Skip to content

Commit 25aba2b

Browse files
authored
[gpt-oss] Add IncompleteDetails to ResponsesRepsonse (vllm-project#24561)
Signed-off-by: Andrew Xia <axia@meta.com>
1 parent 94b03f8 commit 25aba2b

File tree

7 files changed

+67
-25
lines changed

7 files changed

+67
-25
lines changed

tests/entrypoints/openai/test_response_api_with_harmony.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,20 @@ async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
7474
assert response.status == "completed"
7575

7676

77+
@pytest.mark.asyncio
78+
@pytest.mark.parametrize("model_name", [MODEL_NAME])
79+
async def test_max_tokens(client: OpenAI, model_name: str):
80+
response = await client.responses.create(
81+
model=model_name,
82+
input="What is the first paragraph of Moby Dick?",
83+
reasoning={"effort": "low"},
84+
max_output_tokens=30,
85+
)
86+
assert response is not None
87+
assert response.status == "incomplete"
88+
assert response.incomplete_details.reason == "max_output_tokens"
89+
90+
7791
@pytest.mark.asyncio
7892
@pytest.mark.parametrize("model_name", [MODEL_NAME])
7993
async def test_chat(client: OpenAI, model_name: str):

vllm/entrypoints/context.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ def __init__(
112112
available_tools: list[str],
113113
):
114114
self._messages = messages
115+
self.finish_reason: Optional[str] = None
115116
self.available_tools = available_tools
116117
self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {}
117118
self.called_tools: set[str] = set()
@@ -135,7 +136,8 @@ def _update_num_reasoning_tokens(self):
135136
if self.parser.current_channel in {"analysis", "commentary"}:
136137
self.num_reasoning_tokens += 1
137138

138-
def append_output(self, output) -> None:
139+
def append_output(self, output: Union[RequestOutput,
140+
list[Message]]) -> None:
139141
if isinstance(output, RequestOutput):
140142
output_token_ids = output.outputs[0].token_ids
141143
self.parser = get_streamable_parser_for_assistant()
@@ -150,25 +152,27 @@ def append_output(self, output) -> None:
150152
# Move current turn to previous turn for next turn's calculations
151153
self.previous_turn = self.current_turn.copy()
152154
output_msgs = self.parser.messages
155+
# The responses finish reason is set in the last message
156+
self.finish_reason = output.outputs[0].finish_reason
153157
else:
154158
# Tool output.
155159
output_msgs = output
156160
self._messages.extend(output_msgs)
157161

158162
def _update_prefill_token_usage(self, output: RequestOutput) -> None:
159163
"""Update token usage statistics for the prefill phase of generation.
160-
164+
161165
The prefill phase processes the input prompt tokens. This method:
162166
1. Counts the prompt tokens for this turn
163167
2. Calculates tool output tokens for multi-turn conversations
164168
3. Updates cached token counts
165169
4. Tracks state for next turn calculations
166-
170+
167171
Tool output tokens are calculated as:
168-
current_prompt_tokens - last_turn_prompt_tokens -
172+
current_prompt_tokens - last_turn_prompt_tokens -
169173
last_turn_output_tokens
170174
This represents tokens added between turns (typically tool responses).
171-
175+
172176
Args:
173177
output: The RequestOutput containing prompt token information
174178
"""
@@ -214,18 +218,18 @@ def _update_prefill_token_usage(self, output: RequestOutput) -> None:
214218

215219
def _update_decode_token_usage(self, output: RequestOutput) -> int:
216220
"""Update token usage statistics for the decode phase of generation.
217-
221+
218222
The decode phase processes the generated output tokens. This method:
219223
1. Counts output tokens from all completion outputs
220224
2. Updates the total output token count
221225
3. Tracks tokens generated in the current turn
222-
226+
223227
In streaming mode, this is called for each token generated.
224228
In non-streaming mode, this is called once with all output tokens.
225-
229+
226230
Args:
227231
output: The RequestOutput containing generated token information
228-
232+
229233
Returns:
230234
int: Number of output tokens processed in this call
231235
"""
@@ -385,7 +389,8 @@ def __init__(self, *args, **kwargs):
385389
def messages(self) -> list:
386390
return self.parser.messages
387391

388-
def append_output(self, output) -> None:
392+
def append_output(self, output: Union[RequestOutput,
393+
list[Message]]) -> None:
389394
if isinstance(output, RequestOutput):
390395
# append_output is called for each output token in streaming case,
391396
# so we only want to add the prompt tokens once for each message.

vllm/entrypoints/harmony_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,9 @@ def parse_remaining_state(
387387
id=f"msg_{random_uuid()}",
388388
content=[output_text],
389389
role="assistant",
390-
status="completed",
390+
# if the parser still has messages (ie if the generator got cut
391+
# abruptly), this should be incomplete
392+
status="incomplete",
391393
type="message",
392394
)
393395
return [text_item]

vllm/entrypoints/openai/protocol.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from openai.types.responses import (ResponseFormatTextConfig as
3131
ResponseTextConfig)
3232

33-
from openai.types.responses.response import ToolChoice
33+
from openai.types.responses.response import IncompleteDetails, ToolChoice
3434
from openai.types.responses.tool import Tool
3535
from openai.types.shared import Metadata, Reasoning
3636
from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
@@ -1868,7 +1868,7 @@ class ResponsesResponse(OpenAIBaseModel):
18681868
id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
18691869
created_at: int = Field(default_factory=lambda: int(time.time()))
18701870
# error: Optional[ResponseError] = None
1871-
# incomplete_details: Optional[IncompleteDetails] = None
1871+
incomplete_details: Optional[IncompleteDetails] = None
18721872
instructions: Optional[str] = None
18731873
metadata: Optional[Metadata] = None
18741874
model: str
@@ -1904,9 +1904,18 @@ def from_request(
19041904
status: ResponseStatus,
19051905
usage: Optional[ResponseUsage] = None,
19061906
) -> "ResponsesResponse":
1907+
1908+
incomplete_details: Optional[IncompleteDetails] = None
1909+
if status == 'incomplete':
1910+
incomplete_details = IncompleteDetails(reason='max_output_tokens')
1911+
# TODO: implement the other reason for incomplete_details,
1912+
# which is content_filter
1913+
# incomplete_details = IncompleteDetails(reason='content_filter')
1914+
19071915
return cls(
19081916
id=request.request_id,
19091917
created_at=created_time,
1918+
incomplete_details=incomplete_details,
19101919
instructions=request.instructions,
19111920
metadata=request.metadata,
19121921
model=model_name,
@@ -2109,7 +2118,7 @@ class DetokenizeResponse(OpenAIBaseModel):
21092118

21102119
class TokenizerInfoResponse(OpenAIBaseModel):
21112120
"""
2112-
Response containing tokenizer configuration
2121+
Response containing tokenizer configuration
21132122
equivalent to tokenizer_config.json
21142123
"""
21152124

@@ -2199,7 +2208,7 @@ class TranscriptionRequest(OpenAIBaseModel):
21992208
to_language: Optional[str] = None
22002209
"""The language of the output audio we transcribe to.
22012210
2202-
Please note that this is not currently used by supported models at this
2211+
Please note that this is not currently used by supported models at this
22032212
time, but it is a placeholder for future use, matching translation api.
22042213
"""
22052214

vllm/entrypoints/openai/serving_responses.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
ResponseReasoningItem,
2828
ResponseReasoningTextDeltaEvent,
2929
ResponseReasoningTextDoneEvent,
30-
response_text_delta_event)
30+
ResponseStatus, response_text_delta_event)
3131
from openai.types.responses.response_output_text import (Logprob,
3232
LogprobTopLogprob)
3333
# yapf: enable
@@ -461,10 +461,22 @@ async def responses_full_generator(
461461
# TODO: Use a vllm-specific Validation Error
462462
return self.create_error_response(str(e))
463463

464+
# NOTE: Implementation of stauts is still WIP, but for now
465+
# we guarantee that if the status is not "completed", it is accurate.
466+
# "completed" is implemented as the "catch-all" for now.
467+
status: ResponseStatus = "completed"
468+
464469
if self.use_harmony:
465470
assert isinstance(context, HarmonyContext)
466471
output = self._make_response_output_items_with_harmony(context)
467472
num_tool_output_tokens = context.num_tool_output_tokens
473+
if len(output) > 0:
474+
if context.finish_reason == "length":
475+
status = "incomplete"
476+
elif context.finish_reason == "abort":
477+
status = "cancelled"
478+
else:
479+
status = "incomplete"
468480
else:
469481
assert isinstance(context, SimpleContext)
470482
final_res = context.last_output
@@ -501,7 +513,7 @@ async def responses_full_generator(
501513
model_name=model_name,
502514
created_time=created_time,
503515
output=output,
504-
status="completed",
516+
status=status,
505517
usage=usage,
506518
)
507519

@@ -658,7 +670,7 @@ def _make_response_output_items_with_harmony(
658670
self,
659671
context: HarmonyContext,
660672
) -> list[ResponseOutputItem]:
661-
output_items = []
673+
output_items: list[ResponseOutputItem] = []
662674
num_init_messages = context.num_init_messages
663675
for msg in context.messages[num_init_messages:]:
664676
output_items.extend(parse_output_message(msg))

vllm/v1/core/sched/utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,19 @@
1010

1111
def remove_all(lst: list, items_to_remove: set) -> list:
1212
"""Remove all items from a list that are in the items_to_remove set.
13-
13+
1414
This method optimizes for the common case of removing a single item,
1515
falling back to list comprehension for multiple items.
16-
16+
1717
Args:
1818
lst: The list to remove items from
1919
items_to_remove: Set of items to remove
20-
20+
2121
Returns:
2222
Either the modified original list (for single item removal) or
2323
a new list (for multiple item removal). Callers should use the
2424
returned value.
25-
25+
2626
Note:
2727
For single item removal, this modifies the original list in-place
2828
and returns it. For multiple items, it creates and returns a new list.

vllm/v1/engine/output_processor.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -373,17 +373,17 @@ def process_outputs(
373373
1) Compute stats for logging
374374
2) Detokenize
375375
3) Create and handle RequestOutput objects:
376-
* If there is a queue (for usage with AsyncLLM),
376+
* If there is a queue (for usage with AsyncLLM),
377377
put the RequestOutput objects into the queue for
378378
handling by the per-request generate() tasks.
379379
380-
* If there is no queue (for usage with LLMEngine),
380+
* If there is no queue (for usage with LLMEngine),
381381
return a list of RequestOutput objects.
382382
383383
NOTE FOR DEVELOPERS
384384
385385
vLLM V1 minimizes the number of python loops over the full
386-
batch to ensure system overheads are minimized. This is the
386+
batch to ensure system overheads are minimized. This is the
387387
only function that should loop over EngineCoreOutputs.
388388
389389
If you need to touch every element of the batch, do it from

0 commit comments

Comments
 (0)