Skip to content

Commit d266c59

Browse files
authored
chore: remove deprecated inference.chat_completion implementations (#3654)
# What does this PR do? remove unused chat_completion implementations vllm features ported - - requires max_tokens be set, use config value - set tool_choice to none if no tools provided ## Test Plan ci
1 parent 4dfbe46 commit d266c59

File tree

18 files changed

+193
-1410
lines changed

18 files changed

+193
-1410
lines changed

llama_stack/apis/inference/inference.py

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1006,45 +1006,6 @@ class InferenceProvider(Protocol):
10061006

10071007
model_store: ModelStore | None = None
10081008

1009-
async def chat_completion(
1010-
self,
1011-
model_id: str,
1012-
messages: list[Message],
1013-
sampling_params: SamplingParams | None = None,
1014-
tools: list[ToolDefinition] | None = None,
1015-
tool_choice: ToolChoice | None = ToolChoice.auto,
1016-
tool_prompt_format: ToolPromptFormat | None = None,
1017-
response_format: ResponseFormat | None = None,
1018-
stream: bool | None = False,
1019-
logprobs: LogProbConfig | None = None,
1020-
tool_config: ToolConfig | None = None,
1021-
) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
1022-
"""Generate a chat completion for the given messages using the specified model.
1023-
1024-
:param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
1025-
:param messages: List of messages in the conversation.
1026-
:param sampling_params: Parameters to control the sampling strategy.
1027-
:param tools: (Optional) List of tool definitions available to the model.
1028-
:param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
1029-
.. deprecated::
1030-
Use tool_config instead.
1031-
:param tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model.
1032-
- `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
1033-
- `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag.
1034-
- `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls.
1035-
.. deprecated::
1036-
Use tool_config instead.
1037-
:param response_format: (Optional) Grammar specification for guided (structured) decoding. There are two options:
1038-
- `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format.
1039-
- `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it.
1040-
:param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
1041-
:param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
1042-
:param tool_config: (Optional) Configuration for tool use.
1043-
:returns: If stream=False, returns a ChatCompletionResponse with the full completion.
1044-
If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk.
1045-
"""
1046-
...
1047-
10481009
@webmethod(route="/inference/rerank", method="POST", level=LLAMA_STACK_API_V1ALPHA)
10491010
async def rerank(
10501011
self,

llama_stack/core/routers/inference.py

Lines changed: 0 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
CompletionResponseStreamChunk,
2828
Inference,
2929
ListOpenAIChatCompletionResponse,
30-
LogProbConfig,
3130
Message,
3231
OpenAIAssistantMessageParam,
3332
OpenAIChatCompletion,
@@ -42,12 +41,7 @@
4241
OpenAIMessageParam,
4342
OpenAIResponseFormatParam,
4443
Order,
45-
ResponseFormat,
46-
SamplingParams,
4744
StopReason,
48-
ToolChoice,
49-
ToolConfig,
50-
ToolDefinition,
5145
ToolPromptFormat,
5246
)
5347
from llama_stack.apis.models import Model, ModelType
@@ -185,88 +179,6 @@ async def _get_model(self, model_id: str, expected_model_type: str) -> Model:
185179
raise ModelTypeError(model_id, model.model_type, expected_model_type)
186180
return model
187181

188-
async def chat_completion(
189-
self,
190-
model_id: str,
191-
messages: list[Message],
192-
sampling_params: SamplingParams | None = None,
193-
response_format: ResponseFormat | None = None,
194-
tools: list[ToolDefinition] | None = None,
195-
tool_choice: ToolChoice | None = None,
196-
tool_prompt_format: ToolPromptFormat | None = None,
197-
stream: bool | None = False,
198-
logprobs: LogProbConfig | None = None,
199-
tool_config: ToolConfig | None = None,
200-
) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
201-
logger.debug(
202-
f"InferenceRouter.chat_completion: {model_id=}, {stream=}, {messages=}, {tools=}, {tool_config=}, {response_format=}",
203-
)
204-
if sampling_params is None:
205-
sampling_params = SamplingParams()
206-
model = await self._get_model(model_id, ModelType.llm)
207-
if tool_config:
208-
if tool_choice and tool_choice != tool_config.tool_choice:
209-
raise ValueError("tool_choice and tool_config.tool_choice must match")
210-
if tool_prompt_format and tool_prompt_format != tool_config.tool_prompt_format:
211-
raise ValueError("tool_prompt_format and tool_config.tool_prompt_format must match")
212-
else:
213-
params = {}
214-
if tool_choice:
215-
params["tool_choice"] = tool_choice
216-
if tool_prompt_format:
217-
params["tool_prompt_format"] = tool_prompt_format
218-
tool_config = ToolConfig(**params)
219-
220-
tools = tools or []
221-
if tool_config.tool_choice == ToolChoice.none:
222-
tools = []
223-
elif tool_config.tool_choice == ToolChoice.auto:
224-
pass
225-
elif tool_config.tool_choice == ToolChoice.required:
226-
pass
227-
else:
228-
# verify tool_choice is one of the tools
229-
tool_names = [t.tool_name if isinstance(t.tool_name, str) else t.tool_name.value for t in tools]
230-
if tool_config.tool_choice not in tool_names:
231-
raise ValueError(f"Tool choice {tool_config.tool_choice} is not one of the tools: {tool_names}")
232-
233-
params = dict(
234-
model_id=model_id,
235-
messages=messages,
236-
sampling_params=sampling_params,
237-
tools=tools,
238-
tool_choice=tool_choice,
239-
tool_prompt_format=tool_prompt_format,
240-
response_format=response_format,
241-
stream=stream,
242-
logprobs=logprobs,
243-
tool_config=tool_config,
244-
)
245-
provider = await self.routing_table.get_provider_impl(model_id)
246-
prompt_tokens = await self._count_tokens(messages, tool_config.tool_prompt_format)
247-
248-
if stream:
249-
response_stream = await provider.chat_completion(**params)
250-
return self.stream_tokens_and_compute_metrics(
251-
response=response_stream,
252-
prompt_tokens=prompt_tokens,
253-
model=model,
254-
tool_prompt_format=tool_config.tool_prompt_format,
255-
)
256-
257-
response = await provider.chat_completion(**params)
258-
metrics = await self.count_tokens_and_compute_metrics(
259-
response=response,
260-
prompt_tokens=prompt_tokens,
261-
model=model,
262-
tool_prompt_format=tool_config.tool_prompt_format,
263-
)
264-
# these metrics will show up in the client response.
265-
response.metrics = (
266-
metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
267-
)
268-
return response
269-
270182
async def openai_completion(
271183
self,
272184
model: str,

0 commit comments

Comments
 (0)