33import time
44import warnings
55from functools import partial
6- from typing import Any , Callable , TypeGuard
6+ from typing import Any , Callable , TypeGuard , cast
77
88import httpx
99
@@ -100,6 +100,8 @@ def __init__(
100100
101101 self .model_info : ModelInfo | None = None
102102 self ._function_calling_active : bool = False
103+ self ._max_input_tokens : int | None = self .config .max_input_tokens
104+ self ._max_output_tokens : int | None = self .config .max_output_tokens
103105 self .retry_listener = retry_listener
104106 if self .config .log_completions :
105107 if self .config .log_completions_folder is None :
@@ -139,15 +141,6 @@ def __init__(
139141 # openai doesn't expose top_p, but litellm does
140142 kwargs ["top_p" ] = self .config .top_p
141143
142- # Handle OpenHands provider - rewrite to litellm_proxy
143- if self .config .model .startswith ("openhands/" ):
144- model_name = self .config .model .removeprefix ("openhands/" )
145- self .config .model = f"litellm_proxy/{ model_name } "
146- self .config .base_url = "https://llm-proxy.app.all-hands.dev/"
147- logger .debug (
148- f"Rewrote openhands/{ model_name } to { self .config .model } with base URL { self .config .base_url } " # noqa: E501
149- )
150-
151144 features = get_features (self .config .model )
152145 if features .supports_reasoning_effort :
153146 # For Gemini models, only map 'low' to optimized thinking budget
@@ -229,7 +222,9 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
229222 if "stream" in kwargs and kwargs ["stream" ]:
230223 raise ValueError ("Streaming is not supported in LLM class." )
231224
232- messages_kwarg : list [dict [str , Any ]] | dict [str , Any ] = []
225+ messages_kwarg : (
226+ dict [str , Any ] | Message | list [dict [str , Any ]] | list [Message ]
227+ ) = []
233228 mock_function_calling = not self .is_function_calling_active ()
234229
235230 # some callers might send the model and messages directly
@@ -248,9 +243,19 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
248243 messages_kwarg = kwargs ["messages" ]
249244
250245 # ensure we work with a list of messages
251- messages : list [ dict [ str , Any ]] = (
246+ messages_list = (
252247 messages_kwarg if isinstance (messages_kwarg , list ) else [messages_kwarg ]
253248 )
249+ # format Message objects to dict if needed
250+ messages : list [dict ] = []
251+ if messages_list and isinstance (messages_list [0 ], Message ):
252+ messages = self .format_messages_for_llm (
253+ cast (list [Message ], messages_list )
254+ )
255+ else :
256+ messages = cast (list [dict [str , Any ]], messages_list )
257+
258+ kwargs ["messages" ] = messages
254259
255260 # handle conversion of to non-function calling messages if needed
256261 original_fncall_messages = copy .deepcopy (messages )
@@ -408,6 +413,14 @@ def _all_choices(
408413
409414 self ._completion = wrapper
410415
416+ @property
417+ def max_input_tokens (self ) -> int | None :
418+ return self ._max_input_tokens
419+
420+ @property
421+ def max_output_tokens (self ) -> int | None :
422+ return self ._max_output_tokens
423+
411424 @property
412425 def completion (self ) -> Callable :
413426 """Decorator for the litellm completion function.
@@ -483,41 +496,34 @@ def init_model_info(self) -> None:
483496 f"Model info: { json .dumps ({'model' : self .config .model , 'base_url' : self .config .base_url }, indent = 2 )} " # noqa: E501
484497 )
485498
486- if self .config .model .startswith ("huggingface" ):
487- # HF doesn't support the OpenAI default value for top_p (1)
488- logger .debug (
489- f"Setting top_p to 0.9 for Hugging Face model: { self .config .model } "
490- )
491- self .config .top_p = 0.9 if self .config .top_p == 1 else self .config .top_p
492-
493499 # Set max_input_tokens from model info if not explicitly set
494500 if (
495- self .config . max_input_tokens is None
501+ self ._max_input_tokens is None
496502 and self .model_info is not None
497503 and "max_input_tokens" in self .model_info
498504 and isinstance (self .model_info ["max_input_tokens" ], int )
499505 ):
500- self .config . max_input_tokens = self .model_info ["max_input_tokens" ]
506+ self ._max_input_tokens = self .model_info ["max_input_tokens" ]
501507
502508 # Set max_output_tokens from model info if not explicitly set
503- if self .config . max_output_tokens is None :
509+ if self ._max_output_tokens is None :
504510 # Special case for Claude 3.7 Sonnet models
505511 if any (
506512 model in self .config .model
507513 for model in ["claude-3-7-sonnet" , "claude-3.7-sonnet" ]
508514 ):
509- self .config . max_output_tokens = 64000 # litellm set max to 128k, but that requires a header to be set # noqa: E501
515+ self ._max_output_tokens = 64000 # litellm set max to 128k, but that requires a header to be set # noqa: E501
510516 # Try to get from model info
511517 elif self .model_info is not None :
512518 # max_output_tokens has precedence over max_tokens
513519 if "max_output_tokens" in self .model_info and isinstance (
514520 self .model_info ["max_output_tokens" ], int
515521 ):
516- self .config . max_output_tokens = self .model_info ["max_output_tokens" ]
522+ self ._max_output_tokens = self .model_info ["max_output_tokens" ]
517523 elif "max_tokens" in self .model_info and isinstance (
518524 self .model_info ["max_tokens" ], int
519525 ):
520- self .config . max_output_tokens = self .model_info ["max_tokens" ]
526+ self ._max_output_tokens = self .model_info ["max_tokens" ]
521527
522528 # Initialize function calling using centralized model features
523529 features = get_features (self .config .model )
0 commit comments