Better default and warnings around LM max_tokens (#8215)

okhat · web-flow · commit f971600bee8a · 2025-05-14T15:17:01.000-07:00
* Better default and warnings around LM max_tokens

* max_tokens

* max_tokens

* fix test
diff --git a/dspy/clients/lm.py b/dspy/clients/lm.py
@@ -31,7 +31,7 @@ def __init__(
         model: str,
         model_type: Literal["chat", "text"] = "chat",
         temperature: float = 0.0,
-        max_tokens: int = 1000,
+        max_tokens: int = 4000,
         cache: bool = True,
         cache_in_memory: bool = True,
         callbacks: Optional[List[BaseCallback]] = None,
@@ -131,6 +131,15 @@ def forward(self, prompt=None, messages=None, **kwargs):
             cache=litellm_cache_args,
         )
 
+        if any(c.finish_reason == "length" for c in results["choices"]):
+            logger.warning(
+                f"LM response was truncated due to exceeding max_tokens={self.kwargs['max_tokens']}. "
+                "You can inspect the latest LM interactions with `dspy.inspect_history()`. "
+                "To avoid truncation, consider passing a larger max_tokens when setting up dspy.LM. "
+                f"You may also consider increasing the temperature (currently {self.kwargs['temperature']}) "
+                " if the reason for truncation is repetition."
+            )
+
         if not getattr(results, "cache_hit", False) and dspy.settings.usage_tracker and hasattr(results, "usage"):
             settings.usage_tracker.add_usage(self.model, dict(results.usage))
         return results
@@ -152,6 +161,15 @@ async def aforward(self, prompt=None, messages=None, **kwargs):
             cache=litellm_cache_args,
         )
 
+        if any(c.finish_reason == "length" for c in results["choices"]):
+            logger.warning(
+                f"LM response was truncated due to exceeding max_tokens={self.kwargs['max_tokens']}. "
+                "You can inspect the latest LM interactions with `dspy.inspect_history()`. "
+                "To avoid truncation, consider passing a larger max_tokens when setting up dspy.LM. "
+                f"You may also consider increasing the temperature (currently {self.kwargs['temperature']}) "
+                " if the reason for truncation is repetition."
+            )
+
         if not getattr(results, "cache_hit", False) and dspy.settings.usage_tracker and hasattr(results, "usage"):
             settings.usage_tracker.add_usage(self.model, dict(results.usage))
         return results
diff --git a/tests/caching/test_litellm_cache.py b/tests/caching/test_litellm_cache.py
@@ -88,6 +88,7 @@ def test_lm_calls_are_cached_across_interpreter_sessions(litellm_test_server, te
         model="openai/dspy-test-model",
         api_base=api_base,
         api_key="fakekey",
+        max_tokens=1000,
     )
     lm1("Example query")
 

Original file line number	Diff line number	Diff line change
`@@ -88,6 +88,7 @@ def test_lm_calls_are_cached_across_interpreter_sessions(litellm_test_server, te`
`88`	`88`	`model="openai/dspy-test-model",`
`89`	`89`	`api_base=api_base,`
`90`	`90`	`api_key="fakekey",`
	`91`	`+ max_tokens=1000,`
`91`	`92`	`)`
`92`	`93`	`lm1("Example query")`
`93`	`94`