feat(gepa): add tool description optimization for multi-agent systems

Ju-usc · Ju-usc · commit 6412a5db719d · 2025-10-09T20:07:36.000-07:00
- Add optimize_tool_descriptions parameter (default False) to GEPA
- Extract tool descriptions from all nested modules via named_sub_modules()
- Apply optimized descriptions in DspyAdapter.build_program()
- Enables holistic optimization of tools across main and subagent modules
- Tests: 4 new tests, all 16 pass (4 new + 12 existing)
diff --git a/dspy/teleprompt/gepa/gepa.py b/dspy/teleprompt/gepa/gepa.py
@@ -273,6 +273,9 @@ def metric(
         warn_on_score_mismatch: GEPA (currently) expects the metric to return the same module-level score when 
             called with and without the pred_name. This flag (defaults to True) determines whether a warning is 
             raised if a mismatch in module-level and predictor-level score is detected.
+        optimize_tool_descriptions: Whether to optimize tool descriptions for modules with tools 
+            (e.g., ReAct agents). When enabled, tool descriptions are included in the optimization 
+            process alongside signature instructions. Default is False.
         seed: The random seed to use for reproducibility. Default is 0.
         gepa_kwargs: (Optional) provide additional kwargs to be passed to [gepa.optimize](https://github.com/gepa-ai/gepa/blob/main/src/gepa/api.py) method
         
@@ -328,6 +331,7 @@ def __init__(
         wandb_init_kwargs: dict[str, Any] | None = None,
         track_best_outputs: bool = False,
         warn_on_score_mismatch: bool = True,
+        optimize_tool_descriptions: bool = False,
         use_mlflow: bool = False,
         # Reproducibility
         seed: int | None = 0,
@@ -390,6 +394,7 @@ def __init__(
         self.wandb_api_key = wandb_api_key
         self.wandb_init_kwargs = wandb_init_kwargs
         self.warn_on_score_mismatch = warn_on_score_mismatch
+        self.optimize_tool_descriptions = optimize_tool_descriptions
         self.use_mlflow = use_mlflow
 
         if track_best_outputs:
@@ -518,11 +523,25 @@ def feedback_fn(
             rng=rng,
             reflection_lm=self.reflection_lm,
             custom_instruction_proposer=self.custom_instruction_proposer,
-            warn_on_score_mismatch=self.warn_on_score_mismatch
+            warn_on_score_mismatch=self.warn_on_score_mismatch,
+            optimize_tool_descriptions=self.optimize_tool_descriptions
         )
 
         # Instantiate GEPA with the simpler adapter-based API
         base_program = {name: pred.signature.instructions for name, pred in student.named_predictors()}
+
+        if self.optimize_tool_descriptions:
+            tool_descriptions = {}
+            for _, module in student.named_sub_modules():
+                if hasattr(module, 'tools'):
+                    for tool_name, tool in module.tools.items():
+                        tool_key = f"tool:{tool_name}"
+                        if tool_key not in tool_descriptions:
+                            tool_descriptions[tool_key] = tool.desc
+            if tool_descriptions:
+                logger.info(f"Including {len(tool_descriptions)} tool descriptions for optimization")
+                base_program.update(tool_descriptions)
+
         gepa_result: GEPAResult = optimize(
             seed_candidate=base_program,
             trainset=trainset,
diff --git a/dspy/teleprompt/gepa/gepa_utils.py b/dspy/teleprompt/gepa/gepa_utils.py
@@ -76,7 +76,8 @@ def __init__(
         rng: random.Random | None = None,
         reflection_lm=None,
         custom_instruction_proposer: "ProposalFn | None" = None,
-        warn_on_score_mismatch: bool = True
+        warn_on_score_mismatch: bool = True,
+        optimize_tool_descriptions: bool = False,
     ):
         self.student = student_module
         self.metric_fn = metric_fn
@@ -88,6 +89,7 @@ def __init__(
         self.reflection_lm = reflection_lm
         self.custom_instruction_proposer = custom_instruction_proposer
         self.warn_on_score_mismatch = warn_on_score_mismatch
+        self.optimize_tool_descriptions = optimize_tool_descriptions
 
         if self.custom_instruction_proposer is not None:
             # We are only overriding the propose_new_texts method when a custom
@@ -124,6 +126,15 @@ def build_program(self, candidate: dict[str, str]):
         for name, pred in new_prog.named_predictors():
             if name in candidate:
                 pred.signature = pred.signature.with_instructions(candidate[name])
+        
+        if self.optimize_tool_descriptions:
+            for _, module in new_prog.named_sub_modules():
+                if hasattr(module, 'tools'):
+                    for tool_name, tool in module.tools.items():
+                        tool_key = f"tool:{tool_name}"
+                        if tool_key in candidate:
+                            tool.desc = candidate[tool_key]
+        
         return new_prog
 
     def evaluate(self, batch, candidate, capture_traces=False):
diff --git a/tests/teleprompt/test_gepa_tool_optimization.py b/tests/teleprompt/test_gepa_tool_optimization.py
@@ -0,0 +1,154 @@
+import dspy
+from dspy import Example
+from dspy.utils.dummies import DummyLM
+
+
+def calculator(expression: str) -> str:
+    try:
+        return str(eval(expression))
+    except Exception:
+        return "Error"
+
+
+def search(query: str) -> str:
+    return f"Search results for: {query}"
+
+
+def simple_metric(example, prediction, trace=None, pred_name=None, pred_trace=None):
+    score = 1.0 if example.answer in str(prediction.answer) else 0.0
+    return dspy.Prediction(score=score, feedback="Correct" if score == 1.0 else "Wrong")
+
+
+def test_build_program_applies_tool_descriptions():
+    """Test that build_program applies tool descriptions from candidate dict."""
+    from dspy.teleprompt.gepa.gepa_utils import DspyAdapter
+
+    calc_tool = dspy.Tool(calculator, name="calculator", desc="Old description")
+    react = dspy.ReAct("question -> answer", tools=[calc_tool])
+
+    adapter = DspyAdapter(
+        student_module=react,
+        metric_fn=simple_metric,
+        feedback_map={},
+        failure_score=0.0,
+        optimize_tool_descriptions=True,
+    )
+
+    candidate = {
+        "react": "New instruction for ReAct",
+        "tool:calculator": "Optimized calculator description",
+    }
+
+    new_prog = adapter.build_program(candidate)
+
+    assert new_prog.react.signature.instructions == "New instruction for ReAct"
+    assert new_prog.tools["calculator"].desc == "Optimized calculator description"
+
+
+def test_gepa_with_tool_optimization_enabled():
+    """Test GEPA end-to-end with optimize_tool_descriptions=True."""
+    calc_tool = dspy.Tool(calculator, name="calculator", desc="Does math")
+    react = dspy.ReAct("question -> answer", tools=[calc_tool])
+
+    lm = DummyLM(
+        [
+            {"next_thought": "Calculate", "next_tool_name": "calculator", "next_tool_args": {"expression": "2+2"}},
+            {"next_thought": "Done", "next_tool_name": "finish", "next_tool_args": {}},
+            {"reasoning": "Used calculator", "answer": "4"},
+        ]
+    )
+    reflection_lm = DummyLM([{"improved_instruction": "Better"}])
+
+    dspy.settings.configure(lm=lm)
+
+    optimizer = dspy.GEPA(
+        metric=simple_metric,
+        reflection_lm=reflection_lm,
+        max_metric_calls=3,
+        optimize_tool_descriptions=True,
+    )
+
+    trainset = [Example(question="What is 2+2?", answer="4").with_inputs("question")]
+
+    optimized = optimizer.compile(react, trainset=trainset)
+
+    assert optimized is not None
+    assert hasattr(optimized, "tools")
+    assert "calculator" in optimized.tools
+
+
+def test_gepa_with_multi_agent_architecture():
+    """Test that tool optimization discovers tools from nested subagent modules."""
+    class MultiAgentSystem(dspy.Module):
+        def __init__(self):
+            super().__init__()
+            # Subagent as module attribute (reuse existing search function)
+            search_tool = dspy.Tool(search, name="search", desc="Searches")
+            self.subagent = dspy.ReAct("task -> result", tools=[search_tool])
+            
+            # Main agent with subagent wrapped as tool
+            def spawn_subagent(task: str) -> str:
+                return self.subagent(task=task).result
+            
+            spawn_tool = dspy.Tool(spawn_subagent, name="spawn_subagent", desc="Spawns subagent")
+            calc_tool = dspy.Tool(calculator, name="calculator", desc="Does math")
+            self.main_agent = dspy.ReAct("q -> a", tools=[spawn_tool, calc_tool])
+    
+    system = MultiAgentSystem()
+    
+    # Test extraction using named_sub_modules pattern
+    tool_descriptions = {}
+    for _, module in system.named_sub_modules():
+        if hasattr(module, 'tools'):
+            for tool_name, tool in module.tools.items():
+                tool_key = f"tool:{tool_name}"
+                if tool_key not in tool_descriptions:
+                    tool_descriptions[tool_key] = tool.desc
+    
+    # All tools from all nested agents should be discovered
+    assert "tool:calculator" in tool_descriptions
+    assert "tool:spawn_subagent" in tool_descriptions
+    assert "tool:search" in tool_descriptions
+    assert "tool:finish" in tool_descriptions
+
+
+def test_gepa_optimizes_multi_agent_system_end_to_end():
+    """Test GEPA.compile() optimizes ALL tools from nested multi-agent system."""
+    class MultiAgentSystem(dspy.Module):
+        def __init__(self):
+            super().__init__()
+            search_tool = dspy.Tool(search, name="search", desc="Searches")
+            self.subagent = dspy.ReAct("task -> result", tools=[search_tool])
+            
+            def spawn_subagent(task: str) -> str:
+                return self.subagent(task=task).result
+            
+            spawn_tool = dspy.Tool(spawn_subagent, name="spawn_subagent", desc="Spawns subagent")
+            calc_tool = dspy.Tool(calculator, name="calculator", desc="Does math")
+            self.main_agent = dspy.ReAct("q -> a", tools=[spawn_tool, calc_tool])
+        
+        def forward(self, question):
+            return self.main_agent(q=question)
+    
+    system = MultiAgentSystem()
+    
+    # Setup LMs
+    lm = DummyLM([{"q": "question", "a": "answer"}])
+    reflection_lm = DummyLM([{"improved_instruction": "Better"}])
+    dspy.settings.configure(lm=lm)
+    
+    # Run GEPA optimization
+    optimizer = dspy.GEPA(
+        metric=simple_metric,
+        reflection_lm=reflection_lm,
+        max_metric_calls=3,
+        optimize_tool_descriptions=True,
+    )
+    
+    trainset = [Example(question="test", answer="answer").with_inputs("question")]
+    optimized = optimizer.compile(system, trainset=trainset)
+    
+    # Verify optimized system preserves structure with all tools
+    assert "search" in optimized.subagent.tools
+    assert "calculator" in optimized.main_agent.tools
+    assert "spawn_subagent" in optimized.main_agent.tools