From 7f05c251de5814e3a207d256a7af19b83e93b90a Mon Sep 17 00:00:00 2001 From: TomuHirata Date: Sat, 20 Sep 2025 14:57:09 +0900 Subject: [PATCH 1/3] Limit callback metadata to trace capture path --- dspy/teleprompt/bootstrap_trace.py | 7 ++++- dspy/teleprompt/gepa/gepa_utils.py | 7 +++-- tests/teleprompt/test_bootstrap_trace.py | 33 +++++++++++++++++++++++ tests/teleprompt/test_gepa.py | 34 ++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 3 deletions(-) diff --git a/dspy/teleprompt/bootstrap_trace.py b/dspy/teleprompt/bootstrap_trace.py index 5af24223c5..2f0cc60cd1 100644 --- a/dspy/teleprompt/bootstrap_trace.py +++ b/dspy/teleprompt/bootstrap_trace.py @@ -37,6 +37,7 @@ def bootstrap_trace_data( failure_score: float = 0, format_failure_score: float = -1, log_format_failures: bool = False, + callback_metadata: dict[str, Any] | None = None, ) -> list[TraceData]: # Return a list of dicts with the following keys: example_ind, example, prediction, trace, and score # (if metric != None) @@ -110,7 +111,11 @@ def patched_forward(program_to_use: Module, **kwargs): program.forward = MethodType(patched_forward, program) try: - results = evaluator(program, metric=wrapped_metric).results + results = evaluator( + program, + metric=wrapped_metric, + callback_metadata=callback_metadata, + ).results finally: program.forward = original_forward diff --git a/dspy/teleprompt/gepa/gepa_utils.py b/dspy/teleprompt/gepa/gepa_utils.py index 50f8b6ea7a..0e483034ea 100644 --- a/dspy/teleprompt/gepa/gepa_utils.py +++ b/dspy/teleprompt/gepa/gepa_utils.py @@ -119,8 +119,10 @@ def evaluate(self, batch, candidate, capture_traces=False): if capture_traces: # bootstrap_trace_data-like flow with trace capture - from dspy.teleprompt.bootstrap_trace import bootstrap_trace_data - trajs = bootstrap_trace_data( + from dspy.teleprompt import bootstrap_trace as bootstrap_trace_module + + eval_callback_metadata = {"disable_logging": True} + trajs = bootstrap_trace_module.bootstrap_trace_data( program=program, dataset=batch, metric=self.metric_fn, @@ -129,6 +131,7 @@ def evaluate(self, batch, candidate, capture_traces=False): capture_failed_parses=True, failure_score=self.failure_score, format_failure_score=self.failure_score, + callback_metadata=eval_callback_metadata, ) scores = [] outputs = [] diff --git a/tests/teleprompt/test_bootstrap_trace.py b/tests/teleprompt/test_bootstrap_trace.py index 3a1e3d10ea..020f4cba3b 100644 --- a/tests/teleprompt/test_bootstrap_trace.py +++ b/tests/teleprompt/test_bootstrap_trace.py @@ -1,3 +1,4 @@ +from typing import Any from unittest import mock from litellm import Choices, Message, ModelResponse @@ -118,3 +119,35 @@ def completion_side_effect(*args, **kwargs): # Each trace entry should be a tuple of (predictor, inputs, prediction) for trace_entry in result["trace"]: assert len(trace_entry) == 3, "Trace entry should have 3 elements" + + +def test_bootstrap_trace_data_passes_callback_metadata(monkeypatch): + from dspy.teleprompt import bootstrap_trace as bootstrap_trace_module + + class DummyProgram(dspy.Module): + def forward(self, **kwargs): # pragma: no cover - stub forward + return dspy.Prediction() + + captured_metadata: dict[str, Any] = {} + + class DummyEvaluate: + def __init__(self, *args, **kwargs): + pass + + def __call__(self, *args, callback_metadata=None, **kwargs): + captured_metadata["value"] = callback_metadata + + class _Result: + results: list[Any] = [] + + return _Result() + + monkeypatch.setattr(bootstrap_trace_module, "Evaluate", DummyEvaluate) + + bootstrap_trace_module.bootstrap_trace_data( + program=DummyProgram(), + dataset=[], + callback_metadata={"disable_logging": True}, + ) + + assert captured_metadata["value"] == {"disable_logging": True} diff --git a/tests/teleprompt/test_gepa.py b/tests/teleprompt/test_gepa.py index b78e808947..2afe31bb9c 100644 --- a/tests/teleprompt/test_gepa.py +++ b/tests/teleprompt/test_gepa.py @@ -43,6 +43,40 @@ def bad_metric(example, prediction): return 0.0 +def test_gepa_adapter_disables_logging_during_trace_capture(monkeypatch): + from dspy.teleprompt import bootstrap_trace as bootstrap_trace_module + from dspy.teleprompt.gepa import gepa_utils + + class DummyModule(dspy.Module): + def forward(self, **kwargs): # pragma: no cover - stub forward + return dspy.Prediction() + + # Exercise the adapter evaluate path directly. + adapter = gepa_utils.DspyAdapter( + student_module=SimpleModule("input -> output"), + metric_fn=simple_metric, + feedback_map={}, + failure_score=0.0, + ) + + captured_kwargs: dict[str, Any] = {} + + def dummy_bootstrap_trace_data(*args, **kwargs): + captured_kwargs.update(kwargs) + return [] + + monkeypatch.setattr(bootstrap_trace_module, "bootstrap_trace_data", dummy_bootstrap_trace_data) + monkeypatch.setattr( + gepa_utils.DspyAdapter, + "build_program", + lambda self, candidate: DummyModule(), + ) + + adapter.evaluate(batch=[], candidate={}, capture_traces=True) + + assert captured_kwargs["callback_metadata"] == {"disable_logging": True} + + @pytest.fixture def mock_mlflow(): mock_mlflow = mock.MagicMock() From 2c41ae0fb33d1b51bf8b7aee54b16ce0e64dc8ad Mon Sep 17 00:00:00 2001 From: TomuHirata Date: Mon, 22 Sep 2025 10:34:43 +0900 Subject: [PATCH 2/3] use batch length for eval --- dspy/teleprompt/gepa/gepa.py | 3 ++- dspy/teleprompt/gepa/gepa_utils.py | 11 +++++++---- tests/teleprompt/test_gepa.py | 11 ++++++++--- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/dspy/teleprompt/gepa/gepa.py b/dspy/teleprompt/gepa/gepa.py index c2ab1b68f7..6758794201 100644 --- a/dspy/teleprompt/gepa/gepa.py +++ b/dspy/teleprompt/gepa/gepa.py @@ -497,7 +497,8 @@ def feedback_fn( rng=rng, reflection_lm=self.reflection_lm, custom_instruction_proposer=self.custom_instruction_proposer, - warn_on_score_mismatch=self.warn_on_score_mismatch + warn_on_score_mismatch=self.warn_on_score_mismatch, + full_eval_size=len(valset), ) # Instantiate GEPA with the simpler adapter-based API diff --git a/dspy/teleprompt/gepa/gepa_utils.py b/dspy/teleprompt/gepa/gepa_utils.py index 0e483034ea..19bf238493 100644 --- a/dspy/teleprompt/gepa/gepa_utils.py +++ b/dspy/teleprompt/gepa/gepa_utils.py @@ -64,7 +64,8 @@ def __init__( rng: random.Random | None = None, reflection_lm=None, custom_instruction_proposer: "ProposalFn | None" = None, - warn_on_score_mismatch: bool = True + warn_on_score_mismatch: bool = True, + full_eval_size: int | None = None, ): self.student = student_module self.metric_fn = metric_fn @@ -76,6 +77,7 @@ def __init__( self.reflection_lm = reflection_lm self.custom_instruction_proposer = custom_instruction_proposer self.warn_on_score_mismatch = warn_on_score_mismatch + self.full_eval_size = full_eval_size if self.custom_instruction_proposer is not None: # We are only overriding the propose_new_texts method when a custom @@ -116,12 +118,12 @@ def build_program(self, candidate: dict[str, str]): def evaluate(self, batch, candidate, capture_traces=False): program = self.build_program(candidate) + callback_metadata = {"metric_key": "eval_full"} if self.full_eval_size == len(batch) else {"disable_logging": True} if capture_traces: # bootstrap_trace_data-like flow with trace capture from dspy.teleprompt import bootstrap_trace as bootstrap_trace_module - eval_callback_metadata = {"disable_logging": True} trajs = bootstrap_trace_module.bootstrap_trace_data( program=program, dataset=batch, @@ -131,7 +133,7 @@ def evaluate(self, batch, candidate, capture_traces=False): capture_failed_parses=True, failure_score=self.failure_score, format_failure_score=self.failure_score, - callback_metadata=eval_callback_metadata, + callback_metadata=callback_metadata, ) scores = [] outputs = [] @@ -153,7 +155,8 @@ def evaluate(self, batch, candidate, capture_traces=False): return_all_scores=True, failure_score=self.failure_score, provide_traceback=True, - max_errors=len(batch) * 100 + max_errors=len(batch) * 100, + callback_metadata=callback_metadata, ) res = evaluator(program) outputs = [r[1] for r in res.results] diff --git a/tests/teleprompt/test_gepa.py b/tests/teleprompt/test_gepa.py index 2afe31bb9c..d2bd31616f 100644 --- a/tests/teleprompt/test_gepa.py +++ b/tests/teleprompt/test_gepa.py @@ -43,7 +43,11 @@ def bad_metric(example, prediction): return 0.0 -def test_gepa_adapter_disables_logging_during_trace_capture(monkeypatch): +@pytest.mark.parametrize("full_eval_size, batch, expected_callback_metadata", [ + (1, [], {"disable_logging": True}), + (1, [Example(input="What is the color of the sky?", output="blue")], {"metric_key": "eval_full"}), +]) +def test_gepa_adapter_disables_logging_on_minibatch_eval(monkeypatch, full_eval_size, batch, expected_callback_metadata): from dspy.teleprompt import bootstrap_trace as bootstrap_trace_module from dspy.teleprompt.gepa import gepa_utils @@ -57,6 +61,7 @@ def forward(self, **kwargs): # pragma: no cover - stub forward metric_fn=simple_metric, feedback_map={}, failure_score=0.0, + full_eval_size=full_eval_size, ) captured_kwargs: dict[str, Any] = {} @@ -72,9 +77,9 @@ def dummy_bootstrap_trace_data(*args, **kwargs): lambda self, candidate: DummyModule(), ) - adapter.evaluate(batch=[], candidate={}, capture_traces=True) + adapter.evaluate(batch=batch, candidate={}, capture_traces=True) - assert captured_kwargs["callback_metadata"] == {"disable_logging": True} + assert captured_kwargs["callback_metadata"] == expected_callback_metadata @pytest.fixture From 73311a165e6a3c2b05ea8cc14b7eabf15211b6cc Mon Sep 17 00:00:00 2001 From: TomuHirata Date: Thu, 13 Nov 2025 12:10:16 +0900 Subject: [PATCH 3/3] refactor(gepa): rename full_eval_size to reflection_minibatch_size Updated the DspyAdapter and GEPA classes to replace the full_eval_size parameter with reflection_minibatch_size for improved clarity. Adjusted the evaluate method and corresponding tests to reflect this change, ensuring callback metadata is correctly generated based on the new parameter. Signed-off-by: TomuHirata --- dspy/teleprompt/gepa/gepa.py | 2 +- dspy/teleprompt/gepa/gepa_utils.py | 6 +++--- tests/teleprompt/test_gepa.py | 13 +++++++++---- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/dspy/teleprompt/gepa/gepa.py b/dspy/teleprompt/gepa/gepa.py index c5fdce68e7..c35e916691 100644 --- a/dspy/teleprompt/gepa/gepa.py +++ b/dspy/teleprompt/gepa/gepa.py @@ -546,7 +546,7 @@ def feedback_fn( reflection_lm=self.reflection_lm, custom_instruction_proposer=self.custom_instruction_proposer, warn_on_score_mismatch=self.warn_on_score_mismatch, - full_eval_size=len(valset), + reflection_minibatch_size=self.reflection_minibatch_size, ) # Instantiate GEPA with the simpler adapter-based API diff --git a/dspy/teleprompt/gepa/gepa_utils.py b/dspy/teleprompt/gepa/gepa_utils.py index 94f2624ee5..d2e6772cef 100644 --- a/dspy/teleprompt/gepa/gepa_utils.py +++ b/dspy/teleprompt/gepa/gepa_utils.py @@ -77,7 +77,7 @@ def __init__( reflection_lm=None, custom_instruction_proposer: "ProposalFn | None" = None, warn_on_score_mismatch: bool = True, - full_eval_size: int | None = None, + reflection_minibatch_size: int | None = None, ): self.student = student_module self.metric_fn = metric_fn @@ -89,7 +89,7 @@ def __init__( self.reflection_lm = reflection_lm self.custom_instruction_proposer = custom_instruction_proposer self.warn_on_score_mismatch = warn_on_score_mismatch - self.full_eval_size = full_eval_size + self.reflection_minibatch_size = reflection_minibatch_size if self.custom_instruction_proposer is not None: # We are only overriding the propose_new_texts method when a custom @@ -130,7 +130,7 @@ def build_program(self, candidate: dict[str, str]): def evaluate(self, batch, candidate, capture_traces=False): program = self.build_program(candidate) - callback_metadata = {"metric_key": "eval_full"} if self.full_eval_size == len(batch) else {"disable_logging": True} + callback_metadata = {"metric_key": "eval_full"} if self.reflection_minibatch_size is None or len(batch) > self.reflection_minibatch_size else {"disable_logging": True} if capture_traces: # bootstrap_trace_data-like flow with trace capture diff --git a/tests/teleprompt/test_gepa.py b/tests/teleprompt/test_gepa.py index 5cd2968217..afe40d082a 100644 --- a/tests/teleprompt/test_gepa.py +++ b/tests/teleprompt/test_gepa.py @@ -43,11 +43,16 @@ def bad_metric(example, prediction): return 0.0 -@pytest.mark.parametrize("full_eval_size, batch, expected_callback_metadata", [ +@pytest.mark.parametrize("reflection_minibatch_size, batch, expected_callback_metadata", [ + (None, [], {"metric_key": "eval_full"}), + (None, [Example(input="What is the color of the sky?", output="blue")], {"metric_key": "eval_full"}), (1, [], {"disable_logging": True}), - (1, [Example(input="What is the color of the sky?", output="blue")], {"metric_key": "eval_full"}), + (1, [ + Example(input="What is the color of the sky?", output="blue"), + Example(input="What does the fox say?", output="Ring-ding-ding-ding-dingeringeding!"), + ], {"metric_key": "eval_full"}), ]) -def test_gepa_adapter_disables_logging_on_minibatch_eval(monkeypatch, full_eval_size, batch, expected_callback_metadata): +def test_gepa_adapter_disables_logging_on_minibatch_eval(monkeypatch, reflection_minibatch_size, batch, expected_callback_metadata): from dspy.teleprompt import bootstrap_trace as bootstrap_trace_module from dspy.teleprompt.gepa import gepa_utils @@ -61,7 +66,7 @@ def forward(self, **kwargs): # pragma: no cover - stub forward metric_fn=simple_metric, feedback_map={}, failure_score=0.0, - full_eval_size=full_eval_size, + reflection_minibatch_size=reflection_minibatch_size, ) captured_kwargs: dict[str, Any] = {}