From 597a7b44ab09b4d4e8045329a81d61815321694b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 12 Nov 2025 04:01:37 +0000 Subject: [PATCH 1/6] Initial plan From e79dc12a7c7c7bb3d4e8799c992f6852cebd0665 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 12 Nov 2025 04:15:47 +0000 Subject: [PATCH 2/6] Fix Example.toDict() to serialize dspy.History objects properly Co-authored-by: TomeHirata <33407409+TomeHirata@users.noreply.github.com> --- dspy/evaluate/evaluate.py | 7 +- dspy/primitives/example.py | 3 + tests/evaluate/test_evaluate_with_history.py | 141 +++++++++++++++++++ tests/primitives/test_example.py | 31 ++++ 4 files changed, 180 insertions(+), 2 deletions(-) create mode 100644 tests/evaluate/test_evaluate_with_history.py diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index 50513ce8e2..90e48ae954 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -231,9 +231,12 @@ def _prepare_results_output( ): return [ ( - merge_dicts(example, prediction) | {metric_name: score} + merge_dicts( + example.toDict(), + prediction.toDict() if hasattr(prediction, "toDict") else prediction + ) | {metric_name: score} if prediction_is_dictlike(prediction) - else dict(example) | {"prediction": prediction, metric_name: score} + else example.toDict() | {"prediction": prediction, metric_name: score} ) for example, prediction, score in results ] diff --git a/dspy/primitives/example.py b/dspy/primitives/example.py index 549da6bce7..4b8259fa33 100644 --- a/dspy/primitives/example.py +++ b/dspy/primitives/example.py @@ -193,6 +193,9 @@ def toDict(self): # noqa: N802 def convert_to_serializable(value): if hasattr(value, "toDict"): return value.toDict() + elif hasattr(value, "model_dump"): + # Handle Pydantic models (e.g., dspy.History) + return value.model_dump() elif isinstance(value, list): return [convert_to_serializable(item) for item in value] elif isinstance(value, dict): diff --git a/tests/evaluate/test_evaluate_with_history.py b/tests/evaluate/test_evaluate_with_history.py new file mode 100644 index 0000000000..7fc81f0345 --- /dev/null +++ b/tests/evaluate/test_evaluate_with_history.py @@ -0,0 +1,141 @@ +"""Test Evaluate with dspy.History objects.""" +import json +import tempfile + +import dspy +from dspy.evaluate import Evaluate +from dspy.evaluate.metrics import answer_exact_match +from dspy.predict import Predict +from dspy.utils.dummies import DummyLM + + +def test_evaluate_save_as_json_with_history(): + """Test that save_as_json works with Examples containing dspy.History objects.""" + # Setup + dspy.settings.configure( + lm=DummyLM( + { + "What is 1+1?": {"answer": "2"}, + "What is 2+2?": {"answer": "4"}, + } + ) + ) + + # Create history objects + history1 = dspy.History( + messages=[ + {"question": "Previous Q1", "answer": "Previous A1"}, + ] + ) + history2 = dspy.History( + messages=[ + {"question": "Previous Q2", "answer": "Previous A2"}, + {"question": "Previous Q3", "answer": "Previous A3"}, + ] + ) + + # Create examples with history + devset = [ + dspy.Example(question="What is 1+1?", answer="2", history=history1).with_inputs("question"), + dspy.Example(question="What is 2+2?", answer="4", history=history2).with_inputs("question"), + ] + + program = Predict("question -> answer") + + # Create evaluator with save_as_json + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + temp_json = f.name + + try: + evaluator = Evaluate( + devset=devset, + metric=answer_exact_match, + display_progress=False, + save_as_json=temp_json, + ) + + result = evaluator(program) + assert result.score == 100.0 + + # Verify JSON file was created and is valid + with open(temp_json) as f: + data = json.load(f) + + assert len(data) == 2 + + # Verify history was properly serialized in first record + assert "history" in data[0] + assert isinstance(data[0]["history"], dict) + assert "messages" in data[0]["history"] + assert len(data[0]["history"]["messages"]) == 1 + assert data[0]["history"]["messages"][0] == {"question": "Previous Q1", "answer": "Previous A1"} + + # Verify history was properly serialized in second record + assert "history" in data[1] + assert isinstance(data[1]["history"], dict) + assert "messages" in data[1]["history"] + assert len(data[1]["history"]["messages"]) == 2 + assert data[1]["history"]["messages"][0] == {"question": "Previous Q2", "answer": "Previous A2"} + assert data[1]["history"]["messages"][1] == {"question": "Previous Q3", "answer": "Previous A3"} + + finally: + import os + if os.path.exists(temp_json): + os.unlink(temp_json) + + +def test_evaluate_save_as_csv_with_history(): + """Test that save_as_csv works with Examples containing dspy.History objects.""" + # Setup + dspy.settings.configure( + lm=DummyLM( + { + "What is 1+1?": {"answer": "2"}, + } + ) + ) + + # Create history object + history = dspy.History( + messages=[ + {"question": "Previous Q", "answer": "Previous A"}, + ] + ) + + # Create example with history + devset = [ + dspy.Example(question="What is 1+1?", answer="2", history=history).with_inputs("question"), + ] + + program = Predict("question -> answer") + + # Create evaluator with save_as_csv + with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: + temp_csv = f.name + + try: + evaluator = Evaluate( + devset=devset, + metric=answer_exact_match, + display_progress=False, + save_as_csv=temp_csv, + ) + + result = evaluator(program) + assert result.score == 100.0 + + # Verify CSV file was created + import csv + with open(temp_csv) as f: + reader = csv.DictReader(f) + rows = list(reader) + + assert len(rows) == 1 + assert "history" in rows[0] + # CSV will have string representation of the dict + assert "messages" in rows[0]["history"] + + finally: + import os + if os.path.exists(temp_csv): + os.unlink(temp_csv) diff --git a/tests/primitives/test_example.py b/tests/primitives/test_example.py index e406017137..c569016dbc 100644 --- a/tests/primitives/test_example.py +++ b/tests/primitives/test_example.py @@ -123,3 +123,34 @@ def test_example_copy_without(): def test_example_to_dict(): example = Example(a=1, b=2) assert example.toDict() == {"a": 1, "b": 2} + + +def test_example_to_dict_with_history(): + """Test that Example.toDict() properly serializes dspy.History objects.""" + history = dspy.History( + messages=[ + {"question": "What is the capital of France?", "answer": "Paris"}, + {"question": "What is the capital of Germany?", "answer": "Berlin"}, + ] + ) + example = Example(question="Test question", history=history, answer="Test answer") + + result = example.toDict() + + # Verify the result is a dictionary + assert isinstance(result, dict) + assert "history" in result + + # Verify history is serialized to a dict (not a History object) + assert isinstance(result["history"], dict) + assert "messages" in result["history"] + assert result["history"]["messages"] == [ + {"question": "What is the capital of France?", "answer": "Paris"}, + {"question": "What is the capital of Germany?", "answer": "Berlin"}, + ] + + # Verify JSON serialization works + import json + json_str = json.dumps(result) + restored = json.loads(json_str) + assert restored["history"]["messages"] == result["history"]["messages"] From c67ba3336ce24a6d4bec3a792b6cbfa91dcc7b2c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 13 Nov 2025 04:15:42 +0000 Subject: [PATCH 3/6] Consolidate History tests into existing test_evaluate.py Co-authored-by: TomeHirata <33407409+TomeHirata@users.noreply.github.com> --- tests/evaluate/test_evaluate.py | 135 ++++++++++++++++++ tests/evaluate/test_evaluate_with_history.py | 141 ------------------- 2 files changed, 135 insertions(+), 141 deletions(-) delete mode 100644 tests/evaluate/test_evaluate_with_history.py diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py index 211cf25962..6d44850bae 100644 --- a/tests/evaluate/test_evaluate.py +++ b/tests/evaluate/test_evaluate.py @@ -1,4 +1,6 @@ +import json import signal +import tempfile import threading from unittest.mock import patch @@ -261,3 +263,136 @@ def on_evaluate_end( def test_evaluation_result_repr(): result = EvaluationResult(score=100.0, results=[(new_example("What is 1+1?", "2"), {"answer": "2"}, 100.0)]) assert repr(result) == "EvaluationResult(score=100.0, results=)" + + +def test_evaluate_save_as_json_with_history(): + """Test that save_as_json works with Examples containing dspy.History objects.""" + # Setup + dspy.settings.configure( + lm=DummyLM( + { + "What is 1+1?": {"answer": "2"}, + "What is 2+2?": {"answer": "4"}, + } + ) + ) + + # Create history objects + history1 = dspy.History( + messages=[ + {"question": "Previous Q1", "answer": "Previous A1"}, + ] + ) + history2 = dspy.History( + messages=[ + {"question": "Previous Q2", "answer": "Previous A2"}, + {"question": "Previous Q3", "answer": "Previous A3"}, + ] + ) + + # Create examples with history + devset = [ + dspy.Example(question="What is 1+1?", answer="2", history=history1).with_inputs("question"), + dspy.Example(question="What is 2+2?", answer="4", history=history2).with_inputs("question"), + ] + + program = Predict("question -> answer") + + # Create evaluator with save_as_json + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + temp_json = f.name + + try: + evaluator = Evaluate( + devset=devset, + metric=answer_exact_match, + display_progress=False, + save_as_json=temp_json, + ) + + result = evaluator(program) + assert result.score == 100.0 + + # Verify JSON file was created and is valid + with open(temp_json) as f: + data = json.load(f) + + assert len(data) == 2 + + # Verify history was properly serialized in first record + assert "history" in data[0] + assert isinstance(data[0]["history"], dict) + assert "messages" in data[0]["history"] + assert len(data[0]["history"]["messages"]) == 1 + assert data[0]["history"]["messages"][0] == {"question": "Previous Q1", "answer": "Previous A1"} + + # Verify history was properly serialized in second record + assert "history" in data[1] + assert isinstance(data[1]["history"], dict) + assert "messages" in data[1]["history"] + assert len(data[1]["history"]["messages"]) == 2 + assert data[1]["history"]["messages"][0] == {"question": "Previous Q2", "answer": "Previous A2"} + assert data[1]["history"]["messages"][1] == {"question": "Previous Q3", "answer": "Previous A3"} + + finally: + import os + if os.path.exists(temp_json): + os.unlink(temp_json) + + +def test_evaluate_save_as_csv_with_history(): + """Test that save_as_csv works with Examples containing dspy.History objects.""" + # Setup + dspy.settings.configure( + lm=DummyLM( + { + "What is 1+1?": {"answer": "2"}, + } + ) + ) + + # Create history object + history = dspy.History( + messages=[ + {"question": "Previous Q", "answer": "Previous A"}, + ] + ) + + # Create example with history + devset = [ + dspy.Example(question="What is 1+1?", answer="2", history=history).with_inputs("question"), + ] + + program = Predict("question -> answer") + + # Create evaluator with save_as_csv + with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: + temp_csv = f.name + + try: + evaluator = Evaluate( + devset=devset, + metric=answer_exact_match, + display_progress=False, + save_as_csv=temp_csv, + ) + + result = evaluator(program) + assert result.score == 100.0 + + # Verify CSV file was created + import csv + with open(temp_csv) as f: + reader = csv.DictReader(f) + rows = list(reader) + + assert len(rows) == 1 + assert "history" in rows[0] + # CSV will have string representation of the dict + assert "messages" in rows[0]["history"] + + finally: + import os + if os.path.exists(temp_csv): + os.unlink(temp_csv) + diff --git a/tests/evaluate/test_evaluate_with_history.py b/tests/evaluate/test_evaluate_with_history.py deleted file mode 100644 index 7fc81f0345..0000000000 --- a/tests/evaluate/test_evaluate_with_history.py +++ /dev/null @@ -1,141 +0,0 @@ -"""Test Evaluate with dspy.History objects.""" -import json -import tempfile - -import dspy -from dspy.evaluate import Evaluate -from dspy.evaluate.metrics import answer_exact_match -from dspy.predict import Predict -from dspy.utils.dummies import DummyLM - - -def test_evaluate_save_as_json_with_history(): - """Test that save_as_json works with Examples containing dspy.History objects.""" - # Setup - dspy.settings.configure( - lm=DummyLM( - { - "What is 1+1?": {"answer": "2"}, - "What is 2+2?": {"answer": "4"}, - } - ) - ) - - # Create history objects - history1 = dspy.History( - messages=[ - {"question": "Previous Q1", "answer": "Previous A1"}, - ] - ) - history2 = dspy.History( - messages=[ - {"question": "Previous Q2", "answer": "Previous A2"}, - {"question": "Previous Q3", "answer": "Previous A3"}, - ] - ) - - # Create examples with history - devset = [ - dspy.Example(question="What is 1+1?", answer="2", history=history1).with_inputs("question"), - dspy.Example(question="What is 2+2?", answer="4", history=history2).with_inputs("question"), - ] - - program = Predict("question -> answer") - - # Create evaluator with save_as_json - with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: - temp_json = f.name - - try: - evaluator = Evaluate( - devset=devset, - metric=answer_exact_match, - display_progress=False, - save_as_json=temp_json, - ) - - result = evaluator(program) - assert result.score == 100.0 - - # Verify JSON file was created and is valid - with open(temp_json) as f: - data = json.load(f) - - assert len(data) == 2 - - # Verify history was properly serialized in first record - assert "history" in data[0] - assert isinstance(data[0]["history"], dict) - assert "messages" in data[0]["history"] - assert len(data[0]["history"]["messages"]) == 1 - assert data[0]["history"]["messages"][0] == {"question": "Previous Q1", "answer": "Previous A1"} - - # Verify history was properly serialized in second record - assert "history" in data[1] - assert isinstance(data[1]["history"], dict) - assert "messages" in data[1]["history"] - assert len(data[1]["history"]["messages"]) == 2 - assert data[1]["history"]["messages"][0] == {"question": "Previous Q2", "answer": "Previous A2"} - assert data[1]["history"]["messages"][1] == {"question": "Previous Q3", "answer": "Previous A3"} - - finally: - import os - if os.path.exists(temp_json): - os.unlink(temp_json) - - -def test_evaluate_save_as_csv_with_history(): - """Test that save_as_csv works with Examples containing dspy.History objects.""" - # Setup - dspy.settings.configure( - lm=DummyLM( - { - "What is 1+1?": {"answer": "2"}, - } - ) - ) - - # Create history object - history = dspy.History( - messages=[ - {"question": "Previous Q", "answer": "Previous A"}, - ] - ) - - # Create example with history - devset = [ - dspy.Example(question="What is 1+1?", answer="2", history=history).with_inputs("question"), - ] - - program = Predict("question -> answer") - - # Create evaluator with save_as_csv - with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: - temp_csv = f.name - - try: - evaluator = Evaluate( - devset=devset, - metric=answer_exact_match, - display_progress=False, - save_as_csv=temp_csv, - ) - - result = evaluator(program) - assert result.score == 100.0 - - # Verify CSV file was created - import csv - with open(temp_csv) as f: - reader = csv.DictReader(f) - rows = list(reader) - - assert len(rows) == 1 - assert "history" in rows[0] - # CSV will have string representation of the dict - assert "messages" in rows[0]["history"] - - finally: - import os - if os.path.exists(temp_csv): - os.unlink(temp_csv) From ebb502ca79ddf4c8a3f93e938c2b118a5bf78314 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 14 Nov 2025 06:44:12 +0000 Subject: [PATCH 4/6] Refactor: move toDict() call to merge_dicts for cleaner code Co-authored-by: TomeHirata <33407409+TomeHirata@users.noreply.github.com> --- dspy/evaluate/evaluate.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index 90e48ae954..e190ff6f36 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -231,12 +231,9 @@ def _prepare_results_output( ): return [ ( - merge_dicts( - example.toDict(), - prediction.toDict() if hasattr(prediction, "toDict") else prediction - ) | {metric_name: score} + merge_dicts(example, prediction) | {metric_name: score} if prediction_is_dictlike(prediction) - else example.toDict() | {"prediction": prediction, metric_name: score} + else (example.toDict() if hasattr(example, "toDict") else dict(example)) | {"prediction": prediction, metric_name: score} ) for example, prediction, score in results ] @@ -308,6 +305,12 @@ def prediction_is_dictlike(prediction): def merge_dicts(d1, d2) -> dict: + # Convert to dict if objects have toDict method (e.g., Example objects) + if hasattr(d1, "toDict"): + d1 = d1.toDict() + if hasattr(d2, "toDict"): + d2 = d2.toDict() + merged = {} for k, v in d1.items(): if k in d2: From 2a82f8d0155bdf3dc113ccccd97428ddaa742c37 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 14 Nov 2025 07:01:29 +0000 Subject: [PATCH 5/6] Use isinstance(BaseModel) instead of hasattr for Pydantic check Co-authored-by: TomeHirata <33407409+TomeHirata@users.noreply.github.com> --- dspy/primitives/example.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/dspy/primitives/example.py b/dspy/primitives/example.py index 4b8259fa33..11e490d1de 100644 --- a/dspy/primitives/example.py +++ b/dspy/primitives/example.py @@ -1,3 +1,9 @@ +try: + from pydantic import BaseModel +except ImportError: + BaseModel = None + + class Example: """A flexible data container for DSPy examples and training data. @@ -193,7 +199,7 @@ def toDict(self): # noqa: N802 def convert_to_serializable(value): if hasattr(value, "toDict"): return value.toDict() - elif hasattr(value, "model_dump"): + elif BaseModel is not None and isinstance(value, BaseModel): # Handle Pydantic models (e.g., dspy.History) return value.model_dump() elif isinstance(value, list): From 1d56c2984db2470d933a1da3935f249029bbb787 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 14 Nov 2025 07:08:52 +0000 Subject: [PATCH 6/6] Simplify _prepare_results_output to always use toDict() Co-authored-by: TomeHirata <33407409+TomeHirata@users.noreply.github.com> --- dspy/evaluate/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index e190ff6f36..e42b98e706 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -233,7 +233,7 @@ def _prepare_results_output( ( merge_dicts(example, prediction) | {metric_name: score} if prediction_is_dictlike(prediction) - else (example.toDict() if hasattr(example, "toDict") else dict(example)) | {"prediction": prediction, metric_name: score} + else example.toDict() | {"prediction": prediction, metric_name: score} ) for example, prediction, score in results ]