From 597a7b44ab09b4d4e8045329a81d61815321694b Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 12 Nov 2025 04:01:37 +0000
Subject: [PATCH 1/6] Initial plan


From e79dc12a7c7c7bb3d4e8799c992f6852cebd0665 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 12 Nov 2025 04:15:47 +0000
Subject: [PATCH 2/6] Fix Example.toDict() to serialize dspy.History objects
 properly

Co-authored-by: TomeHirata <33407409+TomeHirata@users.noreply.github.com>
---
 dspy/evaluate/evaluate.py                    |   7 +-
 dspy/primitives/example.py                   |   3 +
 tests/evaluate/test_evaluate_with_history.py | 141 +++++++++++++++++++
 tests/primitives/test_example.py             |  31 ++++
 4 files changed, 180 insertions(+), 2 deletions(-)
 create mode 100644 tests/evaluate/test_evaluate_with_history.py

diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
index 50513ce8e2..90e48ae954 100644
--- a/dspy/evaluate/evaluate.py
+++ b/dspy/evaluate/evaluate.py
@@ -231,9 +231,12 @@ def _prepare_results_output(
     ):
         return [
             (
-                merge_dicts(example, prediction) | {metric_name: score}
+                merge_dicts(
+                    example.toDict(),
+                    prediction.toDict() if hasattr(prediction, "toDict") else prediction
+                ) | {metric_name: score}
                 if prediction_is_dictlike(prediction)
-                else dict(example) | {"prediction": prediction, metric_name: score}
+                else example.toDict() | {"prediction": prediction, metric_name: score}
             )
             for example, prediction, score in results
         ]
diff --git a/dspy/primitives/example.py b/dspy/primitives/example.py
index 549da6bce7..4b8259fa33 100644
--- a/dspy/primitives/example.py
+++ b/dspy/primitives/example.py
@@ -193,6 +193,9 @@ def toDict(self):  # noqa: N802
         def convert_to_serializable(value):
             if hasattr(value, "toDict"):
                 return value.toDict()
+            elif hasattr(value, "model_dump"):
+                # Handle Pydantic models (e.g., dspy.History)
+                return value.model_dump()
             elif isinstance(value, list):
                 return [convert_to_serializable(item) for item in value]
             elif isinstance(value, dict):
diff --git a/tests/evaluate/test_evaluate_with_history.py b/tests/evaluate/test_evaluate_with_history.py
new file mode 100644
index 0000000000..7fc81f0345
--- /dev/null
+++ b/tests/evaluate/test_evaluate_with_history.py
@@ -0,0 +1,141 @@
+"""Test Evaluate with dspy.History objects."""
+import json
+import tempfile
+
+import dspy
+from dspy.evaluate import Evaluate
+from dspy.evaluate.metrics import answer_exact_match
+from dspy.predict import Predict
+from dspy.utils.dummies import DummyLM
+
+
+def test_evaluate_save_as_json_with_history():
+    """Test that save_as_json works with Examples containing dspy.History objects."""
+    # Setup
+    dspy.settings.configure(
+        lm=DummyLM(
+            {
+                "What is 1+1?": {"answer": "2"},
+                "What is 2+2?": {"answer": "4"},
+            }
+        )
+    )
+
+    # Create history objects
+    history1 = dspy.History(
+        messages=[
+            {"question": "Previous Q1", "answer": "Previous A1"},
+        ]
+    )
+    history2 = dspy.History(
+        messages=[
+            {"question": "Previous Q2", "answer": "Previous A2"},
+            {"question": "Previous Q3", "answer": "Previous A3"},
+        ]
+    )
+
+    # Create examples with history
+    devset = [
+        dspy.Example(question="What is 1+1?", answer="2", history=history1).with_inputs("question"),
+        dspy.Example(question="What is 2+2?", answer="4", history=history2).with_inputs("question"),
+    ]
+
+    program = Predict("question -> answer")
+
+    # Create evaluator with save_as_json
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        temp_json = f.name
+
+    try:
+        evaluator = Evaluate(
+            devset=devset,
+            metric=answer_exact_match,
+            display_progress=False,
+            save_as_json=temp_json,
+        )
+
+        result = evaluator(program)
+        assert result.score == 100.0
+
+        # Verify JSON file was created and is valid
+        with open(temp_json) as f:
+            data = json.load(f)
+
+        assert len(data) == 2
+
+        # Verify history was properly serialized in first record
+        assert "history" in data[0]
+        assert isinstance(data[0]["history"], dict)
+        assert "messages" in data[0]["history"]
+        assert len(data[0]["history"]["messages"]) == 1
+        assert data[0]["history"]["messages"][0] == {"question": "Previous Q1", "answer": "Previous A1"}
+
+        # Verify history was properly serialized in second record
+        assert "history" in data[1]
+        assert isinstance(data[1]["history"], dict)
+        assert "messages" in data[1]["history"]
+        assert len(data[1]["history"]["messages"]) == 2
+        assert data[1]["history"]["messages"][0] == {"question": "Previous Q2", "answer": "Previous A2"}
+        assert data[1]["history"]["messages"][1] == {"question": "Previous Q3", "answer": "Previous A3"}
+
+    finally:
+        import os
+        if os.path.exists(temp_json):
+            os.unlink(temp_json)
+
+
+def test_evaluate_save_as_csv_with_history():
+    """Test that save_as_csv works with Examples containing dspy.History objects."""
+    # Setup
+    dspy.settings.configure(
+        lm=DummyLM(
+            {
+                "What is 1+1?": {"answer": "2"},
+            }
+        )
+    )
+
+    # Create history object
+    history = dspy.History(
+        messages=[
+            {"question": "Previous Q", "answer": "Previous A"},
+        ]
+    )
+
+    # Create example with history
+    devset = [
+        dspy.Example(question="What is 1+1?", answer="2", history=history).with_inputs("question"),
+    ]
+
+    program = Predict("question -> answer")
+
+    # Create evaluator with save_as_csv
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
+        temp_csv = f.name
+
+    try:
+        evaluator = Evaluate(
+            devset=devset,
+            metric=answer_exact_match,
+            display_progress=False,
+            save_as_csv=temp_csv,
+        )
+
+        result = evaluator(program)
+        assert result.score == 100.0
+
+        # Verify CSV file was created
+        import csv
+        with open(temp_csv) as f:
+            reader = csv.DictReader(f)
+            rows = list(reader)
+
+        assert len(rows) == 1
+        assert "history" in rows[0]
+        # CSV will have string representation of the dict
+        assert "messages" in rows[0]["history"]
+
+    finally:
+        import os
+        if os.path.exists(temp_csv):
+            os.unlink(temp_csv)
diff --git a/tests/primitives/test_example.py b/tests/primitives/test_example.py
index e406017137..c569016dbc 100644
--- a/tests/primitives/test_example.py
+++ b/tests/primitives/test_example.py
@@ -123,3 +123,34 @@ def test_example_copy_without():
 def test_example_to_dict():
     example = Example(a=1, b=2)
     assert example.toDict() == {"a": 1, "b": 2}
+
+
+def test_example_to_dict_with_history():
+    """Test that Example.toDict() properly serializes dspy.History objects."""
+    history = dspy.History(
+        messages=[
+            {"question": "What is the capital of France?", "answer": "Paris"},
+            {"question": "What is the capital of Germany?", "answer": "Berlin"},
+        ]
+    )
+    example = Example(question="Test question", history=history, answer="Test answer")
+
+    result = example.toDict()
+
+    # Verify the result is a dictionary
+    assert isinstance(result, dict)
+    assert "history" in result
+
+    # Verify history is serialized to a dict (not a History object)
+    assert isinstance(result["history"], dict)
+    assert "messages" in result["history"]
+    assert result["history"]["messages"] == [
+        {"question": "What is the capital of France?", "answer": "Paris"},
+        {"question": "What is the capital of Germany?", "answer": "Berlin"},
+    ]
+
+    # Verify JSON serialization works
+    import json
+    json_str = json.dumps(result)
+    restored = json.loads(json_str)
+    assert restored["history"]["messages"] == result["history"]["messages"]

From c67ba3336ce24a6d4bec3a792b6cbfa91dcc7b2c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 13 Nov 2025 04:15:42 +0000
Subject: [PATCH 3/6] Consolidate History tests into existing test_evaluate.py

Co-authored-by: TomeHirata <33407409+TomeHirata@users.noreply.github.com>
---
 tests/evaluate/test_evaluate.py              | 135 ++++++++++++++++++
 tests/evaluate/test_evaluate_with_history.py | 141 -------------------
 2 files changed, 135 insertions(+), 141 deletions(-)
 delete mode 100644 tests/evaluate/test_evaluate_with_history.py

diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py
index 211cf25962..6d44850bae 100644
--- a/tests/evaluate/test_evaluate.py
+++ b/tests/evaluate/test_evaluate.py
@@ -1,4 +1,6 @@
+import json
 import signal
+import tempfile
 import threading
 from unittest.mock import patch
 
@@ -261,3 +263,136 @@ def on_evaluate_end(
 def test_evaluation_result_repr():
     result = EvaluationResult(score=100.0, results=[(new_example("What is 1+1?", "2"), {"answer": "2"}, 100.0)])
     assert repr(result) == "EvaluationResult(score=100.0, results=<list of 1 results>)"
+
+
+def test_evaluate_save_as_json_with_history():
+    """Test that save_as_json works with Examples containing dspy.History objects."""
+    # Setup
+    dspy.settings.configure(
+        lm=DummyLM(
+            {
+                "What is 1+1?": {"answer": "2"},
+                "What is 2+2?": {"answer": "4"},
+            }
+        )
+    )
+
+    # Create history objects
+    history1 = dspy.History(
+        messages=[
+            {"question": "Previous Q1", "answer": "Previous A1"},
+        ]
+    )
+    history2 = dspy.History(
+        messages=[
+            {"question": "Previous Q2", "answer": "Previous A2"},
+            {"question": "Previous Q3", "answer": "Previous A3"},
+        ]
+    )
+
+    # Create examples with history
+    devset = [
+        dspy.Example(question="What is 1+1?", answer="2", history=history1).with_inputs("question"),
+        dspy.Example(question="What is 2+2?", answer="4", history=history2).with_inputs("question"),
+    ]
+
+    program = Predict("question -> answer")
+
+    # Create evaluator with save_as_json
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        temp_json = f.name
+
+    try:
+        evaluator = Evaluate(
+            devset=devset,
+            metric=answer_exact_match,
+            display_progress=False,
+            save_as_json=temp_json,
+        )
+
+        result = evaluator(program)
+        assert result.score == 100.0
+
+        # Verify JSON file was created and is valid
+        with open(temp_json) as f:
+            data = json.load(f)
+
+        assert len(data) == 2
+
+        # Verify history was properly serialized in first record
+        assert "history" in data[0]
+        assert isinstance(data[0]["history"], dict)
+        assert "messages" in data[0]["history"]
+        assert len(data[0]["history"]["messages"]) == 1
+        assert data[0]["history"]["messages"][0] == {"question": "Previous Q1", "answer": "Previous A1"}
+
+        # Verify history was properly serialized in second record
+        assert "history" in data[1]
+        assert isinstance(data[1]["history"], dict)
+        assert "messages" in data[1]["history"]
+        assert len(data[1]["history"]["messages"]) == 2
+        assert data[1]["history"]["messages"][0] == {"question": "Previous Q2", "answer": "Previous A2"}
+        assert data[1]["history"]["messages"][1] == {"question": "Previous Q3", "answer": "Previous A3"}
+
+    finally:
+        import os
+        if os.path.exists(temp_json):
+            os.unlink(temp_json)
+
+
+def test_evaluate_save_as_csv_with_history():
+    """Test that save_as_csv works with Examples containing dspy.History objects."""
+    # Setup
+    dspy.settings.configure(
+        lm=DummyLM(
+            {
+                "What is 1+1?": {"answer": "2"},
+            }
+        )
+    )
+
+    # Create history object
+    history = dspy.History(
+        messages=[
+            {"question": "Previous Q", "answer": "Previous A"},
+        ]
+    )
+
+    # Create example with history
+    devset = [
+        dspy.Example(question="What is 1+1?", answer="2", history=history).with_inputs("question"),
+    ]
+
+    program = Predict("question -> answer")
+
+    # Create evaluator with save_as_csv
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
+        temp_csv = f.name
+
+    try:
+        evaluator = Evaluate(
+            devset=devset,
+            metric=answer_exact_match,
+            display_progress=False,
+            save_as_csv=temp_csv,
+        )
+
+        result = evaluator(program)
+        assert result.score == 100.0
+
+        # Verify CSV file was created
+        import csv
+        with open(temp_csv) as f:
+            reader = csv.DictReader(f)
+            rows = list(reader)
+
+        assert len(rows) == 1
+        assert "history" in rows[0]
+        # CSV will have string representation of the dict
+        assert "messages" in rows[0]["history"]
+
+    finally:
+        import os
+        if os.path.exists(temp_csv):
+            os.unlink(temp_csv)
+
diff --git a/tests/evaluate/test_evaluate_with_history.py b/tests/evaluate/test_evaluate_with_history.py
deleted file mode 100644
index 7fc81f0345..0000000000
--- a/tests/evaluate/test_evaluate_with_history.py
+++ /dev/null
@@ -1,141 +0,0 @@
-"""Test Evaluate with dspy.History objects."""
-import json
-import tempfile
-
-import dspy
-from dspy.evaluate import Evaluate
-from dspy.evaluate.metrics import answer_exact_match
-from dspy.predict import Predict
-from dspy.utils.dummies import DummyLM
-
-
-def test_evaluate_save_as_json_with_history():
-    """Test that save_as_json works with Examples containing dspy.History objects."""
-    # Setup
-    dspy.settings.configure(
-        lm=DummyLM(
-            {
-                "What is 1+1?": {"answer": "2"},
-                "What is 2+2?": {"answer": "4"},
-            }
-        )
-    )
-
-    # Create history objects
-    history1 = dspy.History(
-        messages=[
-            {"question": "Previous Q1", "answer": "Previous A1"},
-        ]
-    )
-    history2 = dspy.History(
-        messages=[
-            {"question": "Previous Q2", "answer": "Previous A2"},
-            {"question": "Previous Q3", "answer": "Previous A3"},
-        ]
-    )
-
-    # Create examples with history
-    devset = [
-        dspy.Example(question="What is 1+1?", answer="2", history=history1).with_inputs("question"),
-        dspy.Example(question="What is 2+2?", answer="4", history=history2).with_inputs("question"),
-    ]
-
-    program = Predict("question -> answer")
-
-    # Create evaluator with save_as_json
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
-        temp_json = f.name
-
-    try:
-        evaluator = Evaluate(
-            devset=devset,
-            metric=answer_exact_match,
-            display_progress=False,
-            save_as_json=temp_json,
-        )
-
-        result = evaluator(program)
-        assert result.score == 100.0
-
-        # Verify JSON file was created and is valid
-        with open(temp_json) as f:
-            data = json.load(f)
-
-        assert len(data) == 2
-
-        # Verify history was properly serialized in first record
-        assert "history" in data[0]
-        assert isinstance(data[0]["history"], dict)
-        assert "messages" in data[0]["history"]
-        assert len(data[0]["history"]["messages"]) == 1
-        assert data[0]["history"]["messages"][0] == {"question": "Previous Q1", "answer": "Previous A1"}
-
-        # Verify history was properly serialized in second record
-        assert "history" in data[1]
-        assert isinstance(data[1]["history"], dict)
-        assert "messages" in data[1]["history"]
-        assert len(data[1]["history"]["messages"]) == 2
-        assert data[1]["history"]["messages"][0] == {"question": "Previous Q2", "answer": "Previous A2"}
-        assert data[1]["history"]["messages"][1] == {"question": "Previous Q3", "answer": "Previous A3"}
-
-    finally:
-        import os
-        if os.path.exists(temp_json):
-            os.unlink(temp_json)
-
-
-def test_evaluate_save_as_csv_with_history():
-    """Test that save_as_csv works with Examples containing dspy.History objects."""
-    # Setup
-    dspy.settings.configure(
-        lm=DummyLM(
-            {
-                "What is 1+1?": {"answer": "2"},
-            }
-        )
-    )
-
-    # Create history object
-    history = dspy.History(
-        messages=[
-            {"question": "Previous Q", "answer": "Previous A"},
-        ]
-    )
-
-    # Create example with history
-    devset = [
-        dspy.Example(question="What is 1+1?", answer="2", history=history).with_inputs("question"),
-    ]
-
-    program = Predict("question -> answer")
-
-    # Create evaluator with save_as_csv
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
-        temp_csv = f.name
-
-    try:
-        evaluator = Evaluate(
-            devset=devset,
-            metric=answer_exact_match,
-            display_progress=False,
-            save_as_csv=temp_csv,
-        )
-
-        result = evaluator(program)
-        assert result.score == 100.0
-
-        # Verify CSV file was created
-        import csv
-        with open(temp_csv) as f:
-            reader = csv.DictReader(f)
-            rows = list(reader)
-
-        assert len(rows) == 1
-        assert "history" in rows[0]
-        # CSV will have string representation of the dict
-        assert "messages" in rows[0]["history"]
-
-    finally:
-        import os
-        if os.path.exists(temp_csv):
-            os.unlink(temp_csv)

From ebb502ca79ddf4c8a3f93e938c2b118a5bf78314 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 14 Nov 2025 06:44:12 +0000
Subject: [PATCH 4/6] Refactor: move toDict() call to merge_dicts for cleaner
 code

Co-authored-by: TomeHirata <33407409+TomeHirata@users.noreply.github.com>
---
 dspy/evaluate/evaluate.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
index 90e48ae954..e190ff6f36 100644
--- a/dspy/evaluate/evaluate.py
+++ b/dspy/evaluate/evaluate.py
@@ -231,12 +231,9 @@ def _prepare_results_output(
     ):
         return [
             (
-                merge_dicts(
-                    example.toDict(),
-                    prediction.toDict() if hasattr(prediction, "toDict") else prediction
-                ) | {metric_name: score}
+                merge_dicts(example, prediction) | {metric_name: score}
                 if prediction_is_dictlike(prediction)
-                else example.toDict() | {"prediction": prediction, metric_name: score}
+                else (example.toDict() if hasattr(example, "toDict") else dict(example)) | {"prediction": prediction, metric_name: score}
             )
             for example, prediction, score in results
         ]
@@ -308,6 +305,12 @@ def prediction_is_dictlike(prediction):
 
 
 def merge_dicts(d1, d2) -> dict:
+    # Convert to dict if objects have toDict method (e.g., Example objects)
+    if hasattr(d1, "toDict"):
+        d1 = d1.toDict()
+    if hasattr(d2, "toDict"):
+        d2 = d2.toDict()
+
     merged = {}
     for k, v in d1.items():
         if k in d2:

From 2a82f8d0155bdf3dc113ccccd97428ddaa742c37 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 14 Nov 2025 07:01:29 +0000
Subject: [PATCH 5/6] Use isinstance(BaseModel) instead of hasattr for Pydantic
 check

Co-authored-by: TomeHirata <33407409+TomeHirata@users.noreply.github.com>
---
 dspy/primitives/example.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/dspy/primitives/example.py b/dspy/primitives/example.py
index 4b8259fa33..11e490d1de 100644
--- a/dspy/primitives/example.py
+++ b/dspy/primitives/example.py
@@ -1,3 +1,9 @@
+try:
+    from pydantic import BaseModel
+except ImportError:
+    BaseModel = None
+
+
 class Example:
     """A flexible data container for DSPy examples and training data.
 
@@ -193,7 +199,7 @@ def toDict(self):  # noqa: N802
         def convert_to_serializable(value):
             if hasattr(value, "toDict"):
                 return value.toDict()
-            elif hasattr(value, "model_dump"):
+            elif BaseModel is not None and isinstance(value, BaseModel):
                 # Handle Pydantic models (e.g., dspy.History)
                 return value.model_dump()
             elif isinstance(value, list):

From 1d56c2984db2470d933a1da3935f249029bbb787 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 14 Nov 2025 07:08:52 +0000
Subject: [PATCH 6/6] Simplify _prepare_results_output to always use toDict()

Co-authored-by: TomeHirata <33407409+TomeHirata@users.noreply.github.com>
---
 dspy/evaluate/evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
index e190ff6f36..e42b98e706 100644
--- a/dspy/evaluate/evaluate.py
+++ b/dspy/evaluate/evaluate.py
@@ -233,7 +233,7 @@ def _prepare_results_output(
             (
                 merge_dicts(example, prediction) | {metric_name: score}
                 if prediction_is_dictlike(prediction)
-                else (example.toDict() if hasattr(example, "toDict") else dict(example)) | {"prediction": prediction, metric_name: score}
+                else example.toDict() | {"prediction": prediction, metric_name: score}
             )
             for example, prediction, score in results
         ]