diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index 50513ce8e2..e42b98e706 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -233,7 +233,7 @@ def _prepare_results_output( ( merge_dicts(example, prediction) | {metric_name: score} if prediction_is_dictlike(prediction) - else dict(example) | {"prediction": prediction, metric_name: score} + else example.toDict() | {"prediction": prediction, metric_name: score} ) for example, prediction, score in results ] @@ -305,6 +305,12 @@ def prediction_is_dictlike(prediction): def merge_dicts(d1, d2) -> dict: + # Convert to dict if objects have toDict method (e.g., Example objects) + if hasattr(d1, "toDict"): + d1 = d1.toDict() + if hasattr(d2, "toDict"): + d2 = d2.toDict() + merged = {} for k, v in d1.items(): if k in d2: diff --git a/dspy/primitives/example.py b/dspy/primitives/example.py index 549da6bce7..11e490d1de 100644 --- a/dspy/primitives/example.py +++ b/dspy/primitives/example.py @@ -1,3 +1,9 @@ +try: + from pydantic import BaseModel +except ImportError: + BaseModel = None + + class Example: """A flexible data container for DSPy examples and training data. @@ -193,6 +199,9 @@ def toDict(self): # noqa: N802 def convert_to_serializable(value): if hasattr(value, "toDict"): return value.toDict() + elif BaseModel is not None and isinstance(value, BaseModel): + # Handle Pydantic models (e.g., dspy.History) + return value.model_dump() elif isinstance(value, list): return [convert_to_serializable(item) for item in value] elif isinstance(value, dict): diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py index 211cf25962..6d44850bae 100644 --- a/tests/evaluate/test_evaluate.py +++ b/tests/evaluate/test_evaluate.py @@ -1,4 +1,6 @@ +import json import signal +import tempfile import threading from unittest.mock import patch @@ -261,3 +263,136 @@ def on_evaluate_end( def test_evaluation_result_repr(): result = EvaluationResult(score=100.0, results=[(new_example("What is 1+1?", "2"), {"answer": "2"}, 100.0)]) assert repr(result) == "EvaluationResult(score=100.0, results=)" + + +def test_evaluate_save_as_json_with_history(): + """Test that save_as_json works with Examples containing dspy.History objects.""" + # Setup + dspy.settings.configure( + lm=DummyLM( + { + "What is 1+1?": {"answer": "2"}, + "What is 2+2?": {"answer": "4"}, + } + ) + ) + + # Create history objects + history1 = dspy.History( + messages=[ + {"question": "Previous Q1", "answer": "Previous A1"}, + ] + ) + history2 = dspy.History( + messages=[ + {"question": "Previous Q2", "answer": "Previous A2"}, + {"question": "Previous Q3", "answer": "Previous A3"}, + ] + ) + + # Create examples with history + devset = [ + dspy.Example(question="What is 1+1?", answer="2", history=history1).with_inputs("question"), + dspy.Example(question="What is 2+2?", answer="4", history=history2).with_inputs("question"), + ] + + program = Predict("question -> answer") + + # Create evaluator with save_as_json + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + temp_json = f.name + + try: + evaluator = Evaluate( + devset=devset, + metric=answer_exact_match, + display_progress=False, + save_as_json=temp_json, + ) + + result = evaluator(program) + assert result.score == 100.0 + + # Verify JSON file was created and is valid + with open(temp_json) as f: + data = json.load(f) + + assert len(data) == 2 + + # Verify history was properly serialized in first record + assert "history" in data[0] + assert isinstance(data[0]["history"], dict) + assert "messages" in data[0]["history"] + assert len(data[0]["history"]["messages"]) == 1 + assert data[0]["history"]["messages"][0] == {"question": "Previous Q1", "answer": "Previous A1"} + + # Verify history was properly serialized in second record + assert "history" in data[1] + assert isinstance(data[1]["history"], dict) + assert "messages" in data[1]["history"] + assert len(data[1]["history"]["messages"]) == 2 + assert data[1]["history"]["messages"][0] == {"question": "Previous Q2", "answer": "Previous A2"} + assert data[1]["history"]["messages"][1] == {"question": "Previous Q3", "answer": "Previous A3"} + + finally: + import os + if os.path.exists(temp_json): + os.unlink(temp_json) + + +def test_evaluate_save_as_csv_with_history(): + """Test that save_as_csv works with Examples containing dspy.History objects.""" + # Setup + dspy.settings.configure( + lm=DummyLM( + { + "What is 1+1?": {"answer": "2"}, + } + ) + ) + + # Create history object + history = dspy.History( + messages=[ + {"question": "Previous Q", "answer": "Previous A"}, + ] + ) + + # Create example with history + devset = [ + dspy.Example(question="What is 1+1?", answer="2", history=history).with_inputs("question"), + ] + + program = Predict("question -> answer") + + # Create evaluator with save_as_csv + with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: + temp_csv = f.name + + try: + evaluator = Evaluate( + devset=devset, + metric=answer_exact_match, + display_progress=False, + save_as_csv=temp_csv, + ) + + result = evaluator(program) + assert result.score == 100.0 + + # Verify CSV file was created + import csv + with open(temp_csv) as f: + reader = csv.DictReader(f) + rows = list(reader) + + assert len(rows) == 1 + assert "history" in rows[0] + # CSV will have string representation of the dict + assert "messages" in rows[0]["history"] + + finally: + import os + if os.path.exists(temp_csv): + os.unlink(temp_csv) + diff --git a/tests/primitives/test_example.py b/tests/primitives/test_example.py index e406017137..c569016dbc 100644 --- a/tests/primitives/test_example.py +++ b/tests/primitives/test_example.py @@ -123,3 +123,34 @@ def test_example_copy_without(): def test_example_to_dict(): example = Example(a=1, b=2) assert example.toDict() == {"a": 1, "b": 2} + + +def test_example_to_dict_with_history(): + """Test that Example.toDict() properly serializes dspy.History objects.""" + history = dspy.History( + messages=[ + {"question": "What is the capital of France?", "answer": "Paris"}, + {"question": "What is the capital of Germany?", "answer": "Berlin"}, + ] + ) + example = Example(question="Test question", history=history, answer="Test answer") + + result = example.toDict() + + # Verify the result is a dictionary + assert isinstance(result, dict) + assert "history" in result + + # Verify history is serialized to a dict (not a History object) + assert isinstance(result["history"], dict) + assert "messages" in result["history"] + assert result["history"]["messages"] == [ + {"question": "What is the capital of France?", "answer": "Paris"}, + {"question": "What is the capital of Germany?", "answer": "Berlin"}, + ] + + # Verify JSON serialization works + import json + json_str = json.dumps(result) + restored = json.loads(json_str) + assert restored["history"]["messages"] == result["history"]["messages"]