use real LLM for unit tests

TomeHirata · TomeHirata · commit 868f4ae30594 · 2025-07-03T14:31:36.000+09:00
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -81,6 +81,11 @@ jobs:
         uses: chartboost/ruff-action@v1
         with:
           args: check --fix-only
+      - name: Set LLM model
+        run: |
+          echo "LLM_MODEL=${{ secrets.LLM_MODEL }}" >> $GITHUB_ENV
+          echo "DATABRICKS_API_BASE=${{ secrets.DATABRICKS_API_BASE }}" >> $GITHUB_ENV
+          echo "DATABRICKS_API_KEY=${{ secrets.DATABRICKS_API_KEY }}" >> $GITHUB_ENV
       - name: Run tests with pytest
         run: uv run -p .venv pytest tests/
       - name: Install optional dependencies
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,4 +1,5 @@
 import copy
+import os
 
 import pytest
 
@@ -49,3 +50,11 @@ def pytest_collection_modifyitems(config, items):
         for item in items:
             if flag in item.keywords:
                 item.add_marker(skip_mark)
+
+
+@pytest.fixture
+def llm_model():
+    model = os.environ.get("LLM_MODEL", None)
+    if model is None:
+        pytest.skip("LLM_MODEL is not set in the environment variables")
+    return model
diff --git a/tests/primitives/test_base_module.py b/tests/primitives/test_base_module.py
@@ -230,30 +230,28 @@ def emit(self, record):
         logger.removeHandler(handler)
 
 
-@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.")
-def test_single_module_call_with_usage_tracker():
-    dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=False), track_usage=True)
+def test_single_module_call_with_usage_tracker(llm_model):
+    dspy.settings.configure(lm=dspy.LM(llm_model, cache=False), track_usage=True)
 
     predict = dspy.ChainOfThought("question -> answer")
     output = predict(question="What is the capital of France?")
 
     lm_usage = output.get_lm_usage()
     assert len(lm_usage) == 1
-    assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0
-    assert lm_usage["openai/gpt-4o-mini"]["completion_tokens"] > 0
-    assert lm_usage["openai/gpt-4o-mini"]["total_tokens"] > 0
+    assert lm_usage[llm_model]["prompt_tokens"] > 0
+    assert lm_usage[llm_model]["completion_tokens"] > 0
+    assert lm_usage[llm_model]["total_tokens"] > 0
 
     # Test no usage being tracked when cache is enabled
-    dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=True), track_usage=True)
+    dspy.settings.configure(lm=dspy.LM(llm_model, cache=True), track_usage=True)
     for _ in range(2):
         output = predict(question="What is the capital of France?")
 
     assert len(output.get_lm_usage()) == 0
 
 
-@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.")
-def test_multi_module_call_with_usage_tracker():
-    dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=False), track_usage=True)
+def test_multi_module_call_with_usage_tracker(llm_model):
+    dspy.settings.configure(lm=dspy.LM(llm_model, cache=False), track_usage=True)
 
     class MyProgram(dspy.Module):
         def __init__(self):
@@ -270,12 +268,13 @@ def __call__(self, question: str) -> str:
 
     lm_usage = output.get_lm_usage()
     assert len(lm_usage) == 1
-    assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0
-    assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0
-    assert lm_usage["openai/gpt-4o-mini"]["completion_tokens"] > 0
-    assert lm_usage["openai/gpt-4o-mini"]["total_tokens"] > 0
+    assert lm_usage[llm_model]["prompt_tokens"] > 0
+    assert lm_usage[llm_model]["prompt_tokens"] > 0
+    assert lm_usage[llm_model]["completion_tokens"] > 0
+    assert lm_usage[llm_model]["total_tokens"] > 0
 
 
+# TODO: prepare second model for testing this unit test in ci
 @pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.")
 def test_usage_tracker_in_parallel():
     class MyProgram(dspy.Module):
diff --git a/tests/streaming/test_streaming.py b/tests/streaming/test_streaming.py
@@ -1,5 +1,4 @@
 import asyncio
-import os
 import time
 from unittest import mock
 from unittest.mock import AsyncMock
@@ -131,9 +130,8 @@ def module_start_status_message(self, instance, inputs):
         assert status_messages[2].message == "Predict starting!"
 
 
-@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables")
 @pytest.mark.anyio
-async def test_stream_listener_chat_adapter():
+async def test_stream_listener_chat_adapter(llm_model):
     class MyProgram(dspy.Module):
         def __init__(self):
             self.predict1 = dspy.Predict("question->answer")
@@ -154,7 +152,7 @@ def __call__(self, x: str, **kwargs):
         include_final_prediction_in_output_stream=False,
     )
     # Turn off the cache to ensure the stream is produced.
-    with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False)):
+    with dspy.context(lm=dspy.LM(llm_model, cache=False)):
         output = program(x="why did a chicken cross the kitchen?")
         all_chunks = []
         async for value in output:
@@ -194,9 +192,8 @@ async def acall(self, x: str):
     assert status_messages[1].message == "Tool calling finished! Querying the LLM with tool calling results..."
 
 
-@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables")
 @pytest.mark.anyio
-async def test_stream_listener_json_adapter():
+async def test_stream_listener_json_adapter(llm_model):
     class MyProgram(dspy.Module):
         def __init__(self):
             self.predict1 = dspy.Predict("question->answer")
@@ -217,7 +214,7 @@ def __call__(self, x: str, **kwargs):
         include_final_prediction_in_output_stream=False,
     )
     # Turn off the cache to ensure the stream is produced.
-    with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False), adapter=dspy.JSONAdapter()):
+    with dspy.context(lm=dspy.LM(llm_model, cache=False), adapter=dspy.JSONAdapter()):
         output = program(x="why did a chicken cross the kitchen?")
         all_chunks = []
         async for value in output:
@@ -232,22 +229,22 @@ def __call__(self, x: str, **kwargs):
 
 
 @pytest.mark.anyio
-async def test_streaming_handles_space_correctly():
+async def test_streaming_handles_space_correctly(llm_model):
     my_program = dspy.Predict("question->answer")
     program = dspy.streamify(
         my_program, stream_listeners=[dspy.streaming.StreamListener(signature_field_name="answer")]
     )
 
     async def gpt_4o_mini_stream(*args, **kwargs):
         yield ModelResponseStream(
-            model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="[[ ## answer ## ]]\n"))]
+            model=llm_model, choices=[StreamingChoices(delta=Delta(content="[[ ## answer ## ]]\n"))]
         )
-        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="How "))])
-        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="are "))])
-        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="you "))])
-        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="doing?"))])
+        yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="How "))])
+        yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="are "))])
+        yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="you "))])
+        yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="doing?"))])
         yield ModelResponseStream(
-            model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="\n\n[[ ## completed ## ]]"))]
+            model=llm_model, choices=[StreamingChoices(delta=Delta(content="\n\n[[ ## completed ## ]]"))]
         )
 
     with mock.patch("litellm.acompletion", side_effect=gpt_4o_mini_stream):
@@ -261,8 +258,7 @@ async def gpt_4o_mini_stream(*args, **kwargs):
     assert all_chunks[0].chunk == "How are you doing?"
 
 
-@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables")
-def test_sync_streaming():
+def test_sync_streaming(llm_model):
     class MyProgram(dspy.Module):
         def __init__(self):
             self.predict1 = dspy.Predict("question->answer")
@@ -284,7 +280,7 @@ def __call__(self, x: str, **kwargs):
         async_streaming=False,
     )
     # Turn off the cache to ensure the stream is produced.
-    with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False)):
+    with dspy.context(lm=dspy.LM(llm_model, cache=False)):
         output = program(x="why did a chicken cross the kitchen?")
         all_chunks = []
         for value in output:
diff --git a/tests/utils/test_usage_tracker.py b/tests/utils/test_usage_tracker.py
@@ -1,7 +1,3 @@
-import os
-
-import pytest
-
 import dspy
 from dspy.utils.usage_tracker import UsageTracker, track_usage
 
@@ -137,12 +133,8 @@ def test_track_usage_with_multiple_models():
     assert total_usage["gpt-3.5-turbo"]["total_tokens"] == 900
 
 
-@pytest.mark.skipif(
-    not os.getenv("OPENAI_API_KEY"),
-    reason="Skip the test if OPENAI_API_KEY is not set.",
-)
-def test_track_usage_context_manager():
-    lm = dspy.LM("openai/gpt-4o-mini", cache=False)
+def test_track_usage_context_manager(llm_model):
+    lm = dspy.LM(llm_model, cache=False)
     dspy.settings.configure(lm=lm)
 
     predict = dspy.ChainOfThought("question -> answer")
@@ -151,12 +143,12 @@ def test_track_usage_context_manager():
         predict(question="What is the capital of Italy?")
 
     assert len(tracker.usage_data) > 0
-    assert len(tracker.usage_data["openai/gpt-4o-mini"]) == 2
+    assert len(tracker.usage_data[llm_model]) == 2
 
     total_usage = tracker.get_total_tokens()
-    assert "openai/gpt-4o-mini" in total_usage
+    assert llm_model in total_usage
     assert len(total_usage.keys()) == 1
-    assert isinstance(total_usage["openai/gpt-4o-mini"], dict)
+    assert isinstance(total_usage[llm_model], dict)
 
 
 def test_merge_usage_entries_with_new_keys():