From 868f4ae30594c0f9f00308cf5ceea3126b8c62d8 Mon Sep 17 00:00:00 2001
From: TomuHirata <tomu.hirata@gmail.com>
Date: Thu, 3 Jul 2025 14:31:36 +0900
Subject: [PATCH 1/8] use real LLM for unit tests

---
 .github/workflows/run_tests.yml      |  5 +++++
 tests/conftest.py                    |  9 +++++++++
 tests/primitives/test_base_module.py | 27 ++++++++++++-------------
 tests/streaming/test_streaming.py    | 30 ++++++++++++----------------
 tests/utils/test_usage_tracker.py    | 18 +++++------------
 5 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 6510c69abf..7975361e93 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -81,6 +81,11 @@ jobs:
         uses: chartboost/ruff-action@v1
         with:
           args: check --fix-only
+      - name: Set LLM model
+        run: |
+          echo "LLM_MODEL=${{ secrets.LLM_MODEL }}" >> $GITHUB_ENV
+          echo "DATABRICKS_API_BASE=${{ secrets.DATABRICKS_API_BASE }}" >> $GITHUB_ENV
+          echo "DATABRICKS_API_KEY=${{ secrets.DATABRICKS_API_KEY }}" >> $GITHUB_ENV
       - name: Run tests with pytest
         run: uv run -p .venv pytest tests/
       - name: Install optional dependencies
diff --git a/tests/conftest.py b/tests/conftest.py
index dd57d03f19..fe80b5ecaa 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,5 @@
 import copy
+import os
 
 import pytest
 
@@ -49,3 +50,11 @@ def pytest_collection_modifyitems(config, items):
         for item in items:
             if flag in item.keywords:
                 item.add_marker(skip_mark)
+
+
+@pytest.fixture
+def llm_model():
+    model = os.environ.get("LLM_MODEL", None)
+    if model is None:
+        pytest.skip("LLM_MODEL is not set in the environment variables")
+    return model
diff --git a/tests/primitives/test_base_module.py b/tests/primitives/test_base_module.py
index ee6255bb06..dbe8c1a42c 100644
--- a/tests/primitives/test_base_module.py
+++ b/tests/primitives/test_base_module.py
@@ -230,30 +230,28 @@ def emit(self, record):
         logger.removeHandler(handler)
 
 
-@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.")
-def test_single_module_call_with_usage_tracker():
-    dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=False), track_usage=True)
+def test_single_module_call_with_usage_tracker(llm_model):
+    dspy.settings.configure(lm=dspy.LM(llm_model, cache=False), track_usage=True)
 
     predict = dspy.ChainOfThought("question -> answer")
     output = predict(question="What is the capital of France?")
 
     lm_usage = output.get_lm_usage()
     assert len(lm_usage) == 1
-    assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0
-    assert lm_usage["openai/gpt-4o-mini"]["completion_tokens"] > 0
-    assert lm_usage["openai/gpt-4o-mini"]["total_tokens"] > 0
+    assert lm_usage[llm_model]["prompt_tokens"] > 0
+    assert lm_usage[llm_model]["completion_tokens"] > 0
+    assert lm_usage[llm_model]["total_tokens"] > 0
 
     # Test no usage being tracked when cache is enabled
-    dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=True), track_usage=True)
+    dspy.settings.configure(lm=dspy.LM(llm_model, cache=True), track_usage=True)
     for _ in range(2):
         output = predict(question="What is the capital of France?")
 
     assert len(output.get_lm_usage()) == 0
 
 
-@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.")
-def test_multi_module_call_with_usage_tracker():
-    dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=False), track_usage=True)
+def test_multi_module_call_with_usage_tracker(llm_model):
+    dspy.settings.configure(lm=dspy.LM(llm_model, cache=False), track_usage=True)
 
     class MyProgram(dspy.Module):
         def __init__(self):
@@ -270,12 +268,13 @@ def __call__(self, question: str) -> str:
 
     lm_usage = output.get_lm_usage()
     assert len(lm_usage) == 1
-    assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0
-    assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0
-    assert lm_usage["openai/gpt-4o-mini"]["completion_tokens"] > 0
-    assert lm_usage["openai/gpt-4o-mini"]["total_tokens"] > 0
+    assert lm_usage[llm_model]["prompt_tokens"] > 0
+    assert lm_usage[llm_model]["prompt_tokens"] > 0
+    assert lm_usage[llm_model]["completion_tokens"] > 0
+    assert lm_usage[llm_model]["total_tokens"] > 0
 
 
+# TODO: prepare second model for testing this unit test in ci
 @pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.")
 def test_usage_tracker_in_parallel():
     class MyProgram(dspy.Module):
diff --git a/tests/streaming/test_streaming.py b/tests/streaming/test_streaming.py
index d79402d2ef..3b83d56789 100644
--- a/tests/streaming/test_streaming.py
+++ b/tests/streaming/test_streaming.py
@@ -1,5 +1,4 @@
 import asyncio
-import os
 import time
 from unittest import mock
 from unittest.mock import AsyncMock
@@ -131,9 +130,8 @@ def module_start_status_message(self, instance, inputs):
         assert status_messages[2].message == "Predict starting!"
 
 
-@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables")
 @pytest.mark.anyio
-async def test_stream_listener_chat_adapter():
+async def test_stream_listener_chat_adapter(llm_model):
     class MyProgram(dspy.Module):
         def __init__(self):
             self.predict1 = dspy.Predict("question->answer")
@@ -154,7 +152,7 @@ def __call__(self, x: str, **kwargs):
         include_final_prediction_in_output_stream=False,
     )
     # Turn off the cache to ensure the stream is produced.
-    with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False)):
+    with dspy.context(lm=dspy.LM(llm_model, cache=False)):
         output = program(x="why did a chicken cross the kitchen?")
         all_chunks = []
         async for value in output:
@@ -194,9 +192,8 @@ async def acall(self, x: str):
     assert status_messages[1].message == "Tool calling finished! Querying the LLM with tool calling results..."
 
 
-@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables")
 @pytest.mark.anyio
-async def test_stream_listener_json_adapter():
+async def test_stream_listener_json_adapter(llm_model):
     class MyProgram(dspy.Module):
         def __init__(self):
             self.predict1 = dspy.Predict("question->answer")
@@ -217,7 +214,7 @@ def __call__(self, x: str, **kwargs):
         include_final_prediction_in_output_stream=False,
     )
     # Turn off the cache to ensure the stream is produced.
-    with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False), adapter=dspy.JSONAdapter()):
+    with dspy.context(lm=dspy.LM(llm_model, cache=False), adapter=dspy.JSONAdapter()):
         output = program(x="why did a chicken cross the kitchen?")
         all_chunks = []
         async for value in output:
@@ -232,7 +229,7 @@ def __call__(self, x: str, **kwargs):
 
 
 @pytest.mark.anyio
-async def test_streaming_handles_space_correctly():
+async def test_streaming_handles_space_correctly(llm_model):
     my_program = dspy.Predict("question->answer")
     program = dspy.streamify(
         my_program, stream_listeners=[dspy.streaming.StreamListener(signature_field_name="answer")]
@@ -240,14 +237,14 @@ async def test_streaming_handles_space_correctly():
 
     async def gpt_4o_mini_stream(*args, **kwargs):
         yield ModelResponseStream(
-            model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="[[ ## answer ## ]]\n"))]
+            model=llm_model, choices=[StreamingChoices(delta=Delta(content="[[ ## answer ## ]]\n"))]
         )
-        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="How "))])
-        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="are "))])
-        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="you "))])
-        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="doing?"))])
+        yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="How "))])
+        yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="are "))])
+        yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="you "))])
+        yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="doing?"))])
         yield ModelResponseStream(
-            model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="\n\n[[ ## completed ## ]]"))]
+            model=llm_model, choices=[StreamingChoices(delta=Delta(content="\n\n[[ ## completed ## ]]"))]
         )
 
     with mock.patch("litellm.acompletion", side_effect=gpt_4o_mini_stream):
@@ -261,8 +258,7 @@ async def gpt_4o_mini_stream(*args, **kwargs):
     assert all_chunks[0].chunk == "How are you doing?"
 
 
-@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables")
-def test_sync_streaming():
+def test_sync_streaming(llm_model):
     class MyProgram(dspy.Module):
         def __init__(self):
             self.predict1 = dspy.Predict("question->answer")
@@ -284,7 +280,7 @@ def __call__(self, x: str, **kwargs):
         async_streaming=False,
     )
     # Turn off the cache to ensure the stream is produced.
-    with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False)):
+    with dspy.context(lm=dspy.LM(llm_model, cache=False)):
         output = program(x="why did a chicken cross the kitchen?")
         all_chunks = []
         for value in output:
diff --git a/tests/utils/test_usage_tracker.py b/tests/utils/test_usage_tracker.py
index 70244b3283..0a6c106850 100644
--- a/tests/utils/test_usage_tracker.py
+++ b/tests/utils/test_usage_tracker.py
@@ -1,7 +1,3 @@
-import os
-
-import pytest
-
 import dspy
 from dspy.utils.usage_tracker import UsageTracker, track_usage
 
@@ -137,12 +133,8 @@ def test_track_usage_with_multiple_models():
     assert total_usage["gpt-3.5-turbo"]["total_tokens"] == 900
 
 
-@pytest.mark.skipif(
-    not os.getenv("OPENAI_API_KEY"),
-    reason="Skip the test if OPENAI_API_KEY is not set.",
-)
-def test_track_usage_context_manager():
-    lm = dspy.LM("openai/gpt-4o-mini", cache=False)
+def test_track_usage_context_manager(llm_model):
+    lm = dspy.LM(llm_model, cache=False)
     dspy.settings.configure(lm=lm)
 
     predict = dspy.ChainOfThought("question -> answer")
@@ -151,12 +143,12 @@ def test_track_usage_context_manager():
         predict(question="What is the capital of Italy?")
 
     assert len(tracker.usage_data) > 0
-    assert len(tracker.usage_data["openai/gpt-4o-mini"]) == 2
+    assert len(tracker.usage_data[llm_model]) == 2
 
     total_usage = tracker.get_total_tokens()
-    assert "openai/gpt-4o-mini" in total_usage
+    assert llm_model in total_usage
     assert len(total_usage.keys()) == 1
-    assert isinstance(total_usage["openai/gpt-4o-mini"], dict)
+    assert isinstance(total_usage[llm_model], dict)
 
 
 def test_merge_usage_entries_with_new_keys():

From 0cd6132375c0a917deccb768006d6ab8d8e6c216 Mon Sep 17 00:00:00 2001
From: TomuHirata <tomu.hirata@gmail.com>
Date: Thu, 3 Jul 2025 15:20:18 +0900
Subject: [PATCH 2/8] use ollama

---
 .github/workflows/run_tests.yml | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 7975361e93..caa59e9d73 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -48,6 +48,11 @@ jobs:
   test:
     name: Run Tests
     runs-on: ubuntu-latest
+    services:
+      ollama:
+        image: ollama/ollama:latest
+        ports:
+          - 11434:11434
     strategy:
       matrix:
         python-version: ["3.10", "3.11", "3.12", "3.13"]
@@ -81,11 +86,13 @@ jobs:
         uses: chartboost/ruff-action@v1
         with:
           args: check --fix-only
-      - name: Set LLM model
+      - name: Pull LLM
         run: |
-          echo "LLM_MODEL=${{ secrets.LLM_MODEL }}" >> $GITHUB_ENV
-          echo "DATABRICKS_API_BASE=${{ secrets.DATABRICKS_API_BASE }}" >> $GITHUB_ENV
-          echo "DATABRICKS_API_KEY=${{ secrets.DATABRICKS_API_KEY }}" >> $GITHUB_ENV
+          timeout 60 bash -c 'until curl -f http://localhost:11434/api/version; do sleep 2; done'
+          curl -X POST http://localhost:11434/api/pull \
+            -H "Content-Type: application/json" \
+            -d '{"name": "llama3.2:1b"}'
+          echo "LLM_MODEL=ollama/llama3.2:1b" >> $GITHUB_ENV
       - name: Run tests with pytest
         run: uv run -p .venv pytest tests/
       - name: Install optional dependencies

From d21eba68be744ed5cafff356319ec139642041a8 Mon Sep 17 00:00:00 2001
From: TomuHirata <tomu.hirata@gmail.com>
Date: Thu, 3 Jul 2025 15:34:18 +0900
Subject: [PATCH 3/8] use Llama 3.2 3b

---
 .github/workflows/run_tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index caa59e9d73..84fc4387ea 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -91,8 +91,8 @@ jobs:
           timeout 60 bash -c 'until curl -f http://localhost:11434/api/version; do sleep 2; done'
           curl -X POST http://localhost:11434/api/pull \
             -H "Content-Type: application/json" \
-            -d '{"name": "llama3.2:1b"}'
-          echo "LLM_MODEL=ollama/llama3.2:1b" >> $GITHUB_ENV
+            -d '{"name": "llama3.2:3b"}'
+          echo "LLM_MODEL=ollama/llama3.2:3b" >> $GITHUB_ENV
       - name: Run tests with pytest
         run: uv run -p .venv pytest tests/
       - name: Install optional dependencies

From 5e9a6a64deab3e24f8c3fd9e763c68031f6144d6 Mon Sep 17 00:00:00 2001
From: TomuHirata <tomu.hirata@gmail.com>
Date: Fri, 4 Jul 2025 09:52:34 +0900
Subject: [PATCH 4/8] add verbose option

---
 .github/workflows/run_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 84fc4387ea..9bae49403e 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -94,7 +94,7 @@ jobs:
             -d '{"name": "llama3.2:3b"}'
           echo "LLM_MODEL=ollama/llama3.2:3b" >> $GITHUB_ENV
       - name: Run tests with pytest
-        run: uv run -p .venv pytest tests/
+        run: uv run -p .venv pytest -vv --durations=3 tests/
       - name: Install optional dependencies
         run: uv sync -p .venv --extra dev --extra test_extras
       - name: Run extra tests

From cecf88af5560591c8d45e302d38d531f8f595f53 Mon Sep 17 00:00:00 2001
From: TomuHirata <tomu.hirata@gmail.com>
Date: Fri, 4 Jul 2025 10:19:06 +0900
Subject: [PATCH 5/8] split test into a separate job

---
 .github/workflows/run_tests.yml      | 45 ++++++++++++++++++++++++----
 tests/conftest.py                    |  2 +-
 tests/primitives/test_base_module.py |  2 ++
 tests/streaming/test_streaming.py    |  4 +++
 4 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 9bae49403e..23da732213 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -48,11 +48,6 @@ jobs:
   test:
     name: Run Tests
     runs-on: ubuntu-latest
-    services:
-      ollama:
-        image: ollama/ollama:latest
-        ports:
-          - 11434:11434
     strategy:
       matrix:
         python-version: ["3.10", "3.11", "3.12", "3.13"]
@@ -94,11 +89,49 @@ jobs:
             -d '{"name": "llama3.2:3b"}'
           echo "LLM_MODEL=ollama/llama3.2:3b" >> $GITHUB_ENV
       - name: Run tests with pytest
-        run: uv run -p .venv pytest -vv --durations=3 tests/
+        run: uv run -p .venv pytest -vv tests/
       - name: Install optional dependencies
         run: uv sync -p .venv --extra dev --extra test_extras
       - name: Run extra tests
         run: uv run -p .venv pytest tests/ -m extra --extra
+  
+  llm_call_test:
+    name: Run Tests with Real LM
+    runs-on: ubuntu-latest
+    services:
+      ollama:
+        image: ollama/ollama:latest
+        ports:
+          - 11434:11434
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: 3.11
+      - name: Install uv with caching
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+          cache-dependency-glob: |
+            **/pyproject.toml
+            **/uv.lock
+      - name: Create and activate virtual environment
+        run: |
+          uv venv .venv
+          echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
+      - name: Install dependencies
+        run: |
+          uv sync --dev -p .venv --extra dev
+          uv pip list
+      - name: Pull LLM
+        run: |
+          timeout 60 bash -c 'until curl -f http://localhost:11434/api/version; do sleep 2; done'
+          curl -X POST http://localhost:11434/api/pull \
+            -H "Content-Type: application/json" \
+            -d '{"name": "llama3.2:3b"}'
+          echo "LLM_MODEL=ollama/llama3.2:3b" >> $GITHUB_ENV
+      - name: Run tests
+        run: uv run -p .venv pytest -m llm_call --llm_call -vv --duration=5 tests/
 
   build_package:
     name: Build Package
diff --git a/tests/conftest.py b/tests/conftest.py
index fe80b5ecaa..1a685f5757 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,7 +5,7 @@
 
 from tests.test_utils.server import litellm_test_server, read_litellm_test_server_request_logs  # noqa: F401
 
-SKIP_DEFAULT_FLAGS = ["reliability", "extra"]
+SKIP_DEFAULT_FLAGS = ["reliability", "extra", "llm_call"]
 
 
 @pytest.fixture(autouse=True)
diff --git a/tests/primitives/test_base_module.py b/tests/primitives/test_base_module.py
index dbe8c1a42c..65190ee996 100644
--- a/tests/primitives/test_base_module.py
+++ b/tests/primitives/test_base_module.py
@@ -230,6 +230,7 @@ def emit(self, record):
         logger.removeHandler(handler)
 
 
+@pytest.mark.llm_call
 def test_single_module_call_with_usage_tracker(llm_model):
     dspy.settings.configure(lm=dspy.LM(llm_model, cache=False), track_usage=True)
 
@@ -250,6 +251,7 @@ def test_single_module_call_with_usage_tracker(llm_model):
     assert len(output.get_lm_usage()) == 0
 
 
+@pytest.mark.llm_call
 def test_multi_module_call_with_usage_tracker(llm_model):
     dspy.settings.configure(lm=dspy.LM(llm_model, cache=False), track_usage=True)
 
diff --git a/tests/streaming/test_streaming.py b/tests/streaming/test_streaming.py
index 3b83d56789..6aa0dcf384 100644
--- a/tests/streaming/test_streaming.py
+++ b/tests/streaming/test_streaming.py
@@ -130,6 +130,7 @@ def module_start_status_message(self, instance, inputs):
         assert status_messages[2].message == "Predict starting!"
 
 
+@pytest.mark.llm_call
 @pytest.mark.anyio
 async def test_stream_listener_chat_adapter(llm_model):
     class MyProgram(dspy.Module):
@@ -192,6 +193,7 @@ async def acall(self, x: str):
     assert status_messages[1].message == "Tool calling finished! Querying the LLM with tool calling results..."
 
 
+@pytest.mark.llm_call
 @pytest.mark.anyio
 async def test_stream_listener_json_adapter(llm_model):
     class MyProgram(dspy.Module):
@@ -228,6 +230,7 @@ def __call__(self, x: str, **kwargs):
     assert all_chunks[-1].signature_field_name == "judgement"
 
 
+@pytest.mark.llm_call
 @pytest.mark.anyio
 async def test_streaming_handles_space_correctly(llm_model):
     my_program = dspy.Predict("question->answer")
@@ -258,6 +261,7 @@ async def gpt_4o_mini_stream(*args, **kwargs):
     assert all_chunks[0].chunk == "How are you doing?"
 
 
+@pytest.mark.llm_call
 def test_sync_streaming(llm_model):
     class MyProgram(dspy.Module):
         def __init__(self):

From 061c58e96c05c18c400e56f045bc58fbc0d0e8f1 Mon Sep 17 00:00:00 2001
From: TomuHirata <tomu.hirata@gmail.com>
Date: Fri, 4 Jul 2025 10:21:51 +0900
Subject: [PATCH 6/8] remove LLM pulling

---
 .github/workflows/run_tests.yml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 23da732213..d22216cc62 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -81,13 +81,6 @@ jobs:
         uses: chartboost/ruff-action@v1
         with:
           args: check --fix-only
-      - name: Pull LLM
-        run: |
-          timeout 60 bash -c 'until curl -f http://localhost:11434/api/version; do sleep 2; done'
-          curl -X POST http://localhost:11434/api/pull \
-            -H "Content-Type: application/json" \
-            -d '{"name": "llama3.2:3b"}'
-          echo "LLM_MODEL=ollama/llama3.2:3b" >> $GITHUB_ENV
       - name: Run tests with pytest
         run: uv run -p .venv pytest -vv tests/
       - name: Install optional dependencies

From f17ef5358a892b71ce36cd5026f170bb9444faa9 Mon Sep 17 00:00:00 2001
From: TomuHirata <tomu.hirata@gmail.com>
Date: Fri, 4 Jul 2025 10:25:06 +0900
Subject: [PATCH 7/8] fix option name

---
 .github/workflows/run_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index d22216cc62..1a64382b75 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -124,7 +124,7 @@ jobs:
             -d '{"name": "llama3.2:3b"}'
           echo "LLM_MODEL=ollama/llama3.2:3b" >> $GITHUB_ENV
       - name: Run tests
-        run: uv run -p .venv pytest -m llm_call --llm_call -vv --duration=5 tests/
+        run: uv run -p .venv pytest -m llm_call --llm_call -vv --durations=5 tests/
 
   build_package:
     name: Build Package

From db6552e7a01ce60d23348f218dc8ca09d7d9f751 Mon Sep 17 00:00:00 2001
From: TomuHirata <tomu.hirata@gmail.com>
Date: Tue, 8 Jul 2025 08:46:19 +0900
Subject: [PATCH 8/8] rename env var

---
 .github/workflows/run_tests.yml      |  2 +-
 tests/conftest.py                    |  6 +++---
 tests/primitives/test_base_module.py | 24 ++++++++++++------------
 tests/streaming/test_streaming.py    | 27 +++++++++++++--------------
 tests/utils/test_usage_tracker.py    | 10 +++++-----
 5 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 1a64382b75..4003781e4d 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -122,7 +122,7 @@ jobs:
           curl -X POST http://localhost:11434/api/pull \
             -H "Content-Type: application/json" \
             -d '{"name": "llama3.2:3b"}'
-          echo "LLM_MODEL=ollama/llama3.2:3b" >> $GITHUB_ENV
+          echo "LM_FOR_TEST=ollama/llama3.2:3b" >> $GITHUB_ENV
       - name: Run tests
         run: uv run -p .venv pytest -m llm_call --llm_call -vv --durations=5 tests/
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 1a685f5757..372585df20 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -53,8 +53,8 @@ def pytest_collection_modifyitems(config, items):
 
 
 @pytest.fixture
-def llm_model():
-    model = os.environ.get("LLM_MODEL", None)
+def lm_for_test():
+    model = os.environ.get("LM_FOR_TEST", None)
     if model is None:
-        pytest.skip("LLM_MODEL is not set in the environment variables")
+        pytest.skip("LM_FOR_TEST is not set in the environment variables")
     return model
diff --git a/tests/primitives/test_base_module.py b/tests/primitives/test_base_module.py
index 65190ee996..5706d24846 100644
--- a/tests/primitives/test_base_module.py
+++ b/tests/primitives/test_base_module.py
@@ -231,20 +231,20 @@ def emit(self, record):
 
 
 @pytest.mark.llm_call
-def test_single_module_call_with_usage_tracker(llm_model):
-    dspy.settings.configure(lm=dspy.LM(llm_model, cache=False), track_usage=True)
+def test_single_module_call_with_usage_tracker(lm_for_test):
+    dspy.settings.configure(lm=dspy.LM(lm_for_test, cache=False), track_usage=True)
 
     predict = dspy.ChainOfThought("question -> answer")
     output = predict(question="What is the capital of France?")
 
     lm_usage = output.get_lm_usage()
     assert len(lm_usage) == 1
-    assert lm_usage[llm_model]["prompt_tokens"] > 0
-    assert lm_usage[llm_model]["completion_tokens"] > 0
-    assert lm_usage[llm_model]["total_tokens"] > 0
+    assert lm_usage[lm_for_test]["prompt_tokens"] > 0
+    assert lm_usage[lm_for_test]["completion_tokens"] > 0
+    assert lm_usage[lm_for_test]["total_tokens"] > 0
 
     # Test no usage being tracked when cache is enabled
-    dspy.settings.configure(lm=dspy.LM(llm_model, cache=True), track_usage=True)
+    dspy.settings.configure(lm=dspy.LM(lm_for_test, cache=True), track_usage=True)
     for _ in range(2):
         output = predict(question="What is the capital of France?")
 
@@ -252,8 +252,8 @@ def test_single_module_call_with_usage_tracker(llm_model):
 
 
 @pytest.mark.llm_call
-def test_multi_module_call_with_usage_tracker(llm_model):
-    dspy.settings.configure(lm=dspy.LM(llm_model, cache=False), track_usage=True)
+def test_multi_module_call_with_usage_tracker(lm_for_test):
+    dspy.settings.configure(lm=dspy.LM(lm_for_test, cache=False), track_usage=True)
 
     class MyProgram(dspy.Module):
         def __init__(self):
@@ -270,10 +270,10 @@ def __call__(self, question: str) -> str:
 
     lm_usage = output.get_lm_usage()
     assert len(lm_usage) == 1
-    assert lm_usage[llm_model]["prompt_tokens"] > 0
-    assert lm_usage[llm_model]["prompt_tokens"] > 0
-    assert lm_usage[llm_model]["completion_tokens"] > 0
-    assert lm_usage[llm_model]["total_tokens"] > 0
+    assert lm_usage[lm_for_test]["prompt_tokens"] > 0
+    assert lm_usage[lm_for_test]["prompt_tokens"] > 0
+    assert lm_usage[lm_for_test]["completion_tokens"] > 0
+    assert lm_usage[lm_for_test]["total_tokens"] > 0
 
 
 # TODO: prepare second model for testing this unit test in ci
diff --git a/tests/streaming/test_streaming.py b/tests/streaming/test_streaming.py
index 6aa0dcf384..56c5426bb8 100644
--- a/tests/streaming/test_streaming.py
+++ b/tests/streaming/test_streaming.py
@@ -132,7 +132,7 @@ def module_start_status_message(self, instance, inputs):
 
 @pytest.mark.llm_call
 @pytest.mark.anyio
-async def test_stream_listener_chat_adapter(llm_model):
+async def test_stream_listener_chat_adapter(lm_for_test):
     class MyProgram(dspy.Module):
         def __init__(self):
             self.predict1 = dspy.Predict("question->answer")
@@ -153,7 +153,7 @@ def __call__(self, x: str, **kwargs):
         include_final_prediction_in_output_stream=False,
     )
     # Turn off the cache to ensure the stream is produced.
-    with dspy.context(lm=dspy.LM(llm_model, cache=False)):
+    with dspy.context(lm=dspy.LM(lm_for_test, cache=False)):
         output = program(x="why did a chicken cross the kitchen?")
         all_chunks = []
         async for value in output:
@@ -195,7 +195,7 @@ async def acall(self, x: str):
 
 @pytest.mark.llm_call
 @pytest.mark.anyio
-async def test_stream_listener_json_adapter(llm_model):
+async def test_stream_listener_json_adapter(lm_for_test):
     class MyProgram(dspy.Module):
         def __init__(self):
             self.predict1 = dspy.Predict("question->answer")
@@ -216,7 +216,7 @@ def __call__(self, x: str, **kwargs):
         include_final_prediction_in_output_stream=False,
     )
     # Turn off the cache to ensure the stream is produced.
-    with dspy.context(lm=dspy.LM(llm_model, cache=False), adapter=dspy.JSONAdapter()):
+    with dspy.context(lm=dspy.LM(lm_for_test, cache=False), adapter=dspy.JSONAdapter()):
         output = program(x="why did a chicken cross the kitchen?")
         all_chunks = []
         async for value in output:
@@ -230,9 +230,8 @@ def __call__(self, x: str, **kwargs):
     assert all_chunks[-1].signature_field_name == "judgement"
 
 
-@pytest.mark.llm_call
 @pytest.mark.anyio
-async def test_streaming_handles_space_correctly(llm_model):
+async def test_streaming_handles_space_correctly():
     my_program = dspy.Predict("question->answer")
     program = dspy.streamify(
         my_program, stream_listeners=[dspy.streaming.StreamListener(signature_field_name="answer")]
@@ -240,14 +239,14 @@ async def test_streaming_handles_space_correctly(llm_model):
 
     async def gpt_4o_mini_stream(*args, **kwargs):
         yield ModelResponseStream(
-            model=llm_model, choices=[StreamingChoices(delta=Delta(content="[[ ## answer ## ]]\n"))]
+            model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="[[ ## answer ## ]]\n"))]
         )
-        yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="How "))])
-        yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="are "))])
-        yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="you "))])
-        yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="doing?"))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="How "))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="are "))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="you "))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="doing?"))])
         yield ModelResponseStream(
-            model=llm_model, choices=[StreamingChoices(delta=Delta(content="\n\n[[ ## completed ## ]]"))]
+            model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="\n\n[[ ## completed ## ]]"))]
         )
 
     with mock.patch("litellm.acompletion", side_effect=gpt_4o_mini_stream):
@@ -262,7 +261,7 @@ async def gpt_4o_mini_stream(*args, **kwargs):
 
 
 @pytest.mark.llm_call
-def test_sync_streaming(llm_model):
+def test_sync_streaming(lm_for_test):
     class MyProgram(dspy.Module):
         def __init__(self):
             self.predict1 = dspy.Predict("question->answer")
@@ -284,7 +283,7 @@ def __call__(self, x: str, **kwargs):
         async_streaming=False,
     )
     # Turn off the cache to ensure the stream is produced.
-    with dspy.context(lm=dspy.LM(llm_model, cache=False)):
+    with dspy.context(lm=dspy.LM(lm_for_test, cache=False)):
         output = program(x="why did a chicken cross the kitchen?")
         all_chunks = []
         for value in output:
diff --git a/tests/utils/test_usage_tracker.py b/tests/utils/test_usage_tracker.py
index 0a6c106850..00c9e43b57 100644
--- a/tests/utils/test_usage_tracker.py
+++ b/tests/utils/test_usage_tracker.py
@@ -133,8 +133,8 @@ def test_track_usage_with_multiple_models():
     assert total_usage["gpt-3.5-turbo"]["total_tokens"] == 900
 
 
-def test_track_usage_context_manager(llm_model):
-    lm = dspy.LM(llm_model, cache=False)
+def test_track_usage_context_manager(lm_for_test):
+    lm = dspy.LM(lm_for_test, cache=False)
     dspy.settings.configure(lm=lm)
 
     predict = dspy.ChainOfThought("question -> answer")
@@ -143,12 +143,12 @@ def test_track_usage_context_manager(llm_model):
         predict(question="What is the capital of Italy?")
 
     assert len(tracker.usage_data) > 0
-    assert len(tracker.usage_data[llm_model]) == 2
+    assert len(tracker.usage_data[lm_for_test]) == 2
 
     total_usage = tracker.get_total_tokens()
-    assert llm_model in total_usage
+    assert lm_for_test in total_usage
     assert len(total_usage.keys()) == 1
-    assert isinstance(total_usage[llm_model], dict)
+    assert isinstance(total_usage[lm_for_test], dict)
 
 
 def test_merge_usage_entries_with_new_keys():