From 868f4ae30594c0f9f00308cf5ceea3126b8c62d8 Mon Sep 17 00:00:00 2001 From: TomuHirata Date: Thu, 3 Jul 2025 14:31:36 +0900 Subject: [PATCH 1/8] use real LLM for unit tests --- .github/workflows/run_tests.yml | 5 +++++ tests/conftest.py | 9 +++++++++ tests/primitives/test_base_module.py | 27 ++++++++++++------------- tests/streaming/test_streaming.py | 30 ++++++++++++---------------- tests/utils/test_usage_tracker.py | 18 +++++------------ 5 files changed, 45 insertions(+), 44 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 6510c69abf..7975361e93 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -81,6 +81,11 @@ jobs: uses: chartboost/ruff-action@v1 with: args: check --fix-only + - name: Set LLM model + run: | + echo "LLM_MODEL=${{ secrets.LLM_MODEL }}" >> $GITHUB_ENV + echo "DATABRICKS_API_BASE=${{ secrets.DATABRICKS_API_BASE }}" >> $GITHUB_ENV + echo "DATABRICKS_API_KEY=${{ secrets.DATABRICKS_API_KEY }}" >> $GITHUB_ENV - name: Run tests with pytest run: uv run -p .venv pytest tests/ - name: Install optional dependencies diff --git a/tests/conftest.py b/tests/conftest.py index dd57d03f19..fe80b5ecaa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ import copy +import os import pytest @@ -49,3 +50,11 @@ def pytest_collection_modifyitems(config, items): for item in items: if flag in item.keywords: item.add_marker(skip_mark) + + +@pytest.fixture +def llm_model(): + model = os.environ.get("LLM_MODEL", None) + if model is None: + pytest.skip("LLM_MODEL is not set in the environment variables") + return model diff --git a/tests/primitives/test_base_module.py b/tests/primitives/test_base_module.py index ee6255bb06..dbe8c1a42c 100644 --- a/tests/primitives/test_base_module.py +++ b/tests/primitives/test_base_module.py @@ -230,30 +230,28 @@ def emit(self, record): logger.removeHandler(handler) -@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.") -def test_single_module_call_with_usage_tracker(): - dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=False), track_usage=True) +def test_single_module_call_with_usage_tracker(llm_model): + dspy.settings.configure(lm=dspy.LM(llm_model, cache=False), track_usage=True) predict = dspy.ChainOfThought("question -> answer") output = predict(question="What is the capital of France?") lm_usage = output.get_lm_usage() assert len(lm_usage) == 1 - assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0 - assert lm_usage["openai/gpt-4o-mini"]["completion_tokens"] > 0 - assert lm_usage["openai/gpt-4o-mini"]["total_tokens"] > 0 + assert lm_usage[llm_model]["prompt_tokens"] > 0 + assert lm_usage[llm_model]["completion_tokens"] > 0 + assert lm_usage[llm_model]["total_tokens"] > 0 # Test no usage being tracked when cache is enabled - dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=True), track_usage=True) + dspy.settings.configure(lm=dspy.LM(llm_model, cache=True), track_usage=True) for _ in range(2): output = predict(question="What is the capital of France?") assert len(output.get_lm_usage()) == 0 -@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.") -def test_multi_module_call_with_usage_tracker(): - dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=False), track_usage=True) +def test_multi_module_call_with_usage_tracker(llm_model): + dspy.settings.configure(lm=dspy.LM(llm_model, cache=False), track_usage=True) class MyProgram(dspy.Module): def __init__(self): @@ -270,12 +268,13 @@ def __call__(self, question: str) -> str: lm_usage = output.get_lm_usage() assert len(lm_usage) == 1 - assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0 - assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0 - assert lm_usage["openai/gpt-4o-mini"]["completion_tokens"] > 0 - assert lm_usage["openai/gpt-4o-mini"]["total_tokens"] > 0 + assert lm_usage[llm_model]["prompt_tokens"] > 0 + assert lm_usage[llm_model]["prompt_tokens"] > 0 + assert lm_usage[llm_model]["completion_tokens"] > 0 + assert lm_usage[llm_model]["total_tokens"] > 0 +# TODO: prepare second model for testing this unit test in ci @pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.") def test_usage_tracker_in_parallel(): class MyProgram(dspy.Module): diff --git a/tests/streaming/test_streaming.py b/tests/streaming/test_streaming.py index d79402d2ef..3b83d56789 100644 --- a/tests/streaming/test_streaming.py +++ b/tests/streaming/test_streaming.py @@ -1,5 +1,4 @@ import asyncio -import os import time from unittest import mock from unittest.mock import AsyncMock @@ -131,9 +130,8 @@ def module_start_status_message(self, instance, inputs): assert status_messages[2].message == "Predict starting!" -@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables") @pytest.mark.anyio -async def test_stream_listener_chat_adapter(): +async def test_stream_listener_chat_adapter(llm_model): class MyProgram(dspy.Module): def __init__(self): self.predict1 = dspy.Predict("question->answer") @@ -154,7 +152,7 @@ def __call__(self, x: str, **kwargs): include_final_prediction_in_output_stream=False, ) # Turn off the cache to ensure the stream is produced. - with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False)): + with dspy.context(lm=dspy.LM(llm_model, cache=False)): output = program(x="why did a chicken cross the kitchen?") all_chunks = [] async for value in output: @@ -194,9 +192,8 @@ async def acall(self, x: str): assert status_messages[1].message == "Tool calling finished! Querying the LLM with tool calling results..." -@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables") @pytest.mark.anyio -async def test_stream_listener_json_adapter(): +async def test_stream_listener_json_adapter(llm_model): class MyProgram(dspy.Module): def __init__(self): self.predict1 = dspy.Predict("question->answer") @@ -217,7 +214,7 @@ def __call__(self, x: str, **kwargs): include_final_prediction_in_output_stream=False, ) # Turn off the cache to ensure the stream is produced. - with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False), adapter=dspy.JSONAdapter()): + with dspy.context(lm=dspy.LM(llm_model, cache=False), adapter=dspy.JSONAdapter()): output = program(x="why did a chicken cross the kitchen?") all_chunks = [] async for value in output: @@ -232,7 +229,7 @@ def __call__(self, x: str, **kwargs): @pytest.mark.anyio -async def test_streaming_handles_space_correctly(): +async def test_streaming_handles_space_correctly(llm_model): my_program = dspy.Predict("question->answer") program = dspy.streamify( my_program, stream_listeners=[dspy.streaming.StreamListener(signature_field_name="answer")] @@ -240,14 +237,14 @@ async def test_streaming_handles_space_correctly(): async def gpt_4o_mini_stream(*args, **kwargs): yield ModelResponseStream( - model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="[[ ## answer ## ]]\n"))] + model=llm_model, choices=[StreamingChoices(delta=Delta(content="[[ ## answer ## ]]\n"))] ) - yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="How "))]) - yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="are "))]) - yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="you "))]) - yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="doing?"))]) + yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="How "))]) + yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="are "))]) + yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="you "))]) + yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="doing?"))]) yield ModelResponseStream( - model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="\n\n[[ ## completed ## ]]"))] + model=llm_model, choices=[StreamingChoices(delta=Delta(content="\n\n[[ ## completed ## ]]"))] ) with mock.patch("litellm.acompletion", side_effect=gpt_4o_mini_stream): @@ -261,8 +258,7 @@ async def gpt_4o_mini_stream(*args, **kwargs): assert all_chunks[0].chunk == "How are you doing?" -@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables") -def test_sync_streaming(): +def test_sync_streaming(llm_model): class MyProgram(dspy.Module): def __init__(self): self.predict1 = dspy.Predict("question->answer") @@ -284,7 +280,7 @@ def __call__(self, x: str, **kwargs): async_streaming=False, ) # Turn off the cache to ensure the stream is produced. - with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False)): + with dspy.context(lm=dspy.LM(llm_model, cache=False)): output = program(x="why did a chicken cross the kitchen?") all_chunks = [] for value in output: diff --git a/tests/utils/test_usage_tracker.py b/tests/utils/test_usage_tracker.py index 70244b3283..0a6c106850 100644 --- a/tests/utils/test_usage_tracker.py +++ b/tests/utils/test_usage_tracker.py @@ -1,7 +1,3 @@ -import os - -import pytest - import dspy from dspy.utils.usage_tracker import UsageTracker, track_usage @@ -137,12 +133,8 @@ def test_track_usage_with_multiple_models(): assert total_usage["gpt-3.5-turbo"]["total_tokens"] == 900 -@pytest.mark.skipif( - not os.getenv("OPENAI_API_KEY"), - reason="Skip the test if OPENAI_API_KEY is not set.", -) -def test_track_usage_context_manager(): - lm = dspy.LM("openai/gpt-4o-mini", cache=False) +def test_track_usage_context_manager(llm_model): + lm = dspy.LM(llm_model, cache=False) dspy.settings.configure(lm=lm) predict = dspy.ChainOfThought("question -> answer") @@ -151,12 +143,12 @@ def test_track_usage_context_manager(): predict(question="What is the capital of Italy?") assert len(tracker.usage_data) > 0 - assert len(tracker.usage_data["openai/gpt-4o-mini"]) == 2 + assert len(tracker.usage_data[llm_model]) == 2 total_usage = tracker.get_total_tokens() - assert "openai/gpt-4o-mini" in total_usage + assert llm_model in total_usage assert len(total_usage.keys()) == 1 - assert isinstance(total_usage["openai/gpt-4o-mini"], dict) + assert isinstance(total_usage[llm_model], dict) def test_merge_usage_entries_with_new_keys(): From 0cd6132375c0a917deccb768006d6ab8d8e6c216 Mon Sep 17 00:00:00 2001 From: TomuHirata Date: Thu, 3 Jul 2025 15:20:18 +0900 Subject: [PATCH 2/8] use ollama --- .github/workflows/run_tests.yml | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 7975361e93..caa59e9d73 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -48,6 +48,11 @@ jobs: test: name: Run Tests runs-on: ubuntu-latest + services: + ollama: + image: ollama/ollama:latest + ports: + - 11434:11434 strategy: matrix: python-version: ["3.10", "3.11", "3.12", "3.13"] @@ -81,11 +86,13 @@ jobs: uses: chartboost/ruff-action@v1 with: args: check --fix-only - - name: Set LLM model + - name: Pull LLM run: | - echo "LLM_MODEL=${{ secrets.LLM_MODEL }}" >> $GITHUB_ENV - echo "DATABRICKS_API_BASE=${{ secrets.DATABRICKS_API_BASE }}" >> $GITHUB_ENV - echo "DATABRICKS_API_KEY=${{ secrets.DATABRICKS_API_KEY }}" >> $GITHUB_ENV + timeout 60 bash -c 'until curl -f http://localhost:11434/api/version; do sleep 2; done' + curl -X POST http://localhost:11434/api/pull \ + -H "Content-Type: application/json" \ + -d '{"name": "llama3.2:1b"}' + echo "LLM_MODEL=ollama/llama3.2:1b" >> $GITHUB_ENV - name: Run tests with pytest run: uv run -p .venv pytest tests/ - name: Install optional dependencies From d21eba68be744ed5cafff356319ec139642041a8 Mon Sep 17 00:00:00 2001 From: TomuHirata Date: Thu, 3 Jul 2025 15:34:18 +0900 Subject: [PATCH 3/8] use Llama 3.2 3b --- .github/workflows/run_tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index caa59e9d73..84fc4387ea 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -91,8 +91,8 @@ jobs: timeout 60 bash -c 'until curl -f http://localhost:11434/api/version; do sleep 2; done' curl -X POST http://localhost:11434/api/pull \ -H "Content-Type: application/json" \ - -d '{"name": "llama3.2:1b"}' - echo "LLM_MODEL=ollama/llama3.2:1b" >> $GITHUB_ENV + -d '{"name": "llama3.2:3b"}' + echo "LLM_MODEL=ollama/llama3.2:3b" >> $GITHUB_ENV - name: Run tests with pytest run: uv run -p .venv pytest tests/ - name: Install optional dependencies From 5e9a6a64deab3e24f8c3fd9e763c68031f6144d6 Mon Sep 17 00:00:00 2001 From: TomuHirata Date: Fri, 4 Jul 2025 09:52:34 +0900 Subject: [PATCH 4/8] add verbose option --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 84fc4387ea..9bae49403e 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -94,7 +94,7 @@ jobs: -d '{"name": "llama3.2:3b"}' echo "LLM_MODEL=ollama/llama3.2:3b" >> $GITHUB_ENV - name: Run tests with pytest - run: uv run -p .venv pytest tests/ + run: uv run -p .venv pytest -vv --durations=3 tests/ - name: Install optional dependencies run: uv sync -p .venv --extra dev --extra test_extras - name: Run extra tests From cecf88af5560591c8d45e302d38d531f8f595f53 Mon Sep 17 00:00:00 2001 From: TomuHirata Date: Fri, 4 Jul 2025 10:19:06 +0900 Subject: [PATCH 5/8] split test into a separate job --- .github/workflows/run_tests.yml | 45 ++++++++++++++++++++++++---- tests/conftest.py | 2 +- tests/primitives/test_base_module.py | 2 ++ tests/streaming/test_streaming.py | 4 +++ 4 files changed, 46 insertions(+), 7 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 9bae49403e..23da732213 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -48,11 +48,6 @@ jobs: test: name: Run Tests runs-on: ubuntu-latest - services: - ollama: - image: ollama/ollama:latest - ports: - - 11434:11434 strategy: matrix: python-version: ["3.10", "3.11", "3.12", "3.13"] @@ -94,11 +89,49 @@ jobs: -d '{"name": "llama3.2:3b"}' echo "LLM_MODEL=ollama/llama3.2:3b" >> $GITHUB_ENV - name: Run tests with pytest - run: uv run -p .venv pytest -vv --durations=3 tests/ + run: uv run -p .venv pytest -vv tests/ - name: Install optional dependencies run: uv sync -p .venv --extra dev --extra test_extras - name: Run extra tests run: uv run -p .venv pytest tests/ -m extra --extra + + llm_call_test: + name: Run Tests with Real LM + runs-on: ubuntu-latest + services: + ollama: + image: ollama/ollama:latest + ports: + - 11434:11434 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.11 + - name: Install uv with caching + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + cache-dependency-glob: | + **/pyproject.toml + **/uv.lock + - name: Create and activate virtual environment + run: | + uv venv .venv + echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH + - name: Install dependencies + run: | + uv sync --dev -p .venv --extra dev + uv pip list + - name: Pull LLM + run: | + timeout 60 bash -c 'until curl -f http://localhost:11434/api/version; do sleep 2; done' + curl -X POST http://localhost:11434/api/pull \ + -H "Content-Type: application/json" \ + -d '{"name": "llama3.2:3b"}' + echo "LLM_MODEL=ollama/llama3.2:3b" >> $GITHUB_ENV + - name: Run tests + run: uv run -p .venv pytest -m llm_call --llm_call -vv --duration=5 tests/ build_package: name: Build Package diff --git a/tests/conftest.py b/tests/conftest.py index fe80b5ecaa..1a685f5757 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,7 +5,7 @@ from tests.test_utils.server import litellm_test_server, read_litellm_test_server_request_logs # noqa: F401 -SKIP_DEFAULT_FLAGS = ["reliability", "extra"] +SKIP_DEFAULT_FLAGS = ["reliability", "extra", "llm_call"] @pytest.fixture(autouse=True) diff --git a/tests/primitives/test_base_module.py b/tests/primitives/test_base_module.py index dbe8c1a42c..65190ee996 100644 --- a/tests/primitives/test_base_module.py +++ b/tests/primitives/test_base_module.py @@ -230,6 +230,7 @@ def emit(self, record): logger.removeHandler(handler) +@pytest.mark.llm_call def test_single_module_call_with_usage_tracker(llm_model): dspy.settings.configure(lm=dspy.LM(llm_model, cache=False), track_usage=True) @@ -250,6 +251,7 @@ def test_single_module_call_with_usage_tracker(llm_model): assert len(output.get_lm_usage()) == 0 +@pytest.mark.llm_call def test_multi_module_call_with_usage_tracker(llm_model): dspy.settings.configure(lm=dspy.LM(llm_model, cache=False), track_usage=True) diff --git a/tests/streaming/test_streaming.py b/tests/streaming/test_streaming.py index 3b83d56789..6aa0dcf384 100644 --- a/tests/streaming/test_streaming.py +++ b/tests/streaming/test_streaming.py @@ -130,6 +130,7 @@ def module_start_status_message(self, instance, inputs): assert status_messages[2].message == "Predict starting!" +@pytest.mark.llm_call @pytest.mark.anyio async def test_stream_listener_chat_adapter(llm_model): class MyProgram(dspy.Module): @@ -192,6 +193,7 @@ async def acall(self, x: str): assert status_messages[1].message == "Tool calling finished! Querying the LLM with tool calling results..." +@pytest.mark.llm_call @pytest.mark.anyio async def test_stream_listener_json_adapter(llm_model): class MyProgram(dspy.Module): @@ -228,6 +230,7 @@ def __call__(self, x: str, **kwargs): assert all_chunks[-1].signature_field_name == "judgement" +@pytest.mark.llm_call @pytest.mark.anyio async def test_streaming_handles_space_correctly(llm_model): my_program = dspy.Predict("question->answer") @@ -258,6 +261,7 @@ async def gpt_4o_mini_stream(*args, **kwargs): assert all_chunks[0].chunk == "How are you doing?" +@pytest.mark.llm_call def test_sync_streaming(llm_model): class MyProgram(dspy.Module): def __init__(self): From 061c58e96c05c18c400e56f045bc58fbc0d0e8f1 Mon Sep 17 00:00:00 2001 From: TomuHirata Date: Fri, 4 Jul 2025 10:21:51 +0900 Subject: [PATCH 6/8] remove LLM pulling --- .github/workflows/run_tests.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 23da732213..d22216cc62 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -81,13 +81,6 @@ jobs: uses: chartboost/ruff-action@v1 with: args: check --fix-only - - name: Pull LLM - run: | - timeout 60 bash -c 'until curl -f http://localhost:11434/api/version; do sleep 2; done' - curl -X POST http://localhost:11434/api/pull \ - -H "Content-Type: application/json" \ - -d '{"name": "llama3.2:3b"}' - echo "LLM_MODEL=ollama/llama3.2:3b" >> $GITHUB_ENV - name: Run tests with pytest run: uv run -p .venv pytest -vv tests/ - name: Install optional dependencies From f17ef5358a892b71ce36cd5026f170bb9444faa9 Mon Sep 17 00:00:00 2001 From: TomuHirata Date: Fri, 4 Jul 2025 10:25:06 +0900 Subject: [PATCH 7/8] fix option name --- .github/workflows/run_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index d22216cc62..1a64382b75 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -124,7 +124,7 @@ jobs: -d '{"name": "llama3.2:3b"}' echo "LLM_MODEL=ollama/llama3.2:3b" >> $GITHUB_ENV - name: Run tests - run: uv run -p .venv pytest -m llm_call --llm_call -vv --duration=5 tests/ + run: uv run -p .venv pytest -m llm_call --llm_call -vv --durations=5 tests/ build_package: name: Build Package From db6552e7a01ce60d23348f218dc8ca09d7d9f751 Mon Sep 17 00:00:00 2001 From: TomuHirata Date: Tue, 8 Jul 2025 08:46:19 +0900 Subject: [PATCH 8/8] rename env var --- .github/workflows/run_tests.yml | 2 +- tests/conftest.py | 6 +++--- tests/primitives/test_base_module.py | 24 ++++++++++++------------ tests/streaming/test_streaming.py | 27 +++++++++++++-------------- tests/utils/test_usage_tracker.py | 10 +++++----- 5 files changed, 34 insertions(+), 35 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 1a64382b75..4003781e4d 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -122,7 +122,7 @@ jobs: curl -X POST http://localhost:11434/api/pull \ -H "Content-Type: application/json" \ -d '{"name": "llama3.2:3b"}' - echo "LLM_MODEL=ollama/llama3.2:3b" >> $GITHUB_ENV + echo "LM_FOR_TEST=ollama/llama3.2:3b" >> $GITHUB_ENV - name: Run tests run: uv run -p .venv pytest -m llm_call --llm_call -vv --durations=5 tests/ diff --git a/tests/conftest.py b/tests/conftest.py index 1a685f5757..372585df20 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -53,8 +53,8 @@ def pytest_collection_modifyitems(config, items): @pytest.fixture -def llm_model(): - model = os.environ.get("LLM_MODEL", None) +def lm_for_test(): + model = os.environ.get("LM_FOR_TEST", None) if model is None: - pytest.skip("LLM_MODEL is not set in the environment variables") + pytest.skip("LM_FOR_TEST is not set in the environment variables") return model diff --git a/tests/primitives/test_base_module.py b/tests/primitives/test_base_module.py index 65190ee996..5706d24846 100644 --- a/tests/primitives/test_base_module.py +++ b/tests/primitives/test_base_module.py @@ -231,20 +231,20 @@ def emit(self, record): @pytest.mark.llm_call -def test_single_module_call_with_usage_tracker(llm_model): - dspy.settings.configure(lm=dspy.LM(llm_model, cache=False), track_usage=True) +def test_single_module_call_with_usage_tracker(lm_for_test): + dspy.settings.configure(lm=dspy.LM(lm_for_test, cache=False), track_usage=True) predict = dspy.ChainOfThought("question -> answer") output = predict(question="What is the capital of France?") lm_usage = output.get_lm_usage() assert len(lm_usage) == 1 - assert lm_usage[llm_model]["prompt_tokens"] > 0 - assert lm_usage[llm_model]["completion_tokens"] > 0 - assert lm_usage[llm_model]["total_tokens"] > 0 + assert lm_usage[lm_for_test]["prompt_tokens"] > 0 + assert lm_usage[lm_for_test]["completion_tokens"] > 0 + assert lm_usage[lm_for_test]["total_tokens"] > 0 # Test no usage being tracked when cache is enabled - dspy.settings.configure(lm=dspy.LM(llm_model, cache=True), track_usage=True) + dspy.settings.configure(lm=dspy.LM(lm_for_test, cache=True), track_usage=True) for _ in range(2): output = predict(question="What is the capital of France?") @@ -252,8 +252,8 @@ def test_single_module_call_with_usage_tracker(llm_model): @pytest.mark.llm_call -def test_multi_module_call_with_usage_tracker(llm_model): - dspy.settings.configure(lm=dspy.LM(llm_model, cache=False), track_usage=True) +def test_multi_module_call_with_usage_tracker(lm_for_test): + dspy.settings.configure(lm=dspy.LM(lm_for_test, cache=False), track_usage=True) class MyProgram(dspy.Module): def __init__(self): @@ -270,10 +270,10 @@ def __call__(self, question: str) -> str: lm_usage = output.get_lm_usage() assert len(lm_usage) == 1 - assert lm_usage[llm_model]["prompt_tokens"] > 0 - assert lm_usage[llm_model]["prompt_tokens"] > 0 - assert lm_usage[llm_model]["completion_tokens"] > 0 - assert lm_usage[llm_model]["total_tokens"] > 0 + assert lm_usage[lm_for_test]["prompt_tokens"] > 0 + assert lm_usage[lm_for_test]["prompt_tokens"] > 0 + assert lm_usage[lm_for_test]["completion_tokens"] > 0 + assert lm_usage[lm_for_test]["total_tokens"] > 0 # TODO: prepare second model for testing this unit test in ci diff --git a/tests/streaming/test_streaming.py b/tests/streaming/test_streaming.py index 6aa0dcf384..56c5426bb8 100644 --- a/tests/streaming/test_streaming.py +++ b/tests/streaming/test_streaming.py @@ -132,7 +132,7 @@ def module_start_status_message(self, instance, inputs): @pytest.mark.llm_call @pytest.mark.anyio -async def test_stream_listener_chat_adapter(llm_model): +async def test_stream_listener_chat_adapter(lm_for_test): class MyProgram(dspy.Module): def __init__(self): self.predict1 = dspy.Predict("question->answer") @@ -153,7 +153,7 @@ def __call__(self, x: str, **kwargs): include_final_prediction_in_output_stream=False, ) # Turn off the cache to ensure the stream is produced. - with dspy.context(lm=dspy.LM(llm_model, cache=False)): + with dspy.context(lm=dspy.LM(lm_for_test, cache=False)): output = program(x="why did a chicken cross the kitchen?") all_chunks = [] async for value in output: @@ -195,7 +195,7 @@ async def acall(self, x: str): @pytest.mark.llm_call @pytest.mark.anyio -async def test_stream_listener_json_adapter(llm_model): +async def test_stream_listener_json_adapter(lm_for_test): class MyProgram(dspy.Module): def __init__(self): self.predict1 = dspy.Predict("question->answer") @@ -216,7 +216,7 @@ def __call__(self, x: str, **kwargs): include_final_prediction_in_output_stream=False, ) # Turn off the cache to ensure the stream is produced. - with dspy.context(lm=dspy.LM(llm_model, cache=False), adapter=dspy.JSONAdapter()): + with dspy.context(lm=dspy.LM(lm_for_test, cache=False), adapter=dspy.JSONAdapter()): output = program(x="why did a chicken cross the kitchen?") all_chunks = [] async for value in output: @@ -230,9 +230,8 @@ def __call__(self, x: str, **kwargs): assert all_chunks[-1].signature_field_name == "judgement" -@pytest.mark.llm_call @pytest.mark.anyio -async def test_streaming_handles_space_correctly(llm_model): +async def test_streaming_handles_space_correctly(): my_program = dspy.Predict("question->answer") program = dspy.streamify( my_program, stream_listeners=[dspy.streaming.StreamListener(signature_field_name="answer")] @@ -240,14 +239,14 @@ async def test_streaming_handles_space_correctly(llm_model): async def gpt_4o_mini_stream(*args, **kwargs): yield ModelResponseStream( - model=llm_model, choices=[StreamingChoices(delta=Delta(content="[[ ## answer ## ]]\n"))] + model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="[[ ## answer ## ]]\n"))] ) - yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="How "))]) - yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="are "))]) - yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="you "))]) - yield ModelResponseStream(model=llm_model, choices=[StreamingChoices(delta=Delta(content="doing?"))]) + yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="How "))]) + yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="are "))]) + yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="you "))]) + yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="doing?"))]) yield ModelResponseStream( - model=llm_model, choices=[StreamingChoices(delta=Delta(content="\n\n[[ ## completed ## ]]"))] + model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="\n\n[[ ## completed ## ]]"))] ) with mock.patch("litellm.acompletion", side_effect=gpt_4o_mini_stream): @@ -262,7 +261,7 @@ async def gpt_4o_mini_stream(*args, **kwargs): @pytest.mark.llm_call -def test_sync_streaming(llm_model): +def test_sync_streaming(lm_for_test): class MyProgram(dspy.Module): def __init__(self): self.predict1 = dspy.Predict("question->answer") @@ -284,7 +283,7 @@ def __call__(self, x: str, **kwargs): async_streaming=False, ) # Turn off the cache to ensure the stream is produced. - with dspy.context(lm=dspy.LM(llm_model, cache=False)): + with dspy.context(lm=dspy.LM(lm_for_test, cache=False)): output = program(x="why did a chicken cross the kitchen?") all_chunks = [] for value in output: diff --git a/tests/utils/test_usage_tracker.py b/tests/utils/test_usage_tracker.py index 0a6c106850..00c9e43b57 100644 --- a/tests/utils/test_usage_tracker.py +++ b/tests/utils/test_usage_tracker.py @@ -133,8 +133,8 @@ def test_track_usage_with_multiple_models(): assert total_usage["gpt-3.5-turbo"]["total_tokens"] == 900 -def test_track_usage_context_manager(llm_model): - lm = dspy.LM(llm_model, cache=False) +def test_track_usage_context_manager(lm_for_test): + lm = dspy.LM(lm_for_test, cache=False) dspy.settings.configure(lm=lm) predict = dspy.ChainOfThought("question -> answer") @@ -143,12 +143,12 @@ def test_track_usage_context_manager(llm_model): predict(question="What is the capital of Italy?") assert len(tracker.usage_data) > 0 - assert len(tracker.usage_data[llm_model]) == 2 + assert len(tracker.usage_data[lm_for_test]) == 2 total_usage = tracker.get_total_tokens() - assert llm_model in total_usage + assert lm_for_test in total_usage assert len(total_usage.keys()) == 1 - assert isinstance(total_usage[llm_model], dict) + assert isinstance(total_usage[lm_for_test], dict) def test_merge_usage_entries_with_new_keys():