diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 6510c69abf..4003781e4d 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -82,11 +82,49 @@ jobs: with: args: check --fix-only - name: Run tests with pytest - run: uv run -p .venv pytest tests/ + run: uv run -p .venv pytest -vv tests/ - name: Install optional dependencies run: uv sync -p .venv --extra dev --extra test_extras - name: Run extra tests run: uv run -p .venv pytest tests/ -m extra --extra + + llm_call_test: + name: Run Tests with Real LM + runs-on: ubuntu-latest + services: + ollama: + image: ollama/ollama:latest + ports: + - 11434:11434 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.11 + - name: Install uv with caching + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + cache-dependency-glob: | + **/pyproject.toml + **/uv.lock + - name: Create and activate virtual environment + run: | + uv venv .venv + echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH + - name: Install dependencies + run: | + uv sync --dev -p .venv --extra dev + uv pip list + - name: Pull LLM + run: | + timeout 60 bash -c 'until curl -f http://localhost:11434/api/version; do sleep 2; done' + curl -X POST http://localhost:11434/api/pull \ + -H "Content-Type: application/json" \ + -d '{"name": "llama3.2:3b"}' + echo "LM_FOR_TEST=ollama/llama3.2:3b" >> $GITHUB_ENV + - name: Run tests + run: uv run -p .venv pytest -m llm_call --llm_call -vv --durations=5 tests/ build_package: name: Build Package diff --git a/tests/conftest.py b/tests/conftest.py index dd57d03f19..372585df20 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,11 @@ import copy +import os import pytest from tests.test_utils.server import litellm_test_server, read_litellm_test_server_request_logs # noqa: F401 -SKIP_DEFAULT_FLAGS = ["reliability", "extra"] +SKIP_DEFAULT_FLAGS = ["reliability", "extra", "llm_call"] @pytest.fixture(autouse=True) @@ -49,3 +50,11 @@ def pytest_collection_modifyitems(config, items): for item in items: if flag in item.keywords: item.add_marker(skip_mark) + + +@pytest.fixture +def lm_for_test(): + model = os.environ.get("LM_FOR_TEST", None) + if model is None: + pytest.skip("LM_FOR_TEST is not set in the environment variables") + return model diff --git a/tests/primitives/test_base_module.py b/tests/primitives/test_base_module.py index ee6255bb06..5706d24846 100644 --- a/tests/primitives/test_base_module.py +++ b/tests/primitives/test_base_module.py @@ -230,30 +230,30 @@ def emit(self, record): logger.removeHandler(handler) -@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.") -def test_single_module_call_with_usage_tracker(): - dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=False), track_usage=True) +@pytest.mark.llm_call +def test_single_module_call_with_usage_tracker(lm_for_test): + dspy.settings.configure(lm=dspy.LM(lm_for_test, cache=False), track_usage=True) predict = dspy.ChainOfThought("question -> answer") output = predict(question="What is the capital of France?") lm_usage = output.get_lm_usage() assert len(lm_usage) == 1 - assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0 - assert lm_usage["openai/gpt-4o-mini"]["completion_tokens"] > 0 - assert lm_usage["openai/gpt-4o-mini"]["total_tokens"] > 0 + assert lm_usage[lm_for_test]["prompt_tokens"] > 0 + assert lm_usage[lm_for_test]["completion_tokens"] > 0 + assert lm_usage[lm_for_test]["total_tokens"] > 0 # Test no usage being tracked when cache is enabled - dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=True), track_usage=True) + dspy.settings.configure(lm=dspy.LM(lm_for_test, cache=True), track_usage=True) for _ in range(2): output = predict(question="What is the capital of France?") assert len(output.get_lm_usage()) == 0 -@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.") -def test_multi_module_call_with_usage_tracker(): - dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=False), track_usage=True) +@pytest.mark.llm_call +def test_multi_module_call_with_usage_tracker(lm_for_test): + dspy.settings.configure(lm=dspy.LM(lm_for_test, cache=False), track_usage=True) class MyProgram(dspy.Module): def __init__(self): @@ -270,12 +270,13 @@ def __call__(self, question: str) -> str: lm_usage = output.get_lm_usage() assert len(lm_usage) == 1 - assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0 - assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0 - assert lm_usage["openai/gpt-4o-mini"]["completion_tokens"] > 0 - assert lm_usage["openai/gpt-4o-mini"]["total_tokens"] > 0 + assert lm_usage[lm_for_test]["prompt_tokens"] > 0 + assert lm_usage[lm_for_test]["prompt_tokens"] > 0 + assert lm_usage[lm_for_test]["completion_tokens"] > 0 + assert lm_usage[lm_for_test]["total_tokens"] > 0 +# TODO: prepare second model for testing this unit test in ci @pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.") def test_usage_tracker_in_parallel(): class MyProgram(dspy.Module): diff --git a/tests/streaming/test_streaming.py b/tests/streaming/test_streaming.py index d79402d2ef..56c5426bb8 100644 --- a/tests/streaming/test_streaming.py +++ b/tests/streaming/test_streaming.py @@ -1,5 +1,4 @@ import asyncio -import os import time from unittest import mock from unittest.mock import AsyncMock @@ -131,9 +130,9 @@ def module_start_status_message(self, instance, inputs): assert status_messages[2].message == "Predict starting!" -@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables") +@pytest.mark.llm_call @pytest.mark.anyio -async def test_stream_listener_chat_adapter(): +async def test_stream_listener_chat_adapter(lm_for_test): class MyProgram(dspy.Module): def __init__(self): self.predict1 = dspy.Predict("question->answer") @@ -154,7 +153,7 @@ def __call__(self, x: str, **kwargs): include_final_prediction_in_output_stream=False, ) # Turn off the cache to ensure the stream is produced. - with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False)): + with dspy.context(lm=dspy.LM(lm_for_test, cache=False)): output = program(x="why did a chicken cross the kitchen?") all_chunks = [] async for value in output: @@ -194,9 +193,9 @@ async def acall(self, x: str): assert status_messages[1].message == "Tool calling finished! Querying the LLM with tool calling results..." -@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables") +@pytest.mark.llm_call @pytest.mark.anyio -async def test_stream_listener_json_adapter(): +async def test_stream_listener_json_adapter(lm_for_test): class MyProgram(dspy.Module): def __init__(self): self.predict1 = dspy.Predict("question->answer") @@ -217,7 +216,7 @@ def __call__(self, x: str, **kwargs): include_final_prediction_in_output_stream=False, ) # Turn off the cache to ensure the stream is produced. - with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False), adapter=dspy.JSONAdapter()): + with dspy.context(lm=dspy.LM(lm_for_test, cache=False), adapter=dspy.JSONAdapter()): output = program(x="why did a chicken cross the kitchen?") all_chunks = [] async for value in output: @@ -261,8 +260,8 @@ async def gpt_4o_mini_stream(*args, **kwargs): assert all_chunks[0].chunk == "How are you doing?" -@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables") -def test_sync_streaming(): +@pytest.mark.llm_call +def test_sync_streaming(lm_for_test): class MyProgram(dspy.Module): def __init__(self): self.predict1 = dspy.Predict("question->answer") @@ -284,7 +283,7 @@ def __call__(self, x: str, **kwargs): async_streaming=False, ) # Turn off the cache to ensure the stream is produced. - with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False)): + with dspy.context(lm=dspy.LM(lm_for_test, cache=False)): output = program(x="why did a chicken cross the kitchen?") all_chunks = [] for value in output: diff --git a/tests/utils/test_usage_tracker.py b/tests/utils/test_usage_tracker.py index 70244b3283..00c9e43b57 100644 --- a/tests/utils/test_usage_tracker.py +++ b/tests/utils/test_usage_tracker.py @@ -1,7 +1,3 @@ -import os - -import pytest - import dspy from dspy.utils.usage_tracker import UsageTracker, track_usage @@ -137,12 +133,8 @@ def test_track_usage_with_multiple_models(): assert total_usage["gpt-3.5-turbo"]["total_tokens"] == 900 -@pytest.mark.skipif( - not os.getenv("OPENAI_API_KEY"), - reason="Skip the test if OPENAI_API_KEY is not set.", -) -def test_track_usage_context_manager(): - lm = dspy.LM("openai/gpt-4o-mini", cache=False) +def test_track_usage_context_manager(lm_for_test): + lm = dspy.LM(lm_for_test, cache=False) dspy.settings.configure(lm=lm) predict = dspy.ChainOfThought("question -> answer") @@ -151,12 +143,12 @@ def test_track_usage_context_manager(): predict(question="What is the capital of Italy?") assert len(tracker.usage_data) > 0 - assert len(tracker.usage_data["openai/gpt-4o-mini"]) == 2 + assert len(tracker.usage_data[lm_for_test]) == 2 total_usage = tracker.get_total_tokens() - assert "openai/gpt-4o-mini" in total_usage + assert lm_for_test in total_usage assert len(total_usage.keys()) == 1 - assert isinstance(total_usage["openai/gpt-4o-mini"], dict) + assert isinstance(total_usage[lm_for_test], dict) def test_merge_usage_entries_with_new_keys():