Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion .github/workflows/run_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,49 @@ jobs:
with:
args: check --fix-only
- name: Run tests with pytest
run: uv run -p .venv pytest tests/
run: uv run -p .venv pytest -vv tests/
- name: Install optional dependencies
run: uv sync -p .venv --extra dev --extra test_extras
- name: Run extra tests
run: uv run -p .venv pytest tests/ -m extra --extra

llm_call_test:
name: Run Tests with Real LM
runs-on: ubuntu-latest
services:
ollama:
image: ollama/ollama:latest
ports:
- 11434:11434
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.11
- name: Install uv with caching
uses: astral-sh/setup-uv@v5
with:
enable-cache: true
cache-dependency-glob: |
**/pyproject.toml
**/uv.lock
- name: Create and activate virtual environment
run: |
uv venv .venv
echo "${{ github.workspace }}/.venv/bin" >> $GITHUB_PATH
- name: Install dependencies
run: |
uv sync --dev -p .venv --extra dev
uv pip list
- name: Pull LLM
run: |
timeout 60 bash -c 'until curl -f http://localhost:11434/api/version; do sleep 2; done'
curl -X POST http://localhost:11434/api/pull \
-H "Content-Type: application/json" \
-d '{"name": "llama3.2:3b"}'
echo "LM_FOR_TEST=ollama/llama3.2:3b" >> $GITHUB_ENV
- name: Run tests
run: uv run -p .venv pytest -m llm_call --llm_call -vv --durations=5 tests/

build_package:
name: Build Package
Expand Down
11 changes: 10 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import copy
import os

import pytest

from tests.test_utils.server import litellm_test_server, read_litellm_test_server_request_logs # noqa: F401

SKIP_DEFAULT_FLAGS = ["reliability", "extra"]
SKIP_DEFAULT_FLAGS = ["reliability", "extra", "llm_call"]


@pytest.fixture(autouse=True)
Expand Down Expand Up @@ -49,3 +50,11 @@ def pytest_collection_modifyitems(config, items):
for item in items:
if flag in item.keywords:
item.add_marker(skip_mark)


@pytest.fixture
def lm_for_test():
model = os.environ.get("LM_FOR_TEST", None)
if model is None:
pytest.skip("LM_FOR_TEST is not set in the environment variables")
return model
29 changes: 15 additions & 14 deletions tests/primitives/test_base_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,30 +230,30 @@ def emit(self, record):
logger.removeHandler(handler)


@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.")
def test_single_module_call_with_usage_tracker():
dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=False), track_usage=True)
@pytest.mark.llm_call
def test_single_module_call_with_usage_tracker(lm_for_test):
dspy.settings.configure(lm=dspy.LM(lm_for_test, cache=False), track_usage=True)

predict = dspy.ChainOfThought("question -> answer")
output = predict(question="What is the capital of France?")

lm_usage = output.get_lm_usage()
assert len(lm_usage) == 1
assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0
assert lm_usage["openai/gpt-4o-mini"]["completion_tokens"] > 0
assert lm_usage["openai/gpt-4o-mini"]["total_tokens"] > 0
assert lm_usage[lm_for_test]["prompt_tokens"] > 0
assert lm_usage[lm_for_test]["completion_tokens"] > 0
assert lm_usage[lm_for_test]["total_tokens"] > 0

# Test no usage being tracked when cache is enabled
dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=True), track_usage=True)
dspy.settings.configure(lm=dspy.LM(lm_for_test, cache=True), track_usage=True)
for _ in range(2):
output = predict(question="What is the capital of France?")

assert len(output.get_lm_usage()) == 0


@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.")
def test_multi_module_call_with_usage_tracker():
dspy.settings.configure(lm=dspy.LM("openai/gpt-4o-mini", cache=False), track_usage=True)
@pytest.mark.llm_call
def test_multi_module_call_with_usage_tracker(lm_for_test):
dspy.settings.configure(lm=dspy.LM(lm_for_test, cache=False), track_usage=True)

class MyProgram(dspy.Module):
def __init__(self):
Expand All @@ -270,12 +270,13 @@ def __call__(self, question: str) -> str:

lm_usage = output.get_lm_usage()
assert len(lm_usage) == 1
assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0
assert lm_usage["openai/gpt-4o-mini"]["prompt_tokens"] > 0
assert lm_usage["openai/gpt-4o-mini"]["completion_tokens"] > 0
assert lm_usage["openai/gpt-4o-mini"]["total_tokens"] > 0
assert lm_usage[lm_for_test]["prompt_tokens"] > 0
assert lm_usage[lm_for_test]["prompt_tokens"] > 0
assert lm_usage[lm_for_test]["completion_tokens"] > 0
assert lm_usage[lm_for_test]["total_tokens"] > 0


# TODO: prepare second model for testing this unit test in ci
@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Skip the test if OPENAI_API_KEY is not set.")
def test_usage_tracker_in_parallel():
class MyProgram(dspy.Module):
Expand Down
19 changes: 9 additions & 10 deletions tests/streaming/test_streaming.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import asyncio
import os
import time
from unittest import mock
from unittest.mock import AsyncMock
Expand Down Expand Up @@ -131,9 +130,9 @@ def module_start_status_message(self, instance, inputs):
assert status_messages[2].message == "Predict starting!"


@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables")
@pytest.mark.llm_call
@pytest.mark.anyio
async def test_stream_listener_chat_adapter():
async def test_stream_listener_chat_adapter(lm_for_test):
class MyProgram(dspy.Module):
def __init__(self):
self.predict1 = dspy.Predict("question->answer")
Expand All @@ -154,7 +153,7 @@ def __call__(self, x: str, **kwargs):
include_final_prediction_in_output_stream=False,
)
# Turn off the cache to ensure the stream is produced.
with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False)):
with dspy.context(lm=dspy.LM(lm_for_test, cache=False)):
output = program(x="why did a chicken cross the kitchen?")
all_chunks = []
async for value in output:
Expand Down Expand Up @@ -194,9 +193,9 @@ async def acall(self, x: str):
assert status_messages[1].message == "Tool calling finished! Querying the LLM with tool calling results..."


@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables")
@pytest.mark.llm_call
@pytest.mark.anyio
async def test_stream_listener_json_adapter():
async def test_stream_listener_json_adapter(lm_for_test):
class MyProgram(dspy.Module):
def __init__(self):
self.predict1 = dspy.Predict("question->answer")
Expand All @@ -217,7 +216,7 @@ def __call__(self, x: str, **kwargs):
include_final_prediction_in_output_stream=False,
)
# Turn off the cache to ensure the stream is produced.
with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False), adapter=dspy.JSONAdapter()):
with dspy.context(lm=dspy.LM(lm_for_test, cache=False), adapter=dspy.JSONAdapter()):
output = program(x="why did a chicken cross the kitchen?")
all_chunks = []
async for value in output:
Expand Down Expand Up @@ -261,8 +260,8 @@ async def gpt_4o_mini_stream(*args, **kwargs):
assert all_chunks[0].chunk == "How are you doing?"


@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OpenAI API key not found in environment variables")
def test_sync_streaming():
@pytest.mark.llm_call
def test_sync_streaming(lm_for_test):
class MyProgram(dspy.Module):
def __init__(self):
self.predict1 = dspy.Predict("question->answer")
Expand All @@ -284,7 +283,7 @@ def __call__(self, x: str, **kwargs):
async_streaming=False,
)
# Turn off the cache to ensure the stream is produced.
with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False)):
with dspy.context(lm=dspy.LM(lm_for_test, cache=False)):
output = program(x="why did a chicken cross the kitchen?")
all_chunks = []
for value in output:
Expand Down
18 changes: 5 additions & 13 deletions tests/utils/test_usage_tracker.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
import os

import pytest

import dspy
from dspy.utils.usage_tracker import UsageTracker, track_usage

Expand Down Expand Up @@ -137,12 +133,8 @@ def test_track_usage_with_multiple_models():
assert total_usage["gpt-3.5-turbo"]["total_tokens"] == 900


@pytest.mark.skipif(
not os.getenv("OPENAI_API_KEY"),
reason="Skip the test if OPENAI_API_KEY is not set.",
)
def test_track_usage_context_manager():
lm = dspy.LM("openai/gpt-4o-mini", cache=False)
def test_track_usage_context_manager(lm_for_test):
lm = dspy.LM(lm_for_test, cache=False)
dspy.settings.configure(lm=lm)

predict = dspy.ChainOfThought("question -> answer")
Expand All @@ -151,12 +143,12 @@ def test_track_usage_context_manager():
predict(question="What is the capital of Italy?")

assert len(tracker.usage_data) > 0
assert len(tracker.usage_data["openai/gpt-4o-mini"]) == 2
assert len(tracker.usage_data[lm_for_test]) == 2

total_usage = tracker.get_total_tokens()
assert "openai/gpt-4o-mini" in total_usage
assert lm_for_test in total_usage
assert len(total_usage.keys()) == 1
assert isinstance(total_usage["openai/gpt-4o-mini"], dict)
assert isinstance(total_usage[lm_for_test], dict)


def test_merge_usage_entries_with_new_keys():
Expand Down