From fd8a72d8422c34133901d2dedf54cf9dc26cfbd3 Mon Sep 17 00:00:00 2001 From: Dennis Feng Date: Tue, 20 May 2025 13:16:53 -0700 Subject: [PATCH 01/12] added test utils --- src/mcp_foundry/mcp_foundry_model/tools.py | 1 + tests/test_mcp.py | 12 ++---- tests/utils.py | 50 ++++++++++++++++++++++ 3 files changed, 55 insertions(+), 8 deletions(-) create mode 100644 tests/utils.py diff --git a/src/mcp_foundry/mcp_foundry_model/tools.py b/src/mcp_foundry/mcp_foundry_model/tools.py index f300d2f..4a08103 100644 --- a/src/mcp_foundry/mcp_foundry_model/tools.py +++ b/src/mcp_foundry/mcp_foundry_model/tools.py @@ -35,6 +35,7 @@ ) logger = logging.getLogger("mcp_foundry_model") + @mcp.tool() async def list_models_from_model_catalog(ctx: Context, search_for_free_playground: bool = False, publisher_name = "", license_name = "") -> str: """ diff --git a/tests/test_mcp.py b/tests/test_mcp.py index 4c27b40..7318302 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -1,13 +1,16 @@ +from typing import Any +from mcp.types import ListToolsResult import pytest from mcp import ClientSession, StdioServerParameters from mcp.client.stdio import stdio_client + @pytest.mark.integration @pytest.mark.asyncio async def test_mcp_client_lists_tools(): server_params = StdioServerParameters( command="pipx", - args=["run", "--no-cache", "--spec", "..", "run-azure-foundry-mcp"], + args=["run", "--no-cache", "--spec", "..", "run-azure-ai-foundry-mcp"], ) async with stdio_client(server_params) as (stdio, write): @@ -16,10 +19,3 @@ async def test_mcp_client_lists_tools(): response = await session.list_tools() tools = response.tools assert tools, "Expected at least one tool from the MCP server" - - -#TODO: Add tools that take prompts and test that the correct tool(s) are selected -#TODO: Find way to only create client once per test module or make it faster -#TODO: Add LLM to client -##TODO: Make LLM easily configurable -##TODO: Make it so we can test against multiple LLMs diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..af337a0 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,50 @@ +""" +Utilities for helping run tests. +""" + +from openai import AzureOpenAI +from openai.types.chat import ChatCompletion +from mcp.types import ListToolsResult + + +def construct_openai_tools_from_mcp_tools(mcp_tools: ListToolsResult) -> list[dict]: + """ + Given a tools list from MCP server, convert it to the format required to feed into Azure OpenAI chat completion. + """ + final_tools = [ + { + "type": "function", + "function": { + "name": tool.name, + "description": tool.description, + "parameters": tool.inputSchema, + }, + } + for tool in mcp_tools.tools + ] + return final_tools + + +def invoke_llm_with_tools( + user_message: str, + tools: list[dict], + aoai_client: AzureOpenAI, + model: str, +) -> ChatCompletion: + """ + Invoke a single LLM inference step on a user message, including the specified tools, and return the response + """ + messages = [ + dict( + role="user", + message=user_message, + ) + ] + + completion = aoai_client.chat.completions.create( + model=model, + messages=messages, + tools=tools, + tool_choice="auto", + ) + return completion From 16f7e719f6ccbd4efa25f34bfe33cb82d0c33fc7 Mon Sep 17 00:00:00 2001 From: Dennis Feng Date: Wed, 21 May 2025 13:57:45 -0700 Subject: [PATCH 02/12] working test of tool call --- tests/test_mcp.py | 51 +++++++++++++++++++++++++++++++++++++++++++++-- tests/utils.py | 2 +- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/tests/test_mcp.py b/tests/test_mcp.py index 7318302..95e6f68 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -1,8 +1,17 @@ -from typing import Any -from mcp.types import ListToolsResult +import os import pytest from mcp import ClientSession, StdioServerParameters from mcp.client.stdio import stdio_client +from .utils import construct_openai_tools_from_mcp_tools, invoke_llm_with_tools +from openai import AzureOpenAI +from openai.types.chat import ChatCompletionMessageToolCall + +from azure.identity import DefaultAzureCredential, get_bearer_token_provider +from dotenv import load_dotenv + +load_dotenv() + +token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default") @pytest.mark.integration @@ -19,3 +28,41 @@ async def test_mcp_client_lists_tools(): response = await session.list_tools() tools = response.tools assert tools, "Expected at least one tool from the MCP server" + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_mcp_client_message_1(): + # In the args, we omit "--no-cache" to reduce latency + server_params = StdioServerParameters( + command="pipx", + args=["run", "--spec", "..", "run-azure-ai-foundry-mcp"], + ) + + async with stdio_client(server_params) as (stdio, write): + async with ClientSession(stdio, write) as session: + await session.initialize() + tools_response = await session.list_tools() + openai_tools = construct_openai_tools_from_mcp_tools( + mcp_tools=tools_response, + ) + aoai_client = AzureOpenAI( + azure_endpoint=os.environ["AOAI_ENDPOINT"], + api_version=os.environ["AOAI_API_VERSION"], + azure_ad_token_provider=token_provider, + ) + + completion = invoke_llm_with_tools( + user_message="Tell me the Azure AI Foundry Labs projects", + aoai_client=aoai_client, + model=os.environ["AOAI_MODEL"], + tools=openai_tools, + ) + response_message = completion.choices[0].message + + # TODO dennis + expected_tool_call_name = "list_azure_ai_foundry_labs_projects" + actual_tool_calls = response_message.tool_calls + assert len(actual_tool_calls) > 0 + assert isinstance(actual_tool_calls[0], ChatCompletionMessageToolCall) + assert actual_tool_calls[0].function.name == expected_tool_call_name diff --git a/tests/utils.py b/tests/utils.py index af337a0..d3bcf61 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -37,7 +37,7 @@ def invoke_llm_with_tools( messages = [ dict( role="user", - message=user_message, + content=user_message, ) ] From f859d03e69b3f32e578c248dbeb6576f0d0fb864 Mon Sep 17 00:00:00 2001 From: Dennis Feng Date: Wed, 21 May 2025 14:14:39 -0700 Subject: [PATCH 03/12] added a few automated tests for tool call verifications --- tests/test_mcp.py | 55 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 9 deletions(-) diff --git a/tests/test_mcp.py b/tests/test_mcp.py index 95e6f68..63b480e 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -30,13 +30,22 @@ async def test_mcp_client_lists_tools(): assert tools, "Expected at least one tool from the MCP server" -@pytest.mark.integration -@pytest.mark.asyncio -async def test_mcp_client_message_1(): - # In the args, we omit "--no-cache" to reduce latency +async def verify_mcp_tool_call(user_message: str, expected_tool_call_name: str, no_cache: bool = False): + """ + Helper function to test MCP tool calling functionality with different messages and expected tools. + + Args: + user_message: The query to send to the model + expected_tool_call_name: The name of the tool we expect the model to call + no_cache: Whether to use --no-cache flag when running the MCP server + """ + args = ["run", "--spec", "..", "run-azure-ai-foundry-mcp"] + if no_cache: + args.insert(1, "--no-cache") + server_params = StdioServerParameters( command="pipx", - args=["run", "--spec", "..", "run-azure-ai-foundry-mcp"], + args=args, ) async with stdio_client(server_params) as (stdio, write): @@ -53,16 +62,44 @@ async def test_mcp_client_message_1(): ) completion = invoke_llm_with_tools( - user_message="Tell me the Azure AI Foundry Labs projects", + user_message=user_message, aoai_client=aoai_client, model=os.environ["AOAI_MODEL"], tools=openai_tools, ) response_message = completion.choices[0].message - # TODO dennis - expected_tool_call_name = "list_azure_ai_foundry_labs_projects" actual_tool_calls = response_message.tool_calls - assert len(actual_tool_calls) > 0 + assert len(actual_tool_calls) > 0, "Expected at least one tool call but got none" assert isinstance(actual_tool_calls[0], ChatCompletionMessageToolCall) assert actual_tool_calls[0].function.name == expected_tool_call_name + + return completion + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_mcp_client_message_1(): + """Test that the model correctly calls list_azure_ai_foundry_labs_projects tool""" + await verify_mcp_tool_call( + user_message="Tell me the Azure AI Foundry Labs projects", + expected_tool_call_name="list_azure_ai_foundry_labs_projects", + ) + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_mcp_client_message_2(): + await verify_mcp_tool_call( + user_message="I want to prototype an app with Azure AI Foundry Labs. Where do I start?", + expected_tool_call_name="get_prototyping_instructions_for_github_and_labs", + ) + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_mcp_client_message_3(): + await verify_mcp_tool_call( + user_message="I want to use the Aurora model from Azure AI Foundry Labs; fetch details on how to implement it.", + expected_tool_call_name="list_azure_ai_foundry_labs_projects", + ) From 9d9caf45be85a647aaae40dec939ec9785ba600b Mon Sep 17 00:00:00 2001 From: Dennis Feng Date: Wed, 21 May 2025 15:23:29 -0700 Subject: [PATCH 04/12] minor comment --- tests/test_mcp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_mcp.py b/tests/test_mcp.py index 63b480e..24f378e 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -99,6 +99,8 @@ async def test_mcp_client_message_2(): @pytest.mark.integration @pytest.mark.asyncio async def test_mcp_client_message_3(): + # TODO: Create a more sophisticated tool call verification step that handles the stochosticity. + # This is because this pytest sometimes gives different tool calls which is expected of a vague freeform input await verify_mcp_tool_call( user_message="I want to use the Aurora model from Azure AI Foundry Labs; fetch details on how to implement it.", expected_tool_call_name="list_azure_ai_foundry_labs_projects", From f3e13a5cb90d3a6d1a93b35ea10641b580e28e07 Mon Sep 17 00:00:00 2001 From: Dennis Feng Date: Wed, 28 May 2025 14:16:01 -0700 Subject: [PATCH 05/12] added tool-usage-evals to pyproject.toml --- pyproject.toml | 6 +++++- uv.lock | 34 ++++++++++++++++++++++++---------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f41bdaf..3260325 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,8 @@ dependencies = [ "azure-search-documents>=11.5.2", "azure-cli>=2.60.0", "azure-ai-evaluation>=1.3.0", - "azure-ai-projects==1.0.0b10" + "azure-ai-projects==1.0.0b10", + "tool-usage-evals", ] [dependency-groups] @@ -25,5 +26,8 @@ test = [ asyncio_default_fixture_loop_scope = "function" # or "module", "session" based on my use case pythonpath = ["src"] +[tool.uv.sources] +tool-usage-evals = { git = "https://github.com/dennischenfeng/tool-usage-evals", branch = "Feature/first" } + [project.scripts] run-azure-ai-foundry-mcp = "mcp_foundry.__main__:main" diff --git a/uv.lock b/uv.lock index 6663508..aa1d5b7 100644 --- a/uv.lock +++ b/uv.lock @@ -1920,7 +1920,7 @@ name = "exceptiongroup" version = "1.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.13'" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" } wheels = [ @@ -2599,7 +2599,7 @@ wheels = [ [[package]] name = "mcp" -version = "1.9.0" +version = "1.9.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -2612,9 +2612,9 @@ dependencies = [ { name = "starlette" }, { name = "uvicorn", marker = "sys_platform != 'emscripten'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/bc/8d/0f4468582e9e97b0a24604b585c651dfd2144300ecffd1c06a680f5c8861/mcp-1.9.0.tar.gz", hash = "sha256:905d8d208baf7e3e71d70c82803b89112e321581bcd2530f9de0fe4103d28749", size = 281432, upload-time = "2025-05-15T18:51:06.615Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e7/bc/54aec2c334698cc575ca3b3481eed627125fb66544152fa1af927b1a495c/mcp-1.9.1.tar.gz", hash = "sha256:19879cd6dde3d763297617242888c2f695a95dfa854386a6a68676a646ce75e4", size = 316247, upload-time = "2025-05-22T15:52:21.26Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a5/d5/22e36c95c83c80eb47c83f231095419cf57cf5cca5416f1c960032074c78/mcp-1.9.0-py3-none-any.whl", hash = "sha256:9dfb89c8c56f742da10a5910a1f64b0d2ac2c3ed2bd572ddb1cfab7f35957178", size = 125082, upload-time = "2025-05-15T18:51:04.916Z" }, + { url = "https://files.pythonhosted.org/packages/a6/c0/4ac795585a22a0a2d09cd2b1187b0252d2afcdebd01e10a68bbac4d34890/mcp-1.9.1-py3-none-any.whl", hash = "sha256:2900ded8ffafc3c8a7bfcfe8bc5204037e988e753ec398f371663e6a06ecd9a9", size = 130261, upload-time = "2025-05-22T15:52:19.702Z" }, ] [[package]] @@ -2631,6 +2631,7 @@ dependencies = [ { name = "jinja2" }, { name = "mcp" }, { name = "requests" }, + { name = "tool-usage-evals" }, ] [package.dev-dependencies] @@ -2650,6 +2651,7 @@ requires-dist = [ { name = "jinja2", specifier = "~=3.0" }, { name = "mcp", specifier = ">=1.8.0" }, { name = "requests", specifier = ">=2.32.3" }, + { name = "tool-usage-evals", git = "https://github.com/dennischenfeng/tool-usage-evals?branch=Feature%2Ffirst" }, ] [package.metadata.requires-dev] @@ -2923,7 +2925,7 @@ wheels = [ [[package]] name = "openai" -version = "1.79.0" +version = "1.82.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -2935,9 +2937,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/52/cf/4901077dbbfd0d82a814d721600fa0c3a61a093d7f0bf84d0e4732448dc9/openai-1.79.0.tar.gz", hash = "sha256:e3b627aa82858d3e42d16616edc22aa9f7477ee5eb3e6819e9f44a961d899a4c", size = 444736, upload-time = "2025-05-16T19:49:59.738Z" } +sdist = { url = "https://files.pythonhosted.org/packages/3f/19/6b09bb3132f7e1a7a2291fd46fb33659bbccca041f863abd682e14ba86d7/openai-1.82.0.tar.gz", hash = "sha256:b0a009b9a58662d598d07e91e4219ab4b1e3d8ba2db3f173896a92b9b874d1a7", size = 461092, upload-time = "2025-05-22T20:08:07.282Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/81/d2/e3992bb7c6641b765c1008e3c96e076e0b50381be2cce344e6ff177bad80/openai-1.79.0-py3-none-any.whl", hash = "sha256:d5050b92d5ef83f869cb8dcd0aca0b2291c3413412500eec40c66981b3966992", size = 683334, upload-time = "2025-05-16T19:49:57.445Z" }, + { url = "https://files.pythonhosted.org/packages/51/4b/a59464ee5f77822a81ee069b4021163a0174940a92685efc3cf8b4c443a3/openai-1.82.0-py3-none-any.whl", hash = "sha256:8c40647fea1816516cb3de5189775b30b5f4812777e40b8768f361f232b61b30", size = 720412, upload-time = "2025-05-22T20:08:05.637Z" }, ] [[package]] @@ -3404,7 +3406,7 @@ wheels = [ [[package]] name = "pydantic" -version = "2.11.4" +version = "2.11.5" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "annotated-types" }, @@ -3412,9 +3414,9 @@ dependencies = [ { name = "typing-extensions" }, { name = "typing-inspection" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/77/ab/5250d56ad03884ab5efd07f734203943c8a8ab40d551e208af81d0257bf2/pydantic-2.11.4.tar.gz", hash = "sha256:32738d19d63a226a52eed76645a98ee07c1f410ee41d93b4afbfa85ed8111c2d", size = 786540, upload-time = "2025-04-29T20:38:55.02Z" } +sdist = { url = "https://files.pythonhosted.org/packages/f0/86/8ce9040065e8f924d642c58e4a344e33163a07f6b57f836d0d734e0ad3fb/pydantic-2.11.5.tar.gz", hash = "sha256:7f853db3d0ce78ce8bbb148c401c2cdd6431b3473c0cdff2755c7690952a7b7a", size = 787102, upload-time = "2025-05-22T21:18:08.761Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e7/12/46b65f3534d099349e38ef6ec98b1a5a81f42536d17e0ba382c28c67ba67/pydantic-2.11.4-py3-none-any.whl", hash = "sha256:d9615eaa9ac5a063471da949c8fc16376a84afb5024688b3ff885693506764eb", size = 443900, upload-time = "2025-04-29T20:38:52.724Z" }, + { url = "https://files.pythonhosted.org/packages/b5/69/831ed22b38ff9b4b64b66569f0e5b7b97cf3638346eb95a2147fdb49ad5f/pydantic-2.11.5-py3-none-any.whl", hash = "sha256:f9c26ba06f9747749ca1e5c94d6a85cb84254577553c8785576fd38fa64dc0f7", size = 444229, upload-time = "2025-05-22T21:18:06.329Z" }, ] [[package]] @@ -4294,6 +4296,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257, upload-time = "2024-11-27T22:38:35.385Z" }, ] +[[package]] +name = "tool-usage-evals" +version = "0.1.0" +source = { git = "https://github.com/dennischenfeng/tool-usage-evals?branch=Feature%2Ffirst#bd22edfdc0a2ca27bd9925911620bcad849b9e8b" } +dependencies = [ + { name = "azure-identity" }, + { name = "mcp" }, + { name = "openai" }, + { name = "pydantic" }, + { name = "python-dotenv" }, +] + [[package]] name = "tqdm" version = "4.67.1" From 6ee2365d2e4063d27d59fd68a4dd7d9441ab2033 Mon Sep 17 00:00:00 2001 From: Dennis Feng Date: Wed, 28 May 2025 17:10:21 -0700 Subject: [PATCH 06/12] adding 1 pytest to test tool-usage-evals library --- pyproject.toml | 13 ++- src/mcp_foundry/__main__.py | 14 ++- tests/test_mcp.py | 182 ++++++++++++++++++++++-------------- uv.lock | 10 +- 4 files changed, 135 insertions(+), 84 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3260325..ee29993 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ dependencies = [ "azure-cli>=2.60.0", "azure-ai-evaluation>=1.3.0", "azure-ai-projects==1.0.0b10", - "tool-usage-evals", + "tool-usage-evals>=0.1.0", ] [dependency-groups] @@ -22,12 +22,17 @@ test = [ "pytest>=8.3.5", "pytest-asyncio>=0.26.0", ] + [tool.pytest.ini_options] asyncio_default_fixture_loop_scope = "function" # or "module", "session" based on my use case pythonpath = ["src"] -[tool.uv.sources] -tool-usage-evals = { git = "https://github.com/dennischenfeng/tool-usage-evals", branch = "Feature/first" } - [project.scripts] run-azure-ai-foundry-mcp = "mcp_foundry.__main__:main" + +[build-system] +requires = ["hatchling", "uv-dynamic-versioning"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/mcp_foundry"] \ No newline at end of file diff --git a/src/mcp_foundry/__main__.py b/src/mcp_foundry/__main__.py index 1fd17dd..5b14cde 100644 --- a/src/mcp_foundry/__main__.py +++ b/src/mcp_foundry/__main__.py @@ -5,7 +5,7 @@ from typing import Literal from dotenv import load_dotenv -from .mcp_server import mcp, auto_import_modules +from mcp_foundry.mcp_server import mcp, auto_import_modules # Configure logging @@ -16,15 +16,19 @@ ) logger = logging.getLogger("__main__") + def main() -> None: """Runs the MCP server""" parser = ArgumentParser(description="Start the MCP service with provided or default configuration.") - parser.add_argument('--transport', required=False, default='stdio', - help='Transport protocol (sse | stdio | streamable-http) (default: stdio)') - parser.add_argument('--envFile', required=False, default='.env', - help='Path to .env file (default: .env)') + parser.add_argument( + "--transport", + required=False, + default="stdio", + help="Transport protocol (sse | stdio | streamable-http) (default: stdio)", + ) + parser.add_argument("--envFile", required=False, default=".env", help="Path to .env file (default: .env)") # Parse the application arguments args = parser.parse_args() diff --git a/tests/test_mcp.py b/tests/test_mcp.py index 24f378e..8338f54 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -1,3 +1,10 @@ +from tool_usage_evals.multi_step import run_agent_turn +from pathlib import Path +from tool_usage_evals.mcp_handling import ( + mcp_session_context_manager, + extract_tool_definitions, + build_mcp_tool_caller, +) import os import pytest from mcp import ClientSession, StdioServerParameters @@ -10,8 +17,19 @@ from dotenv import load_dotenv load_dotenv() +MCP_SERVER_SCRIPT = Path(__file__).parent / "../src/mcp_foundry/__main__.py" -token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default") + +@pytest.fixture(scope="session") +def aoai_client() -> AzureOpenAI: + """Azure OpenAI client""" + token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default") + client = AzureOpenAI( + azure_ad_token_provider=token_provider, + azure_endpoint=os.environ["AOAI_ENDPOINT"], + api_version=os.environ["AOAI_API_VERSION"], + ) + return client @pytest.mark.integration @@ -30,78 +48,98 @@ async def test_mcp_client_lists_tools(): assert tools, "Expected at least one tool from the MCP server" -async def verify_mcp_tool_call(user_message: str, expected_tool_call_name: str, no_cache: bool = False): - """ - Helper function to test MCP tool calling functionality with different messages and expected tools. - - Args: - user_message: The query to send to the model - expected_tool_call_name: The name of the tool we expect the model to call - no_cache: Whether to use --no-cache flag when running the MCP server - """ - args = ["run", "--spec", "..", "run-azure-ai-foundry-mcp"] - if no_cache: - args.insert(1, "--no-cache") - - server_params = StdioServerParameters( - command="pipx", - args=args, - ) - - async with stdio_client(server_params) as (stdio, write): - async with ClientSession(stdio, write) as session: - await session.initialize() - tools_response = await session.list_tools() - openai_tools = construct_openai_tools_from_mcp_tools( - mcp_tools=tools_response, - ) - aoai_client = AzureOpenAI( - azure_endpoint=os.environ["AOAI_ENDPOINT"], - api_version=os.environ["AOAI_API_VERSION"], - azure_ad_token_provider=token_provider, - ) - - completion = invoke_llm_with_tools( - user_message=user_message, - aoai_client=aoai_client, - model=os.environ["AOAI_MODEL"], - tools=openai_tools, - ) - response_message = completion.choices[0].message - - actual_tool_calls = response_message.tool_calls - assert len(actual_tool_calls) > 0, "Expected at least one tool call but got none" - assert isinstance(actual_tool_calls[0], ChatCompletionMessageToolCall) - assert actual_tool_calls[0].function.name == expected_tool_call_name - - return completion - - -@pytest.mark.integration -@pytest.mark.asyncio -async def test_mcp_client_message_1(): - """Test that the model correctly calls list_azure_ai_foundry_labs_projects tool""" - await verify_mcp_tool_call( - user_message="Tell me the Azure AI Foundry Labs projects", - expected_tool_call_name="list_azure_ai_foundry_labs_projects", - ) +# async def verify_mcp_tool_call(user_message: str, expected_tool_call_name: str, no_cache: bool = False): +# """ +# Helper function to test MCP tool calling functionality with different messages and expected tools. + +# Args: +# user_message: The query to send to the model +# expected_tool_call_name: The name of the tool we expect the model to call +# no_cache: Whether to use --no-cache flag when running the MCP server +# """ +# args = ["run", "--spec", "..", "run-azure-ai-foundry-mcp"] +# if no_cache: +# args.insert(1, "--no-cache") + +# server_params = StdioServerParameters( +# command="pipx", +# args=args, +# ) + +# async with stdio_client(server_params) as (stdio, write): +# async with ClientSession(stdio, write) as session: +# await session.initialize() +# tools_response = await session.list_tools() +# openai_tools = construct_openai_tools_from_mcp_tools( +# mcp_tools=tools_response, +# ) +# aoai_client = AzureOpenAI( +# azure_endpoint=os.environ["AOAI_ENDPOINT"], +# api_version=os.environ["AOAI_API_VERSION"], +# azure_ad_token_provider=token_provider, +# ) + +# completion = invoke_llm_with_tools( +# user_message=user_message, +# aoai_client=aoai_client, +# model=os.environ["AOAI_MODEL"], +# tools=openai_tools, +# ) +# response_message = completion.choices[0].message + +# actual_tool_calls = response_message.tool_calls +# assert len(actual_tool_calls) > 0, "Expected at least one tool call but got none" +# assert isinstance(actual_tool_calls[0], ChatCompletionMessageToolCall) +# assert actual_tool_calls[0].function.name == expected_tool_call_name + +# return completion + + +# @pytest.mark.integration +# @pytest.mark.asyncio +# async def test_mcp_client_message_1(): +# """Test that the model correctly calls list_azure_ai_foundry_labs_projects tool""" +# await verify_mcp_tool_call( +# user_message="Tell me the Azure AI Foundry Labs projects", +# expected_tool_call_name="list_azure_ai_foundry_labs_projects", +# ) + + +# @pytest.mark.integration +# @pytest.mark.asyncio +# async def test_mcp_client_message_2(): +# await verify_mcp_tool_call( +# user_message="I want to prototype an app with Azure AI Foundry Labs. Where do I start?", +# expected_tool_call_name="get_prototyping_instructions_for_github_and_labs", +# ) + + +# @pytest.mark.integration +# @pytest.mark.asyncio +# async def test_mcp_client_message_3(): +# # TODO: Create a more sophisticated tool call verification step that handles the stochosticity. +# # This is because this pytest sometimes gives different tool calls which is expected of a vague freeform input +# await verify_mcp_tool_call( +# user_message="I want to use the Aurora model from Azure AI Foundry Labs; fetch details on how to implement it.", +# expected_tool_call_name="list_azure_ai_foundry_labs_projects", +# ) @pytest.mark.integration @pytest.mark.asyncio -async def test_mcp_client_message_2(): - await verify_mcp_tool_call( - user_message="I want to prototype an app with Azure AI Foundry Labs. Where do I start?", - expected_tool_call_name="get_prototyping_instructions_for_github_and_labs", - ) - - -@pytest.mark.integration -@pytest.mark.asyncio -async def test_mcp_client_message_3(): - # TODO: Create a more sophisticated tool call verification step that handles the stochosticity. - # This is because this pytest sometimes gives different tool calls which is expected of a vague freeform input - await verify_mcp_tool_call( - user_message="I want to use the Aurora model from Azure AI Foundry Labs; fetch details on how to implement it.", - expected_tool_call_name="list_azure_ai_foundry_labs_projects", - ) +async def test_mcp_client_message_10(aoai_client) -> None: + """test""" + user_message = "What are the projects in Azure AI Foundry Labs?" + async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session: + tools = await extract_tool_definitions(session) + call_tool_fn = await build_mcp_tool_caller(session) + + result = await run_agent_turn( + aoai_client=aoai_client, + tools=tools, + call_tool_fn=call_tool_fn, + user_message=user_message, + ) + + tool_call_names = [t.name for t in result.tool_calls] + assert "list_azure_ai_foundry_labs_projects" in tool_call_names diff --git a/uv.lock b/uv.lock index aa1d5b7..278f691 100644 --- a/uv.lock +++ b/uv.lock @@ -2620,7 +2620,7 @@ wheels = [ [[package]] name = "mcp-foundry" version = "0.1.0" -source = { virtual = "." } +source = { editable = "." } dependencies = [ { name = "azure-ai-evaluation" }, { name = "azure-ai-projects" }, @@ -2651,7 +2651,7 @@ requires-dist = [ { name = "jinja2", specifier = "~=3.0" }, { name = "mcp", specifier = ">=1.8.0" }, { name = "requests", specifier = ">=2.32.3" }, - { name = "tool-usage-evals", git = "https://github.com/dennischenfeng/tool-usage-evals?branch=Feature%2Ffirst" }, + { name = "tool-usage-evals", specifier = ">=0.1.0" }, ] [package.metadata.requires-dev] @@ -4299,7 +4299,7 @@ wheels = [ [[package]] name = "tool-usage-evals" version = "0.1.0" -source = { git = "https://github.com/dennischenfeng/tool-usage-evals?branch=Feature%2Ffirst#bd22edfdc0a2ca27bd9925911620bcad849b9e8b" } +source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "azure-identity" }, { name = "mcp" }, @@ -4307,6 +4307,10 @@ dependencies = [ { name = "pydantic" }, { name = "python-dotenv" }, ] +sdist = { url = "https://files.pythonhosted.org/packages/fe/eb/52307358325344969d3b12957df06214ba2b25597f78c872f4eda7552094/tool_usage_evals-0.1.0.tar.gz", hash = "sha256:b808ecbd74c9456580bb9c0a7a4b3ffc7a0ea2ecabf900cfcdabd4a796c776b1", size = 48529, upload-time = "2025-05-28T22:11:35.89Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/65/2d/ebc363bbbd1b87891c1f44f0947df4e98bdade593eafcc23bc507be873f1/tool_usage_evals-0.1.0-py3-none-any.whl", hash = "sha256:f4b1be40afa2b2f9f7e1114b23176f483a316bf4bfd5bc0c156f6c62ef051840", size = 5394, upload-time = "2025-05-28T22:11:34.354Z" }, +] [[package]] name = "tqdm" From d3eb84336d440714b6d704ca4c3e13ac442354e2 Mon Sep 17 00:00:00 2001 From: Dennis Feng Date: Wed, 28 May 2025 18:08:01 -0700 Subject: [PATCH 07/12] first test is working, using tool-usage-evals library --- pyproject.toml | 4 ++-- src/mcp_foundry/mcp_foundry_model/tools.py | 2 +- uv.lock | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ee29993..e44e0de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "MCP Server for Azure AI Foundry (experimental)" readme = "README.md" requires-python = ">=3.10" dependencies = [ - "mcp>=1.8.0", + "mcp>=1.9.1", "requests>=2.32.3", "azure-mgmt-cognitiveservices>=13.0.0", "azure-identity>=1.0", @@ -14,7 +14,7 @@ dependencies = [ "azure-cli>=2.60.0", "azure-ai-evaluation>=1.3.0", "azure-ai-projects==1.0.0b10", - "tool-usage-evals>=0.1.0", + "tool-usage-evals>=0.1.1", ] [dependency-groups] diff --git a/src/mcp_foundry/mcp_foundry_model/tools.py b/src/mcp_foundry/mcp_foundry_model/tools.py index 57c80e0..a88fc79 100644 --- a/src/mcp_foundry/mcp_foundry_model/tools.py +++ b/src/mcp_foundry/mcp_foundry_model/tools.py @@ -37,7 +37,7 @@ @mcp.tool() -async def list_models_from_model_catalog(ctx: Context, search_for_free_playground: bool = False, publisher_name = "", license_name = "") -> str: +async def list_models_from_model_catalog(ctx: Context, search_for_free_playground: bool = False, publisher_name: str = "", license_name: str = "") -> str: """ Retrieves a list of supported models from the Azure AI Foundry catalog. diff --git a/uv.lock b/uv.lock index 278f691..c048b3a 100644 --- a/uv.lock +++ b/uv.lock @@ -2649,9 +2649,9 @@ requires-dist = [ { name = "azure-mgmt-cognitiveservices", specifier = ">=13.0.0" }, { name = "azure-search-documents", specifier = ">=11.5.2" }, { name = "jinja2", specifier = "~=3.0" }, - { name = "mcp", specifier = ">=1.8.0" }, + { name = "mcp", specifier = ">=1.9.1" }, { name = "requests", specifier = ">=2.32.3" }, - { name = "tool-usage-evals", specifier = ">=0.1.0" }, + { name = "tool-usage-evals", specifier = ">=0.1.1" }, ] [package.metadata.requires-dev] @@ -4298,7 +4298,7 @@ wheels = [ [[package]] name = "tool-usage-evals" -version = "0.1.0" +version = "0.1.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "azure-identity" }, @@ -4307,9 +4307,9 @@ dependencies = [ { name = "pydantic" }, { name = "python-dotenv" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/fe/eb/52307358325344969d3b12957df06214ba2b25597f78c872f4eda7552094/tool_usage_evals-0.1.0.tar.gz", hash = "sha256:b808ecbd74c9456580bb9c0a7a4b3ffc7a0ea2ecabf900cfcdabd4a796c776b1", size = 48529, upload-time = "2025-05-28T22:11:35.89Z" } +sdist = { url = "https://files.pythonhosted.org/packages/0c/ba/150dfce6eaa2a1167f366fbc91b95ed05f68f3a972e83207b38c3dc81dcc/tool_usage_evals-0.1.1.tar.gz", hash = "sha256:8a666b8252623d9014b3d4f304b7ed512a7a8bf1e4fee22a921c565f799871dd", size = 48669, upload-time = "2025-05-29T01:00:48.58Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/65/2d/ebc363bbbd1b87891c1f44f0947df4e98bdade593eafcc23bc507be873f1/tool_usage_evals-0.1.0-py3-none-any.whl", hash = "sha256:f4b1be40afa2b2f9f7e1114b23176f483a316bf4bfd5bc0c156f6c62ef051840", size = 5394, upload-time = "2025-05-28T22:11:34.354Z" }, + { url = "https://files.pythonhosted.org/packages/43/e6/bb7678b08c0d45f78ef9323de2757180dfb11d8eb091690067d6fba100b1/tool_usage_evals-0.1.1-py3-none-any.whl", hash = "sha256:bf1fa5010bc1797e807064e096346d82042ba6a9d0110e81c0a160f5404b508f", size = 5414, upload-time = "2025-05-29T01:00:46.803Z" }, ] [[package]] From 016229e81640515a07ce4d7a85f644f58898fd2b Mon Sep 17 00:00:00 2001 From: Dennis Feng Date: Thu, 29 May 2025 14:48:51 -0700 Subject: [PATCH 08/12] tests all working because added tenacity for retries --- pyproject.toml | 3 ++- tests/test_mcp.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++- uv.lock | 11 +++++++++ 3 files changed, 69 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e44e0de..921e3b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ test = [ "pytest>=8.3.5", "pytest-asyncio>=0.26.0", + "tenacity>=9.1.2", ] [tool.pytest.ini_options] @@ -35,4 +36,4 @@ requires = ["hatchling", "uv-dynamic-versioning"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] -packages = ["src/mcp_foundry"] \ No newline at end of file +packages = ["src/mcp_foundry"] diff --git a/tests/test_mcp.py b/tests/test_mcp.py index 8338f54..b994a0f 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -15,6 +15,21 @@ from azure.identity import DefaultAzureCredential, get_bearer_token_provider from dotenv import load_dotenv +import openai +from tenacity import ( + retry, + wait_random_exponential, + stop_after_attempt, + retry_if_exception_type, +) + + +retry_decorator = retry( + retry=retry_if_exception_type(openai.RateLimitError), + wait=wait_random_exponential(min=10, max=90), + stop=stop_after_attempt(6), + reraise=True, +) load_dotenv() MCP_SERVER_SCRIPT = Path(__file__).parent / "../src/mcp_foundry/__main__.py" @@ -134,7 +149,47 @@ async def test_mcp_client_message_10(aoai_client) -> None: tools = await extract_tool_definitions(session) call_tool_fn = await build_mcp_tool_caller(session) - result = await run_agent_turn( + result = await retry_decorator(run_agent_turn)( + aoai_client=aoai_client, + tools=tools, + call_tool_fn=call_tool_fn, + user_message=user_message, + ) + + tool_call_names = [t.name for t in result.tool_calls] + assert "list_azure_ai_foundry_labs_projects" in tool_call_names + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_mcp_client_message_20(aoai_client) -> None: + """test""" + user_message = "I want to prototype an app with Azure AI Foundry Labs. Where do I start?" + async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session: + tools = await extract_tool_definitions(session) + call_tool_fn = await build_mcp_tool_caller(session) + + result = await retry_decorator(run_agent_turn)( + aoai_client=aoai_client, + tools=tools, + call_tool_fn=call_tool_fn, + user_message=user_message, + ) + + tool_call_names = [t.name for t in result.tool_calls] + assert "get_prototyping_instructions_for_github_and_labs" in tool_call_names + + +@pytest.mark.integration +@pytest.mark.asyncio +async def test_mcp_client_message_30(aoai_client) -> None: + """test""" + user_message = "I want to use the Aurora model from Azure AI Foundry Labs; fetch details on how to implement it." + async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session: + tools = await extract_tool_definitions(session) + call_tool_fn = await build_mcp_tool_caller(session) + + result = await retry_decorator(run_agent_turn)( aoai_client=aoai_client, tools=tools, call_tool_fn=call_tool_fn, diff --git a/uv.lock b/uv.lock index c048b3a..0fd8132 100644 --- a/uv.lock +++ b/uv.lock @@ -2638,6 +2638,7 @@ dependencies = [ test = [ { name = "pytest" }, { name = "pytest-asyncio" }, + { name = "tenacity" }, ] [package.metadata] @@ -2658,6 +2659,7 @@ requires-dist = [ test = [ { name = "pytest", specifier = ">=8.3.5" }, { name = "pytest-asyncio", specifier = ">=0.26.0" }, + { name = "tenacity", specifier = ">=9.1.2" }, ] [[package]] @@ -4221,6 +4223,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" }, ] +[[package]] +name = "tenacity" +version = "9.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0a/d4/2b0cd0fe285e14b36db076e78c93766ff1d529d70408bd1d2a5a84f1d929/tenacity-9.1.2.tar.gz", hash = "sha256:1169d376c297e7de388d18b4481760d478b0e99a777cad3a9c86e556f4b697cb", size = 48036, upload-time = "2025-04-02T08:25:09.966Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" }, +] + [[package]] name = "tiktoken" version = "0.9.0" From f36cf12241a96648361326e28132f27415d56357 Mon Sep 17 00:00:00 2001 From: Dennis Feng Date: Thu, 29 May 2025 16:42:35 -0700 Subject: [PATCH 09/12] updated pytest to use multiple trials --- pyproject.toml | 1 + tests/test_mcp.py | 114 +++++++++------------------------------------- uv.lock | 2 + 3 files changed, 25 insertions(+), 92 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 921e3b7..05662af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "azure-ai-evaluation>=1.3.0", "azure-ai-projects==1.0.0b10", "tool-usage-evals>=0.1.1", + "tqdm>=4.67.1", ] [dependency-groups] diff --git a/tests/test_mcp.py b/tests/test_mcp.py index b994a0f..9df61f3 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -9,9 +9,7 @@ import pytest from mcp import ClientSession, StdioServerParameters from mcp.client.stdio import stdio_client -from .utils import construct_openai_tools_from_mcp_tools, invoke_llm_with_tools from openai import AzureOpenAI -from openai.types.chat import ChatCompletionMessageToolCall from azure.identity import DefaultAzureCredential, get_bearer_token_provider from dotenv import load_dotenv @@ -22,6 +20,7 @@ stop_after_attempt, retry_if_exception_type, ) +from tqdm import tqdm retry_decorator = retry( @@ -49,7 +48,7 @@ def aoai_client() -> AzureOpenAI: @pytest.mark.integration @pytest.mark.asyncio -async def test_mcp_client_lists_tools(): +async def test_mcp_client_lists_tools_using_pipx(): server_params = StdioServerParameters( command="pipx", args=["run", "--no-cache", "--spec", "..", "run-azure-ai-foundry-mcp"], @@ -63,86 +62,9 @@ async def test_mcp_client_lists_tools(): assert tools, "Expected at least one tool from the MCP server" -# async def verify_mcp_tool_call(user_message: str, expected_tool_call_name: str, no_cache: bool = False): -# """ -# Helper function to test MCP tool calling functionality with different messages and expected tools. - -# Args: -# user_message: The query to send to the model -# expected_tool_call_name: The name of the tool we expect the model to call -# no_cache: Whether to use --no-cache flag when running the MCP server -# """ -# args = ["run", "--spec", "..", "run-azure-ai-foundry-mcp"] -# if no_cache: -# args.insert(1, "--no-cache") - -# server_params = StdioServerParameters( -# command="pipx", -# args=args, -# ) - -# async with stdio_client(server_params) as (stdio, write): -# async with ClientSession(stdio, write) as session: -# await session.initialize() -# tools_response = await session.list_tools() -# openai_tools = construct_openai_tools_from_mcp_tools( -# mcp_tools=tools_response, -# ) -# aoai_client = AzureOpenAI( -# azure_endpoint=os.environ["AOAI_ENDPOINT"], -# api_version=os.environ["AOAI_API_VERSION"], -# azure_ad_token_provider=token_provider, -# ) - -# completion = invoke_llm_with_tools( -# user_message=user_message, -# aoai_client=aoai_client, -# model=os.environ["AOAI_MODEL"], -# tools=openai_tools, -# ) -# response_message = completion.choices[0].message - -# actual_tool_calls = response_message.tool_calls -# assert len(actual_tool_calls) > 0, "Expected at least one tool call but got none" -# assert isinstance(actual_tool_calls[0], ChatCompletionMessageToolCall) -# assert actual_tool_calls[0].function.name == expected_tool_call_name - -# return completion - - -# @pytest.mark.integration -# @pytest.mark.asyncio -# async def test_mcp_client_message_1(): -# """Test that the model correctly calls list_azure_ai_foundry_labs_projects tool""" -# await verify_mcp_tool_call( -# user_message="Tell me the Azure AI Foundry Labs projects", -# expected_tool_call_name="list_azure_ai_foundry_labs_projects", -# ) - - -# @pytest.mark.integration -# @pytest.mark.asyncio -# async def test_mcp_client_message_2(): -# await verify_mcp_tool_call( -# user_message="I want to prototype an app with Azure AI Foundry Labs. Where do I start?", -# expected_tool_call_name="get_prototyping_instructions_for_github_and_labs", -# ) - - -# @pytest.mark.integration -# @pytest.mark.asyncio -# async def test_mcp_client_message_3(): -# # TODO: Create a more sophisticated tool call verification step that handles the stochosticity. -# # This is because this pytest sometimes gives different tool calls which is expected of a vague freeform input -# await verify_mcp_tool_call( -# user_message="I want to use the Aurora model from Azure AI Foundry Labs; fetch details on how to implement it.", -# expected_tool_call_name="list_azure_ai_foundry_labs_projects", -# ) - - @pytest.mark.integration @pytest.mark.asyncio -async def test_mcp_client_message_10(aoai_client) -> None: +async def test_mcp_client_message_1(aoai_client) -> None: """test""" user_message = "What are the projects in Azure AI Foundry Labs?" async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session: @@ -162,7 +84,7 @@ async def test_mcp_client_message_10(aoai_client) -> None: @pytest.mark.integration @pytest.mark.asyncio -async def test_mcp_client_message_20(aoai_client) -> None: +async def test_mcp_client_message_2(aoai_client) -> None: """test""" user_message = "I want to prototype an app with Azure AI Foundry Labs. Where do I start?" async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session: @@ -182,19 +104,27 @@ async def test_mcp_client_message_20(aoai_client) -> None: @pytest.mark.integration @pytest.mark.asyncio -async def test_mcp_client_message_30(aoai_client) -> None: +async def test_mcp_client_message_3(aoai_client) -> None: """test""" - user_message = "I want to use the Aurora model from Azure AI Foundry Labs; fetch details on how to implement it." + user_message = "Give me code and implementation details for the Aurora model." + n_trials = 5 async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session: tools = await extract_tool_definitions(session) call_tool_fn = await build_mcp_tool_caller(session) - result = await retry_decorator(run_agent_turn)( - aoai_client=aoai_client, - tools=tools, - call_tool_fn=call_tool_fn, - user_message=user_message, - ) + results = [] + for trial in tqdm(range(n_trials)): + result = await retry_decorator(run_agent_turn)( + aoai_client=aoai_client, + tools=tools, + call_tool_fn=call_tool_fn, + user_message=user_message, + ) + results.append(result) - tool_call_names = [t.name for t in result.tool_calls] - assert "list_azure_ai_foundry_labs_projects" in tool_call_names + all_tool_call_names = [[t.name for t in result.tool_calls] for result in results] + + n_found_correct_tool = sum(["get_model_details_and_code_samples" in names for names in all_tool_call_names]) + accuracy = n_found_correct_tool / n_trials + + assert accuracy > 0.5 diff --git a/uv.lock b/uv.lock index 0fd8132..c80d583 100644 --- a/uv.lock +++ b/uv.lock @@ -2632,6 +2632,7 @@ dependencies = [ { name = "mcp" }, { name = "requests" }, { name = "tool-usage-evals" }, + { name = "tqdm" }, ] [package.dev-dependencies] @@ -2653,6 +2654,7 @@ requires-dist = [ { name = "mcp", specifier = ">=1.9.1" }, { name = "requests", specifier = ">=2.32.3" }, { name = "tool-usage-evals", specifier = ">=0.1.1" }, + { name = "tqdm", specifier = ">=4.67.1" }, ] [package.metadata.requires-dev] From c54aabe1cba712c83084359fb7947b20ac8a07be Mon Sep 17 00:00:00 2001 From: Dennis Feng Date: Thu, 29 May 2025 16:57:33 -0700 Subject: [PATCH 10/12] clean up --- pyproject.toml | 8 ++++---- tests/utils.py | 50 -------------------------------------------------- 2 files changed, 4 insertions(+), 54 deletions(-) delete mode 100644 tests/utils.py diff --git a/pyproject.toml b/pyproject.toml index 05662af..cc24e50 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "MCP Server for Azure AI Foundry (experimental)" readme = "README.md" requires-python = ">=3.10" dependencies = [ - "mcp>=1.9.1", + "mcp>=1.8.0", "requests>=2.32.3", "azure-mgmt-cognitiveservices>=13.0.0", "azure-identity>=1.0", @@ -14,8 +14,6 @@ dependencies = [ "azure-cli>=2.60.0", "azure-ai-evaluation>=1.3.0", "azure-ai-projects==1.0.0b10", - "tool-usage-evals>=0.1.1", - "tqdm>=4.67.1", ] [dependency-groups] @@ -23,6 +21,8 @@ test = [ "pytest>=8.3.5", "pytest-asyncio>=0.26.0", "tenacity>=9.1.2", + "tqdm>=4.67.1", + "tool-usage-evals>=0.1.1", ] [tool.pytest.ini_options] @@ -33,7 +33,7 @@ pythonpath = ["src"] run-azure-ai-foundry-mcp = "mcp_foundry.__main__:main" [build-system] -requires = ["hatchling", "uv-dynamic-versioning"] +requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] diff --git a/tests/utils.py b/tests/utils.py deleted file mode 100644 index d3bcf61..0000000 --- a/tests/utils.py +++ /dev/null @@ -1,50 +0,0 @@ -""" -Utilities for helping run tests. -""" - -from openai import AzureOpenAI -from openai.types.chat import ChatCompletion -from mcp.types import ListToolsResult - - -def construct_openai_tools_from_mcp_tools(mcp_tools: ListToolsResult) -> list[dict]: - """ - Given a tools list from MCP server, convert it to the format required to feed into Azure OpenAI chat completion. - """ - final_tools = [ - { - "type": "function", - "function": { - "name": tool.name, - "description": tool.description, - "parameters": tool.inputSchema, - }, - } - for tool in mcp_tools.tools - ] - return final_tools - - -def invoke_llm_with_tools( - user_message: str, - tools: list[dict], - aoai_client: AzureOpenAI, - model: str, -) -> ChatCompletion: - """ - Invoke a single LLM inference step on a user message, including the specified tools, and return the response - """ - messages = [ - dict( - role="user", - content=user_message, - ) - ] - - completion = aoai_client.chat.completions.create( - model=model, - messages=messages, - tools=tools, - tool_choice="auto", - ) - return completion From 765891765c32b38702721fd1fccb778aa8a78f41 Mon Sep 17 00:00:00 2001 From: Dennis Feng Date: Thu, 29 May 2025 17:02:44 -0700 Subject: [PATCH 11/12] added more to docstring of tests --- tests/test_mcp.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_mcp.py b/tests/test_mcp.py index 9df61f3..d3c2afb 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -65,7 +65,7 @@ async def test_mcp_client_lists_tools_using_pipx(): @pytest.mark.integration @pytest.mark.asyncio async def test_mcp_client_message_1(aoai_client) -> None: - """test""" + """Test tool usage for a user message asking about foundry labs projects.""" user_message = "What are the projects in Azure AI Foundry Labs?" async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session: tools = await extract_tool_definitions(session) @@ -85,7 +85,7 @@ async def test_mcp_client_message_1(aoai_client) -> None: @pytest.mark.integration @pytest.mark.asyncio async def test_mcp_client_message_2(aoai_client) -> None: - """test""" + """Test tool usage for a user message asking about prototyping with foundry labs projects.""" user_message = "I want to prototype an app with Azure AI Foundry Labs. Where do I start?" async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session: tools = await extract_tool_definitions(session) @@ -105,7 +105,11 @@ async def test_mcp_client_message_2(aoai_client) -> None: @pytest.mark.integration @pytest.mark.asyncio async def test_mcp_client_message_3(aoai_client) -> None: - """test""" + """ + Test tool usage for a user message asking code/implementation details. + Because of stochasticity of response (sometimes uses the prototyping tool or list-projects + tool instead of intended code-samples tool), we do n repeated trials. + """ user_message = "Give me code and implementation details for the Aurora model." n_trials = 5 async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session: From d9318bf6d5c4d5d2e660156ebda718b1afa24942 Mon Sep 17 00:00:00 2001 From: Dennis Feng Date: Thu, 29 May 2025 18:13:28 -0700 Subject: [PATCH 12/12] reduced n_trials from 5 to 3 --- pyproject.toml | 2 +- tests/test_mcp.py | 9 ++++++--- uv.lock | 17 +++++++++-------- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cc24e50..48dea49 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ test = [ "pytest-asyncio>=0.26.0", "tenacity>=9.1.2", "tqdm>=4.67.1", - "tool-usage-evals>=0.1.1", + "tool-usage-evals>=0.1.4", ] [tool.pytest.ini_options] diff --git a/tests/test_mcp.py b/tests/test_mcp.py index d3c2afb..940ecca 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -40,8 +40,8 @@ def aoai_client() -> AzureOpenAI: token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default") client = AzureOpenAI( azure_ad_token_provider=token_provider, - azure_endpoint=os.environ["AOAI_ENDPOINT"], - api_version=os.environ["AOAI_API_VERSION"], + azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"], + api_version=os.environ["AZURE_OPENAI_API_VERSION"], ) return client @@ -73,6 +73,7 @@ async def test_mcp_client_message_1(aoai_client) -> None: result = await retry_decorator(run_agent_turn)( aoai_client=aoai_client, + model=os.environ["AZURE_OPENAI_DEPLOYMENT"], tools=tools, call_tool_fn=call_tool_fn, user_message=user_message, @@ -93,6 +94,7 @@ async def test_mcp_client_message_2(aoai_client) -> None: result = await retry_decorator(run_agent_turn)( aoai_client=aoai_client, + model=os.environ["AZURE_OPENAI_DEPLOYMENT"], tools=tools, call_tool_fn=call_tool_fn, user_message=user_message, @@ -111,7 +113,7 @@ async def test_mcp_client_message_3(aoai_client) -> None: tool instead of intended code-samples tool), we do n repeated trials. """ user_message = "Give me code and implementation details for the Aurora model." - n_trials = 5 + n_trials = 3 async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session: tools = await extract_tool_definitions(session) call_tool_fn = await build_mcp_tool_caller(session) @@ -120,6 +122,7 @@ async def test_mcp_client_message_3(aoai_client) -> None: for trial in tqdm(range(n_trials)): result = await retry_decorator(run_agent_turn)( aoai_client=aoai_client, + model=os.environ["AZURE_OPENAI_DEPLOYMENT"], tools=tools, call_tool_fn=call_tool_fn, user_message=user_message, diff --git a/uv.lock b/uv.lock index c80d583..b73f619 100644 --- a/uv.lock +++ b/uv.lock @@ -2631,8 +2631,6 @@ dependencies = [ { name = "jinja2" }, { name = "mcp" }, { name = "requests" }, - { name = "tool-usage-evals" }, - { name = "tqdm" }, ] [package.dev-dependencies] @@ -2640,6 +2638,8 @@ test = [ { name = "pytest" }, { name = "pytest-asyncio" }, { name = "tenacity" }, + { name = "tool-usage-evals" }, + { name = "tqdm" }, ] [package.metadata] @@ -2651,10 +2651,8 @@ requires-dist = [ { name = "azure-mgmt-cognitiveservices", specifier = ">=13.0.0" }, { name = "azure-search-documents", specifier = ">=11.5.2" }, { name = "jinja2", specifier = "~=3.0" }, - { name = "mcp", specifier = ">=1.9.1" }, + { name = "mcp", specifier = ">=1.8.0" }, { name = "requests", specifier = ">=2.32.3" }, - { name = "tool-usage-evals", specifier = ">=0.1.1" }, - { name = "tqdm", specifier = ">=4.67.1" }, ] [package.metadata.requires-dev] @@ -2662,6 +2660,8 @@ test = [ { name = "pytest", specifier = ">=8.3.5" }, { name = "pytest-asyncio", specifier = ">=0.26.0" }, { name = "tenacity", specifier = ">=9.1.2" }, + { name = "tool-usage-evals", specifier = ">=0.1.4" }, + { name = "tqdm", specifier = ">=4.67.1" }, ] [[package]] @@ -4311,7 +4311,7 @@ wheels = [ [[package]] name = "tool-usage-evals" -version = "0.1.1" +version = "0.1.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "azure-identity" }, @@ -4319,10 +4319,11 @@ dependencies = [ { name = "openai" }, { name = "pydantic" }, { name = "python-dotenv" }, + { name = "tenacity" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/0c/ba/150dfce6eaa2a1167f366fbc91b95ed05f68f3a972e83207b38c3dc81dcc/tool_usage_evals-0.1.1.tar.gz", hash = "sha256:8a666b8252623d9014b3d4f304b7ed512a7a8bf1e4fee22a921c565f799871dd", size = 48669, upload-time = "2025-05-29T01:00:48.58Z" } +sdist = { url = "https://files.pythonhosted.org/packages/68/5c/57e6940cbb191a982dee71863c125aad943a84b4f30d4ce08a522642f764/tool_usage_evals-0.1.4.tar.gz", hash = "sha256:7abbeb257183a8fbe818b7f0e793a5afe669ff96e2dfc3ae28f87a4ae41b8731", size = 49210, upload-time = "2025-05-30T00:49:51.715Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/43/e6/bb7678b08c0d45f78ef9323de2757180dfb11d8eb091690067d6fba100b1/tool_usage_evals-0.1.1-py3-none-any.whl", hash = "sha256:bf1fa5010bc1797e807064e096346d82042ba6a9d0110e81c0a160f5404b508f", size = 5414, upload-time = "2025-05-29T01:00:46.803Z" }, + { url = "https://files.pythonhosted.org/packages/09/73/298e98da166fbd16820affbed4c213bc22af77fd2e68df3381add4f186db/tool_usage_evals-0.1.4-py3-none-any.whl", hash = "sha256:732b167927697bf19bb924c2341f27056e591ad6496023b288884e8081d11ce1", size = 5578, upload-time = "2025-05-30T00:49:50.49Z" }, ] [[package]]