From fd8a72d8422c34133901d2dedf54cf9dc26cfbd3 Mon Sep 17 00:00:00 2001
From: Dennis Feng <v-dennisfeng@microsoft.com>
Date: Tue, 20 May 2025 13:16:53 -0700
Subject: [PATCH 01/12] added test utils

---
 src/mcp_foundry/mcp_foundry_model/tools.py |  1 +
 tests/test_mcp.py                          | 12 ++----
 tests/utils.py                             | 50 ++++++++++++++++++++++
 3 files changed, 55 insertions(+), 8 deletions(-)
 create mode 100644 tests/utils.py

diff --git a/src/mcp_foundry/mcp_foundry_model/tools.py b/src/mcp_foundry/mcp_foundry_model/tools.py
index f300d2f..4a08103 100644
--- a/src/mcp_foundry/mcp_foundry_model/tools.py
+++ b/src/mcp_foundry/mcp_foundry_model/tools.py
@@ -35,6 +35,7 @@
 )
 logger = logging.getLogger("mcp_foundry_model")
 
+
 @mcp.tool()
 async def list_models_from_model_catalog(ctx: Context, search_for_free_playground: bool = False, publisher_name = "", license_name = "") -> str:
     """
diff --git a/tests/test_mcp.py b/tests/test_mcp.py
index 4c27b40..7318302 100644
--- a/tests/test_mcp.py
+++ b/tests/test_mcp.py
@@ -1,13 +1,16 @@
+from typing import Any
+from mcp.types import ListToolsResult
 import pytest
 from mcp import ClientSession, StdioServerParameters
 from mcp.client.stdio import stdio_client
 
+
 @pytest.mark.integration
 @pytest.mark.asyncio
 async def test_mcp_client_lists_tools():
     server_params = StdioServerParameters(
         command="pipx",
-        args=["run", "--no-cache", "--spec", "..", "run-azure-foundry-mcp"],
+        args=["run", "--no-cache", "--spec", "..", "run-azure-ai-foundry-mcp"],
     )
 
     async with stdio_client(server_params) as (stdio, write):
@@ -16,10 +19,3 @@ async def test_mcp_client_lists_tools():
             response = await session.list_tools()
             tools = response.tools
             assert tools, "Expected at least one tool from the MCP server"
-
-             
-#TODO: Add tools that take prompts and test that the correct tool(s) are selected
-#TODO: Find way to only create client once per test module or make it faster
-#TODO: Add LLM to client 
-##TODO: Make LLM easily configurable
-##TODO: Make it so we can test against multiple LLMs
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 0000000..af337a0
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,50 @@
+"""
+Utilities for helping run tests.
+"""
+
+from openai import AzureOpenAI
+from openai.types.chat import ChatCompletion
+from mcp.types import ListToolsResult
+
+
+def construct_openai_tools_from_mcp_tools(mcp_tools: ListToolsResult) -> list[dict]:
+    """
+    Given a tools list from MCP server, convert it to the format required to feed into Azure OpenAI chat completion.
+    """
+    final_tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": tool.name,
+                "description": tool.description,
+                "parameters": tool.inputSchema,
+            },
+        }
+        for tool in mcp_tools.tools
+    ]
+    return final_tools
+
+
+def invoke_llm_with_tools(
+    user_message: str,
+    tools: list[dict],
+    aoai_client: AzureOpenAI,
+    model: str,
+) -> ChatCompletion:
+    """
+    Invoke a single LLM inference step on a user message, including the specified tools, and return the response
+    """
+    messages = [
+        dict(
+            role="user",
+            message=user_message,
+        )
+    ]
+
+    completion = aoai_client.chat.completions.create(
+        model=model,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+    )
+    return completion

From 16f7e719f6ccbd4efa25f34bfe33cb82d0c33fc7 Mon Sep 17 00:00:00 2001
From: Dennis Feng <v-dennisfeng@microsoft.com>
Date: Wed, 21 May 2025 13:57:45 -0700
Subject: [PATCH 02/12] working test of tool call

---
 tests/test_mcp.py | 51 +++++++++++++++++++++++++++++++++++++++++++++--
 tests/utils.py    |  2 +-
 2 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/tests/test_mcp.py b/tests/test_mcp.py
index 7318302..95e6f68 100644
--- a/tests/test_mcp.py
+++ b/tests/test_mcp.py
@@ -1,8 +1,17 @@
-from typing import Any
-from mcp.types import ListToolsResult
+import os
 import pytest
 from mcp import ClientSession, StdioServerParameters
 from mcp.client.stdio import stdio_client
+from .utils import construct_openai_tools_from_mcp_tools, invoke_llm_with_tools
+from openai import AzureOpenAI
+from openai.types.chat import ChatCompletionMessageToolCall
+
+from azure.identity import DefaultAzureCredential, get_bearer_token_provider
+from dotenv import load_dotenv
+
+load_dotenv()
+
+token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
 
 
 @pytest.mark.integration
@@ -19,3 +28,41 @@ async def test_mcp_client_lists_tools():
             response = await session.list_tools()
             tools = response.tools
             assert tools, "Expected at least one tool from the MCP server"
+
+
+@pytest.mark.integration
+@pytest.mark.asyncio
+async def test_mcp_client_message_1():
+    # In the args, we omit "--no-cache" to reduce latency
+    server_params = StdioServerParameters(
+        command="pipx",
+        args=["run", "--spec", "..", "run-azure-ai-foundry-mcp"],
+    )
+
+    async with stdio_client(server_params) as (stdio, write):
+        async with ClientSession(stdio, write) as session:
+            await session.initialize()
+            tools_response = await session.list_tools()
+            openai_tools = construct_openai_tools_from_mcp_tools(
+                mcp_tools=tools_response,
+            )
+            aoai_client = AzureOpenAI(
+                azure_endpoint=os.environ["AOAI_ENDPOINT"],
+                api_version=os.environ["AOAI_API_VERSION"],
+                azure_ad_token_provider=token_provider,
+            )
+
+            completion = invoke_llm_with_tools(
+                user_message="Tell me the Azure AI Foundry Labs projects",
+                aoai_client=aoai_client,
+                model=os.environ["AOAI_MODEL"],
+                tools=openai_tools,
+            )
+            response_message = completion.choices[0].message
+
+            # TODO dennis
+            expected_tool_call_name = "list_azure_ai_foundry_labs_projects"
+            actual_tool_calls = response_message.tool_calls
+            assert len(actual_tool_calls) > 0
+            assert isinstance(actual_tool_calls[0], ChatCompletionMessageToolCall)
+            assert actual_tool_calls[0].function.name == expected_tool_call_name
diff --git a/tests/utils.py b/tests/utils.py
index af337a0..d3bcf61 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -37,7 +37,7 @@ def invoke_llm_with_tools(
     messages = [
         dict(
             role="user",
-            message=user_message,
+            content=user_message,
         )
     ]
 

From f859d03e69b3f32e578c248dbeb6576f0d0fb864 Mon Sep 17 00:00:00 2001
From: Dennis Feng <v-dennisfeng@microsoft.com>
Date: Wed, 21 May 2025 14:14:39 -0700
Subject: [PATCH 03/12] added a few automated tests for tool call verifications

---
 tests/test_mcp.py | 55 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 9 deletions(-)

diff --git a/tests/test_mcp.py b/tests/test_mcp.py
index 95e6f68..63b480e 100644
--- a/tests/test_mcp.py
+++ b/tests/test_mcp.py
@@ -30,13 +30,22 @@ async def test_mcp_client_lists_tools():
             assert tools, "Expected at least one tool from the MCP server"
 
 
-@pytest.mark.integration
-@pytest.mark.asyncio
-async def test_mcp_client_message_1():
-    # In the args, we omit "--no-cache" to reduce latency
+async def verify_mcp_tool_call(user_message: str, expected_tool_call_name: str, no_cache: bool = False):
+    """
+    Helper function to test MCP tool calling functionality with different messages and expected tools.
+
+    Args:
+        user_message: The query to send to the model
+        expected_tool_call_name: The name of the tool we expect the model to call
+        no_cache: Whether to use --no-cache flag when running the MCP server
+    """
+    args = ["run", "--spec", "..", "run-azure-ai-foundry-mcp"]
+    if no_cache:
+        args.insert(1, "--no-cache")
+
     server_params = StdioServerParameters(
         command="pipx",
-        args=["run", "--spec", "..", "run-azure-ai-foundry-mcp"],
+        args=args,
     )
 
     async with stdio_client(server_params) as (stdio, write):
@@ -53,16 +62,44 @@ async def test_mcp_client_message_1():
             )
 
             completion = invoke_llm_with_tools(
-                user_message="Tell me the Azure AI Foundry Labs projects",
+                user_message=user_message,
                 aoai_client=aoai_client,
                 model=os.environ["AOAI_MODEL"],
                 tools=openai_tools,
             )
             response_message = completion.choices[0].message
 
-            # TODO dennis
-            expected_tool_call_name = "list_azure_ai_foundry_labs_projects"
             actual_tool_calls = response_message.tool_calls
-            assert len(actual_tool_calls) > 0
+            assert len(actual_tool_calls) > 0, "Expected at least one tool call but got none"
             assert isinstance(actual_tool_calls[0], ChatCompletionMessageToolCall)
             assert actual_tool_calls[0].function.name == expected_tool_call_name
+
+            return completion
+
+
+@pytest.mark.integration
+@pytest.mark.asyncio
+async def test_mcp_client_message_1():
+    """Test that the model correctly calls list_azure_ai_foundry_labs_projects tool"""
+    await verify_mcp_tool_call(
+        user_message="Tell me the Azure AI Foundry Labs projects",
+        expected_tool_call_name="list_azure_ai_foundry_labs_projects",
+    )
+
+
+@pytest.mark.integration
+@pytest.mark.asyncio
+async def test_mcp_client_message_2():
+    await verify_mcp_tool_call(
+        user_message="I want to prototype an app with Azure AI Foundry Labs. Where do I start?",
+        expected_tool_call_name="get_prototyping_instructions_for_github_and_labs",
+    )
+
+
+@pytest.mark.integration
+@pytest.mark.asyncio
+async def test_mcp_client_message_3():
+    await verify_mcp_tool_call(
+        user_message="I want to use the Aurora model from Azure AI Foundry Labs; fetch details on how to implement it.",
+        expected_tool_call_name="list_azure_ai_foundry_labs_projects",
+    )

From 9d9caf45be85a647aaae40dec939ec9785ba600b Mon Sep 17 00:00:00 2001
From: Dennis Feng <v-dennisfeng@microsoft.com>
Date: Wed, 21 May 2025 15:23:29 -0700
Subject: [PATCH 04/12] minor comment

---
 tests/test_mcp.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_mcp.py b/tests/test_mcp.py
index 63b480e..24f378e 100644
--- a/tests/test_mcp.py
+++ b/tests/test_mcp.py
@@ -99,6 +99,8 @@ async def test_mcp_client_message_2():
 @pytest.mark.integration
 @pytest.mark.asyncio
 async def test_mcp_client_message_3():
+    # TODO: Create a more sophisticated tool call verification step that handles the stochosticity.
+    # This is because this pytest sometimes gives different tool calls which is expected of a vague freeform input
     await verify_mcp_tool_call(
         user_message="I want to use the Aurora model from Azure AI Foundry Labs; fetch details on how to implement it.",
         expected_tool_call_name="list_azure_ai_foundry_labs_projects",

From f3e13a5cb90d3a6d1a93b35ea10641b580e28e07 Mon Sep 17 00:00:00 2001
From: Dennis Feng <v-dennisfeng@microsoft.com>
Date: Wed, 28 May 2025 14:16:01 -0700
Subject: [PATCH 05/12] added tool-usage-evals to pyproject.toml

---
 pyproject.toml |  6 +++++-
 uv.lock        | 34 ++++++++++++++++++++++++----------
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f41bdaf..3260325 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,8 @@ dependencies = [
     "azure-search-documents>=11.5.2",
     "azure-cli>=2.60.0",
     "azure-ai-evaluation>=1.3.0",
-    "azure-ai-projects==1.0.0b10"
+    "azure-ai-projects==1.0.0b10",
+    "tool-usage-evals",
 ]
 
 [dependency-groups]
@@ -25,5 +26,8 @@ test = [
 asyncio_default_fixture_loop_scope = "function"  # or "module", "session" based on my use case
 pythonpath = ["src"]
 
+[tool.uv.sources]
+tool-usage-evals = { git = "https://github.com/dennischenfeng/tool-usage-evals", branch = "Feature/first" }
+
 [project.scripts]
 run-azure-ai-foundry-mcp = "mcp_foundry.__main__:main"
diff --git a/uv.lock b/uv.lock
index 6663508..aa1d5b7 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1920,7 +1920,7 @@ name = "exceptiongroup"
 version = "1.3.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" }
 wheels = [
@@ -2599,7 +2599,7 @@ wheels = [
 
 [[package]]
 name = "mcp"
-version = "1.9.0"
+version = "1.9.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -2612,9 +2612,9 @@ dependencies = [
     { name = "starlette" },
     { name = "uvicorn", marker = "sys_platform != 'emscripten'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/bc/8d/0f4468582e9e97b0a24604b585c651dfd2144300ecffd1c06a680f5c8861/mcp-1.9.0.tar.gz", hash = "sha256:905d8d208baf7e3e71d70c82803b89112e321581bcd2530f9de0fe4103d28749", size = 281432, upload-time = "2025-05-15T18:51:06.615Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e7/bc/54aec2c334698cc575ca3b3481eed627125fb66544152fa1af927b1a495c/mcp-1.9.1.tar.gz", hash = "sha256:19879cd6dde3d763297617242888c2f695a95dfa854386a6a68676a646ce75e4", size = 316247, upload-time = "2025-05-22T15:52:21.26Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a5/d5/22e36c95c83c80eb47c83f231095419cf57cf5cca5416f1c960032074c78/mcp-1.9.0-py3-none-any.whl", hash = "sha256:9dfb89c8c56f742da10a5910a1f64b0d2ac2c3ed2bd572ddb1cfab7f35957178", size = 125082, upload-time = "2025-05-15T18:51:04.916Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/c0/4ac795585a22a0a2d09cd2b1187b0252d2afcdebd01e10a68bbac4d34890/mcp-1.9.1-py3-none-any.whl", hash = "sha256:2900ded8ffafc3c8a7bfcfe8bc5204037e988e753ec398f371663e6a06ecd9a9", size = 130261, upload-time = "2025-05-22T15:52:19.702Z" },
 ]
 
 [[package]]
@@ -2631,6 +2631,7 @@ dependencies = [
     { name = "jinja2" },
     { name = "mcp" },
     { name = "requests" },
+    { name = "tool-usage-evals" },
 ]
 
 [package.dev-dependencies]
@@ -2650,6 +2651,7 @@ requires-dist = [
     { name = "jinja2", specifier = "~=3.0" },
     { name = "mcp", specifier = ">=1.8.0" },
     { name = "requests", specifier = ">=2.32.3" },
+    { name = "tool-usage-evals", git = "https://github.com/dennischenfeng/tool-usage-evals?branch=Feature%2Ffirst" },
 ]
 
 [package.metadata.requires-dev]
@@ -2923,7 +2925,7 @@ wheels = [
 
 [[package]]
 name = "openai"
-version = "1.79.0"
+version = "1.82.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -2935,9 +2937,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/52/cf/4901077dbbfd0d82a814d721600fa0c3a61a093d7f0bf84d0e4732448dc9/openai-1.79.0.tar.gz", hash = "sha256:e3b627aa82858d3e42d16616edc22aa9f7477ee5eb3e6819e9f44a961d899a4c", size = 444736, upload-time = "2025-05-16T19:49:59.738Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/3f/19/6b09bb3132f7e1a7a2291fd46fb33659bbccca041f863abd682e14ba86d7/openai-1.82.0.tar.gz", hash = "sha256:b0a009b9a58662d598d07e91e4219ab4b1e3d8ba2db3f173896a92b9b874d1a7", size = 461092, upload-time = "2025-05-22T20:08:07.282Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/81/d2/e3992bb7c6641b765c1008e3c96e076e0b50381be2cce344e6ff177bad80/openai-1.79.0-py3-none-any.whl", hash = "sha256:d5050b92d5ef83f869cb8dcd0aca0b2291c3413412500eec40c66981b3966992", size = 683334, upload-time = "2025-05-16T19:49:57.445Z" },
+    { url = "https://files.pythonhosted.org/packages/51/4b/a59464ee5f77822a81ee069b4021163a0174940a92685efc3cf8b4c443a3/openai-1.82.0-py3-none-any.whl", hash = "sha256:8c40647fea1816516cb3de5189775b30b5f4812777e40b8768f361f232b61b30", size = 720412, upload-time = "2025-05-22T20:08:05.637Z" },
 ]
 
 [[package]]
@@ -3404,7 +3406,7 @@ wheels = [
 
 [[package]]
 name = "pydantic"
-version = "2.11.4"
+version = "2.11.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "annotated-types" },
@@ -3412,9 +3414,9 @@ dependencies = [
     { name = "typing-extensions" },
     { name = "typing-inspection" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/77/ab/5250d56ad03884ab5efd07f734203943c8a8ab40d551e208af81d0257bf2/pydantic-2.11.4.tar.gz", hash = "sha256:32738d19d63a226a52eed76645a98ee07c1f410ee41d93b4afbfa85ed8111c2d", size = 786540, upload-time = "2025-04-29T20:38:55.02Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/f0/86/8ce9040065e8f924d642c58e4a344e33163a07f6b57f836d0d734e0ad3fb/pydantic-2.11.5.tar.gz", hash = "sha256:7f853db3d0ce78ce8bbb148c401c2cdd6431b3473c0cdff2755c7690952a7b7a", size = 787102, upload-time = "2025-05-22T21:18:08.761Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e7/12/46b65f3534d099349e38ef6ec98b1a5a81f42536d17e0ba382c28c67ba67/pydantic-2.11.4-py3-none-any.whl", hash = "sha256:d9615eaa9ac5a063471da949c8fc16376a84afb5024688b3ff885693506764eb", size = 443900, upload-time = "2025-04-29T20:38:52.724Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/69/831ed22b38ff9b4b64b66569f0e5b7b97cf3638346eb95a2147fdb49ad5f/pydantic-2.11.5-py3-none-any.whl", hash = "sha256:f9c26ba06f9747749ca1e5c94d6a85cb84254577553c8785576fd38fa64dc0f7", size = 444229, upload-time = "2025-05-22T21:18:06.329Z" },
 ]
 
 [[package]]
@@ -4294,6 +4296,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257, upload-time = "2024-11-27T22:38:35.385Z" },
 ]
 
+[[package]]
+name = "tool-usage-evals"
+version = "0.1.0"
+source = { git = "https://github.com/dennischenfeng/tool-usage-evals?branch=Feature%2Ffirst#bd22edfdc0a2ca27bd9925911620bcad849b9e8b" }
+dependencies = [
+    { name = "azure-identity" },
+    { name = "mcp" },
+    { name = "openai" },
+    { name = "pydantic" },
+    { name = "python-dotenv" },
+]
+
 [[package]]
 name = "tqdm"
 version = "4.67.1"

From 6ee2365d2e4063d27d59fd68a4dd7d9441ab2033 Mon Sep 17 00:00:00 2001
From: Dennis Feng <v-dennisfeng@microsoft.com>
Date: Wed, 28 May 2025 17:10:21 -0700
Subject: [PATCH 06/12] adding 1 pytest to test tool-usage-evals library

---
 pyproject.toml              |  13 ++-
 src/mcp_foundry/__main__.py |  14 ++-
 tests/test_mcp.py           | 182 ++++++++++++++++++++++--------------
 uv.lock                     |  10 +-
 4 files changed, 135 insertions(+), 84 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3260325..ee29993 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,7 @@ dependencies = [
     "azure-cli>=2.60.0",
     "azure-ai-evaluation>=1.3.0",
     "azure-ai-projects==1.0.0b10",
-    "tool-usage-evals",
+    "tool-usage-evals>=0.1.0",
 ]
 
 [dependency-groups]
@@ -22,12 +22,17 @@ test = [
     "pytest>=8.3.5",
     "pytest-asyncio>=0.26.0",
 ]
+
 [tool.pytest.ini_options]
 asyncio_default_fixture_loop_scope = "function"  # or "module", "session" based on my use case
 pythonpath = ["src"]
 
-[tool.uv.sources]
-tool-usage-evals = { git = "https://github.com/dennischenfeng/tool-usage-evals", branch = "Feature/first" }
-
 [project.scripts]
 run-azure-ai-foundry-mcp = "mcp_foundry.__main__:main"
+
+[build-system]
+requires = ["hatchling", "uv-dynamic-versioning"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/mcp_foundry"]
\ No newline at end of file
diff --git a/src/mcp_foundry/__main__.py b/src/mcp_foundry/__main__.py
index 1fd17dd..5b14cde 100644
--- a/src/mcp_foundry/__main__.py
+++ b/src/mcp_foundry/__main__.py
@@ -5,7 +5,7 @@
 from typing import Literal
 from dotenv import load_dotenv
 
-from .mcp_server import mcp, auto_import_modules
+from mcp_foundry.mcp_server import mcp, auto_import_modules
 
 
 # Configure logging
@@ -16,15 +16,19 @@
 )
 logger = logging.getLogger("__main__")
 
+
 def main() -> None:
     """Runs the MCP server"""
 
     parser = ArgumentParser(description="Start the MCP service with provided or default configuration.")
 
-    parser.add_argument('--transport', required=False, default='stdio',
-                        help='Transport protocol (sse | stdio | streamable-http) (default: stdio)')
-    parser.add_argument('--envFile', required=False, default='.env',
-                        help='Path to .env file (default: .env)')
+    parser.add_argument(
+        "--transport",
+        required=False,
+        default="stdio",
+        help="Transport protocol (sse | stdio | streamable-http) (default: stdio)",
+    )
+    parser.add_argument("--envFile", required=False, default=".env", help="Path to .env file (default: .env)")
 
     # Parse the application arguments
     args = parser.parse_args()
diff --git a/tests/test_mcp.py b/tests/test_mcp.py
index 24f378e..8338f54 100644
--- a/tests/test_mcp.py
+++ b/tests/test_mcp.py
@@ -1,3 +1,10 @@
+from tool_usage_evals.multi_step import run_agent_turn
+from pathlib import Path
+from tool_usage_evals.mcp_handling import (
+    mcp_session_context_manager,
+    extract_tool_definitions,
+    build_mcp_tool_caller,
+)
 import os
 import pytest
 from mcp import ClientSession, StdioServerParameters
@@ -10,8 +17,19 @@
 from dotenv import load_dotenv
 
 load_dotenv()
+MCP_SERVER_SCRIPT = Path(__file__).parent / "../src/mcp_foundry/__main__.py"
 
-token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
+
+@pytest.fixture(scope="session")
+def aoai_client() -> AzureOpenAI:
+    """Azure OpenAI client"""
+    token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
+    client = AzureOpenAI(
+        azure_ad_token_provider=token_provider,
+        azure_endpoint=os.environ["AOAI_ENDPOINT"],
+        api_version=os.environ["AOAI_API_VERSION"],
+    )
+    return client
 
 
 @pytest.mark.integration
@@ -30,78 +48,98 @@ async def test_mcp_client_lists_tools():
             assert tools, "Expected at least one tool from the MCP server"
 
 
-async def verify_mcp_tool_call(user_message: str, expected_tool_call_name: str, no_cache: bool = False):
-    """
-    Helper function to test MCP tool calling functionality with different messages and expected tools.
-
-    Args:
-        user_message: The query to send to the model
-        expected_tool_call_name: The name of the tool we expect the model to call
-        no_cache: Whether to use --no-cache flag when running the MCP server
-    """
-    args = ["run", "--spec", "..", "run-azure-ai-foundry-mcp"]
-    if no_cache:
-        args.insert(1, "--no-cache")
-
-    server_params = StdioServerParameters(
-        command="pipx",
-        args=args,
-    )
-
-    async with stdio_client(server_params) as (stdio, write):
-        async with ClientSession(stdio, write) as session:
-            await session.initialize()
-            tools_response = await session.list_tools()
-            openai_tools = construct_openai_tools_from_mcp_tools(
-                mcp_tools=tools_response,
-            )
-            aoai_client = AzureOpenAI(
-                azure_endpoint=os.environ["AOAI_ENDPOINT"],
-                api_version=os.environ["AOAI_API_VERSION"],
-                azure_ad_token_provider=token_provider,
-            )
-
-            completion = invoke_llm_with_tools(
-                user_message=user_message,
-                aoai_client=aoai_client,
-                model=os.environ["AOAI_MODEL"],
-                tools=openai_tools,
-            )
-            response_message = completion.choices[0].message
-
-            actual_tool_calls = response_message.tool_calls
-            assert len(actual_tool_calls) > 0, "Expected at least one tool call but got none"
-            assert isinstance(actual_tool_calls[0], ChatCompletionMessageToolCall)
-            assert actual_tool_calls[0].function.name == expected_tool_call_name
-
-            return completion
-
-
-@pytest.mark.integration
-@pytest.mark.asyncio
-async def test_mcp_client_message_1():
-    """Test that the model correctly calls list_azure_ai_foundry_labs_projects tool"""
-    await verify_mcp_tool_call(
-        user_message="Tell me the Azure AI Foundry Labs projects",
-        expected_tool_call_name="list_azure_ai_foundry_labs_projects",
-    )
+# async def verify_mcp_tool_call(user_message: str, expected_tool_call_name: str, no_cache: bool = False):
+#     """
+#     Helper function to test MCP tool calling functionality with different messages and expected tools.
+
+#     Args:
+#         user_message: The query to send to the model
+#         expected_tool_call_name: The name of the tool we expect the model to call
+#         no_cache: Whether to use --no-cache flag when running the MCP server
+#     """
+#     args = ["run", "--spec", "..", "run-azure-ai-foundry-mcp"]
+#     if no_cache:
+#         args.insert(1, "--no-cache")
+
+#     server_params = StdioServerParameters(
+#         command="pipx",
+#         args=args,
+#     )
+
+#     async with stdio_client(server_params) as (stdio, write):
+#         async with ClientSession(stdio, write) as session:
+#             await session.initialize()
+#             tools_response = await session.list_tools()
+#             openai_tools = construct_openai_tools_from_mcp_tools(
+#                 mcp_tools=tools_response,
+#             )
+#             aoai_client = AzureOpenAI(
+#                 azure_endpoint=os.environ["AOAI_ENDPOINT"],
+#                 api_version=os.environ["AOAI_API_VERSION"],
+#                 azure_ad_token_provider=token_provider,
+#             )
+
+#             completion = invoke_llm_with_tools(
+#                 user_message=user_message,
+#                 aoai_client=aoai_client,
+#                 model=os.environ["AOAI_MODEL"],
+#                 tools=openai_tools,
+#             )
+#             response_message = completion.choices[0].message
+
+#             actual_tool_calls = response_message.tool_calls
+#             assert len(actual_tool_calls) > 0, "Expected at least one tool call but got none"
+#             assert isinstance(actual_tool_calls[0], ChatCompletionMessageToolCall)
+#             assert actual_tool_calls[0].function.name == expected_tool_call_name
+
+#             return completion
+
+
+# @pytest.mark.integration
+# @pytest.mark.asyncio
+# async def test_mcp_client_message_1():
+#     """Test that the model correctly calls list_azure_ai_foundry_labs_projects tool"""
+#     await verify_mcp_tool_call(
+#         user_message="Tell me the Azure AI Foundry Labs projects",
+#         expected_tool_call_name="list_azure_ai_foundry_labs_projects",
+#     )
+
+
+# @pytest.mark.integration
+# @pytest.mark.asyncio
+# async def test_mcp_client_message_2():
+#     await verify_mcp_tool_call(
+#         user_message="I want to prototype an app with Azure AI Foundry Labs. Where do I start?",
+#         expected_tool_call_name="get_prototyping_instructions_for_github_and_labs",
+#     )
+
+
+# @pytest.mark.integration
+# @pytest.mark.asyncio
+# async def test_mcp_client_message_3():
+#     # TODO: Create a more sophisticated tool call verification step that handles the stochosticity.
+#     # This is because this pytest sometimes gives different tool calls which is expected of a vague freeform input
+#     await verify_mcp_tool_call(
+#         user_message="I want to use the Aurora model from Azure AI Foundry Labs; fetch details on how to implement it.",
+#         expected_tool_call_name="list_azure_ai_foundry_labs_projects",
+#     )
 
 
 @pytest.mark.integration
 @pytest.mark.asyncio
-async def test_mcp_client_message_2():
-    await verify_mcp_tool_call(
-        user_message="I want to prototype an app with Azure AI Foundry Labs. Where do I start?",
-        expected_tool_call_name="get_prototyping_instructions_for_github_and_labs",
-    )
-
-
-@pytest.mark.integration
-@pytest.mark.asyncio
-async def test_mcp_client_message_3():
-    # TODO: Create a more sophisticated tool call verification step that handles the stochosticity.
-    # This is because this pytest sometimes gives different tool calls which is expected of a vague freeform input
-    await verify_mcp_tool_call(
-        user_message="I want to use the Aurora model from Azure AI Foundry Labs; fetch details on how to implement it.",
-        expected_tool_call_name="list_azure_ai_foundry_labs_projects",
-    )
+async def test_mcp_client_message_10(aoai_client) -> None:
+    """test"""
+    user_message = "What are the projects in Azure AI Foundry Labs?"
+    async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session:
+        tools = await extract_tool_definitions(session)
+        call_tool_fn = await build_mcp_tool_caller(session)
+
+        result = await run_agent_turn(
+            aoai_client=aoai_client,
+            tools=tools,
+            call_tool_fn=call_tool_fn,
+            user_message=user_message,
+        )
+
+        tool_call_names = [t.name for t in result.tool_calls]
+        assert "list_azure_ai_foundry_labs_projects" in tool_call_names
diff --git a/uv.lock b/uv.lock
index aa1d5b7..278f691 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2620,7 +2620,7 @@ wheels = [
 [[package]]
 name = "mcp-foundry"
 version = "0.1.0"
-source = { virtual = "." }
+source = { editable = "." }
 dependencies = [
     { name = "azure-ai-evaluation" },
     { name = "azure-ai-projects" },
@@ -2651,7 +2651,7 @@ requires-dist = [
     { name = "jinja2", specifier = "~=3.0" },
     { name = "mcp", specifier = ">=1.8.0" },
     { name = "requests", specifier = ">=2.32.3" },
-    { name = "tool-usage-evals", git = "https://github.com/dennischenfeng/tool-usage-evals?branch=Feature%2Ffirst" },
+    { name = "tool-usage-evals", specifier = ">=0.1.0" },
 ]
 
 [package.metadata.requires-dev]
@@ -4299,7 +4299,7 @@ wheels = [
 [[package]]
 name = "tool-usage-evals"
 version = "0.1.0"
-source = { git = "https://github.com/dennischenfeng/tool-usage-evals?branch=Feature%2Ffirst#bd22edfdc0a2ca27bd9925911620bcad849b9e8b" }
+source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "azure-identity" },
     { name = "mcp" },
@@ -4307,6 +4307,10 @@ dependencies = [
     { name = "pydantic" },
     { name = "python-dotenv" },
 ]
+sdist = { url = "https://files.pythonhosted.org/packages/fe/eb/52307358325344969d3b12957df06214ba2b25597f78c872f4eda7552094/tool_usage_evals-0.1.0.tar.gz", hash = "sha256:b808ecbd74c9456580bb9c0a7a4b3ffc7a0ea2ecabf900cfcdabd4a796c776b1", size = 48529, upload-time = "2025-05-28T22:11:35.89Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/65/2d/ebc363bbbd1b87891c1f44f0947df4e98bdade593eafcc23bc507be873f1/tool_usage_evals-0.1.0-py3-none-any.whl", hash = "sha256:f4b1be40afa2b2f9f7e1114b23176f483a316bf4bfd5bc0c156f6c62ef051840", size = 5394, upload-time = "2025-05-28T22:11:34.354Z" },
+]
 
 [[package]]
 name = "tqdm"

From d3eb84336d440714b6d704ca4c3e13ac442354e2 Mon Sep 17 00:00:00 2001
From: Dennis Feng <v-dennisfeng@microsoft.com>
Date: Wed, 28 May 2025 18:08:01 -0700
Subject: [PATCH 07/12] first test is working, using tool-usage-evals library

---
 pyproject.toml                             |  4 ++--
 src/mcp_foundry/mcp_foundry_model/tools.py |  2 +-
 uv.lock                                    | 10 +++++-----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ee29993..e44e0de 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ description = "MCP Server for Azure AI Foundry (experimental)"
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
-    "mcp>=1.8.0",
+    "mcp>=1.9.1",
     "requests>=2.32.3",
     "azure-mgmt-cognitiveservices>=13.0.0",
     "azure-identity>=1.0",
@@ -14,7 +14,7 @@ dependencies = [
     "azure-cli>=2.60.0",
     "azure-ai-evaluation>=1.3.0",
     "azure-ai-projects==1.0.0b10",
-    "tool-usage-evals>=0.1.0",
+    "tool-usage-evals>=0.1.1",
 ]
 
 [dependency-groups]
diff --git a/src/mcp_foundry/mcp_foundry_model/tools.py b/src/mcp_foundry/mcp_foundry_model/tools.py
index 57c80e0..a88fc79 100644
--- a/src/mcp_foundry/mcp_foundry_model/tools.py
+++ b/src/mcp_foundry/mcp_foundry_model/tools.py
@@ -37,7 +37,7 @@
 
 
 @mcp.tool()
-async def list_models_from_model_catalog(ctx: Context, search_for_free_playground: bool = False, publisher_name = "", license_name = "") -> str:
+async def list_models_from_model_catalog(ctx: Context, search_for_free_playground: bool = False, publisher_name: str = "", license_name: str = "") -> str:
     """
     Retrieves a list of supported models from the Azure AI Foundry catalog.
 
diff --git a/uv.lock b/uv.lock
index 278f691..c048b3a 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2649,9 +2649,9 @@ requires-dist = [
     { name = "azure-mgmt-cognitiveservices", specifier = ">=13.0.0" },
     { name = "azure-search-documents", specifier = ">=11.5.2" },
     { name = "jinja2", specifier = "~=3.0" },
-    { name = "mcp", specifier = ">=1.8.0" },
+    { name = "mcp", specifier = ">=1.9.1" },
     { name = "requests", specifier = ">=2.32.3" },
-    { name = "tool-usage-evals", specifier = ">=0.1.0" },
+    { name = "tool-usage-evals", specifier = ">=0.1.1" },
 ]
 
 [package.metadata.requires-dev]
@@ -4298,7 +4298,7 @@ wheels = [
 
 [[package]]
 name = "tool-usage-evals"
-version = "0.1.0"
+version = "0.1.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "azure-identity" },
@@ -4307,9 +4307,9 @@ dependencies = [
     { name = "pydantic" },
     { name = "python-dotenv" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/fe/eb/52307358325344969d3b12957df06214ba2b25597f78c872f4eda7552094/tool_usage_evals-0.1.0.tar.gz", hash = "sha256:b808ecbd74c9456580bb9c0a7a4b3ffc7a0ea2ecabf900cfcdabd4a796c776b1", size = 48529, upload-time = "2025-05-28T22:11:35.89Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/0c/ba/150dfce6eaa2a1167f366fbc91b95ed05f68f3a972e83207b38c3dc81dcc/tool_usage_evals-0.1.1.tar.gz", hash = "sha256:8a666b8252623d9014b3d4f304b7ed512a7a8bf1e4fee22a921c565f799871dd", size = 48669, upload-time = "2025-05-29T01:00:48.58Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/65/2d/ebc363bbbd1b87891c1f44f0947df4e98bdade593eafcc23bc507be873f1/tool_usage_evals-0.1.0-py3-none-any.whl", hash = "sha256:f4b1be40afa2b2f9f7e1114b23176f483a316bf4bfd5bc0c156f6c62ef051840", size = 5394, upload-time = "2025-05-28T22:11:34.354Z" },
+    { url = "https://files.pythonhosted.org/packages/43/e6/bb7678b08c0d45f78ef9323de2757180dfb11d8eb091690067d6fba100b1/tool_usage_evals-0.1.1-py3-none-any.whl", hash = "sha256:bf1fa5010bc1797e807064e096346d82042ba6a9d0110e81c0a160f5404b508f", size = 5414, upload-time = "2025-05-29T01:00:46.803Z" },
 ]
 
 [[package]]

From 016229e81640515a07ce4d7a85f644f58898fd2b Mon Sep 17 00:00:00 2001
From: Dennis Feng <v-dennisfeng@microsoft.com>
Date: Thu, 29 May 2025 14:48:51 -0700
Subject: [PATCH 08/12] tests all working because added tenacity for retries

---
 pyproject.toml    |  3 ++-
 tests/test_mcp.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++-
 uv.lock           | 11 +++++++++
 3 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e44e0de..921e3b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ dependencies = [
 test = [
     "pytest>=8.3.5",
     "pytest-asyncio>=0.26.0",
+    "tenacity>=9.1.2",
 ]
 
 [tool.pytest.ini_options]
@@ -35,4 +36,4 @@ requires = ["hatchling", "uv-dynamic-versioning"]
 build-backend = "hatchling.build"
 
 [tool.hatch.build.targets.wheel]
-packages = ["src/mcp_foundry"]
\ No newline at end of file
+packages = ["src/mcp_foundry"]
diff --git a/tests/test_mcp.py b/tests/test_mcp.py
index 8338f54..b994a0f 100644
--- a/tests/test_mcp.py
+++ b/tests/test_mcp.py
@@ -15,6 +15,21 @@
 
 from azure.identity import DefaultAzureCredential, get_bearer_token_provider
 from dotenv import load_dotenv
+import openai
+from tenacity import (
+    retry,
+    wait_random_exponential,
+    stop_after_attempt,
+    retry_if_exception_type,
+)
+
+
+retry_decorator = retry(
+    retry=retry_if_exception_type(openai.RateLimitError),
+    wait=wait_random_exponential(min=10, max=90),
+    stop=stop_after_attempt(6),
+    reraise=True,
+)
 
 load_dotenv()
 MCP_SERVER_SCRIPT = Path(__file__).parent / "../src/mcp_foundry/__main__.py"
@@ -134,7 +149,47 @@ async def test_mcp_client_message_10(aoai_client) -> None:
         tools = await extract_tool_definitions(session)
         call_tool_fn = await build_mcp_tool_caller(session)
 
-        result = await run_agent_turn(
+        result = await retry_decorator(run_agent_turn)(
+            aoai_client=aoai_client,
+            tools=tools,
+            call_tool_fn=call_tool_fn,
+            user_message=user_message,
+        )
+
+        tool_call_names = [t.name for t in result.tool_calls]
+        assert "list_azure_ai_foundry_labs_projects" in tool_call_names
+
+
+@pytest.mark.integration
+@pytest.mark.asyncio
+async def test_mcp_client_message_20(aoai_client) -> None:
+    """test"""
+    user_message = "I want to prototype an app with Azure AI Foundry Labs. Where do I start?"
+    async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session:
+        tools = await extract_tool_definitions(session)
+        call_tool_fn = await build_mcp_tool_caller(session)
+
+        result = await retry_decorator(run_agent_turn)(
+            aoai_client=aoai_client,
+            tools=tools,
+            call_tool_fn=call_tool_fn,
+            user_message=user_message,
+        )
+
+        tool_call_names = [t.name for t in result.tool_calls]
+        assert "get_prototyping_instructions_for_github_and_labs" in tool_call_names
+
+
+@pytest.mark.integration
+@pytest.mark.asyncio
+async def test_mcp_client_message_30(aoai_client) -> None:
+    """test"""
+    user_message = "I want to use the Aurora model from Azure AI Foundry Labs; fetch details on how to implement it."
+    async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session:
+        tools = await extract_tool_definitions(session)
+        call_tool_fn = await build_mcp_tool_caller(session)
+
+        result = await retry_decorator(run_agent_turn)(
             aoai_client=aoai_client,
             tools=tools,
             call_tool_fn=call_tool_fn,
diff --git a/uv.lock b/uv.lock
index c048b3a..0fd8132 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2638,6 +2638,7 @@ dependencies = [
 test = [
     { name = "pytest" },
     { name = "pytest-asyncio" },
+    { name = "tenacity" },
 ]
 
 [package.metadata]
@@ -2658,6 +2659,7 @@ requires-dist = [
 test = [
     { name = "pytest", specifier = ">=8.3.5" },
     { name = "pytest-asyncio", specifier = ">=0.26.0" },
+    { name = "tenacity", specifier = ">=9.1.2" },
 ]
 
 [[package]]
@@ -4221,6 +4223,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" },
 ]
 
+[[package]]
+name = "tenacity"
+version = "9.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0a/d4/2b0cd0fe285e14b36db076e78c93766ff1d529d70408bd1d2a5a84f1d929/tenacity-9.1.2.tar.gz", hash = "sha256:1169d376c297e7de388d18b4481760d478b0e99a777cad3a9c86e556f4b697cb", size = 48036, upload-time = "2025-04-02T08:25:09.966Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" },
+]
+
 [[package]]
 name = "tiktoken"
 version = "0.9.0"

From f36cf12241a96648361326e28132f27415d56357 Mon Sep 17 00:00:00 2001
From: Dennis Feng <v-dennisfeng@microsoft.com>
Date: Thu, 29 May 2025 16:42:35 -0700
Subject: [PATCH 09/12] updated pytest to use multiple trials

---
 pyproject.toml    |   1 +
 tests/test_mcp.py | 114 +++++++++-------------------------------------
 uv.lock           |   2 +
 3 files changed, 25 insertions(+), 92 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 921e3b7..05662af 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ dependencies = [
     "azure-ai-evaluation>=1.3.0",
     "azure-ai-projects==1.0.0b10",
     "tool-usage-evals>=0.1.1",
+    "tqdm>=4.67.1",
 ]
 
 [dependency-groups]
diff --git a/tests/test_mcp.py b/tests/test_mcp.py
index b994a0f..9df61f3 100644
--- a/tests/test_mcp.py
+++ b/tests/test_mcp.py
@@ -9,9 +9,7 @@
 import pytest
 from mcp import ClientSession, StdioServerParameters
 from mcp.client.stdio import stdio_client
-from .utils import construct_openai_tools_from_mcp_tools, invoke_llm_with_tools
 from openai import AzureOpenAI
-from openai.types.chat import ChatCompletionMessageToolCall
 
 from azure.identity import DefaultAzureCredential, get_bearer_token_provider
 from dotenv import load_dotenv
@@ -22,6 +20,7 @@
     stop_after_attempt,
     retry_if_exception_type,
 )
+from tqdm import tqdm
 
 
 retry_decorator = retry(
@@ -49,7 +48,7 @@ def aoai_client() -> AzureOpenAI:
 
 @pytest.mark.integration
 @pytest.mark.asyncio
-async def test_mcp_client_lists_tools():
+async def test_mcp_client_lists_tools_using_pipx():
     server_params = StdioServerParameters(
         command="pipx",
         args=["run", "--no-cache", "--spec", "..", "run-azure-ai-foundry-mcp"],
@@ -63,86 +62,9 @@ async def test_mcp_client_lists_tools():
             assert tools, "Expected at least one tool from the MCP server"
 
 
-# async def verify_mcp_tool_call(user_message: str, expected_tool_call_name: str, no_cache: bool = False):
-#     """
-#     Helper function to test MCP tool calling functionality with different messages and expected tools.
-
-#     Args:
-#         user_message: The query to send to the model
-#         expected_tool_call_name: The name of the tool we expect the model to call
-#         no_cache: Whether to use --no-cache flag when running the MCP server
-#     """
-#     args = ["run", "--spec", "..", "run-azure-ai-foundry-mcp"]
-#     if no_cache:
-#         args.insert(1, "--no-cache")
-
-#     server_params = StdioServerParameters(
-#         command="pipx",
-#         args=args,
-#     )
-
-#     async with stdio_client(server_params) as (stdio, write):
-#         async with ClientSession(stdio, write) as session:
-#             await session.initialize()
-#             tools_response = await session.list_tools()
-#             openai_tools = construct_openai_tools_from_mcp_tools(
-#                 mcp_tools=tools_response,
-#             )
-#             aoai_client = AzureOpenAI(
-#                 azure_endpoint=os.environ["AOAI_ENDPOINT"],
-#                 api_version=os.environ["AOAI_API_VERSION"],
-#                 azure_ad_token_provider=token_provider,
-#             )
-
-#             completion = invoke_llm_with_tools(
-#                 user_message=user_message,
-#                 aoai_client=aoai_client,
-#                 model=os.environ["AOAI_MODEL"],
-#                 tools=openai_tools,
-#             )
-#             response_message = completion.choices[0].message
-
-#             actual_tool_calls = response_message.tool_calls
-#             assert len(actual_tool_calls) > 0, "Expected at least one tool call but got none"
-#             assert isinstance(actual_tool_calls[0], ChatCompletionMessageToolCall)
-#             assert actual_tool_calls[0].function.name == expected_tool_call_name
-
-#             return completion
-
-
-# @pytest.mark.integration
-# @pytest.mark.asyncio
-# async def test_mcp_client_message_1():
-#     """Test that the model correctly calls list_azure_ai_foundry_labs_projects tool"""
-#     await verify_mcp_tool_call(
-#         user_message="Tell me the Azure AI Foundry Labs projects",
-#         expected_tool_call_name="list_azure_ai_foundry_labs_projects",
-#     )
-
-
-# @pytest.mark.integration
-# @pytest.mark.asyncio
-# async def test_mcp_client_message_2():
-#     await verify_mcp_tool_call(
-#         user_message="I want to prototype an app with Azure AI Foundry Labs. Where do I start?",
-#         expected_tool_call_name="get_prototyping_instructions_for_github_and_labs",
-#     )
-
-
-# @pytest.mark.integration
-# @pytest.mark.asyncio
-# async def test_mcp_client_message_3():
-#     # TODO: Create a more sophisticated tool call verification step that handles the stochosticity.
-#     # This is because this pytest sometimes gives different tool calls which is expected of a vague freeform input
-#     await verify_mcp_tool_call(
-#         user_message="I want to use the Aurora model from Azure AI Foundry Labs; fetch details on how to implement it.",
-#         expected_tool_call_name="list_azure_ai_foundry_labs_projects",
-#     )
-
-
 @pytest.mark.integration
 @pytest.mark.asyncio
-async def test_mcp_client_message_10(aoai_client) -> None:
+async def test_mcp_client_message_1(aoai_client) -> None:
     """test"""
     user_message = "What are the projects in Azure AI Foundry Labs?"
     async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session:
@@ -162,7 +84,7 @@ async def test_mcp_client_message_10(aoai_client) -> None:
 
 @pytest.mark.integration
 @pytest.mark.asyncio
-async def test_mcp_client_message_20(aoai_client) -> None:
+async def test_mcp_client_message_2(aoai_client) -> None:
     """test"""
     user_message = "I want to prototype an app with Azure AI Foundry Labs. Where do I start?"
     async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session:
@@ -182,19 +104,27 @@ async def test_mcp_client_message_20(aoai_client) -> None:
 
 @pytest.mark.integration
 @pytest.mark.asyncio
-async def test_mcp_client_message_30(aoai_client) -> None:
+async def test_mcp_client_message_3(aoai_client) -> None:
     """test"""
-    user_message = "I want to use the Aurora model from Azure AI Foundry Labs; fetch details on how to implement it."
+    user_message = "Give me code and implementation details for the Aurora model."
+    n_trials = 5
     async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session:
         tools = await extract_tool_definitions(session)
         call_tool_fn = await build_mcp_tool_caller(session)
 
-        result = await retry_decorator(run_agent_turn)(
-            aoai_client=aoai_client,
-            tools=tools,
-            call_tool_fn=call_tool_fn,
-            user_message=user_message,
-        )
+        results = []
+        for trial in tqdm(range(n_trials)):
+            result = await retry_decorator(run_agent_turn)(
+                aoai_client=aoai_client,
+                tools=tools,
+                call_tool_fn=call_tool_fn,
+                user_message=user_message,
+            )
+            results.append(result)
 
-        tool_call_names = [t.name for t in result.tool_calls]
-        assert "list_azure_ai_foundry_labs_projects" in tool_call_names
+        all_tool_call_names = [[t.name for t in result.tool_calls] for result in results]
+
+        n_found_correct_tool = sum(["get_model_details_and_code_samples" in names for names in all_tool_call_names])
+        accuracy = n_found_correct_tool / n_trials
+
+        assert accuracy > 0.5
diff --git a/uv.lock b/uv.lock
index 0fd8132..c80d583 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2632,6 +2632,7 @@ dependencies = [
     { name = "mcp" },
     { name = "requests" },
     { name = "tool-usage-evals" },
+    { name = "tqdm" },
 ]
 
 [package.dev-dependencies]
@@ -2653,6 +2654,7 @@ requires-dist = [
     { name = "mcp", specifier = ">=1.9.1" },
     { name = "requests", specifier = ">=2.32.3" },
     { name = "tool-usage-evals", specifier = ">=0.1.1" },
+    { name = "tqdm", specifier = ">=4.67.1" },
 ]
 
 [package.metadata.requires-dev]

From c54aabe1cba712c83084359fb7947b20ac8a07be Mon Sep 17 00:00:00 2001
From: Dennis Feng <v-dennisfeng@microsoft.com>
Date: Thu, 29 May 2025 16:57:33 -0700
Subject: [PATCH 10/12] clean up

---
 pyproject.toml |  8 ++++----
 tests/utils.py | 50 --------------------------------------------------
 2 files changed, 4 insertions(+), 54 deletions(-)
 delete mode 100644 tests/utils.py

diff --git a/pyproject.toml b/pyproject.toml
index 05662af..cc24e50 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ description = "MCP Server for Azure AI Foundry (experimental)"
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
-    "mcp>=1.9.1",
+    "mcp>=1.8.0",
     "requests>=2.32.3",
     "azure-mgmt-cognitiveservices>=13.0.0",
     "azure-identity>=1.0",
@@ -14,8 +14,6 @@ dependencies = [
     "azure-cli>=2.60.0",
     "azure-ai-evaluation>=1.3.0",
     "azure-ai-projects==1.0.0b10",
-    "tool-usage-evals>=0.1.1",
-    "tqdm>=4.67.1",
 ]
 
 [dependency-groups]
@@ -23,6 +21,8 @@ test = [
     "pytest>=8.3.5",
     "pytest-asyncio>=0.26.0",
     "tenacity>=9.1.2",
+    "tqdm>=4.67.1",
+    "tool-usage-evals>=0.1.1",
 ]
 
 [tool.pytest.ini_options]
@@ -33,7 +33,7 @@ pythonpath = ["src"]
 run-azure-ai-foundry-mcp = "mcp_foundry.__main__:main"
 
 [build-system]
-requires = ["hatchling", "uv-dynamic-versioning"]
+requires = ["hatchling"]
 build-backend = "hatchling.build"
 
 [tool.hatch.build.targets.wheel]
diff --git a/tests/utils.py b/tests/utils.py
deleted file mode 100644
index d3bcf61..0000000
--- a/tests/utils.py
+++ /dev/null
@@ -1,50 +0,0 @@
-"""
-Utilities for helping run tests.
-"""
-
-from openai import AzureOpenAI
-from openai.types.chat import ChatCompletion
-from mcp.types import ListToolsResult
-
-
-def construct_openai_tools_from_mcp_tools(mcp_tools: ListToolsResult) -> list[dict]:
-    """
-    Given a tools list from MCP server, convert it to the format required to feed into Azure OpenAI chat completion.
-    """
-    final_tools = [
-        {
-            "type": "function",
-            "function": {
-                "name": tool.name,
-                "description": tool.description,
-                "parameters": tool.inputSchema,
-            },
-        }
-        for tool in mcp_tools.tools
-    ]
-    return final_tools
-
-
-def invoke_llm_with_tools(
-    user_message: str,
-    tools: list[dict],
-    aoai_client: AzureOpenAI,
-    model: str,
-) -> ChatCompletion:
-    """
-    Invoke a single LLM inference step on a user message, including the specified tools, and return the response
-    """
-    messages = [
-        dict(
-            role="user",
-            content=user_message,
-        )
-    ]
-
-    completion = aoai_client.chat.completions.create(
-        model=model,
-        messages=messages,
-        tools=tools,
-        tool_choice="auto",
-    )
-    return completion

From 765891765c32b38702721fd1fccb778aa8a78f41 Mon Sep 17 00:00:00 2001
From: Dennis Feng <v-dennisfeng@microsoft.com>
Date: Thu, 29 May 2025 17:02:44 -0700
Subject: [PATCH 11/12] added more to docstring of tests

---
 tests/test_mcp.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/test_mcp.py b/tests/test_mcp.py
index 9df61f3..d3c2afb 100644
--- a/tests/test_mcp.py
+++ b/tests/test_mcp.py
@@ -65,7 +65,7 @@ async def test_mcp_client_lists_tools_using_pipx():
 @pytest.mark.integration
 @pytest.mark.asyncio
 async def test_mcp_client_message_1(aoai_client) -> None:
-    """test"""
+    """Test tool usage for a user message asking about foundry labs projects."""
     user_message = "What are the projects in Azure AI Foundry Labs?"
     async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session:
         tools = await extract_tool_definitions(session)
@@ -85,7 +85,7 @@ async def test_mcp_client_message_1(aoai_client) -> None:
 @pytest.mark.integration
 @pytest.mark.asyncio
 async def test_mcp_client_message_2(aoai_client) -> None:
-    """test"""
+    """Test tool usage for a user message asking about prototyping with foundry labs projects."""
     user_message = "I want to prototype an app with Azure AI Foundry Labs. Where do I start?"
     async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session:
         tools = await extract_tool_definitions(session)
@@ -105,7 +105,11 @@ async def test_mcp_client_message_2(aoai_client) -> None:
 @pytest.mark.integration
 @pytest.mark.asyncio
 async def test_mcp_client_message_3(aoai_client) -> None:
-    """test"""
+    """
+    Test tool usage for a user message asking code/implementation details.
+    Because of stochasticity of response (sometimes uses the prototyping tool or list-projects
+    tool instead of intended code-samples tool), we do n repeated trials.
+    """
     user_message = "Give me code and implementation details for the Aurora model."
     n_trials = 5
     async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session:

From d9318bf6d5c4d5d2e660156ebda718b1afa24942 Mon Sep 17 00:00:00 2001
From: Dennis Feng <v-dennisfeng@microsoft.com>
Date: Thu, 29 May 2025 18:13:28 -0700
Subject: [PATCH 12/12] reduced n_trials from 5 to 3

---
 pyproject.toml    |  2 +-
 tests/test_mcp.py |  9 ++++++---
 uv.lock           | 17 +++++++++--------
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index cc24e50..48dea49 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ test = [
     "pytest-asyncio>=0.26.0",
     "tenacity>=9.1.2",
     "tqdm>=4.67.1",
-    "tool-usage-evals>=0.1.1",
+    "tool-usage-evals>=0.1.4",
 ]
 
 [tool.pytest.ini_options]
diff --git a/tests/test_mcp.py b/tests/test_mcp.py
index d3c2afb..940ecca 100644
--- a/tests/test_mcp.py
+++ b/tests/test_mcp.py
@@ -40,8 +40,8 @@ def aoai_client() -> AzureOpenAI:
     token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
     client = AzureOpenAI(
         azure_ad_token_provider=token_provider,
-        azure_endpoint=os.environ["AOAI_ENDPOINT"],
-        api_version=os.environ["AOAI_API_VERSION"],
+        azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
+        api_version=os.environ["AZURE_OPENAI_API_VERSION"],
     )
     return client
 
@@ -73,6 +73,7 @@ async def test_mcp_client_message_1(aoai_client) -> None:
 
         result = await retry_decorator(run_agent_turn)(
             aoai_client=aoai_client,
+            model=os.environ["AZURE_OPENAI_DEPLOYMENT"],
             tools=tools,
             call_tool_fn=call_tool_fn,
             user_message=user_message,
@@ -93,6 +94,7 @@ async def test_mcp_client_message_2(aoai_client) -> None:
 
         result = await retry_decorator(run_agent_turn)(
             aoai_client=aoai_client,
+            model=os.environ["AZURE_OPENAI_DEPLOYMENT"],
             tools=tools,
             call_tool_fn=call_tool_fn,
             user_message=user_message,
@@ -111,7 +113,7 @@ async def test_mcp_client_message_3(aoai_client) -> None:
     tool instead of intended code-samples tool), we do n repeated trials.
     """
     user_message = "Give me code and implementation details for the Aurora model."
-    n_trials = 5
+    n_trials = 3
     async with mcp_session_context_manager("python", [str(MCP_SERVER_SCRIPT)]) as session:
         tools = await extract_tool_definitions(session)
         call_tool_fn = await build_mcp_tool_caller(session)
@@ -120,6 +122,7 @@ async def test_mcp_client_message_3(aoai_client) -> None:
         for trial in tqdm(range(n_trials)):
             result = await retry_decorator(run_agent_turn)(
                 aoai_client=aoai_client,
+                model=os.environ["AZURE_OPENAI_DEPLOYMENT"],
                 tools=tools,
                 call_tool_fn=call_tool_fn,
                 user_message=user_message,
diff --git a/uv.lock b/uv.lock
index c80d583..b73f619 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2631,8 +2631,6 @@ dependencies = [
     { name = "jinja2" },
     { name = "mcp" },
     { name = "requests" },
-    { name = "tool-usage-evals" },
-    { name = "tqdm" },
 ]
 
 [package.dev-dependencies]
@@ -2640,6 +2638,8 @@ test = [
     { name = "pytest" },
     { name = "pytest-asyncio" },
     { name = "tenacity" },
+    { name = "tool-usage-evals" },
+    { name = "tqdm" },
 ]
 
 [package.metadata]
@@ -2651,10 +2651,8 @@ requires-dist = [
     { name = "azure-mgmt-cognitiveservices", specifier = ">=13.0.0" },
     { name = "azure-search-documents", specifier = ">=11.5.2" },
     { name = "jinja2", specifier = "~=3.0" },
-    { name = "mcp", specifier = ">=1.9.1" },
+    { name = "mcp", specifier = ">=1.8.0" },
     { name = "requests", specifier = ">=2.32.3" },
-    { name = "tool-usage-evals", specifier = ">=0.1.1" },
-    { name = "tqdm", specifier = ">=4.67.1" },
 ]
 
 [package.metadata.requires-dev]
@@ -2662,6 +2660,8 @@ test = [
     { name = "pytest", specifier = ">=8.3.5" },
     { name = "pytest-asyncio", specifier = ">=0.26.0" },
     { name = "tenacity", specifier = ">=9.1.2" },
+    { name = "tool-usage-evals", specifier = ">=0.1.4" },
+    { name = "tqdm", specifier = ">=4.67.1" },
 ]
 
 [[package]]
@@ -4311,7 +4311,7 @@ wheels = [
 
 [[package]]
 name = "tool-usage-evals"
-version = "0.1.1"
+version = "0.1.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "azure-identity" },
@@ -4319,10 +4319,11 @@ dependencies = [
     { name = "openai" },
     { name = "pydantic" },
     { name = "python-dotenv" },
+    { name = "tenacity" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/0c/ba/150dfce6eaa2a1167f366fbc91b95ed05f68f3a972e83207b38c3dc81dcc/tool_usage_evals-0.1.1.tar.gz", hash = "sha256:8a666b8252623d9014b3d4f304b7ed512a7a8bf1e4fee22a921c565f799871dd", size = 48669, upload-time = "2025-05-29T01:00:48.58Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/68/5c/57e6940cbb191a982dee71863c125aad943a84b4f30d4ce08a522642f764/tool_usage_evals-0.1.4.tar.gz", hash = "sha256:7abbeb257183a8fbe818b7f0e793a5afe669ff96e2dfc3ae28f87a4ae41b8731", size = 49210, upload-time = "2025-05-30T00:49:51.715Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/43/e6/bb7678b08c0d45f78ef9323de2757180dfb11d8eb091690067d6fba100b1/tool_usage_evals-0.1.1-py3-none-any.whl", hash = "sha256:bf1fa5010bc1797e807064e096346d82042ba6a9d0110e81c0a160f5404b508f", size = 5414, upload-time = "2025-05-29T01:00:46.803Z" },
+    { url = "https://files.pythonhosted.org/packages/09/73/298e98da166fbd16820affbed4c213bc22af77fd2e68df3381add4f186db/tool_usage_evals-0.1.4-py3-none-any.whl", hash = "sha256:732b167927697bf19bb924c2341f27056e591ad6496023b288884e8081d11ce1", size = 5578, upload-time = "2025-05-30T00:49:50.49Z" },
 ]
 
 [[package]]