fix: accuracy test fixes (#651)

nirinchev · web-flow · commit 1cf6f6dbed30 · 2025-10-15T16:19:24.000+02:00
diff --git a/scripts/accuracy/generateTestSummary.ts b/scripts/accuracy/generateTestSummary.ts
@@ -73,7 +73,8 @@ function formatToolCallsWithTooltip(toolCalls: ExpectedToolCall[] | LLMToolCall[
     return toolCalls
         .map((call) => {
             const params = JSON.stringify(call.parameters, null, 2);
-            return `<span class="tool-call" title="${params.replace(/"/g, "&quot;")}">${call.toolName}</span>`;
+            const isOptional = "optional" in call && call.optional;
+            return `<span class="tool-call" title="${params.replace(/"/g, "&quot;")}">${isOptional ? "(" : ""}${call.toolName}${isOptional ? ")" : ""}</span>`;
         })
         .join(", ");
 }
diff --git a/tests/accuracy/createCollection.test.ts b/tests/accuracy/createCollection.test.ts
@@ -28,6 +28,11 @@ describeAccuracyTests([
     {
         prompt: "If and only if, the namespace 'mflix.documentaries' does not exist, then create it",
         expectedToolCalls: [
+            {
+                toolName: "list-databases",
+                parameters: {},
+                optional: true,
+            },
             {
                 toolName: "list-collections",
                 parameters: {
diff --git a/tests/accuracy/createIndex.test.ts b/tests/accuracy/createIndex.test.ts
@@ -1,19 +1,8 @@
-import { afterAll, beforeAll } from "vitest";
 import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js";
 import { Matcher } from "./sdk/matcher.js";
 
-let originalApiKey: string | undefined;
-beforeAll(() => {
-    originalApiKey = process.env.MDB_VOYAGE_API_KEY;
-
-    // We just need a valid key when registering the tool, the actual value is not important
-    if (!originalApiKey) {
-        process.env.MDB_VOYAGE_API_KEY = "valid-key";
-    }
-});
-afterAll(() => {
-    process.env.MDB_VOYAGE_API_KEY = originalApiKey;
-});
+// TODO: supply this with a proper config API once we refactor describeAccuracyTests to support it
+process.env.MDB_VOYAGE_API_KEY = "valid-key";
 
 describeAccuracyTests([
     {
diff --git a/tests/accuracy/dropCollection.test.ts b/tests/accuracy/dropCollection.test.ts
@@ -4,6 +4,18 @@ describeAccuracyTests([
     {
         prompt: "Remove mflix.movies namespace from my cluster.",
         expectedToolCalls: [
+            {
+                toolName: "list-databases",
+                parameters: {},
+                optional: true,
+            },
+            {
+                toolName: "list-collections",
+                parameters: {
+                    database: "mflix",
+                },
+                optional: true,
+            },
             {
                 toolName: "drop-collection",
                 parameters: {
diff --git a/tests/accuracy/dropDatabase.test.ts b/tests/accuracy/dropDatabase.test.ts
@@ -4,6 +4,11 @@ describeAccuracyTests([
     {
         prompt: "Remove mflix database from my cluster.",
         expectedToolCalls: [
+            {
+                toolName: "list-databases",
+                parameters: {},
+                optional: true,
+            },
             {
                 toolName: "drop-database",
                 parameters: {
diff --git a/tests/accuracy/export.test.ts b/tests/accuracy/export.test.ts
@@ -114,12 +114,20 @@ describeAccuracyTests([
                             arguments: {
                                 pipeline: [
                                     {
-                                        $group: {
-                                            _id: "$release_year",
-                                            titles: {
-                                                $push: "$title",
-                                            },
-                                        },
+                                        $group: Matcher.anyOf(
+                                            Matcher.value({
+                                                _id: "$release_year",
+                                                titles: {
+                                                    $push: "$title",
+                                                },
+                                            }),
+                                            Matcher.value({
+                                                _id: "$release_year",
+                                                movies: {
+                                                    $push: "$title",
+                                                },
+                                            })
+                                        ),
                                     },
                                 ],
                             },
diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts
@@ -124,6 +124,7 @@ describeAccuracyTests([
                     limit: Matcher.anyValue,
                     sort: Matcher.anyValue,
                 },
+                optional: true,
             },
             {
                 toolName: "export",
@@ -137,14 +138,19 @@ describeAccuracyTests([
                             arguments: Matcher.anyOf(
                                 Matcher.emptyObjectOrUndefined,
                                 Matcher.value({
-                                    filter: Matcher.anyValue,
+                                    filter: Matcher.emptyObjectOrUndefined,
                                     projection: Matcher.anyValue,
                                     limit: Matcher.anyValue,
                                     sort: Matcher.anyValue,
                                 })
                             ),
                         },
                     ],
+                    jsonExportFormat: Matcher.anyOf(
+                        Matcher.undefined,
+                        Matcher.value("relaxed"),
+                        Matcher.value("canonical")
+                    ),
                 },
             },
         ],
diff --git a/tests/accuracy/getPerformanceAdvisor.test.ts b/tests/accuracy/getPerformanceAdvisor.test.ts
@@ -35,21 +35,27 @@ const mockedTools = {
     },
 };
 
+const listProjectsAndClustersToolCalls = [
+    {
+        toolName: "atlas-list-projects",
+        parameters: {},
+        optional: true,
+    },
+    {
+        toolName: "atlas-list-clusters",
+        parameters: {
+            projectId: "mflix",
+        },
+        optional: true,
+    },
+];
+
 describeAccuracyTests([
     // Test for Suggested Indexes operation
     {
         prompt: "Can you give me index suggestions for the database 'mflix' in the project 'mflix' and cluster 'mflix-cluster'?",
         expectedToolCalls: [
-            {
-                toolName: "atlas-list-projects",
-                parameters: {},
-            },
-            {
-                toolName: "atlas-list-clusters",
-                parameters: {
-                    projectId: "mflix",
-                },
-            },
+            ...listProjectsAndClustersToolCalls,
             {
                 toolName: "atlas-get-performance-advisor",
                 parameters: {
@@ -65,16 +71,7 @@ describeAccuracyTests([
     {
         prompt: "Show me drop index suggestions for the 'mflix' project and 'mflix-cluster' cluster",
         expectedToolCalls: [
-            {
-                toolName: "atlas-list-projects",
-                parameters: {},
-            },
-            {
-                toolName: "atlas-list-clusters",
-                parameters: {
-                    projectId: "mflix",
-                },
-            },
+            ...listProjectsAndClustersToolCalls,
             {
                 toolName: "atlas-get-performance-advisor",
                 parameters: {
@@ -88,26 +85,17 @@ describeAccuracyTests([
     },
     // Test for Slow Query Logs operation
     {
-        prompt: "Show me the slow query logs for the 'mflix' project and 'mflix-cluster' cluster for the namespaces 'mflix.movies' and 'mflix.shows' since January 1st, 2025.",
+        prompt: "Show me the slow query logs for the 'mflix' project and 'mflix-cluster' cluster for the namespaces 'mflix.movies' and 'mflix.shows' since January 1st, 2023",
         expectedToolCalls: [
-            {
-                toolName: "atlas-list-projects",
-                parameters: {},
-            },
-            {
-                toolName: "atlas-list-clusters",
-                parameters: {
-                    projectId: "mflix",
-                },
-            },
+            ...listProjectsAndClustersToolCalls,
             {
                 toolName: "atlas-get-performance-advisor",
                 parameters: {
                     projectId: "mflix",
                     clusterName: "mflix-cluster",
                     operations: ["slowQueryLogs"],
                     namespaces: ["mflix.movies", "mflix.shows"],
-                    since: "2025-01-01T00:00:00Z",
+                    since: "2023-01-01T00:00:00Z",
                 },
             },
         ],
@@ -117,16 +105,7 @@ describeAccuracyTests([
     {
         prompt: "Give me schema suggestions for the 'mflix' project and 'mflix-cluster' cluster",
         expectedToolCalls: [
-            {
-                toolName: "atlas-list-projects",
-                parameters: {},
-            },
-            {
-                toolName: "atlas-list-clusters",
-                parameters: {
-                    projectId: "mflix",
-                },
-            },
+            ...listProjectsAndClustersToolCalls,
             {
                 toolName: "atlas-get-performance-advisor",
                 parameters: {
@@ -142,16 +121,7 @@ describeAccuracyTests([
     {
         prompt: "Show me all performance advisor recommendations for the 'mflix' project and 'mflix-cluster' cluster",
         expectedToolCalls: [
-            {
-                toolName: "atlas-list-projects",
-                parameters: {},
-            },
-            {
-                toolName: "atlas-list-clusters",
-                parameters: {
-                    projectId: "mflix",
-                },
-            },
+            ...listProjectsAndClustersToolCalls,
             {
                 toolName: "atlas-get-performance-advisor",
                 parameters: {
diff --git a/tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts b/tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts
@@ -4,7 +4,9 @@ export interface LLMToolCall {
     parameters: Record<string, unknown>;
 }
 
-export type ExpectedToolCall = Omit<LLMToolCall, "toolCallId">;
+export type ExpectedToolCall = Omit<LLMToolCall, "toolCallId"> & {
+    optional?: boolean;
+};
 
 export const AccuracyRunStatus = {
     Done: "done",
diff --git a/tests/accuracy/sdk/accuracyScorer.ts b/tests/accuracy/sdk/accuracyScorer.ts
@@ -81,12 +81,15 @@ export function calculateToolCallingAccuracy(
             .sort((a, b) => b.score - a.score || a.index - b.index);
 
         const bestMatch = candidates[0];
-        if (!bestMatch || bestMatch.score === 0) {
-            return 0; // No matching tool call found, return 0
+        if (bestMatch) {
+            checkedActualToolCallIndexes.add(bestMatch.index);
+            currentScore = Math.min(currentScore, bestMatch.score);
+        } else if (expectedCall.optional) {
+            // Optional expected tool call not found, but it's okay, continue
+            continue;
+        } else {
+            return 0; // Required expected tool call not found, return 0
         }
-
-        checkedActualToolCallIndexes.add(bestMatch.index);
-        currentScore = Math.min(currentScore, bestMatch.score);
     }
 
     return currentScore;

Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,11 @@ describeAccuracyTests([`
`28`	`28`	`{`
`29`	`29`	`prompt: "If and only if, the namespace 'mflix.documentaries' does not exist, then create it",`
`30`	`30`	`expectedToolCalls: [`
	`31`	`+ {`
	`32`	`+ toolName: "list-databases",`
	`33`	`+ parameters: {},`
	`34`	`+ optional: true,`
	`35`	`+ },`
`31`	`36`	`{`
`32`	`37`	`toolName: "list-collections",`
`33`	`38`	`parameters: {`
Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,11 @@ describeAccuracyTests([`
`4`	`4`	`{`
`5`	`5`	`prompt: "Remove mflix database from my cluster.",`
`6`	`6`	`expectedToolCalls: [`
	`7`	`+ {`
	`8`	`+ toolName: "list-databases",`
	`9`	`+ parameters: {},`
	`10`	`+ optional: true,`
	`11`	`+ },`
`7`	`12`	`{`
`8`	`13`	`toolName: "drop-database",`
`9`	`14`	`parameters: {`
Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,9 @@ export interface LLMToolCall {`
`4`	`4`	`parameters: Record<string, unknown>;`
`5`	`5`	`}`
`6`	`6`
`7`		`-export type ExpectedToolCall = Omit<LLMToolCall, "toolCallId">;`
	`7`	`+export type ExpectedToolCall = Omit<LLMToolCall, "toolCallId"> & {`
	`8`	`+ optional?: boolean;`
	`9`	`+};`
`8`	`10`
`9`	`11`	`export const AccuracyRunStatus = {`
`10`	`12`	`Done: "done",`