Skip to content

Commit 1cf6f6d

Browse files
authored
fix: accuracy test fixes (#651)
1 parent 18fe549 commit 1cf6f6d

File tree

10 files changed

+80
-79
lines changed

10 files changed

+80
-79
lines changed

scripts/accuracy/generateTestSummary.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,8 @@ function formatToolCallsWithTooltip(toolCalls: ExpectedToolCall[] | LLMToolCall[
7373
return toolCalls
7474
.map((call) => {
7575
const params = JSON.stringify(call.parameters, null, 2);
76-
return `<span class="tool-call" title="${params.replace(/"/g, "&quot;")}">${call.toolName}</span>`;
76+
const isOptional = "optional" in call && call.optional;
77+
return `<span class="tool-call" title="${params.replace(/"/g, "&quot;")}">${isOptional ? "(" : ""}${call.toolName}${isOptional ? ")" : ""}</span>`;
7778
})
7879
.join(", ");
7980
}

tests/accuracy/createCollection.test.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@ describeAccuracyTests([
2828
{
2929
prompt: "If and only if, the namespace 'mflix.documentaries' does not exist, then create it",
3030
expectedToolCalls: [
31+
{
32+
toolName: "list-databases",
33+
parameters: {},
34+
optional: true,
35+
},
3136
{
3237
toolName: "list-collections",
3338
parameters: {

tests/accuracy/createIndex.test.ts

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,8 @@
1-
import { afterAll, beforeAll } from "vitest";
21
import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js";
32
import { Matcher } from "./sdk/matcher.js";
43

5-
let originalApiKey: string | undefined;
6-
beforeAll(() => {
7-
originalApiKey = process.env.MDB_VOYAGE_API_KEY;
8-
9-
// We just need a valid key when registering the tool, the actual value is not important
10-
if (!originalApiKey) {
11-
process.env.MDB_VOYAGE_API_KEY = "valid-key";
12-
}
13-
});
14-
afterAll(() => {
15-
process.env.MDB_VOYAGE_API_KEY = originalApiKey;
16-
});
4+
// TODO: supply this with a proper config API once we refactor describeAccuracyTests to support it
5+
process.env.MDB_VOYAGE_API_KEY = "valid-key";
176

187
describeAccuracyTests([
198
{

tests/accuracy/dropCollection.test.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,18 @@ describeAccuracyTests([
44
{
55
prompt: "Remove mflix.movies namespace from my cluster.",
66
expectedToolCalls: [
7+
{
8+
toolName: "list-databases",
9+
parameters: {},
10+
optional: true,
11+
},
12+
{
13+
toolName: "list-collections",
14+
parameters: {
15+
database: "mflix",
16+
},
17+
optional: true,
18+
},
719
{
820
toolName: "drop-collection",
921
parameters: {

tests/accuracy/dropDatabase.test.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@ describeAccuracyTests([
44
{
55
prompt: "Remove mflix database from my cluster.",
66
expectedToolCalls: [
7+
{
8+
toolName: "list-databases",
9+
parameters: {},
10+
optional: true,
11+
},
712
{
813
toolName: "drop-database",
914
parameters: {

tests/accuracy/export.test.ts

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -114,12 +114,20 @@ describeAccuracyTests([
114114
arguments: {
115115
pipeline: [
116116
{
117-
$group: {
118-
_id: "$release_year",
119-
titles: {
120-
$push: "$title",
121-
},
122-
},
117+
$group: Matcher.anyOf(
118+
Matcher.value({
119+
_id: "$release_year",
120+
titles: {
121+
$push: "$title",
122+
},
123+
}),
124+
Matcher.value({
125+
_id: "$release_year",
126+
movies: {
127+
$push: "$title",
128+
},
129+
})
130+
),
123131
},
124132
],
125133
},

tests/accuracy/find.test.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ describeAccuracyTests([
124124
limit: Matcher.anyValue,
125125
sort: Matcher.anyValue,
126126
},
127+
optional: true,
127128
},
128129
{
129130
toolName: "export",
@@ -137,14 +138,19 @@ describeAccuracyTests([
137138
arguments: Matcher.anyOf(
138139
Matcher.emptyObjectOrUndefined,
139140
Matcher.value({
140-
filter: Matcher.anyValue,
141+
filter: Matcher.emptyObjectOrUndefined,
141142
projection: Matcher.anyValue,
142143
limit: Matcher.anyValue,
143144
sort: Matcher.anyValue,
144145
})
145146
),
146147
},
147148
],
149+
jsonExportFormat: Matcher.anyOf(
150+
Matcher.undefined,
151+
Matcher.value("relaxed"),
152+
Matcher.value("canonical")
153+
),
148154
},
149155
},
150156
],

tests/accuracy/getPerformanceAdvisor.test.ts

Lines changed: 22 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -35,21 +35,27 @@ const mockedTools = {
3535
},
3636
};
3737

38+
const listProjectsAndClustersToolCalls = [
39+
{
40+
toolName: "atlas-list-projects",
41+
parameters: {},
42+
optional: true,
43+
},
44+
{
45+
toolName: "atlas-list-clusters",
46+
parameters: {
47+
projectId: "mflix",
48+
},
49+
optional: true,
50+
},
51+
];
52+
3853
describeAccuracyTests([
3954
// Test for Suggested Indexes operation
4055
{
4156
prompt: "Can you give me index suggestions for the database 'mflix' in the project 'mflix' and cluster 'mflix-cluster'?",
4257
expectedToolCalls: [
43-
{
44-
toolName: "atlas-list-projects",
45-
parameters: {},
46-
},
47-
{
48-
toolName: "atlas-list-clusters",
49-
parameters: {
50-
projectId: "mflix",
51-
},
52-
},
58+
...listProjectsAndClustersToolCalls,
5359
{
5460
toolName: "atlas-get-performance-advisor",
5561
parameters: {
@@ -65,16 +71,7 @@ describeAccuracyTests([
6571
{
6672
prompt: "Show me drop index suggestions for the 'mflix' project and 'mflix-cluster' cluster",
6773
expectedToolCalls: [
68-
{
69-
toolName: "atlas-list-projects",
70-
parameters: {},
71-
},
72-
{
73-
toolName: "atlas-list-clusters",
74-
parameters: {
75-
projectId: "mflix",
76-
},
77-
},
74+
...listProjectsAndClustersToolCalls,
7875
{
7976
toolName: "atlas-get-performance-advisor",
8077
parameters: {
@@ -88,26 +85,17 @@ describeAccuracyTests([
8885
},
8986
// Test for Slow Query Logs operation
9087
{
91-
prompt: "Show me the slow query logs for the 'mflix' project and 'mflix-cluster' cluster for the namespaces 'mflix.movies' and 'mflix.shows' since January 1st, 2025.",
88+
prompt: "Show me the slow query logs for the 'mflix' project and 'mflix-cluster' cluster for the namespaces 'mflix.movies' and 'mflix.shows' since January 1st, 2023",
9289
expectedToolCalls: [
93-
{
94-
toolName: "atlas-list-projects",
95-
parameters: {},
96-
},
97-
{
98-
toolName: "atlas-list-clusters",
99-
parameters: {
100-
projectId: "mflix",
101-
},
102-
},
90+
...listProjectsAndClustersToolCalls,
10391
{
10492
toolName: "atlas-get-performance-advisor",
10593
parameters: {
10694
projectId: "mflix",
10795
clusterName: "mflix-cluster",
10896
operations: ["slowQueryLogs"],
10997
namespaces: ["mflix.movies", "mflix.shows"],
110-
since: "2025-01-01T00:00:00Z",
98+
since: "2023-01-01T00:00:00Z",
11199
},
112100
},
113101
],
@@ -117,16 +105,7 @@ describeAccuracyTests([
117105
{
118106
prompt: "Give me schema suggestions for the 'mflix' project and 'mflix-cluster' cluster",
119107
expectedToolCalls: [
120-
{
121-
toolName: "atlas-list-projects",
122-
parameters: {},
123-
},
124-
{
125-
toolName: "atlas-list-clusters",
126-
parameters: {
127-
projectId: "mflix",
128-
},
129-
},
108+
...listProjectsAndClustersToolCalls,
130109
{
131110
toolName: "atlas-get-performance-advisor",
132111
parameters: {
@@ -142,16 +121,7 @@ describeAccuracyTests([
142121
{
143122
prompt: "Show me all performance advisor recommendations for the 'mflix' project and 'mflix-cluster' cluster",
144123
expectedToolCalls: [
145-
{
146-
toolName: "atlas-list-projects",
147-
parameters: {},
148-
},
149-
{
150-
toolName: "atlas-list-clusters",
151-
parameters: {
152-
projectId: "mflix",
153-
},
154-
},
124+
...listProjectsAndClustersToolCalls,
155125
{
156126
toolName: "atlas-get-performance-advisor",
157127
parameters: {

tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@ export interface LLMToolCall {
44
parameters: Record<string, unknown>;
55
}
66

7-
export type ExpectedToolCall = Omit<LLMToolCall, "toolCallId">;
7+
export type ExpectedToolCall = Omit<LLMToolCall, "toolCallId"> & {
8+
optional?: boolean;
9+
};
810

911
export const AccuracyRunStatus = {
1012
Done: "done",

tests/accuracy/sdk/accuracyScorer.ts

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,15 @@ export function calculateToolCallingAccuracy(
8181
.sort((a, b) => b.score - a.score || a.index - b.index);
8282

8383
const bestMatch = candidates[0];
84-
if (!bestMatch || bestMatch.score === 0) {
85-
return 0; // No matching tool call found, return 0
84+
if (bestMatch) {
85+
checkedActualToolCallIndexes.add(bestMatch.index);
86+
currentScore = Math.min(currentScore, bestMatch.score);
87+
} else if (expectedCall.optional) {
88+
// Optional expected tool call not found, but it's okay, continue
89+
continue;
90+
} else {
91+
return 0; // Required expected tool call not found, return 0
8692
}
87-
88-
checkedActualToolCallIndexes.add(bestMatch.index);
89-
currentScore = Math.min(currentScore, bestMatch.score);
9093
}
9194

9295
return currentScore;

0 commit comments

Comments
 (0)