Skip to content

Commit 0456698

Browse files
authored
chore: support for pre-filters in $vectorSearch MCP-240 (#689)
1 parent 95b7b25 commit 0456698

File tree

6 files changed

+333
-36
lines changed

6 files changed

+333
-36
lines changed

src/common/search/embeddingsProvider.ts

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,33 @@ export const zVoyageModels = z
2727
.enum(["voyage-3-large", "voyage-3.5", "voyage-3.5-lite", "voyage-code-3"])
2828
.default("voyage-3-large");
2929

30+
// Zod does not undestand JS boxed numbers (like Int32) as integer literals,
31+
// so we preprocess them to unwrap them so Zod understands them.
32+
function unboxNumber(v: unknown): number {
33+
if (v && typeof v === "object" && typeof v.valueOf === "function") {
34+
const n = Number(v.valueOf());
35+
if (!Number.isNaN(n)) return n;
36+
}
37+
return v as number;
38+
}
39+
3040
export const zVoyageEmbeddingParameters = z.object({
3141
outputDimension: z
32-
.union([z.literal(256), z.literal(512), z.literal(1024), z.literal(2048), z.literal(4096)])
42+
.preprocess(
43+
unboxNumber,
44+
z.union([z.literal(256), z.literal(512), z.literal(1024), z.literal(2048), z.literal(4096)])
45+
)
3346
.optional()
3447
.default(1024),
35-
outputDType: z.enum(["float", "int8", "uint8", "binary", "ubinary"]).optional().default("float"),
48+
outputDtype: z.enum(["float", "int8", "uint8", "binary", "ubinary"]).optional().default("float"),
3649
});
3750

51+
const zVoyageAPIParameters = zVoyageEmbeddingParameters
52+
.extend({
53+
inputType: z.enum(["query", "document"]),
54+
})
55+
.strip();
56+
3857
type VoyageModels = z.infer<typeof zVoyageModels>;
3958
type VoyageEmbeddingParameters = z.infer<typeof zVoyageEmbeddingParameters> & EmbeddingParameters;
4059

@@ -62,11 +81,15 @@ class VoyageEmbeddingsProvider implements EmbeddingsProvider<VoyageModels, Voyag
6281
content: EmbeddingsInput[],
6382
parameters: VoyageEmbeddingParameters
6483
): Promise<Embeddings[]> {
84+
// This ensures that if we receive any random parameter from the outside (agent or us)
85+
// it's stripped before sending it to Voyage, as Voyage will reject the request on
86+
// a single unknown parameter.
87+
const voyage = zVoyageAPIParameters.parse(parameters);
6588
const model = this.voyage.textEmbeddingModel(modelId);
6689
const { embeddings } = await embedMany({
6790
model,
6891
values: content,
69-
providerOptions: { voyage: parameters },
92+
providerOptions: { voyage },
7093
});
7194

7295
return embeddings;

src/tools/mongodb/read/aggregate.ts

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ const VectorSearchStage = z.object({
4747
filter: zEJSON()
4848
.optional()
4949
.describe(
50-
"MQL filter that can only use pre-filter fields from the index definition. Note to LLM: If unsure, use the `collection-indexes` tool to learn which fields can be used for pre-filtering."
50+
"MQL filter that can only use filter fields from the index definition. Note to LLM: If unsure, use the `collection-indexes` tool to learn which fields can be used for filtering."
5151
),
5252
embeddingParameters: zSupportedEmbeddingParameters
5353
.optional()
@@ -59,11 +59,21 @@ const VectorSearchStage = z.object({
5959
});
6060

6161
export const AggregateArgs = {
62-
pipeline: z
63-
.array(z.union([AnyStage, VectorSearchStage]))
64-
.describe(
65-
"An array of aggregation stages to execute. $vectorSearch can only appear as the first stage of the aggregation pipeline or as the first stage of a $unionWith subpipeline. When using $vectorSearch, unless the user explicitly asks for the embeddings, $unset any embedding field to avoid reaching context limits."
66-
),
62+
pipeline: z.array(z.union([AnyStage, VectorSearchStage])).describe(
63+
`An array of aggregation stages to execute.
64+
\`$vectorSearch\` **MUST** be the first stage of the pipeline, or the first stage of a \`$unionWith\` subpipeline.
65+
### Usage Rules for \`$vectorSearch\`
66+
- **Unset embeddings:**
67+
Unless the user explicitly requests the embeddings, add an \`$unset\` stage **at the end of the pipeline** to remove the embedding field and avoid context limits. **The $unset stage in this situation is mandatory**.
68+
- **Pre-filtering:**
69+
If the user requests additional filtering, include filters in \`$vectorSearch.filter\` only for pre-filter fields in the vector index.
70+
NEVER include fields in $vectorSearch.filter that are not part of the vector index.
71+
- **Post-filtering:**
72+
For all remaining filters, add a $match stage after $vectorSearch.
73+
### Note to LLM
74+
- If unsure which fields are filterable, use the collection-indexes tool to determine valid prefilter fields.
75+
- If no requested filters are valid prefilters, omit the filter key from $vectorSearch.`
76+
),
6777
responseBytesLimit: z.number().optional().default(ONE_MB).describe(`\
6878
The maximum number of bytes to return in the response. This value is capped by the server's configured maxBytesPerQuery and cannot be exceeded. \
6979
Note to LLM: If the entire aggregation result is required, use the "export" tool instead of increasing this limit.\

tests/accuracy/aggregate.test.ts

Lines changed: 231 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,24 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js";
22
import { Matcher } from "./sdk/matcher.js";
33
import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
44

5+
function doesUnset(field: string): Matcher {
6+
return Matcher.anyOf(
7+
// { $unset: "<field>" } || { $unset: ["<field>"] }
8+
Matcher.value({ $unset: Matcher.arrayOrSingle(Matcher.value(field)) }),
9+
// { $unset: { "<field>": "" } }
10+
Matcher.value({ $unset: { [field]: "" } })
11+
);
12+
}
13+
14+
const embeddingParameters = {
15+
model: "voyage-3-large",
16+
outputDimension: Matcher.anyOf(
17+
Matcher.undefined,
18+
Matcher.number((n) => n === 1024)
19+
),
20+
outputDType: Matcher.anyOf(Matcher.undefined, Matcher.value("float")),
21+
};
22+
523
describeAccuracyTests([
624
{
725
prompt: "Group all the movies in 'mflix.movies' namespace by 'release_year' and give me a count of them",
@@ -48,16 +66,71 @@ describeAccuracyTests([
4866
index: "titles",
4967
path: "title_embeddings",
5068
queryVector: "hammer of justice",
51-
embeddingParameters: {
52-
model: "voyage-3-large",
53-
outputDimension: Matcher.anyOf(
54-
Matcher.undefined,
55-
Matcher.number((n) => n === 1024)
56-
),
69+
embeddingParameters,
70+
filter: Matcher.emptyObjectOrUndefined,
71+
limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
72+
},
73+
},
74+
doesUnset("title_embeddings"),
75+
],
76+
responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
77+
},
78+
},
79+
],
80+
mockedTools: {
81+
"collection-indexes": (): CallToolResult => {
82+
return {
83+
content: [
84+
{
85+
type: "text",
86+
text: JSON.stringify({
87+
name: "titles",
88+
type: "vectorSearch",
89+
status: "READY",
90+
queryable: true,
91+
latestDefinition: {
92+
type: "vector",
93+
path: "title_embeddings",
94+
numDimensions: 1024,
95+
quantization: "none",
96+
similarity: "euclidean",
5797
},
98+
}),
99+
},
100+
],
101+
};
102+
},
103+
},
104+
},
105+
{
106+
prompt: "Run a vectorSearch query on musicfy.songs on path 'title_embeddings' using the index 'titles' with the model voyage-3-large to find all 'hammer of justice' songs. Keep the embedding field, do not remove it.",
107+
expectedToolCalls: [
108+
{
109+
toolName: "collection-indexes",
110+
parameters: {
111+
database: "musicfy",
112+
collection: "songs",
113+
},
114+
optional: true,
115+
},
116+
{
117+
toolName: "aggregate",
118+
parameters: {
119+
database: "musicfy",
120+
collection: "songs",
121+
pipeline: [
122+
{
123+
$vectorSearch: {
124+
exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(false)),
125+
index: "titles",
126+
path: "title_embeddings",
127+
queryVector: "hammer of justice",
128+
embeddingParameters,
58129
filter: Matcher.emptyObjectOrUndefined,
130+
limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
59131
},
60132
},
133+
Matcher.not(doesUnset("title_embeddings")),
61134
],
62135
responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
63136
},
@@ -107,21 +180,16 @@ describeAccuracyTests([
107180
pipeline: [
108181
{
109182
$vectorSearch: {
110-
exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(true)),
183+
exact: true,
111184
index: "titles",
112185
path: "title_embeddings",
113186
queryVector: "hammer of justice",
114-
limit: 10,
115-
embeddingParameters: {
116-
model: "voyage-3-large",
117-
outputDimension: Matcher.anyOf(
118-
Matcher.undefined,
119-
Matcher.number((n) => n === 1024)
120-
),
121-
},
187+
embeddingParameters,
122188
filter: Matcher.emptyObjectOrUndefined,
189+
limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
123190
},
124191
},
192+
doesUnset("title_embeddings"),
125193
],
126194
responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
127195
},
@@ -153,7 +221,7 @@ describeAccuracyTests([
153221
},
154222
},
155223
{
156-
prompt: "Run an approximate vectorSearch query on mflix.movies on path 'plot_embeddings' with the model voyage-3-large to find all 'sci-fy' movies.",
224+
prompt: "Run an approximate vectorSearch query on mflix.movies on path 'plot_embeddings' with the model voyage-3-large to find all 'sci-fi' movies.",
157225
expectedToolCalls: [
158226
{
159227
toolName: "collection-indexes",
@@ -173,17 +241,13 @@ describeAccuracyTests([
173241
exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(false)),
174242
index: "my-index",
175243
path: "plot_embeddings",
176-
queryVector: "sci-fy",
177-
embeddingParameters: {
178-
model: "voyage-3-large",
179-
outputDimension: Matcher.anyOf(
180-
Matcher.undefined,
181-
Matcher.number((n) => n === 1024)
182-
),
183-
},
244+
queryVector: "sci-fi",
245+
embeddingParameters,
184246
filter: Matcher.emptyObjectOrUndefined,
247+
limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
185248
},
186249
},
250+
doesUnset("plot_embeddings"),
187251
],
188252
responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
189253
},
@@ -214,4 +278,147 @@ describeAccuracyTests([
214278
},
215279
},
216280
},
281+
{
282+
prompt: "(Pre-filter) Run a vectorSearch query on mflix.movies on path 'plot_embeddings' with the model voyage-3-large to find all 'sci-fi' movies. I only want movies with the `released` after 1993 (included) and are published in catalan.",
283+
expectedToolCalls: [
284+
{
285+
toolName: "collection-indexes",
286+
parameters: {
287+
database: "mflix",
288+
collection: "movies",
289+
},
290+
},
291+
{
292+
toolName: "aggregate",
293+
parameters: {
294+
database: "mflix",
295+
collection: "movies",
296+
pipeline: [
297+
{
298+
$vectorSearch: {
299+
exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(false)),
300+
index: "my-index",
301+
path: "plot_embeddings",
302+
queryVector: "sci-fi",
303+
numCandidates: Matcher.anyOf(Matcher.number(), Matcher.undefined),
304+
limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
305+
embeddingParameters,
306+
filter: {
307+
released: { $gte: 1993 },
308+
language: Matcher.caseInsensitiveString("catalan"),
309+
},
310+
},
311+
},
312+
doesUnset("plot_embeddings"),
313+
],
314+
responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
315+
},
316+
},
317+
],
318+
mockedTools: {
319+
"collection-indexes": (): CallToolResult => {
320+
return {
321+
content: [
322+
{
323+
type: "text",
324+
text: JSON.stringify({
325+
name: "my-index",
326+
type: "vectorSearch",
327+
status: "READY",
328+
queryable: true,
329+
latestDefinition: {
330+
fields: [
331+
{
332+
type: "vector",
333+
path: "plot_embeddings",
334+
numDimensions: 1024,
335+
quantization: "none",
336+
similarity: "euclidean",
337+
},
338+
{
339+
type: "filter",
340+
path: "language",
341+
},
342+
{
343+
type: "filter",
344+
path: "released",
345+
},
346+
],
347+
},
348+
}),
349+
},
350+
],
351+
};
352+
},
353+
},
354+
},
355+
{
356+
prompt: "(No-prefilter) Run a vectorSearch query on mflix.movies on path 'plot_embeddings' with the model voyage-3-large to find all 'sci-fi' movies. I only want movies with `released` after 1993 (included) and are published in catalan.",
357+
expectedToolCalls: [
358+
{
359+
toolName: "collection-indexes",
360+
parameters: {
361+
database: "mflix",
362+
collection: "movies",
363+
},
364+
},
365+
{
366+
toolName: "aggregate",
367+
parameters: {
368+
database: "mflix",
369+
collection: "movies",
370+
pipeline: [
371+
{
372+
$vectorSearch: {
373+
exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(false)),
374+
index: "my-index",
375+
path: "plot_embeddings",
376+
queryVector: "sci-fi",
377+
numCandidates: Matcher.anyOf(Matcher.number(), Matcher.undefined),
378+
limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
379+
embeddingParameters,
380+
filter: Matcher.emptyObjectOrUndefined,
381+
},
382+
},
383+
{
384+
$match: {
385+
released: { $gte: 1993 },
386+
language: Matcher.caseInsensitiveString("catalan"),
387+
},
388+
},
389+
doesUnset("plot_embeddings"),
390+
],
391+
responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
392+
},
393+
},
394+
],
395+
mockedTools: {
396+
"collection-indexes": (): CallToolResult => {
397+
return {
398+
content: [
399+
{
400+
type: "text",
401+
text: JSON.stringify({
402+
name: "my-index",
403+
type: "vectorSearch",
404+
status: "READY",
405+
queryable: true,
406+
latestDefinition: {
407+
fields: [
408+
{
409+
type: "vector",
410+
path: "plot_embeddings",
411+
numDimensions: 1024,
412+
quantization: "none",
413+
similarity: "euclidean",
414+
},
415+
],
416+
},
417+
}),
418+
},
419+
],
420+
};
421+
},
422+
},
423+
},
217424
]);

0 commit comments

Comments
 (0)