mongodb-js
diff --git a/‎src/common/search/embeddingsProvider.ts‎
Lines changed: 26 additions & 3 deletions b/‎src/common/search/embeddingsProvider.ts‎
Lines changed: 26 additions & 3 deletions
diff --git a/‎src/tools/mongodb/read/aggregate.ts‎
Lines changed: 16 additions & 6 deletions b/‎src/tools/mongodb/read/aggregate.ts‎
Lines changed: 16 additions & 6 deletions
diff --git a/‎tests/accuracy/aggregate.test.ts‎
Lines changed: 231 additions & 24 deletions b/‎tests/accuracy/aggregate.test.ts‎
Lines changed: 231 additions & 24 deletions
@@ -27,14 +27,33 @@ export const zVoyageModels = z
     .enum(["voyage-3-large", "voyage-3.5", "voyage-3.5-lite", "voyage-code-3"])
     .default("voyage-3-large");
 
+// Zod does not undestand JS boxed numbers (like Int32) as integer literals,
+// so we preprocess them to unwrap them so Zod understands them.
+function unboxNumber(v: unknown): number {
+    if (v && typeof v === "object" && typeof v.valueOf === "function") {
+        const n = Number(v.valueOf());
+        if (!Number.isNaN(n)) return n;
+    }
+    return v as number;
+}
+
 export const zVoyageEmbeddingParameters = z.object({
     outputDimension: z
-        .union([z.literal(256), z.literal(512), z.literal(1024), z.literal(2048), z.literal(4096)])
+        .preprocess(
+            unboxNumber,
+            z.union([z.literal(256), z.literal(512), z.literal(1024), z.literal(2048), z.literal(4096)])
+        )
         .optional()
         .default(1024),
-    outputDType: z.enum(["float", "int8", "uint8", "binary", "ubinary"]).optional().default("float"),
+    outputDtype: z.enum(["float", "int8", "uint8", "binary", "ubinary"]).optional().default("float"),
 });
 
+const zVoyageAPIParameters = zVoyageEmbeddingParameters
+    .extend({
+        inputType: z.enum(["query", "document"]),
+    })
+    .strip();
+
 type VoyageModels = z.infer<typeof zVoyageModels>;
 type VoyageEmbeddingParameters = z.infer<typeof zVoyageEmbeddingParameters> & EmbeddingParameters;
 
@@ -62,11 +81,15 @@ class VoyageEmbeddingsProvider implements EmbeddingsProvider<VoyageModels, Voyag
         content: EmbeddingsInput[],
         parameters: VoyageEmbeddingParameters
     ): Promise<Embeddings[]> {
+        // This ensures that if we receive any random parameter from the outside (agent or us)
+        // it's stripped before sending it to Voyage, as Voyage will reject the request on
+        // a single unknown parameter.
+        const voyage = zVoyageAPIParameters.parse(parameters);
         const model = this.voyage.textEmbeddingModel(modelId);
         const { embeddings } = await embedMany({
             model,
             values: content,
-            providerOptions: { voyage: parameters },
+            providerOptions: { voyage },
         });
 
         return embeddings;
 
@@ -47,7 +47,7 @@ const VectorSearchStage = z.object({
             filter: zEJSON()
                 .optional()
                 .describe(
-                    "MQL filter that can only use pre-filter fields from the index definition. Note to LLM: If unsure, use the `collection-indexes` tool to learn which fields can be used for pre-filtering."
+                    "MQL filter that can only use filter fields from the index definition. Note to LLM: If unsure, use the `collection-indexes` tool to learn which fields can be used for filtering."
                 ),
             embeddingParameters: zSupportedEmbeddingParameters
                 .optional()
@@ -59,11 +59,21 @@ const VectorSearchStage = z.object({
 });
 
 export const AggregateArgs = {
-    pipeline: z
-        .array(z.union([AnyStage, VectorSearchStage]))
-        .describe(
-            "An array of aggregation stages to execute. $vectorSearch can only appear as the first stage of the aggregation pipeline or as the first stage of a $unionWith subpipeline. When using $vectorSearch, unless the user explicitly asks for the embeddings, $unset any embedding field to avoid reaching context limits."
-        ),
+    pipeline: z.array(z.union([AnyStage, VectorSearchStage])).describe(
+        `An array of aggregation stages to execute.  
+\`$vectorSearch\` **MUST** be the first stage of the pipeline, or the first stage of a \`$unionWith\` subpipeline.
+### Usage Rules for \`$vectorSearch\`
+- **Unset embeddings:**  
+  Unless the user explicitly requests the embeddings, add an \`$unset\` stage **at the end of the pipeline** to remove the embedding field and avoid context limits. **The $unset stage in this situation is mandatory**.
+- **Pre-filtering:**
+If the user requests additional filtering, include filters in \`$vectorSearch.filter\` only for pre-filter fields in the vector index.
+    NEVER include fields in $vectorSearch.filter that are not part of the vector index.
+- **Post-filtering:**
+    For all remaining filters, add a $match stage after $vectorSearch.
+### Note to LLM
+- If unsure which fields are filterable, use the collection-indexes tool to determine valid prefilter fields.
+- If no requested filters are valid prefilters, omit the filter key from $vectorSearch.`
+    ),
     responseBytesLimit: z.number().optional().default(ONE_MB).describe(`\
 The maximum number of bytes to return in the response. This value is capped by the server's configured maxBytesPerQuery and cannot be exceeded. \
 Note to LLM: If the entire aggregation result is required, use the "export" tool instead of increasing this limit.\
 
@@ -2,6 +2,24 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js";
 import { Matcher } from "./sdk/matcher.js";
 import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
 
+function doesUnset(field: string): Matcher {
+    return Matcher.anyOf(
+        // { $unset: "<field>" } || { $unset: ["<field>"] }
+        Matcher.value({ $unset: Matcher.arrayOrSingle(Matcher.value(field)) }),
+        // { $unset: { "<field>": "" } }
+        Matcher.value({ $unset: { [field]: "" } })
+    );
+}
+
+const embeddingParameters = {
+    model: "voyage-3-large",
+    outputDimension: Matcher.anyOf(
+        Matcher.undefined,
+        Matcher.number((n) => n === 1024)
+    ),
+    outputDType: Matcher.anyOf(Matcher.undefined, Matcher.value("float")),
+};
+
 describeAccuracyTests([
     {
         prompt: "Group all the movies in 'mflix.movies' namespace by 'release_year' and give me a count of them",
@@ -48,16 +66,71 @@ describeAccuracyTests([
                                 index: "titles",
                                 path: "title_embeddings",
                                 queryVector: "hammer of justice",
-                                embeddingParameters: {
-                                    model: "voyage-3-large",
-                                    outputDimension: Matcher.anyOf(
-                                        Matcher.undefined,
-                                        Matcher.number((n) => n === 1024)
-                                    ),
+                                embeddingParameters,
+                                filter: Matcher.emptyObjectOrUndefined,
+                                limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
+                            },
+                        },
+                        doesUnset("title_embeddings"),
+                    ],
+                    responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
+                },
+            },
+        ],
+        mockedTools: {
+            "collection-indexes": (): CallToolResult => {
+                return {
+                    content: [
+                        {
+                            type: "text",
+                            text: JSON.stringify({
+                                name: "titles",
+                                type: "vectorSearch",
+                                status: "READY",
+                                queryable: true,
+                                latestDefinition: {
+                                    type: "vector",
+                                    path: "title_embeddings",
+                                    numDimensions: 1024,
+                                    quantization: "none",
+                                    similarity: "euclidean",
                                 },
+                            }),
+                        },
+                    ],
+                };
+            },
+        },
+    },
+    {
+        prompt: "Run a vectorSearch query on musicfy.songs on path 'title_embeddings' using the index 'titles' with the model voyage-3-large to find all 'hammer of justice' songs. Keep the embedding field, do not remove it.",
+        expectedToolCalls: [
+            {
+                toolName: "collection-indexes",
+                parameters: {
+                    database: "musicfy",
+                    collection: "songs",
+                },
+                optional: true,
+            },
+            {
+                toolName: "aggregate",
+                parameters: {
+                    database: "musicfy",
+                    collection: "songs",
+                    pipeline: [
+                        {
+                            $vectorSearch: {
+                                exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(false)),
+                                index: "titles",
+                                path: "title_embeddings",
+                                queryVector: "hammer of justice",
+                                embeddingParameters,
                                 filter: Matcher.emptyObjectOrUndefined,
+                                limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
                             },
                         },
+                        Matcher.not(doesUnset("title_embeddings")),
                     ],
                     responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
                 },
@@ -107,21 +180,16 @@ describeAccuracyTests([
                     pipeline: [
                         {
                             $vectorSearch: {
-                                exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(true)),
+                                exact: true,
                                 index: "titles",
                                 path: "title_embeddings",
                                 queryVector: "hammer of justice",
-                                limit: 10,
-                                embeddingParameters: {
-                                    model: "voyage-3-large",
-                                    outputDimension: Matcher.anyOf(
-                                        Matcher.undefined,
-                                        Matcher.number((n) => n === 1024)
-                                    ),
-                                },
+                                embeddingParameters,
                                 filter: Matcher.emptyObjectOrUndefined,
+                                limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
                             },
                         },
+                        doesUnset("title_embeddings"),
                     ],
                     responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
                 },
@@ -153,7 +221,7 @@ describeAccuracyTests([
         },
     },
     {
-        prompt: "Run an approximate vectorSearch query on mflix.movies on path 'plot_embeddings' with the model voyage-3-large to find all 'sci-fy' movies.",
+        prompt: "Run an approximate vectorSearch query on mflix.movies on path 'plot_embeddings' with the model voyage-3-large to find all 'sci-fi' movies.",
         expectedToolCalls: [
             {
                 toolName: "collection-indexes",
@@ -173,17 +241,13 @@ describeAccuracyTests([
                                 exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(false)),
                                 index: "my-index",
                                 path: "plot_embeddings",
-                                queryVector: "sci-fy",
-                                embeddingParameters: {
-                                    model: "voyage-3-large",
-                                    outputDimension: Matcher.anyOf(
-                                        Matcher.undefined,
-                                        Matcher.number((n) => n === 1024)
-                                    ),
-                                },
+                                queryVector: "sci-fi",
+                                embeddingParameters,
                                 filter: Matcher.emptyObjectOrUndefined,
+                                limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
                             },
                         },
+                        doesUnset("plot_embeddings"),
                     ],
                     responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
                 },
@@ -214,4 +278,147 @@ describeAccuracyTests([
             },
         },
     },
+    {
+        prompt: "(Pre-filter) Run a vectorSearch query on mflix.movies on path 'plot_embeddings' with the model voyage-3-large to find all 'sci-fi' movies. I only want movies with the `released` after 1993 (included) and are published in catalan.",
+        expectedToolCalls: [
+            {
+                toolName: "collection-indexes",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                },
+            },
+            {
+                toolName: "aggregate",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                    pipeline: [
+                        {
+                            $vectorSearch: {
+                                exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(false)),
+                                index: "my-index",
+                                path: "plot_embeddings",
+                                queryVector: "sci-fi",
+                                numCandidates: Matcher.anyOf(Matcher.number(), Matcher.undefined),
+                                limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
+                                embeddingParameters,
+                                filter: {
+                                    released: { $gte: 1993 },
+                                    language: Matcher.caseInsensitiveString("catalan"),
+                                },
+                            },
+                        },
+                        doesUnset("plot_embeddings"),
+                    ],
+                    responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
+                },
+            },
+        ],
+        mockedTools: {
+            "collection-indexes": (): CallToolResult => {
+                return {
+                    content: [
+                        {
+                            type: "text",
+                            text: JSON.stringify({
+                                name: "my-index",
+                                type: "vectorSearch",
+                                status: "READY",
+                                queryable: true,
+                                latestDefinition: {
+                                    fields: [
+                                        {
+                                            type: "vector",
+                                            path: "plot_embeddings",
+                                            numDimensions: 1024,
+                                            quantization: "none",
+                                            similarity: "euclidean",
+                                        },
+                                        {
+                                            type: "filter",
+                                            path: "language",
+                                        },
+                                        {
+                                            type: "filter",
+                                            path: "released",
+                                        },
+                                    ],
+                                },
+                            }),
+                        },
+                    ],
+                };
+            },
+        },
+    },
+    {
+        prompt: "(No-prefilter) Run a vectorSearch query on mflix.movies on path 'plot_embeddings' with the model voyage-3-large to find all 'sci-fi' movies. I only want movies with `released` after 1993 (included) and are published in catalan.",
+        expectedToolCalls: [
+            {
+                toolName: "collection-indexes",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                },
+            },
+            {
+                toolName: "aggregate",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                    pipeline: [
+                        {
+                            $vectorSearch: {
+                                exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(false)),
+                                index: "my-index",
+                                path: "plot_embeddings",
+                                queryVector: "sci-fi",
+                                numCandidates: Matcher.anyOf(Matcher.number(), Matcher.undefined),
+                                limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
+                                embeddingParameters,
+                                filter: Matcher.emptyObjectOrUndefined,
+                            },
+                        },
+                        {
+                            $match: {
+                                released: { $gte: 1993 },
+                                language: Matcher.caseInsensitiveString("catalan"),
+                            },
+                        },
+                        doesUnset("plot_embeddings"),
+                    ],
+                    responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
+                },
+            },
+        ],
+        mockedTools: {
+            "collection-indexes": (): CallToolResult => {
+                return {
+                    content: [
+                        {
+                            type: "text",
+                            text: JSON.stringify({
+                                name: "my-index",
+                                type: "vectorSearch",
+                                status: "READY",
+                                queryable: true,
+                                latestDefinition: {
+                                    fields: [
+                                        {
+                                            type: "vector",
+                                            path: "plot_embeddings",
+                                            numDimensions: 1024,
+                                            quantization: "none",
+                                            similarity: "euclidean",
+                                        },
+                                    ],
+                                },
+                            }),
+                        },
+                    ],
+                };
+            },
+        },
+    },
 ]);