Merge pull request stanfordnlp#1026 from ofermend/vectara_retriever_update

arnavsinghvi11 · web-flow · commit 17d76b77999b · 2024-05-15T14:17:49.000-07:00
support multiple corpora in vectara retriever
diff --git a/dspy/retrieve/vectara_rm.py b/dspy/retrieve/vectara_rm.py
@@ -18,7 +18,7 @@ class VectaraRM(dspy.Retrieve):
     """
     A retrieval module that uses Vectara to return the top passages for a given query.
 
-    Assumes that a Vectara corpus has been created and populated with the following payload:
+    Assumes that a Vectara corpora have been created and populated with the following payload:
         - document: The text of the passage
 
     Args:
@@ -67,17 +67,21 @@ def __init__(
     def _vectara_query(
         self,
         query: str,
-        limit: int = 3,
+        limit: int = 5,
     ) -> List[str]:
         """Query Vectara index to get for top k matching passages.
         Args:
             query: query string
         """
-        corpus_key = {
-            "customerId": self._vectara_customer_id,
-            "corpusId": self._vectara_corpus_id,
-            "lexicalInterpolationConfig": {"lambda": 0.025 },
-        }
+        # If multiple corpus ids are provided (comma-separated), create a list of corpus keys
+        # otherwise by default, the `split(',')` is a no-op so retains the single corpus id
+        corpus_key = [
+            {
+                "customerId": self._vectara_customer_id,
+                "corpusId": corpus_id,
+                "lexicalInterpolationConfig": {"lambda": 0.025 },
+            } for corpus_id in self._vectara_corpus_id.split(',')
+        ]
 
         data = {
             "query": [
@@ -91,7 +95,7 @@ def _vectara_query(
                         "startTag": START_SNIPPET,
                         "endTag": END_SNIPPET,
                     },
-                    "corpusKey": [corpus_key],
+                    "corpusKey": corpus_key,
                 },
             ],
         }