Implement CosmosDB vector store (#1587)

KennyZhang1 · web-flow · commit 4637270e9afd · 2025-01-14T02:47:08.000-05:00
diff --git a/.semversioner/next-release/patch-20250106222701371588.json b/.semversioner/next-release/patch-20250106222701371588.json
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "add cosmosdb vector store"
+}
diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py
@@ -93,16 +93,16 @@
 UPDATE_STORAGE_BASE_DIR = "update_output"
 
 VECTOR_STORE = f"""
-    type: {VectorStoreType.LanceDB.value}
+    type: {VectorStoreType.LanceDB.value} # one of [lancedb, azure_ai_search, cosmosdb]
     db_uri: '{(Path(STORAGE_BASE_DIR) / "lancedb")!s}'
-    container_name: default
+    collection_name: default
     overwrite: true\
 """
 
 VECTOR_STORE_DICT = {
     "type": VectorStoreType.LanceDB.value,
     "db_uri": str(Path(STORAGE_BASE_DIR) / "lancedb"),
-    "container_name": "default",
+    "collection_name": "default",
     "overwrite": True,
 }
 
diff --git a/graphrag/index/create_pipeline_config.py b/graphrag/index/create_pipeline_config.py
@@ -379,8 +379,8 @@ def _get_storage_config(
             connection_string = storage_settings.connection_string
             base_dir = storage_settings.base_dir
             container_name = storage_settings.container_name
-            if cosmosdb_account_url is None:
-                msg = "CosmosDB account url must be provided for cosmosdb storage."
+            if connection_string is None and cosmosdb_account_url is None:
+                msg = "Connection string or cosmosDB account url must be provided for cosmosdb storage."
                 raise ValueError(msg)
             if base_dir is None:
                 msg = "Base directory must be provided for cosmosdb storage."
diff --git a/graphrag/storage/cosmosdb_pipeline_storage.py b/graphrag/storage/cosmosdb_pipeline_storage.py
@@ -34,6 +34,7 @@ class CosmosDBPipelineStorage(PipelineStorage):
     _database_name: str
     _container_name: str
     _encoding: str
+    _no_id_prefixes: list[str]
 
     def __init__(
         self,
@@ -66,6 +67,7 @@ def __init__(
             if cosmosdb_account_url
             else None
         )
+        self._no_id_prefixes = []
         log.info(
             "creating cosmosdb storage with account: %s and database: %s and container: %s",
             self._cosmosdb_account_name,
@@ -208,6 +210,12 @@ async def get(
                 items_df = pd.read_json(
                     StringIO(items_json_str), orient="records", lines=False
                 )
+
+                # Drop the "id" column if the original dataframe does not include it
+                # TODO: Figure out optimal way to handle missing id keys in input dataframes
+                if prefix in self._no_id_prefixes:
+                    items_df.drop(columns=["id"], axis=1, inplace=True)
+
                 return items_df.to_parquet()
             item = self._container_client.read_item(item=key, partition_key=key)
             item_body = item.get("body")
@@ -236,9 +244,14 @@ async def set(self, key: str, value: Any, encoding: str | None = None) -> None:
                     log.exception("Error converting output %s to json", key)
                 else:
                     cosmosdb_item_list = json.loads(value_json)
-                    for cosmosdb_item in cosmosdb_item_list:
+                    for index, cosmosdb_item in enumerate(cosmosdb_item_list):
+                        # If the id key does not exist in the input dataframe json, create a unique id using the prefix and item index
+                        # TODO: Figure out optimal way to handle missing id keys in input dataframes
+                        if "id" not in cosmosdb_item:
+                            prefixed_id = f"{prefix}:{index}"
+                            self._no_id_prefixes.append(prefix)
                         # Append an additional prefix to the id to force a unique identifier for the create_final_nodes rows
-                        if prefix == "create_final_nodes":
+                        elif prefix == "create_final_nodes":
                             prefixed_id = f"{prefix}-community_{cosmosdb_item['community']}:{cosmosdb_item['id']}"
                         else:
                             prefixed_id = f"{prefix}:{cosmosdb_item['id']}"
diff --git a/graphrag/vector_stores/azure_ai_search.py b/graphrag/vector_stores/azure_ai_search.py
@@ -33,7 +33,7 @@
 )
 
 
-class AzureAISearch(BaseVectorStore):
+class AzureAISearchVectorStore(BaseVectorStore):
     """Azure AI Search vector storage implementation."""
 
     index_client: SearchIndexClient
diff --git a/graphrag/vector_stores/cosmosdb.py b/graphrag/vector_stores/cosmosdb.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""A package containing the CosmosDB vector store implementation."""
+
+import json
+from typing import Any
+
+from azure.cosmos import ContainerProxy, CosmosClient, DatabaseProxy
+from azure.cosmos.partition_key import PartitionKey
+from azure.identity import DefaultAzureCredential
+
+from graphrag.model.types import TextEmbedder
+from graphrag.vector_stores.base import (
+    DEFAULT_VECTOR_SIZE,
+    BaseVectorStore,
+    VectorStoreDocument,
+    VectorStoreSearchResult,
+)
+
+
+class CosmosDBVectoreStore(BaseVectorStore):
+    """Azure CosmosDB vector storage implementation."""
+
+    _cosmos_client: CosmosClient
+    _database_client: DatabaseProxy
+    _container_client: ContainerProxy
+
+    def __init__(self, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+
+    def connect(self, **kwargs: Any) -> Any:
+        """Connect to CosmosDB vector storage."""
+        connection_string = kwargs.get("connection_string")
+        if connection_string:
+            self._cosmos_client = CosmosClient.from_connection_string(connection_string)
+        else:
+            url = kwargs.get("url")
+            if not url:
+                msg = "Either connection_string or url must be provided."
+                raise ValueError(msg)
+            self._cosmos_client = CosmosClient(
+                url=url, credential=DefaultAzureCredential()
+            )
+
+        database_name = kwargs.get("database_name")
+        if database_name is None:
+            msg = "Database name must be provided."
+            raise ValueError(msg)
+        self._database_name = database_name
+        collection_name = self.collection_name
+        if collection_name is None:
+            msg = "Collection name is empty or not provided."
+            raise ValueError(msg)
+        self._container_name = collection_name
+
+        self.vector_size = kwargs.get("vector_size", DEFAULT_VECTOR_SIZE)
+        self._create_database()
+        self._create_container()
+
+    def _create_database(self) -> None:
+        """Create the database if it doesn't exist."""
+        self._cosmos_client.create_database_if_not_exists(id=self._database_name)
+        self._database_client = self._cosmos_client.get_database_client(
+            self._database_name
+        )
+
+    def _delete_database(self) -> None:
+        """Delete the database if it exists."""
+        if self._database_exists():
+            self._cosmos_client.delete_database(self._database_name)
+
+    def _database_exists(self) -> bool:
+        """Check if the database exists."""
+        existing_database_names = [
+            database["id"] for database in self._cosmos_client.list_databases()
+        ]
+        return self._database_name in existing_database_names
+
+    def _create_container(self) -> None:
+        """Create the container if it doesn't exist."""
+        partition_key = PartitionKey(path="/id", kind="Hash")
+
+        # Define the container vector policy
+        vector_embedding_policy = {
+            "vectorEmbeddings": [
+                {
+                    "path": "/vector",
+                    "dataType": "float32",
+                    "distanceFunction": "cosine",
+                    "dimensions": self.vector_size,
+                }
+            ]
+        }
+
+        # Define the vector indexing policy
+        indexing_policy = {
+            "indexingMode": "consistent",
+            "automatic": True,
+            "includedPaths": [{"path": "/*"}],
+            "excludedPaths": [{"path": "/_etag/?"}, {"path": "/vector/*"}],
+            "vectorIndexes": [{"path": "/vector", "type": "diskANN"}],
+        }
+
+        # Create the container and container client
+        self._database_client.create_container_if_not_exists(
+            id=self._container_name,
+            partition_key=partition_key,
+            indexing_policy=indexing_policy,
+            vector_embedding_policy=vector_embedding_policy,
+        )
+        self._container_client = self._database_client.get_container_client(
+            self._container_name
+        )
+
+    def _delete_container(self) -> None:
+        """Delete the vector store container in the database if it exists."""
+        if self._container_exists():
+            self._database_client.delete_container(self._container_name)
+
+    def _container_exists(self) -> bool:
+        """Check if the container name exists in the database."""
+        existing_container_names = [
+            container["id"] for container in self._database_client.list_containers()
+        ]
+        return self._container_name in existing_container_names
+
+    def load_documents(
+        self, documents: list[VectorStoreDocument], overwrite: bool = True
+    ) -> None:
+        """Load documents into CosmosDB."""
+        # Create a CosmosDB container on overwrite
+        if overwrite:
+            self._delete_container()
+            self._create_container()
+
+        if self._container_client is None:
+            msg = "Container client is not initialized."
+            raise ValueError(msg)
+
+        # Upload documents to CosmosDB
+        for doc in documents:
+            if doc.vector is not None:
+                doc_json = {
+                    "id": doc.id,
+                    "vector": doc.vector,
+                    "text": doc.text,
+                    "attributes": json.dumps(doc.attributes),
+                }
+                self._container_client.upsert_item(doc_json)
+
+    def similarity_search_by_vector(
+        self, query_embedding: list[float], k: int = 10, **kwargs: Any
+    ) -> list[VectorStoreSearchResult]:
+        """Perform a vector-based similarity search."""
+        if self._container_client is None:
+            msg = "Container client is not initialized."
+            raise ValueError(msg)
+
+        query = f"SELECT TOP {k} c.id, c.text, c.vector, c.attributes, VectorDistance(c.vector, @embedding) AS SimilarityScore FROM c ORDER BY VectorDistance(c.vector, @embedding)"  # noqa: S608
+        query_params = [{"name": "@embedding", "value": query_embedding}]
+        items = self._container_client.query_items(
+            query=query,
+            parameters=query_params,
+            enable_cross_partition_query=True,
+        )
+
+        return [
+            VectorStoreSearchResult(
+                document=VectorStoreDocument(
+                    id=item.get("id", ""),
+                    text=item.get("text", ""),
+                    vector=item.get("vector", []),
+                    attributes=(json.loads(item.get("attributes", "{}"))),
+                ),
+                score=item.get("SimilarityScore", 0.0),
+            )
+            for item in items
+        ]
+
+    def similarity_search_by_text(
+        self, text: str, text_embedder: TextEmbedder, k: int = 10, **kwargs: Any
+    ) -> list[VectorStoreSearchResult]:
+        """Perform a text-based similarity search."""
+        query_embedding = text_embedder(text)
+        if query_embedding:
+            return self.similarity_search_by_vector(
+                query_embedding=query_embedding, k=k
+            )
+        return []
+
+    def filter_by_id(self, include_ids: list[str] | list[int]) -> Any:
+        """Build a query filter to filter documents by a list of ids."""
+        if include_ids is None or len(include_ids) == 0:
+            self.query_filter = None
+        else:
+            if isinstance(include_ids[0], str):
+                id_filter = ", ".join([f"'{id}'" for id in include_ids])
+            else:
+                id_filter = ", ".join([str(id) for id in include_ids])
+            self.query_filter = f"SELECT * FROM c WHERE c.id IN ({id_filter})"  # noqa: S608
+        return self.query_filter
+
+    def search_by_id(self, id: str) -> VectorStoreDocument:
+        """Search for a document by id."""
+        if self._container_client is None:
+            msg = "Container client is not initialized."
+            raise ValueError(msg)
+
+        item = self._container_client.read_item(item=id, partition_key=id)
+        return VectorStoreDocument(
+            id=item.get("id", ""),
+            vector=item.get("vector", []),
+            text=item.get("text", ""),
+            attributes=(json.loads(item.get("attributes", "{}"))),
+        )
diff --git a/graphrag/vector_stores/factory.py b/graphrag/vector_stores/factory.py
@@ -6,8 +6,9 @@
 from enum import Enum
 from typing import ClassVar
 
-from graphrag.vector_stores.azure_ai_search import AzureAISearch
+from graphrag.vector_stores.azure_ai_search import AzureAISearchVectorStore
 from graphrag.vector_stores.base import BaseVectorStore
+from graphrag.vector_stores.cosmosdb import CosmosDBVectoreStore
 from graphrag.vector_stores.lancedb import LanceDBVectorStore
 
 
@@ -16,6 +17,7 @@ class VectorStoreType(str, Enum):
 
     LanceDB = "lancedb"
     AzureAISearch = "azure_ai_search"
+    CosmosDB = "cosmosdb"
 
 
 class VectorStoreFactory:
@@ -40,7 +42,9 @@ def create_vector_store(
             case VectorStoreType.LanceDB:
                 return LanceDBVectorStore(**kwargs)
             case VectorStoreType.AzureAISearch:
-                return AzureAISearch(**kwargs)
+                return AzureAISearchVectorStore(**kwargs)
+            case VectorStoreType.CosmosDB:
+                return CosmosDBVectoreStore(**kwargs)
             case _:
                 if vector_store_type in cls.vector_store_types:
                     return cls.vector_store_types[vector_store_type](**kwargs)

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "patch",
 +  "description": "add cosmosdb vector store"
 +}
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@`
`33`	`33`	`)`
`34`	`34`
`35`	`35`
`36`		`-class AzureAISearch(BaseVectorStore):`
	`36`	`+class AzureAISearchVectorStore(BaseVectorStore):`
`37`	`37`	`"""Azure AI Search vector storage implementation."""`
`38`	`38`
`39`	`39`	`index_client: SearchIndexClient`