Merge pull request #831 from Anush008/fastembed-support

arnavsinghvi11 · web-flow · commit b5c561fbb4b0 · 2024-05-05T22:04:35.000-07:00
feat: FastEmbedVectorizer
diff --git a/dsp/modules/sentence_vectorizer.py b/dsp/modules/sentence_vectorizer.py
@@ -203,3 +203,48 @@ def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
 
         embeddings = np.array(embeddings_list, dtype=np.float32)
         return embeddings
+
+class FastEmbedVectorizer(BaseSentenceVectorizer):
+    """Sentence vectorizer implementaion using FastEmbed - https://qdrant.github.io/fastembed."""
+
+    def __init__(
+        self,
+        model_name: str = "BAAI/bge-small-en-v1.5",
+        batch_size: int = 256,
+        cache_dir: Optional[str] = None,
+        threads: Optional[int] = None,
+        parallel: Optional[int] = None,
+        **kwargs,
+    ):
+        """Initialize fastembed.TextEmbedding.
+
+        Args:
+            model_name (str): The name of the model to use. Defaults to `"BAAI/bge-small-en-v1.5"`.
+            batch_size (int): Batch size for encoding. Higher values will use more memory, but be faster.\
+                                        Defaults to 256.
+            cache_dir (str, optional): The path to the model cache directory.\
+                                       Can also be set using the `FASTEMBED_CACHE_PATH` env variable.
+            threads (int, optional): The number of threads single onnxruntime session can use.
+            parallel (int, optional): If `>1`, data-parallel encoding will be used, recommended for large datasets.\
+                                      If `0`, use all available cores.\
+                                      If `None`, don't use data-parallel processing, use default onnxruntime threading.\
+                                      Defaults to None.
+            **kwargs: Additional options to pass to fastembed.TextEmbedding
+        Raises:
+            ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-small-en-v1.5.
+        """
+        try:
+            from fastembed import TextEmbedding
+        except ImportError as e:
+            raise ValueError(
+                "The 'fastembed' package is not installed. Please install it with `pip install fastembed`",
+            ) from e
+        self._batch_size = batch_size
+        self._parallel = parallel
+        self._model = TextEmbedding(model_name=model_name, cache_dir=cache_dir, threads=threads, **kwargs)
+
+    def __call__(self, inp_examples: List["Example"]) -> np.ndarray:
+        texts_to_vectorize = self._extract_text_from_examples(inp_examples)
+        embeddings = self._model.embed(texts_to_vectorize, batch_size=self._batch_size, parallel=self._parallel)
+
+        return np.array([embedding.tolist() for embedding in embeddings], dtype=np.float32)
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -63,6 +63,7 @@ docs = [
     "sphinx-automodapi==0.16.0",
 ]
 dev = ["pytest>=6.2.5"]
+fastembed = ["fastembed>=0.2.0"]
 
 [project.urls]
 homepage = "https://github.com/stanfordnlp/dspy"
@@ -96,7 +97,7 @@ requests = "^2.31.0"
 optuna = "^3.4.0"
 anthropic = { version = "^0.18.0", optional = true }
 chromadb = { version = "^0.4.14", optional = true }
-fastembed = { version = "^0.2.0", optional = true }
+fastembed = { version = ">=0.2.0", optional = true }
 marqo = { version = "*", optional = true }
 qdrant-client = { version = "^1.6.2", optional = true }
 pinecone-client = { version = "^2.2.4", optional = true }
@@ -153,6 +154,7 @@ docs = [
     "sphinx-reredirects",
     "sphinx-automodapi",
 ]
+fastembed = ["fastembed"]
 
 [tool.poetry.group.doc.dependencies]
 mkdocs = ">=1.5.3"
diff --git a/setup.py b/setup.py
@@ -31,6 +31,7 @@
         "faiss-cpu": ["sentence_transformers", "faiss-cpu"],
         "milvus": ["pymilvus~=2.3.7"],
         "google-vertex-ai": ["google-cloud-aiplatform==1.43.0"],
+        "fastembed": ["fastembed"],
     },	
     classifiers=[	
         "Development Status :: 3 - Alpha",	
diff --git a/tests/modules/vectorizer/test_fastembed.py b/tests/modules/vectorizer/test_fastembed.py
@@ -0,0 +1,43 @@
+from dsp.modules.sentence_vectorizer import FastEmbedVectorizer
+import pytest
+
+from dspy.primitives.example import Example
+
+# Skip the test if the 'fastembed' package is not installed
+pytest.importorskip("fastembed", reason="'fastembed' is not installed. Use `pip install fastembed` to install it.")
+
+
+@pytest.mark.parametrize(
+    "n_dims,model_name", [(384, "BAAI/bge-small-en-v1.5"), (512, "jinaai/jina-embeddings-v2-small-en")]
+)
+def test_fastembed_with_examples(n_dims, model_name):
+    vectorizer = FastEmbedVectorizer(model_name)
+
+    examples = [
+        Example(query="What's the price today?", response="The price is $10.00").with_inputs("query", "response"),
+        Example(query="What's the weather today?", response="The weather is sunny").with_inputs("query", "response"),
+        Example(query="Who was leading the team?", response="It was Jim. Rather enthusiastic guy.").with_inputs(
+            "query", "response"
+        ),
+    ]
+
+    embeddings = vectorizer(examples)
+
+    assert embeddings.shape == (len(examples), n_dims)
+
+
+@pytest.mark.parametrize(
+    "n_dims,model_name", [(384, "BAAI/bge-small-en-v1.5"), (512, "jinaai/jina-embeddings-v2-small-en")]
+)
+def test_fastembed_with_strings(n_dims, model_name):
+    vectorizer = FastEmbedVectorizer(model_name)
+
+    inputs = [
+        "Jonathan Kent is a fictional character appearing in American comic books published by DC Comics.",
+        "Clark Kent is a fictional character appearing in American comic books published by DC Comics.",
+        "Martha Kent is a fictional character appearing in American comic books published by DC Comics.",
+    ]
+
+    embeddings = vectorizer(inputs)
+
+    assert embeddings.shape == (len(inputs), n_dims)