OSNeuralSparseDocV3GTE embedding business logic

ghukill · ghukill · commit fc0bdea2ecf7 · 2025-11-04T16:20:11.000-05:00
Why these changes are being introduced: Our first embedding class OSNeuralSparseDocV3GTE was ready for a real create_embedding() method with the rest of the moving parts mostly complete at this time. How this addresses that need: The HuggingFace model card, https://huggingface.co/ opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte, contains a section of example code for using this model with the transformers library. This logic was ported over to our class, with the model already downloaded and loaded taken care of. The biggest addition is the method _decode_sparse_vectors() which converts the numerical sparse vector into a dictionary of token:weights. This decoded token weight form is what we'll pass directly to OpenSearch. With the new functionality in place, the tests associated with this class were also updated. Fixtures were moved into the testing file, a pattern we could adopt for any future models to keep them out of the shared conftest.py. Side effects of this change: * CLI is capable of producing embeddings for our first model OSNeuralSparseDocV3GTE Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-136
diff --git a/embeddings/cli.py b/embeddings/cli.py
@@ -210,6 +210,7 @@ def create_embeddings(
 ) -> None:
     """Create embeddings for TIMDEX records."""
     model: BaseEmbeddingModel = ctx.obj["model"]
+    model.load()
 
     # init TIMDEXDataset
     timdex_dataset = TIMDEXDataset(dataset_location)
diff --git a/embeddings/embedding.py b/embeddings/embedding.py
@@ -23,6 +23,12 @@ class EmbeddingInput:
     embedding_strategy: str
     text: str
 
+    def __repr__(self) -> str:  # noqa: D105
+        return (
+            f"<EmbeddingInput - record:'{self.timdex_record_id}', "
+            f"strategy:'{self.embedding_strategy}', text length:{len(self.text)}>"
+        )
+
 
 @dataclass
 class Embedding:
@@ -49,6 +55,12 @@ class Embedding:
         default_factory=lambda: datetime.datetime.now(datetime.UTC)
     )
 
+    def __repr__(self) -> str:  # noqa: D105
+        return (
+            f"<Embedding - record:'{self.timdex_record_id}', "
+            f"strategy:'{self.embedding_strategy}'>"
+        )
+
     def to_dict(self) -> dict:
         """Marshal to dictionary."""
         return asdict(self)
diff --git a/embeddings/models/os_neural_sparse_doc_v3_gte.py b/embeddings/models/os_neural_sparse_doc_v3_gte.py
@@ -8,6 +8,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
+import torch
 from huggingface_hub import snapshot_download
 from transformers import AutoModelForMaskedLM, AutoTokenizer
 
@@ -26,6 +27,9 @@
 class OSNeuralSparseDocV3GTE(BaseEmbeddingModel):
     """OpenSearch Neural Sparse Encoding Doc v3 GTE model.
 
+    This model generates sparse embeddings for documents by using a masked language
+    model's logits to identify the most relevant tokens.
+
     HuggingFace URI: opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte
     """
 
@@ -40,8 +44,8 @@ def __init__(self, model_path: str | Path) -> None:
         super().__init__(model_path)
         self._model: PreTrainedModel | None = None
         self._tokenizer: DistilBertTokenizerFast | None = None
-        self._special_token_ids: list | None = None
-        self._id_to_token: list | None = None
+        self._special_token_ids: list[int] | None = None
+        self._device: torch.device = torch.device("cpu")
 
     def download(self) -> Path:
         """Download and prepare model, saving to self.model_path.
@@ -139,29 +143,228 @@ def load(self) -> None:
         if not self.model_path.exists():
             raise FileNotFoundError(f"Model not found at path: {self.model_path}")
 
-        # load local model and tokenizer
-        self._model = AutoModelForMaskedLM.from_pretrained(
+        # setup device (use CUDA if available, otherwise CPU)
+        self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        # load tokenizer
+        self._tokenizer = AutoTokenizer.from_pretrained(  # type: ignore[no-untyped-call]
             self.model_path,
-            trust_remote_code=True,
             local_files_only=True,
         )
-        self._tokenizer = AutoTokenizer.from_pretrained(  # type: ignore[no-untyped-call]
+
+        # load model as AutoModelForMaskedLM (required for sparse embeddings)
+        self._model = AutoModelForMaskedLM.from_pretrained(
             self.model_path,
+            trust_remote_code=True,
             local_files_only=True,
         )
+        self._model.to(self._device)  # type: ignore[arg-type]
+        self._model.eval()
 
-        # setup special tokens
+        # set special token IDs (following model card pattern)
+        # these will be zeroed out in the sparse vectors
         self._special_token_ids = [
-            self._tokenizer.vocab[str(token)]
+            self._tokenizer.vocab[token]  # type: ignore[index]
             for token in self._tokenizer.special_tokens_map.values()
         ]
 
-        # setup id_to_token mapping
-        self._id_to_token = ["" for _ in range(self._tokenizer.vocab_size)]
-        for token, token_id in self._tokenizer.vocab.items():
-            self._id_to_token[token_id] = token
-
-        logger.info(f"Model loaded successfully, {time.perf_counter()-start_time}s")
+        logger.info(
+            f"Model loaded successfully on {self._device}, "
+            f"{time.perf_counter() - start_time:.2f}s"
+        )
 
     def create_embedding(self, input_record: EmbeddingInput) -> Embedding:
-        raise NotImplementedError
+        """Create sparse embeddings for the input text (document encoding).
+
+        This method generates sparse document embeddings.
+
+        Process follows the model card exactly:
+        1. Tokenize the document
+        2. Pass through the masked language model to get logits
+        3. Convert logits to sparse vector
+        6. Return both raw sparse vector and decoded token-weight pairs
+
+        Args:
+            input_record: The input containing text to embed
+        """
+        # generate the sparse embeddings
+        sparse_vector, decoded_tokens = self._encode_documents([input_record.text])[0]
+
+        # coerce sparse vector tensor into list[float]
+        sparse_vector_list = sparse_vector.cpu().numpy().tolist()
+
+        return Embedding(
+            timdex_record_id=input_record.timdex_record_id,
+            run_id=input_record.run_id,
+            run_record_offset=input_record.run_record_offset,
+            model_uri=self.model_uri,
+            embedding_strategy=input_record.embedding_strategy,
+            embedding_vector=sparse_vector_list,
+            embedding_token_weights=decoded_tokens,
+        )
+
+    def _encode_documents(
+        self,
+        texts: list[str],
+    ) -> list[tuple[torch.Tensor, dict[str, float]]]:
+        """Encode documents into sparse vectors and decoded token weights.
+
+        This follows the pattern outlined on the HuggingFace model card for document
+        encoding.
+
+        This method will accommodate a list of text inputs, and return a list of
+        embeddings, but the calling base method create_embeddings() is a singular input +
+        output.  This method keeps the ability to handle multiple inputs + outputs, in the
+        event we want something like a create_multiple_embeddings() method in the future.
+
+        The following is a rough approximation of receiving logits back from the model
+        and converting this to a sparse vector which can then be decoded to token:weights:
+
+        ----------------------------------------------------------------------------------
+        Imagine your vocabulary is just 5 words: ["cat", "dog", "bird", "fish", "tree"]
+        Vocabulary indices:                      [  0,     1,      2,      3,       4]
+
+        1. MODEL RETURNS LOGITS
+        Let's say you input the text: "cat and dog"
+        After tokenization, you have 3 tokens at 3 sequence positions
+        The model outputs logits - a score for EVERY vocab word at EVERY position:
+
+        logits = [
+            # Position 0 (word "cat"):  scores for each vocab word at this position
+            [9.2,  1.1,  0.3,  0.5,  0.2],  # "cat" gets high score (9.2)
+
+            # Position 1 (word "and" - not in our toy vocab, but tokenized somehow):
+            [2.1,  1.8,  0.4,  0.3,  0.9],  # moderate scores everywhere
+
+            # Position 2 (word "dog"):
+            [0.8,  8.7,  0.2,  0.4,  0.1],  # "dog" gets high score (8.7)
+        ]
+        Shape: (3 positions, 5 vocab words)
+
+
+        2. PRODUCE SPARSE VECTORS FROM LOGITS
+        We collapse the sequence positions by taking the MAX score for each vocab word:
+
+        sparse_vector = [
+            max(9.2, 2.1, 0.8),  # "cat": take max across all 3 positions = 9.2
+            max(1.1, 1.8, 8.7),  # "dog": take max = 8.7
+            max(0.3, 0.4, 0.2),  # "bird": take max = 0.4
+            max(0.5, 0.3, 0.4),  # "fish": take max = 0.5
+            max(0.2, 0.9, 0.1),  # "tree": take max = 0.9
+        ]
+
+        Apply transformations (ReLU, double-log) to make it sparser:
+        sparse_vector = [5.1, 4.8, 0.0, 0.0, 0.0]  # smaller values become 0
+
+        Final result:
+        {"cat": 5.1, "dog": 4.8}  # Only the relevant words have non-zero weights
+        ----------------------------------------------------------------------------------
+
+        Args:
+            texts: list of strings to create embeddings for
+        """
+        if self._model is None or self._tokenizer is None:
+            raise RuntimeError("Model not loaded. Call load() before create_embedding.")
+
+        # tokenize the input texts
+        features = self._tokenizer(
+            texts,
+            padding=True,
+            truncation=True,
+            return_tensors="pt",  # returns PyTorch tensors instead of Python lists
+            return_token_type_ids=False,
+        )
+
+        # move to CPU or GPU device, depending on what's available
+        features = {k: v.to(self._device) for k, v in features.items()}
+
+        # get model logits output
+        with torch.no_grad():
+            output = self._model(**features)[0]
+
+        # generate sparse vectors from model logits
+        sparse_vectors = self._get_sparse_vectors(features, output)
+
+        # decode to token-weight dictionaries
+        decoded = self._decode_sparse_vectors(sparse_vectors)
+
+        # return list of tuple(vector, decoded token weights) embedding results
+        return [(sparse_vectors[i], decoded[i]) for i in range(len(texts))]
+
+    def _get_sparse_vectors(
+        self, features: dict[str, torch.Tensor], output: torch.Tensor
+    ) -> torch.Tensor:
+        """Convert model logits output to sparse vectors.
+
+        This follows the HuggingFace model card exactly: https://huggingface.co/
+        opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte#usage-huggingface
+
+        This implements the get_sparse_vector function from the model card:
+            1. Max pooling with attention mask
+            2. log(1 + log(1 + relu())) transformation
+            3. Zero out special tokens
+
+        Args:
+            features: Tokenizer output with attention_mask
+            output: Model logits of shape (batch_size, seq_len, vocab_size)
+
+        Returns:
+            Sparse vectors of shape (batch_size, vocab_size)
+        """
+        # max pooling with attention mask
+        values, _ = torch.max(output * features["attention_mask"].unsqueeze(-1), dim=1)
+
+        # apply the v3 model activation
+        values = torch.log(1 + torch.log(1 + torch.relu(values)))
+
+        # zero out special tokens
+        values[:, self._special_token_ids] = 0
+
+        return values
+
+    def _decode_sparse_vectors(
+        self, sparse_vectors: torch.Tensor
+    ) -> list[dict[str, float]]:
+        """Convert sparse vectors to token-weight dictionaries.
+
+        Handles both single vectors and batches, returning a list of dictionaries mapping
+        token strings to their weights.
+
+        Args:
+            sparse_vectors: Tensor of shape (batch_size, vocab_size) or (vocab_size,)
+
+        Returns:
+            List of dictionaries with token-weight pairs
+        """
+        if sparse_vectors.dim() == 1:
+            sparse_vectors = sparse_vectors.unsqueeze(0)
+
+        # move to CPU for processing
+        sparse_vectors_cpu = sparse_vectors.cpu()
+
+        results: list[dict] = []
+        for vector in sparse_vectors_cpu:
+
+            # find non-zero indices and values
+            nonzero_indices = torch.nonzero(vector, as_tuple=False).squeeze(-1)
+
+            if nonzero_indices.numel() == 0:
+                results.append({})
+                continue
+
+            # get weights
+            weights = vector[nonzero_indices].tolist()
+
+            # convert indices to token strings
+            token_ids = nonzero_indices.tolist()
+            tokens = self._tokenizer.convert_ids_to_tokens(token_ids)  # type: ignore[union-attr]
+
+            # create token:weight dictionary
+            token_dict = {
+                token: weight
+                for token, weight in zip(tokens, weights, strict=True)
+                if token is not None
+            }
+            results.append(token_dict)
+
+        return results
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -71,35 +71,7 @@ def register_mock_model(monkeypatch):
 
 
 @pytest.fixture
-def neural_sparse_doc_v3_gte_fake_model_directory(tmp_path):
-    """Create a fake downloaded model directory with required files."""
-    model_dir = tmp_path / "fake_model"
-    model_dir.mkdir()
-
-    # create config.json
-    config_json = {
-        "model_type": "distilbert",
-        "vocab_size": 30000,
-        "auto_map": {
-            "AutoConfig": "Alibaba-NLP/new-impl--configuration.NewConfig",
-            "AutoModel": "Alibaba-NLP/new-impl--modeling.NewModel",
-        },
-    }
-    (model_dir / "config.json").write_text(json.dumps(config_json))
-
-    # create modeling.py and configuration.py
-    (model_dir / "modeling.py").write_text("# mock modeling code")
-    (model_dir / "configuration.py").write_text("# mock configuration code")
-
-    # create tokenizer files
-    (model_dir / "tokenizer.json").write_text('{"version": "1.0"}')
-    (model_dir / "vocab.txt").write_text("word1\nword2\n")
-
-    return model_dir
-
-
-@pytest.fixture
-def neural_sparse_doc_v3_gte_mock_huggingface_snapshot(monkeypatch, tmp_path):
+def mock_snapshot_download(monkeypatch, tmp_path):
     """Mock snapshot_download to create fake model files locally."""
 
     def mock_snapshot(repo_id, local_dir, **kwargs):
@@ -132,53 +104,3 @@ def mock_snapshot(repo_id, local_dir, **kwargs):
         "embeddings.models.os_neural_sparse_doc_v3_gte.snapshot_download", mock_snapshot
     )
     return mock_snapshot
-
-
-@pytest.fixture
-def neural_sparse_doc_v3_gte_mock_transformers_models(monkeypatch):
-    """Mock AutoModelForMaskedLM and AutoTokenizer."""
-
-    class MockTokenizer:
-        """Mock tokenizer with necessary attributes."""
-
-        def __init__(self, *args, **kwargs):  # noqa: ARG002
-            self.vocab = {
-                "[CLS]": 0,
-                "[SEP]": 1,
-                "[PAD]": 2,
-                "word1": 3,
-                "word2": 4,
-            }
-            self.vocab_size = len(self.vocab)
-            self.special_tokens_map = {
-                "cls_token": "[CLS]",
-                "sep_token": "[SEP]",
-                "pad_token": "[PAD]",
-            }
-
-    class MockModel:
-        """Mock model with necessary attributes."""
-
-        def __init__(self, *args, **kwargs):  # noqa: ARG002
-            self.config = {"vocab_size": 30000}
-
-    class MockAutoTokenizer:
-        @staticmethod
-        def from_pretrained(*args, **kwargs):  # noqa: ARG004
-            return MockTokenizer()
-
-    class MockAutoModelForMaskedLM:
-        @staticmethod
-        def from_pretrained(*args, **kwargs):  # noqa: ARG004
-            return MockModel()
-
-    monkeypatch.setattr(
-        "embeddings.models.os_neural_sparse_doc_v3_gte.AutoTokenizer",
-        MockAutoTokenizer,
-    )
-    monkeypatch.setattr(
-        "embeddings.models.os_neural_sparse_doc_v3_gte.AutoModelForMaskedLM",
-        MockAutoModelForMaskedLM,
-    )
-
-    return {"tokenizer": MockTokenizer, "model": MockModel}
diff --git a/tests/test_os_neural_sparse_doc_v3_gte.py b/tests/test_os_neural_sparse_doc_v3_gte.py