OSNeuralSparseDocV3GTE model download and load

ghukill · ghukill · commit a50f256c919e · 2025-10-28T09:34:43.000-04:00
Why these changes are being introduced: Each embedding class needs a way to download the model assets (e.g. weights and related files) locally, such that it can be loaded without calls to the HuggingFace API. Some models may require work beyond just HF's `snapshot_download()` function, e.g. cloning dependency models or configurations. To test if a downloaded and configured correctly, you must then also load the model. Ideally performing a test embedding creation, but even just a load without errors is a good step. How this addresses that need: The base class is extended to include a `load()` method. Our first embedding class `OSNeuralSparseDocV3GTE` has a first pass at `downloadg()` and `load()` methods. The model we are using has some unusual dependency requirements, that most commonly relies on additional HuggingFace calls on load. To avoid this, we include some manual work to clone the model `Alibaba-NLP/new-impl` and copy required files into our local model clone. The `load()` function confirms that the model loads successfully, and without making any HuggingFace API calls. Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-113
diff --git a/embeddings/cli.py b/embeddings/cli.py
@@ -1,5 +1,6 @@
 import functools
 import logging
+import os
 import time
 from collections.abc import Callable
 from datetime import timedelta
@@ -90,6 +91,25 @@ def download_model(model_uri: str, output: Path) -> None:
     click.echo(result_path)
 
 
+@main.command()
+def test_model_load() -> None:
+    """Test loading of embedding class and local model based on env vars.
+
+    In a deployed context, the following env vars are expected:
+        - TE_MODEL_URI
+        - TE_MODEL_DOWNLOAD_PATH
+
+    With these set, the embedding class should be registered successfully and initialized,
+    and the model loaded from a local copy.
+    """
+    # load embedding model class
+    model_class = get_model_class(os.environ["TE_MODEL_URI"])
+    model = model_class()
+
+    model.load(os.environ["TE_MODEL_DOWNLOAD_PATH"])
+    click.echo("OK")
+
+
 @main.command()
 @model_required
 def create_embeddings(_model_uri: str) -> None:
diff --git a/embeddings/models/base.py b/embeddings/models/base.py
@@ -28,9 +28,17 @@ def model_uri(self) -> str:
         return self.MODEL_URI
 
     @abstractmethod
-    def download(self, output_path: Path) -> Path:
+    def download(self, output_path: str | Path) -> Path:
         """Download and prepare model, saving to output_path.
 
         Args:
             output_path: Path where the model zip should be saved.
         """
+
+    @abstractmethod
+    def load(self, model_path: str | Path) -> None:
+        """Load model from local, downloaded instance.
+
+        Args:
+            model_path: Path of local model directory.
+        """
diff --git a/embeddings/models/os_neural_sparse_doc_v3_gte.py b/embeddings/models/os_neural_sparse_doc_v3_gte.py
@@ -1,10 +1,24 @@
 """OpenSearch Neural Sparse Doc v3 GTE model."""
 
+import json
 import logging
+import shutil
+import tempfile
+import time
 from pathlib import Path
+from typing import TYPE_CHECKING
+
+from huggingface_hub import snapshot_download
+from transformers import AutoModelForMaskedLM, AutoTokenizer
 
 from embeddings.models.base import BaseEmbeddingModel
 
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+    from transformers.models.distilbert.tokenization_distilbert_fast import (
+        DistilBertTokenizerFast,
+    )
+
 logger = logging.getLogger(__name__)
 
 
@@ -16,11 +30,134 @@ class OSNeuralSparseDocV3GTE(BaseEmbeddingModel):
 
     MODEL_URI = "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte"
 
-    def download(self, output_path: Path) -> Path:
+    def __init__(self) -> None:
+        """Initialize the model."""
+        super().__init__()
+        self._model: PreTrainedModel | None = None
+        self._tokenizer: DistilBertTokenizerFast | None = None
+        self._special_token_ids: list | None = None
+        self._id_to_token: list | None = None
+
+    def download(self, output_path: str | Path) -> Path:
         """Download and prepare model, saving to output_path.
 
         Args:
-            output_path: Path where the model zip should be saved.
+            output_path: Path where the model should be saved.
         """
-        logger.info(f"Downloading model: { self.model_uri}, saving to: {output_path}.")
-        raise NotImplementedError
+        start_time = time.perf_counter()
+
+        output_path = Path(output_path)
+        logger.info(f"Downloading model: {self.model_uri}, saving to: {output_path}.")
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+
+            # download snapshot of HuggingFace model
+            snapshot_download(repo_id=self.model_uri, local_dir=temp_path)
+            logger.debug("Model download complete.")
+
+            # patch local model with files from dependency model "Alibaba-NLP/new-impl"
+            self._patch_local_model_with_alibaba_new_impl(temp_path)
+
+            # compress model directory as a zip file
+            if output_path.suffix.lower() == ".zip":
+                logger.debug("Creating zip file of model contents.")
+                shutil.make_archive(str(output_path.with_suffix("")), "zip", temp_path)
+
+            # copy to output directory without zipping
+            else:
+                logger.debug(f"Copying model contents to {output_path}")
+                if output_path.exists():
+                    shutil.rmtree(output_path)
+                shutil.copytree(temp_path, output_path)
+
+        logger.info(f"Model downloaded successfully, {time.perf_counter() - start_time}s")
+        return output_path
+
+    def _patch_local_model_with_alibaba_new_impl(self, model_temp_path: Path) -> None:
+        """Patch downloaded model with required assets from Alibaba-NLP/new-impl.
+
+        Our main model, opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte,
+        has configurations that attempt dynamic downloading of another model for files.
+        This can be seen here: https://huggingface.co/opensearch-project/opensearch-
+        neural-sparse-encoding-doc-v3-gte/blob/main/config.json#L6-L14.
+
+        To avoid our deployed CLI application making requests to the HuggingFace API to
+        retrieve these required files, which is problematic during high concurrency, we
+        manually download these files and patch the model during our local download and
+        save.
+
+        This allows us to load the primary model without any HuggingFace API calls.
+        """
+        logger.info("Downloading custom code from Alibaba-NLP/new-impl")
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            snapshot_download(
+                repo_id="Alibaba-NLP/new-impl",
+                local_dir=str(temp_path),
+            )
+
+            logger.info("Copying Alibaba code and updating config.json")
+            shutil.copy(temp_path / "modeling.py", model_temp_path / "modeling.py")
+            shutil.copy(
+                temp_path / "configuration.py",
+                model_temp_path / "configuration.py",
+            )
+
+            with open(model_temp_path / "config.json") as f:
+                config_json = json.load(f)
+                config_json["auto_map"] = {
+                    "AutoConfig": "configuration.NewConfig",
+                    "AutoModel": "modeling.NewModel",
+                    "AutoModelForMaskedLM": "modeling.NewForMaskedLM",
+                    "AutoModelForMultipleChoice": "modeling.NewForMultipleChoice",
+                    "AutoModelForQuestionAnswering": "modeling.NewForQuestionAnswering",
+                    "AutoModelForSequenceClassification": (
+                        "modeling.NewForSequenceClassification"
+                    ),
+                    "AutoModelForTokenClassification": (
+                        "modeling.NewForTokenClassification"
+                    ),
+                }
+            with open(model_temp_path / "config.json", "w") as f:
+                f.write(json.dumps(config_json))
+
+            logger.debug("Dependency model Alibaba-NLP/new-impl downloaded and used.")
+
+    def load(self, model_path: str | Path) -> None:
+        """Load the model from the specified path.
+
+        Args:
+            model_path: Path to the model directory.
+        """
+        start_time = time.perf_counter()
+        logger.info(f"Loading model from: {model_path}")
+        model_path = Path(model_path)
+
+        # ensure model exists locally
+        if not model_path.exists():
+            raise FileNotFoundError(f"Model not found at path: {model_path}")
+
+        # load local model and tokenizer
+        self._model = AutoModelForMaskedLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            local_files_only=True,
+        )
+        self._tokenizer = AutoTokenizer.from_pretrained(  # type: ignore[no-untyped-call]
+            model_path,
+            local_files_only=True,
+        )
+
+        # setup special tokens
+        self._special_token_ids = [
+            self._tokenizer.vocab[str(token)]
+            for token in self._tokenizer.special_tokens_map.values()
+        ]
+
+        # setup id_to_token mapping
+        self._id_to_token = ["" for _ in range(self._tokenizer.vocab_size)]
+        for token, token_id in self._tokenizer.vocab.items():
+            self._id_to_token[token_id] = token
+
+        logger.info(f"Model loaded successfully, {time.perf_counter()-start_time}s")
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,8 @@ dependencies = [
     "huggingface-hub>=0.26.0",
     "sentry-sdk>=2.34.1",
     "timdex-dataset-api",
+    "torch>=2.9.0",
+    "transformers>=4.57.1",
 ]
 
 [dependency-groups]
@@ -32,7 +34,10 @@ line-length = 90
 [tool.mypy]
 disallow_untyped_calls = true
 disallow_untyped_defs = true
-exclude = ["tests/"]
+exclude = [
+    "tests/",
+    "output/"
+]
 
 [tool.pytest.ini_options]
 log_level = "INFO"
@@ -101,3 +106,6 @@ embeddings = "embeddings.cli:main"
 [build-system]
 requires = ["setuptools>=61"]
 build-backend = "setuptools.build_meta"
+
+[tool.setuptools]
+py-modules = ["embeddings"]
diff --git a/uv.lock b/uv.lock