Merge pull request #14 from MITLibraries/USE-113-download-model

ghukill · web-flow · commit 8db533acf712 · 2025-10-28T16:05:50.000-04:00
USE 113 - OSNeuralSparseDocV3GTE download and load
diff --git a/Dockerfile b/Dockerfile
@@ -18,4 +18,13 @@ COPY embeddings ./embeddings
 # Install package into system python, includes "marimo-launcher" script
 RUN uv pip install --system .
 
-ENTRYPOINT ["embeddings"]
+# Download the model and include in the Docker image
+# NOTE: The env vars "TE_MODEL_URI" and "TE_MODEL_DOWNLOAD_PATH" are set here to support
+#  the downloading of the model into this image build, but persist in the container and
+#  effectively also set this as the default model.
+ENV HF_HUB_DISABLE_PROGRESS_BARS=true
+ENV TE_MODEL_URI=opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte
+ENV TE_MODEL_DOWNLOAD_PATH=/model
+RUN python -m embeddings.cli --verbose download-model
+
+ENTRYPOINT ["python", "-m", "embeddings.cli"]
diff --git a/README.md b/README.md
@@ -28,6 +28,20 @@ TE_MODEL_DOWNLOAD_PATH=# Download location for model
 HF_HUB_DISABLE_PROGRESS_BARS=#boolean to use progress bars for HuggingFace model downloads; defaults to 'true' in deployed contexts
 ```
 
+## Configuring an Embedding Model
+
+This CLI application is designed to create embeddings for input texts.  To do this, a pre-trained model must be identified and configured for use.  
+
+To this end, there is a base embedding class `BaseEmbeddingModel` that is designed to be extended and customized for a particular embedding model.
+
+Once an embedding class has been created, the preferred approach is to set env vars `TE_MODEL_URI` and `TE_MODEL_DOWNLOAD_PATH` directly in the `Dockerfile` to a) download a local snapshot of the model during image build, and b) set this model as the default for the CLI.
+
+This allows invoking the CLI without specifying a model URI or local location, allowing this model to serve as the default, e.g.:
+
+```shell
+uv run --env-file .env embeddings test-model-load
+```
+
 ## CLI Commands
 
 For local development, all CLI commands should be invoked with the following format to pickup environment variables from `.env`:
diff --git a/embeddings/cli.py b/embeddings/cli.py
@@ -1,5 +1,6 @@
 import functools
 import logging
+import os
 import time
 from collections.abc import Callable
 from datetime import timedelta
@@ -90,6 +91,25 @@ def download_model(model_uri: str, output: Path) -> None:
     click.echo(result_path)
 
 
+@main.command()
+def test_model_load() -> None:
+    """Test loading of embedding class and local model based on env vars.
+
+    In a deployed context, the following env vars are expected:
+        - TE_MODEL_URI
+        - TE_MODEL_DOWNLOAD_PATH
+
+    With these set, the embedding class should be registered successfully and initialized,
+    and the model loaded from a local copy.
+    """
+    # load embedding model class
+    model_class = get_model_class(os.environ["TE_MODEL_URI"])
+    model = model_class()
+
+    model.load(os.environ["TE_MODEL_DOWNLOAD_PATH"])
+    click.echo("OK")
+
+
 @main.command()
 @model_required
 def create_embeddings(_model_uri: str) -> None:
diff --git a/embeddings/models/base.py b/embeddings/models/base.py
@@ -28,9 +28,17 @@ def model_uri(self) -> str:
         return self.MODEL_URI
 
     @abstractmethod
-    def download(self, output_path: Path) -> Path:
+    def download(self, output_path: str | Path) -> Path:
         """Download and prepare model, saving to output_path.
 
         Args:
             output_path: Path where the model zip should be saved.
         """
+
+    @abstractmethod
+    def load(self, model_path: str | Path) -> None:
+        """Load model from local, downloaded instance.
+
+        Args:
+            model_path: Path of local model directory.
+        """
diff --git a/embeddings/models/os_neural_sparse_doc_v3_gte.py b/embeddings/models/os_neural_sparse_doc_v3_gte.py
@@ -1,10 +1,24 @@
 """OpenSearch Neural Sparse Doc v3 GTE model."""
 
+import json
 import logging
+import shutil
+import tempfile
+import time
 from pathlib import Path
+from typing import TYPE_CHECKING
+
+from huggingface_hub import snapshot_download
+from transformers import AutoModelForMaskedLM, AutoTokenizer
 
 from embeddings.models.base import BaseEmbeddingModel
 
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+    from transformers.models.distilbert.tokenization_distilbert_fast import (
+        DistilBertTokenizerFast,
+    )
+
 logger = logging.getLogger(__name__)
 
 
@@ -16,11 +30,134 @@ class OSNeuralSparseDocV3GTE(BaseEmbeddingModel):
 
     MODEL_URI = "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte"
 
-    def download(self, output_path: Path) -> Path:
+    def __init__(self) -> None:
+        """Initialize the model."""
+        super().__init__()
+        self._model: PreTrainedModel | None = None
+        self._tokenizer: DistilBertTokenizerFast | None = None
+        self._special_token_ids: list | None = None
+        self._id_to_token: list | None = None
+
+    def download(self, output_path: str | Path) -> Path:
         """Download and prepare model, saving to output_path.
 
         Args:
-            output_path: Path where the model zip should be saved.
+            output_path: Path where the model should be saved.
         """
-        logger.info(f"Downloading model: { self.model_uri}, saving to: {output_path}.")
-        raise NotImplementedError
+        start_time = time.perf_counter()
+
+        output_path = Path(output_path)
+        logger.info(f"Downloading model: {self.model_uri}, saving to: {output_path}.")
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+
+            # download snapshot of HuggingFace model
+            snapshot_download(repo_id=self.model_uri, local_dir=temp_path)
+            logger.debug("Model download complete.")
+
+            # patch local model with files from dependency model "Alibaba-NLP/new-impl"
+            self._patch_local_model_with_alibaba_new_impl(temp_path)
+
+            # compress model directory as a zip file
+            if output_path.suffix.lower() == ".zip":
+                logger.debug("Creating zip file of model contents.")
+                shutil.make_archive(str(output_path.with_suffix("")), "zip", temp_path)
+
+            # copy to output directory without zipping
+            else:
+                logger.debug(f"Copying model contents to {output_path}")
+                if output_path.exists():
+                    shutil.rmtree(output_path)
+                shutil.copytree(temp_path, output_path)
+
+        logger.info(f"Model downloaded successfully, {time.perf_counter() - start_time}s")
+        return output_path
+
+    def _patch_local_model_with_alibaba_new_impl(self, model_temp_path: Path) -> None:
+        """Patch downloaded model with required assets from Alibaba-NLP/new-impl.
+
+        Our main model, opensearch-project/opensearch-neural-sparse-encoding-doc-v3-gte,
+        has configurations that attempt dynamic downloading of another model for files.
+        This can be seen here: https://huggingface.co/opensearch-project/opensearch-
+        neural-sparse-encoding-doc-v3-gte/blob/main/config.json#L6-L14.
+
+        To avoid our deployed CLI application making requests to the HuggingFace API to
+        retrieve these required files, which is problematic during high concurrency, we
+        manually download these files and patch the model during our local download and
+        save.
+
+        This allows us to load the primary model without any HuggingFace API calls.
+        """
+        logger.info("Downloading custom code from Alibaba-NLP/new-impl")
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            snapshot_download(
+                repo_id="Alibaba-NLP/new-impl",
+                local_dir=str(temp_path),
+            )
+
+            logger.info("Copying Alibaba code and updating config.json")
+            shutil.copy(temp_path / "modeling.py", model_temp_path / "modeling.py")
+            shutil.copy(
+                temp_path / "configuration.py",
+                model_temp_path / "configuration.py",
+            )
+
+            with open(model_temp_path / "config.json") as f:
+                config_json = json.load(f)
+                config_json["auto_map"] = {
+                    "AutoConfig": "configuration.NewConfig",
+                    "AutoModel": "modeling.NewModel",
+                    "AutoModelForMaskedLM": "modeling.NewForMaskedLM",
+                    "AutoModelForMultipleChoice": "modeling.NewForMultipleChoice",
+                    "AutoModelForQuestionAnswering": "modeling.NewForQuestionAnswering",
+                    "AutoModelForSequenceClassification": (
+                        "modeling.NewForSequenceClassification"
+                    ),
+                    "AutoModelForTokenClassification": (
+                        "modeling.NewForTokenClassification"
+                    ),
+                }
+            with open(model_temp_path / "config.json", "w") as f:
+                f.write(json.dumps(config_json))
+
+            logger.debug("Dependency model Alibaba-NLP/new-impl downloaded and used.")
+
+    def load(self, model_path: str | Path) -> None:
+        """Load the model from the specified path.
+
+        Args:
+            model_path: Path to the model directory.
+        """
+        start_time = time.perf_counter()
+        logger.info(f"Loading model from: {model_path}")
+        model_path = Path(model_path)
+
+        # ensure model exists locally
+        if not model_path.exists():
+            raise FileNotFoundError(f"Model not found at path: {model_path}")
+
+        # load local model and tokenizer
+        self._model = AutoModelForMaskedLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            local_files_only=True,
+        )
+        self._tokenizer = AutoTokenizer.from_pretrained(  # type: ignore[no-untyped-call]
+            model_path,
+            local_files_only=True,
+        )
+
+        # setup special tokens
+        self._special_token_ids = [
+            self._tokenizer.vocab[str(token)]
+            for token in self._tokenizer.special_tokens_map.values()
+        ]
+
+        # setup id_to_token mapping
+        self._id_to_token = ["" for _ in range(self._tokenizer.vocab_size)]
+        for token, token_id in self._tokenizer.vocab.items():
+            self._id_to_token[token_id] = token
+
+        logger.info(f"Model loaded successfully, {time.perf_counter()-start_time}s")
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,8 @@ dependencies = [
     "huggingface-hub>=0.26.0",
     "sentry-sdk>=2.34.1",
     "timdex-dataset-api",
+    "torch>=2.9.0",
+    "transformers>=4.57.1",
 ]
 
 [dependency-groups]
@@ -32,7 +34,10 @@ line-length = 90
 [tool.mypy]
 disallow_untyped_calls = true
 disallow_untyped_defs = true
-exclude = ["tests/"]
+exclude = [
+    "tests/",
+    "output/"
+]
 
 [tool.pytest.ini_options]
 log_level = "INFO"
@@ -101,3 +106,6 @@ embeddings = "embeddings.cli:main"
 [build-system]
 requires = ["setuptools>=61"]
 build-backend = "setuptools.build_meta"
+
+[tool.setuptools]
+py-modules = ["embeddings"]
diff --git a/tests/conftest.py b/tests/conftest.py
diff --git a/tests/test_os_neural_sparse_doc_v3_gte.py b/tests/test_os_neural_sparse_doc_v3_gte.py
diff --git a/uv.lock b/uv.lock