Introduce strategies framework for preparing EmbeddingInputs

ghukill · ghukill · commit cb03062c510b · 2025-11-03T15:22:08.000-05:00
Why these changes are being introduced: A core requirement of this application is the ability to take a TIMDEX JSON record and "transform" all or parts of it into a single string for which an embedding can be created. We are calling these "embedding strategies" in the context of this app. While our first strategy will likely be a very simple, full record approach, we want to support multiple strategies in the application, and even multiple strategies for a single record in a single invocation. How this addresses that need: * A new 'strategies' module is created * A base 'BaseStrategy' class, with a required 'extract_text()' method for implementations * Our first strategy represented in class 'FullRecordStrategy', which JSON dumps the entire TIMDEX JSON record. * A registry of strategies, similar to our models, that allow CLI level validation. Side effects of this change: * None really, but further solidifies that this application is contains the opinionation about how text is prepared for the embedding process. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-131 * https://mitlibraries.atlassian.net/browse/USE-132
diff --git a/README.md b/README.md
@@ -114,8 +114,9 @@ Options:
                                default = 0.  [required]
   --record-limit INTEGER       Limit number of records after --run-record-
                                offset, default = None (unlimited).  [required]
-  --strategy TEXT              Pre-embedding record transformation strategy to
-                               use.  Repeatable.  [required]
+  --strategy [full_record]     Pre-embedding record transformation strategy.
+                               Repeatable to apply multiple strategies.
+                               [required]
   --output-jsonl TEXT          Optionally write embeddings to local JSONLines
                                file (primarily for testing).
   --help                       Show this message and exit.
diff --git a/embeddings/cli.py b/embeddings/cli.py
@@ -1,4 +1,5 @@
 import functools
+import json
 import logging
 import time
 from collections.abc import Callable
@@ -12,6 +13,8 @@
 
 from embeddings.config import configure_logger, configure_sentry
 from embeddings.models.registry import get_model_class
+from embeddings.strategies.processor import create_embedding_inputs
+from embeddings.strategies.registry import STRATEGY_REGISTRY
 
 logger = logging.getLogger(__name__)
 
@@ -181,10 +184,13 @@ def test_model_load(ctx: click.Context) -> None:
 )
 @click.option(
     "--strategy",
-    type=str,  # WIP: establish an enum of supported strategies
+    type=click.Choice(list(STRATEGY_REGISTRY.keys())),
     required=True,
     multiple=True,
-    help="Pre-embedding record transformation strategy to use.  Repeatable.",
+    help=(
+        "Pre-embedding record transformation strategy. "
+        "Repeatable to apply multiple strategies."
+    ),
 )
 @click.option(
     "--output-jsonl",
@@ -222,48 +228,11 @@ def create_embeddings(
         action="index",
     )
 
-    # create an iterator of InputTexts applying all requested strategies to all records
-    # WIP NOTE: this will leverage some kind of pre-embedding transformer class(es) that
-    #   create texts based on the requested strategies (e.g. "full record"), which are
-    #   captured in --strategy CLI args
-    # WIP NOTE: the following simulates that...
-    # DEBUG ------------------------------------------------------------------------------
-    import json  # noqa: PLC0415
-
-    from embeddings.embedding import EmbeddingInput  # noqa: PLC0415
-
-    input_records = (
-        EmbeddingInput(
-            timdex_record_id=timdex_record["timdex_record_id"],
-            run_id=timdex_record["run_id"],
-            run_record_offset=timdex_record["run_record_offset"],
-            embedding_strategy=_strategy,
-            text=json.dumps(timdex_record["transformed_record"].decode()),
-        )
-        for timdex_record in timdex_records
-        for _strategy in strategy
-    )
-    # DEBUG ------------------------------------------------------------------------------
-
-    # create an iterator of Embeddings via the embedding model
-    # WIP NOTE: this will use the embedding class .create_embeddings() bulk method
-    # WIP NOTE: the following simulates that...
-    # DEBUG ------------------------------------------------------------------------------
-    from embeddings.embedding import Embedding  # noqa: PLC0415
-
-    embeddings = (
-        Embedding(
-            timdex_record_id=input_record.timdex_record_id,
-            run_id=input_record.run_id,
-            run_record_offset=input_record.run_record_offset,
-            embedding_strategy=input_record.embedding_strategy,
-            model_uri=model.model_uri,
-            embedding_vector=[0.1, 0.2, 0.3],
-            embedding_token_weights={"coffee": 0.9, "seattle": 0.5},
-        )
-        for input_record in input_records
-    )
-    # DEBUG ------------------------------------------------------------------------------
+    # create an iterator of EmbeddingInputs applying all requested strategies
+    input_records = create_embedding_inputs(timdex_records, list(strategy))
+
+    # create embeddings via the embedding model
+    embeddings = model.create_embeddings(input_records)
 
     # if requested, write embeddings to a local JSONLines file
     if output_jsonl:
diff --git a/embeddings/strategies/__init__.py b/embeddings/strategies/__init__.py
@@ -0,0 +1 @@
+"""Strategies for transforming TIMDEX records into EmbeddingInputs."""
diff --git a/embeddings/strategies/base.py b/embeddings/strategies/base.py
@@ -0,0 +1,57 @@
+from abc import ABC, abstractmethod
+
+from embeddings.embedding import EmbeddingInput
+
+
+class BaseStrategy(ABC):
+    """Base class for embedding input strategies.
+
+    All child classes must set class level attribute STRATEGY_NAME.
+    """
+
+    STRATEGY_NAME: str  # type hint to document the requirement
+
+    def __init__(
+        self,
+        timdex_record_id: str,
+        run_id: str,
+        run_record_offset: int,
+        transformed_record: dict,
+    ) -> None:
+        """Initialize strategy with TIMDEX record metadata.
+
+        Args:
+            timdex_record_id: TIMDEX record ID
+            run_id: TIMDEX ETL run ID
+            run_record_offset: record offset within the run
+            transformed_record: parsed TIMDEX record JSON
+        """
+        self.timdex_record_id = timdex_record_id
+        self.run_id = run_id
+        self.run_record_offset = run_record_offset
+        self.transformed_record = transformed_record
+
+    def __init_subclass__(cls, **kwargs: dict) -> None:  # noqa: D105
+        super().__init_subclass__(**kwargs)
+
+        # require class level STRATEGY_NAME to be set
+        if not hasattr(cls, "STRATEGY_NAME"):
+            msg = f"{cls.__name__} must define 'STRATEGY_NAME' class attribute"
+            raise TypeError(msg)
+        if not isinstance(cls.STRATEGY_NAME, str):
+            msg = f"{cls.__name__} must override 'STRATEGY_NAME' with a valid string"
+            raise TypeError(msg)
+
+    @abstractmethod
+    def extract_text(self) -> str:
+        """Extract text to be embedded from transformed_record."""
+
+    def to_embedding_input(self) -> EmbeddingInput:
+        """Create EmbeddingInput instance with strategy-specific extracted text."""
+        return EmbeddingInput(
+            timdex_record_id=self.timdex_record_id,
+            run_id=self.run_id,
+            run_record_offset=self.run_record_offset,
+            embedding_strategy=self.STRATEGY_NAME,
+            text=self.extract_text(),
+        )
diff --git a/embeddings/strategies/full_record.py b/embeddings/strategies/full_record.py
@@ -0,0 +1,13 @@
+import json
+
+from embeddings.strategies.base import BaseStrategy
+
+
+class FullRecordStrategy(BaseStrategy):
+    """Serialize entire TIMDEX record JSON as embedding input."""
+
+    STRATEGY_NAME = "full_record"
+
+    def extract_text(self) -> str:
+        """Serialize the entire transformed_record as JSON."""
+        return json.dumps(self.transformed_record)
diff --git a/embeddings/strategies/processor.py b/embeddings/strategies/processor.py
@@ -0,0 +1,42 @@
+import json
+from collections.abc import Iterator
+
+from embeddings.embedding import EmbeddingInput
+from embeddings.strategies.registry import get_strategy_class
+
+
+def create_embedding_inputs(
+    timdex_records: Iterator[dict],
+    strategies: list[str],
+) -> Iterator[EmbeddingInput]:
+    """Yield EmbeddingInput instances for all records x all strategies.
+
+    Creates a cartesian product: each record is processed by each strategy,
+    yielding one EmbeddingInput per combination.
+
+    Args:
+        timdex_records: Iterator of TIMDEX records.
+            Expected keys: timdex_record_id, run_id, run_record_offset,
+            transformed_record (bytes)
+        strategies: List of strategy names to apply
+
+    Yields:
+        EmbeddingInput instances ready for embedding model
+
+    Example:
+        100 records x 3 strategies = 300 EmbeddingInput instances
+    """
+    for timdex_record in timdex_records:
+        # decode and parse the TIMDEX JSON record
+        transformed_record = json.loads(timdex_record["transformed_record"].decode())
+
+        # apply all strategies to the record and yield
+        for strategy_name in strategies:
+            strategy_class = get_strategy_class(strategy_name)
+            strategy_instance = strategy_class(
+                timdex_record_id=timdex_record["timdex_record_id"],
+                run_id=timdex_record["run_id"],
+                run_record_offset=timdex_record["run_record_offset"],
+                transformed_record=transformed_record,
+            )
+            yield strategy_instance.to_embedding_input()
diff --git a/embeddings/strategies/registry.py b/embeddings/strategies/registry.py
@@ -0,0 +1,29 @@
+import logging
+
+from embeddings.strategies.base import BaseStrategy
+from embeddings.strategies.full_record import FullRecordStrategy
+
+logger = logging.getLogger(__name__)
+
+STRATEGY_CLASSES = [
+    FullRecordStrategy,
+]
+
+STRATEGY_REGISTRY: dict[str, type[BaseStrategy]] = {
+    strategy.STRATEGY_NAME: strategy for strategy in STRATEGY_CLASSES
+}
+
+
+def get_strategy_class(strategy_name: str) -> type[BaseStrategy]:
+    """Get strategy class by name.
+
+    Args:
+        strategy_name: Name of the strategy to retrieve
+    """
+    if strategy_name not in STRATEGY_REGISTRY:
+        available = ", ".join(sorted(STRATEGY_REGISTRY.keys()))
+        msg = f"Unknown strategy: {strategy_name}. Available: {available}"
+        logger.error(msg)
+        raise ValueError(msg)
+
+    return STRATEGY_REGISTRY[strategy_name]

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+"""Strategies for transforming TIMDEX records into EmbeddingInputs."""`