Update DeltaStore to be Generic (#3468)

aliafzal · meta-codesync[bot] · commit 498437748b85 · 2025-10-21T02:18:19.000-07:00
Summary: Pull Request resolved: #3468 Make DeltaStore generic to allow use case specific custom implementations internal General Context: We are in the process of transition to a unified DeltaTracker and this is 1/n diffs representing changes towards the transition. Specific Context: DeltaTracker utilizes Memstore to preserve and compact lookups extracted during embedding lookups. As part of transitioning to a common DeltaTracker, we are adding a generic DeltaStore. Memstore will extend from Generic DeltaStore, allowing both MRS and OSS DeltaTrackers to be easily integrated into training frameworks. Differential Revision: D80614364 fbshipit-source-id: 9ef57943bfa4ea1ff630d14d2ce7805775b6505f
diff --git a/torchrec/distributed/model_tracker/delta_store.py b/torchrec/distributed/model_tracker/delta_store.py
@@ -7,6 +7,7 @@
 
 
 # pyre-strict
+from abc import ABC, abstractmethod
 from bisect import bisect_left
 from typing import Dict, List, Optional
 
@@ -67,34 +68,106 @@ def _compute_unique_rows(
         return DeltaRows(ids=unique_ids, states=unique_states)
 
 
-class DeltaStore:
+class DeltaStore(ABC):
     """
-    DeltaStore is a helper class that stores and manages local delta (row) updates for embeddings/states across
-    various batches during training, designed to be used with TorchRecs ModelDeltaTracker.
-    It maintains a CUDA in-memory representation of requested ids and embeddings/states,
+    DeltaStore is an abstract base class that defines the interface for storing and managing
+    local delta (row) updates for embeddings/states across various batches during training.
+
+    Implementations should maintain a representation of requested ids and embeddings/states,
     providing a way to compact and get delta updates for each embedding table.
 
     The class supports different embedding update modes (NONE, FIRST, LAST) to determine
     how to handle duplicate ids when compacting or retrieving embeddings.
+    """
+
+    @abstractmethod
+    def __init__(self, embdUpdateMode: EmbdUpdateMode = EmbdUpdateMode.NONE) -> None:
+        pass
+
+    @abstractmethod
+    def append(
+        self,
+        batch_idx: int,
+        fqn: str,
+        ids: torch.Tensor,
+        states: Optional[torch.Tensor],
+    ) -> None:
+        """
+        Append a batch of ids and states to the store for a specific table.
+
+        Args:
+            batch_idx: The batch index
+            table_fqn: The fully qualified name of the table
+            ids: The tensor of ids to append
+            states: Optional tensor of states to append
+        """
+        pass
+
+    @abstractmethod
+    def delete(self, up_to_idx: Optional[int] = None) -> None:
+        """
+        Delete all idx from the store up to `up_to_idx`
+
+        Args:
+            up_to_idx: Optional index up to which to delete lookups
+        """
+        pass
 
+    @abstractmethod
+    def compact(self, start_idx: int, end_idx: int) -> None:
+        """
+        Compact (ids, embeddings) in batch index range from start_idx to end_idx.
+
+        Args:
+            start_idx: The starting batch index
+            end_idx: The ending batch index
+        """
+        pass
+
+    @abstractmethod
+    def get_delta(self, from_idx: int = 0) -> Dict[str, DeltaRows]:
+        """
+        Return all unique/delta ids per table from the Delta Store.
+
+        Args:
+            from_idx: The batch index from which to get deltas
+
+        Returns:
+            A dictionary mapping table FQNs to their delta rows
+        """
+        pass
+
+
+class DeltaStoreTrec(DeltaStore):
+    """
+    DeltaStoreTrec is a concrete implementation of DeltaStore that stores and manages
+    local delta (row) updates for embeddings/states across various batches during training,
+    designed to be used with TorchRecs ModelDeltaTracker.
+
+    It maintains a CUDA in-memory representation of requested ids and embeddings/states,
+    providing a way to compact and get delta updates for each embedding table.
+
+    The class supports different embedding update modes (NONE, FIRST, LAST) to determine
+    how to handle duplicate ids when compacting or retrieving embeddings.
     """
 
     def __init__(self, embdUpdateMode: EmbdUpdateMode = EmbdUpdateMode.NONE) -> None:
+        super().__init__(embdUpdateMode)
         self.embdUpdateMode = embdUpdateMode
         self.per_fqn_lookups: Dict[str, List[IndexedLookup]] = {}
 
     def append(
         self,
         batch_idx: int,
-        table_fqn: str,
+        fqn: str,
         ids: torch.Tensor,
         states: Optional[torch.Tensor],
     ) -> None:
-        table_fqn_lookup = self.per_fqn_lookups.get(table_fqn, [])
+        table_fqn_lookup = self.per_fqn_lookups.get(fqn, [])
         table_fqn_lookup.append(
             IndexedLookup(batch_idx=batch_idx, ids=ids, states=states)
         )
-        self.per_fqn_lookups[table_fqn] = table_fqn_lookup
+        self.per_fqn_lookups[fqn] = table_fqn_lookup
 
     def delete(self, up_to_idx: Optional[int] = None) -> None:
         """
diff --git a/torchrec/distributed/model_tracker/model_delta_tracker.py b/torchrec/distributed/model_tracker/model_delta_tracker.py
@@ -26,7 +26,7 @@
     GroupedPooledEmbeddingsLookup,
 )
 from torchrec.distributed.embeddingbag import ShardedEmbeddingBagCollection
-from torchrec.distributed.model_tracker.delta_store import DeltaStore
+from torchrec.distributed.model_tracker.delta_store import DeltaStoreTrec
 from torchrec.distributed.model_tracker.types import (
     DeltaRows,
     EmbdUpdateMode,
@@ -122,7 +122,7 @@ def __init__(
         # Validate is the mode is supported for the given module and initialize tracker functions
         self._validate_and_init_tracker_fns()
 
-        self.store: DeltaStore = DeltaStore(UPDATE_MODE_MAP[self._mode])
+        self.store: DeltaStoreTrec = DeltaStoreTrec(UPDATE_MODE_MAP[self._mode])
 
         # Mapping feature name to corresponding FQNs. This is used for retrieving
         # the FQN associated with a given feature name in record_lookup().
@@ -222,7 +222,7 @@ def record_ids(self, kjt: KeyedJaggedTensor) -> None:
         for table_fqn, ids_list in per_table_ids.items():
             self.store.append(
                 batch_idx=self.curr_batch_idx,
-                table_fqn=table_fqn,
+                fqn=table_fqn,
                 ids=torch.cat(ids_list),
                 states=None,
             )
@@ -262,7 +262,7 @@ def record_embeddings(
         for table_fqn, ids_list in per_table_ids.items():
             self.store.append(
                 batch_idx=self.curr_batch_idx,
-                table_fqn=table_fqn,
+                fqn=table_fqn,
                 ids=torch.cat(ids_list),
                 states=torch.cat(per_table_emb[table_fqn]),
             )
@@ -295,7 +295,7 @@ def record_momentum(
             per_key_states = states[offsets[i] : offsets[i + 1]]
             self.store.append(
                 batch_idx=self.curr_batch_idx,
-                table_fqn=fqn,
+                fqn=fqn,
                 ids=kjt[key].values(),
                 states=per_key_states,
             )
@@ -323,7 +323,7 @@ def record_rowwise_optim_state(
             per_key_states = states[offsets[i] : offsets[i + 1]]
             self.store.append(
                 batch_idx=self.curr_batch_idx,
-                table_fqn=fqn,
+                fqn=fqn,
                 ids=kjt[key].values(),
                 states=per_key_states,
             )
diff --git a/torchrec/distributed/model_tracker/tests/test_delta_store.py b/torchrec/distributed/model_tracker/tests/test_delta_store.py
@@ -15,7 +15,7 @@
 from parameterized import parameterized
 from torchrec.distributed.model_tracker.delta_store import (
     _compute_unique_rows,
-    DeltaStore,
+    DeltaStoreTrec,
 )
 from torchrec.distributed.model_tracker.types import (
     DeltaRows,
@@ -24,7 +24,7 @@
 )
 
 
-class DeltaStoreTest(unittest.TestCase):
+class DeltaStoreTrecTest(unittest.TestCase):
     # pyre-fixme[2]: Parameter must be annotated.
     def __init__(self, methodName="runTest") -> None:
         super().__init__(methodName)
@@ -188,12 +188,12 @@ class AppendDeleteTestParams:
     def test_append_and_delete(
         self, _test_name: str, test_params: AppendDeleteTestParams
     ) -> None:
-        delta_store = DeltaStore()
+        delta_store = DeltaStoreTrec()
         for table_fqn, lookup_list in test_params.table_fqn_to_lookups.items():
             for lookup in lookup_list:
                 delta_store.append(
                     batch_idx=lookup.batch_idx,
-                    table_fqn=table_fqn,
+                    fqn=table_fqn,
                     ids=lookup.ids,
                     states=lookup.states,
                 )
@@ -783,15 +783,15 @@ def test_compact(self, _test_name: str, test_params: CompactTestParams) -> None:
         """
         Test the compact method of DeltaStore.
         """
-        # Create a DeltaStore with the specified embdUpdateMode
-        delta_store = DeltaStore(embdUpdateMode=test_params.embdUpdateMode)
+        # Create a DeltaStoreTrec with the specified embdUpdateMode
+        delta_store = DeltaStoreTrec(embdUpdateMode=test_params.embdUpdateMode)
 
         # Populate the DeltaStore with the test lookups
         for table_fqn, lookup_list in test_params.table_fqn_to_lookups.items():
             for lookup in lookup_list:
                 delta_store.append(
                     batch_idx=lookup.batch_idx,
-                    table_fqn=table_fqn,
+                    fqn=table_fqn,
                     ids=lookup.ids,
                     states=lookup.states,
                 )