disable random init in inference operator for embedding cache (#3466)

emlin · meta-codesync[bot] · commit af2507605132 · 2025-10-20T04:16:25.000-07:00
Summary: X-link: pytorch/FBGEMM#5026 Pull Request resolved: #3466 X-link: https://github.com/facebookresearch/FBGEMM/pull/2040 For embedding cache mode, we do not expect random value if there is cache missing. This diff passed the embedding cache mode to inference operator, and use that to disable the backend random initialization. Differential Revision: D84367061 fbshipit-source-id: 83687bcb7c097f60b583c00bf80956efcdcd3a9d
diff --git a/torchrec/distributed/quant_embedding_kernel.py b/torchrec/distributed/quant_embedding_kernel.py
@@ -327,20 +327,45 @@ def __init__(
         else:
             shard_offsets_for_kv_zch = None
 
-        self._emb_module: IntNBitTableBatchedEmbeddingBagsCodegen = tbe_clazz(
-            embedding_specs=embedding_specs,
-            device=device,
-            pooling_mode=self._pooling,
-            feature_table_map=self._feature_table_map,
-            row_alignment=self._tbe_row_alignment,
-            uvm_host_mapped=True,  # Use cudaHostAlloc for UVM CACHING to fix imbalance numa memory issue
-            bounds_check_mode=(
+        # Determine embedding cache mode for KV embedding tables
+        embedding_cache_mode = False  # Default: False = randomized initialization
+        if tbe_clazz == KVEmbeddingInference:
+            # For KV embedding tables, set cache mode based on embedding table configuration
+            # Check if any table has NoEvictionPolicy - use zero init for those
+            for table in config.embedding_tables:
+                if (
+                    table.virtual_table_eviction_policy is not None
+                    and type(table.virtual_table_eviction_policy).__name__
+                    == "NoEvictionPolicy"
+                ):
+                    embedding_cache_mode = True  # True = zero initialization
+                    break
+
+        # Build kwargs for module construction
+        module_kwargs: Dict[str, Any] = {
+            "embedding_specs": embedding_specs,
+            "device": device,
+            "pooling_mode": self._pooling,
+            "feature_table_map": self._feature_table_map,
+            "row_alignment": self._tbe_row_alignment,
+            "uvm_host_mapped": True,  # Use cudaHostAlloc for UVM CACHING to fix imbalance numa memory issue
+            "bounds_check_mode": (
                 bounds_check_mode if bounds_check_mode else BoundsCheckMode.WARNING
             ),
-            feature_names_per_table=[
+            "feature_names_per_table": [
                 table.feature_names for table in config.embedding_tables
             ],
-            **(tbe_fused_params(fused_params) or {}),
+        }
+
+        # Add KV-specific parameters
+        if tbe_clazz == KVEmbeddingInference:
+            module_kwargs["embedding_cache_mode"] = embedding_cache_mode
+
+        # Add fused params
+        module_kwargs.update(**(tbe_fused_params(fused_params) or {}))
+
+        self._emb_module: IntNBitTableBatchedEmbeddingBagsCodegen = tbe_clazz(
+            **module_kwargs
         )
         if device is not None:
             self._emb_module.initialize_weights()
@@ -495,6 +520,7 @@ def __init__(
 
         managed: List[EmbeddingLocation] = []
         is_virtual_table = False
+        embedding_cache_mode = False
         for table in config.embedding_tables:
             if device is not None and device.type == "cuda":
                 managed.append(
@@ -504,6 +530,8 @@ def __init__(
                 managed.append(EmbeddingLocation.HOST)
             if table.use_virtual_table:
                 is_virtual_table = True
+            if table.enable_embedding_update:
+                embedding_cache_mode = True
         self._config: GroupedEmbeddingConfig = config
         self._emb_module_registered: bool = is_fused_param_register_tbe(fused_params)
         self._quant_state_dict_split_scale_bias: bool = (
@@ -529,8 +557,9 @@ def __init__(
         else:
             shard_offsets_for_kv_zch = None
 
-        self._emb_module: IntNBitTableBatchedEmbeddingBagsCodegen = embedding_clazz(
-            embedding_specs=[
+        # Build kwargs for module construction
+        module_kwargs: Dict[str, Any] = {
+            "embedding_specs": [
                 (
                     table.name,
                     local_rows,
@@ -549,15 +578,25 @@ def __init__(
                     managed,
                 )
             ],
-            device=device,
-            pooling_mode=PoolingMode.NONE,
-            feature_table_map=self._feature_table_map,
-            row_alignment=self._tbe_row_alignment,
-            uvm_host_mapped=True,  # Use cudaHostAlloc for UVM CACHING to fix imbalance numa memory issue
-            feature_names_per_table=[
+            "device": device,
+            "pooling_mode": PoolingMode.NONE,
+            "feature_table_map": self._feature_table_map,
+            "row_alignment": self._tbe_row_alignment,
+            "uvm_host_mapped": True,  # Use cudaHostAlloc for UVM CACHING to fix imbalance numa memory issue
+            "feature_names_per_table": [
                 table.feature_names for table in config.embedding_tables
             ],
-            **(tbe_fused_params(fused_params) or {}),
+        }
+
+        # Add KV-specific parameters
+        if embedding_clazz == KVEmbeddingInference:
+            module_kwargs["embedding_cache_mode"] = embedding_cache_mode
+
+        # Add fused params
+        module_kwargs.update(**(tbe_fused_params(fused_params) or {}))
+
+        self._emb_module: IntNBitTableBatchedEmbeddingBagsCodegen = embedding_clazz(
+            **module_kwargs
         )
         if device is not None:
             self._emb_module.initialize_weights()
diff --git a/torchrec/quant/embedding_modules.py b/torchrec/quant/embedding_modules.py
@@ -764,9 +764,9 @@ def __init__(  # noqa C901
         self._output_dtype = output_dtype
         self._device = device
         self.row_alignment = row_alignment
-        self._key_to_tables: Dict[Tuple[DataType, bool], List[EmbeddingConfig]] = (
-            defaultdict(list)
-        )
+        self._key_to_tables: Dict[
+            Tuple[DataType, bool, bool], List[EmbeddingConfig]
+        ] = defaultdict(list)
         self._feature_names: List[str] = []
         self._features_order: Optional[List[int]] = None
 
@@ -789,12 +789,24 @@ def __init__(  # noqa C901
                     + f" {self._embedding_dim}"
                 )
             if hasattr(table, "use_virtual_table"):
-                key = (table.data_type, table.use_virtual_table)
+                key = (table.data_type, table.use_virtual_table, False)
+            if hasattr(table, "use_virtual_table") and hasattr(
+                table, "enable_embedding_update"
+            ):
+                key = (
+                    table.data_type,
+                    table.use_virtual_table,
+                    table.enable_embedding_update,
+                )
             else:
-                key = (table.data_type, False)
+                key = (table.data_type, False, False)
             self._key_to_tables[key].append(table)
         self._feature_splits: List[int] = []
-        for (data_type, use_virtual_table), emb_configs in self._key_to_tables.items():
+        for (
+            data_type,
+            use_virtual_table,
+            enable_embedding_update,
+        ), emb_configs in self._key_to_tables.items():
             embedding_specs = []
             weight_lists: Optional[
                 List[Tuple[torch.Tensor, Optional[torch.Tensor]]]
@@ -825,15 +837,20 @@ def __init__(  # noqa C901
                 if use_virtual_table
                 else IntNBitTableBatchedEmbeddingBagsCodegen
             )
-            emb_module = embedding_clazz(
-                embedding_specs=embedding_specs,
-                pooling_mode=PoolingMode.NONE,
-                weight_lists=weight_lists,
-                device=device,
-                output_dtype=data_type_to_sparse_type(dtype_to_data_type(output_dtype)),
-                row_alignment=row_alignment,
-                feature_table_map=feature_table_map,
-            )
+            kwargs: Dict[str, Any] = {
+                "embedding_specs": embedding_specs,
+                "pooling_mode": PoolingMode.NONE,
+                "weight_lists": weight_lists,
+                "device": device,
+                "output_dtype": data_type_to_sparse_type(
+                    dtype_to_data_type(output_dtype)
+                ),
+                "row_alignment": row_alignment,
+                "feature_table_map": feature_table_map,
+            }
+            if embedding_clazz == KVEmbeddingInference:
+                kwargs["embedding_cache_mode"] = enable_embedding_update
+            emb_module = embedding_clazz(**kwargs)
             if weight_lists is None:
                 emb_module.initialize_weights()
             self._emb_modules.append(emb_module)
@@ -869,6 +886,7 @@ def __init__(  # noqa C901
                         "weight_qbias", qbias
                     )
 
+        # pyre-ignore [8]
         self._embedding_names_by_batched_tables: Dict[
             Tuple[DataType, bool], List[str]
         ] = {
@@ -934,6 +952,7 @@ def forward(
             f = kjts_per_key[i]
             lengths = _get_feature_length(f)
             indices, offsets = _fx_trec_unwrap_kjt(f)
+            # pyre-ignore [6]
             embedding_names = self._embedding_names_by_batched_tables[key]
             lookup = (
                 emb_module(indices=indices, offsets=offsets)