add inplace_copy_batch_to_gpu in TrainPipeline (#3526)

TroyGarden · meta-codesync[bot] · commit da91c05352d0 · 2025-11-07T21:17:18.000-08:00
Summary: Pull Request resolved: #3526 This diff implements support for pre-allocation in-place copy for host-to-device data transfer in TorchRec train pipelines, addressing CUDA memory overhead issues identified in production RecSys models. https://fb.workplace.com/groups/429376538334034/permalink/1497469664858044/ ## Context As described in the [RFC on Workplace](https://fb.workplace.com/groups/429376538334034/permalink/1497469664858044/), we identified an extra CUDA memory overhead of 3-6 GB per rank on top of the active memory snapshot in most RecSys model training pipelines. This overhead stems from PyTorch's caching allocator behavior when using side CUDA streams for non-blocking host-to-device transfers - the allocator associates transferred tensor memory with the side stream, preventing memory reuse in the main stream and causing up to 13GB extra memory footprint per rank in production models. The solution proposed in [D86068070](https://www.internalfb.com/diff/D86068070) enables pre-allocating memory on the main stream and using in-place copy to reduce this overhead. In local train pipeline benchmarks with 1-GB ModelInput (2 KJTs + float features), this approach reduced memory footprint by ~6 GB per rank. This optimization enables many memory-constrained use cases across platforms including APS, Pyper, and MVAI. ## Key Changes: 1. **Added `inplace_copy_batch_to_gpu` parameter**: New boolean flag throughout the train pipeline infrastructure that enables switching between standard batch copying (direct allocation on side stream) and in-place copying (pre-allocation on main stream). 2. **New `inplace_copy_batch_to_gpu()` method**: Implemented in `TrainPipeline` class to handle the new data transfer pattern with proper stream synchronization, using `_to_device()` with the optional `data_copy_stream` parameter. 3. **Extended `Pipelineable.to()` interface**: Added optional `data_copy_stream` parameter to the abstract method, allowing implementations to specify which stream should execute the data copy operation (see #3510). 4. **Updated benchmark configuration** (`sparse_data_dist_base.yml`): - Increased `num_batches` from 5 to 10 - Changed `feature_pooling_avg` from 10 to 30 - Reduced `num_benchmarks` from 2 to 1 - Added `num_profiles: 1` for profiling 5. **Enhanced table configuration**: Added `base_row_size` parameter (default: 100,000) to `EmbeddingTablesConfig` for more flexible embedding table sizing. These changes enable performance and memory comparison between standard and in-place copy strategies, with proper benchmarking infrastructure to measure and trace the differences. Reviewed By: aporialiao Differential Revision: D86208714 fbshipit-source-id: c7bd9d46d1a9f98a68446b9d4be0f63208b626bf
diff --git a/torchrec/distributed/benchmark/benchmark_train_pipeline.py b/torchrec/distributed/benchmark/benchmark_train_pipeline.py
@@ -183,6 +183,7 @@ def _func_to_benchmark(
             model: nn.Module,
             pipeline: TrainPipeline,
         ) -> None:
+            pipeline.reset()
             dataloader = iter(bench_inputs)
             while True:
                 try:
diff --git a/torchrec/distributed/benchmark/yaml/sparse_data_dist_base.yml b/torchrec/distributed/benchmark/yaml/sparse_data_dist_base.yml
@@ -2,19 +2,20 @@
 # runs on 2 ranks, showing traces with reasonable workloads
 RunOptions:
   world_size: 2
-  num_batches: 5
-  num_benchmarks: 2
+  num_batches: 10
+  num_benchmarks: 1
+  num_profiles: 1
   sharding_type: table_wise
   profile_dir: "."
   name: "sparse_data_dist_base"
   # export_stacks: True # enable this to export stack traces
 PipelineConfig:
   pipeline: "sparse"
 ModelInputConfig:
-  feature_pooling_avg: 10
+  feature_pooling_avg: 30
 EmbeddingTablesConfig:
-  num_unweighted_features: 100
-  num_weighted_features: 100
+  num_unweighted_features: 90
+  num_weighted_features: 80
   embedding_feature_dim: 256
   additional_tables:
     - - name: FP16_table
diff --git a/torchrec/distributed/test_utils/pipeline_config.py b/torchrec/distributed/test_utils/pipeline_config.py
@@ -48,6 +48,7 @@ class PipelineConfig:
 
     pipeline: str = "base"
     emb_lookup_stream: str = "data_dist"
+    inplace_copy_batch_to_gpu: bool = False
     apply_jit: bool = False
 
     def generate_pipeline(
@@ -111,14 +112,24 @@ def generate_pipeline(
                 device=device,
                 emb_lookup_stream=self.emb_lookup_stream,
                 apply_jit=self.apply_jit,
+                inplace_copy_batch_to_gpu=self.inplace_copy_batch_to_gpu,
             )
         elif self.pipeline == "base":
             assert self.apply_jit is False, "JIT is not supported for base pipeline"
 
-            return TrainPipelineBase(model=model, optimizer=opt, device=device)
+            return TrainPipelineBase(
+                model=model,
+                optimizer=opt,
+                device=device,
+                inplace_copy_batch_to_gpu=self.inplace_copy_batch_to_gpu,
+            )
         else:
             Pipeline = _pipeline_cls[self.pipeline]
             # pyre-ignore[28]
             return Pipeline(
-                model=model, optimizer=opt, device=device, apply_jit=self.apply_jit
+                model=model,
+                optimizer=opt,
+                device=device,
+                apply_jit=self.apply_jit,
+                inplace_copy_batch_to_gpu=self.inplace_copy_batch_to_gpu,
             )
diff --git a/torchrec/distributed/test_utils/table_config.py b/torchrec/distributed/test_utils/table_config.py
@@ -36,6 +36,7 @@ class EmbeddingTablesConfig:
     num_unweighted_features: int = 100
     num_weighted_features: int = 100
     embedding_feature_dim: int = 128
+    base_row_size: int = 100_000
     table_data_type: DataType = DataType.FP32
     additional_tables: List[List[Dict[str, Any]]] = field(default_factory=list)
 
@@ -71,7 +72,7 @@ def generate_tables(
         """
         unweighted_tables = [
             EmbeddingBagConfig(
-                num_embeddings=max(i + 1, 100) * 2000,
+                num_embeddings=max(i + 1, 100) * self.base_row_size // 100,
                 embedding_dim=self.embedding_feature_dim,
                 name="table_" + str(i),
                 feature_names=["feature_" + str(i)],
@@ -81,7 +82,7 @@ def generate_tables(
         ]
         weighted_tables = [
             EmbeddingBagConfig(
-                num_embeddings=max(i + 1, 100) * 2000,
+                num_embeddings=max(i + 1, 100) * self.base_row_size // 100,
                 embedding_dim=self.embedding_feature_dim,
                 name="weighted_table_" + str(i),
                 feature_names=["weighted_feature_" + str(i)],
diff --git a/torchrec/distributed/train_pipeline/train_pipelines.py b/torchrec/distributed/train_pipeline/train_pipelines.py
@@ -187,6 +187,7 @@ def __init__(
         custom_model_fwd: Optional[
             Callable[[In], Tuple[torch.Tensor, List[torch.Tensor]]]
         ] = None,
+        inplace_copy_batch_to_gpu: bool = False,
     ) -> None:
         self._model = model
         self._optimizer = optimizer
@@ -196,6 +197,7 @@ def __init__(
             if device.type in ["cuda", "mtia"]
             else None
         )
+        self._inplace_copy_batch_to_gpu = inplace_copy_batch_to_gpu
 
         # pyre-ignore
         self._stream_context = (
@@ -217,8 +219,18 @@ def _connect(self, dataloader_iter: Iterator[In]) -> None:
         cur_batch = next(dataloader_iter)
         self._cur_batch = cur_batch
         if cur_batch is not None:
-            with self._stream_context(self._memcpy_stream):
-                self._cur_batch = _to_device(cur_batch, self._device, non_blocking=True)
+            if self._inplace_copy_batch_to_gpu:
+                self._cur_batch = _to_device(
+                    cur_batch,
+                    self._device,
+                    non_blocking=True,
+                    data_copy_stream=self._memcpy_stream,
+                )
+            else:
+                with self._stream_context(self._memcpy_stream):
+                    self._cur_batch = _to_device(
+                        cur_batch, self._device, non_blocking=True
+                    )
         self._connected = True
 
     def _next_batch(self, dataloader_iter: Iterator[In]) -> Optional[In]:
@@ -241,8 +253,18 @@ def _backward(self, losses: torch.Tensor) -> None:
 
     def _copy_batch_to_gpu(self, cur_batch: In) -> None:
         with record_function("## copy_batch_to_gpu ##"):
-            with self._stream_context(self._memcpy_stream):
-                self._cur_batch = _to_device(cur_batch, self._device, non_blocking=True)
+            if self._inplace_copy_batch_to_gpu:
+                self._cur_batch = _to_device(
+                    cur_batch,
+                    self._device,
+                    non_blocking=True,
+                    data_copy_stream=self._memcpy_stream,
+                )
+            else:
+                with self._stream_context(self._memcpy_stream):
+                    self._cur_batch = _to_device(
+                        cur_batch, self._device, non_blocking=True
+                    )
 
     def progress(self, dataloader_iter: Iterator[In]) -> Out:
         if not self._connected:
@@ -440,13 +462,15 @@ def __init__(
         ] = None,
         dmp_collection_sync_interval_batches: Optional[int] = 1,
         enqueue_batch_after_forward: bool = False,
+        inplace_copy_batch_to_gpu: bool = False,
     ) -> None:
         self._model = model
         self._optimizer = optimizer
         self._device = device
         self._execute_all_batches = execute_all_batches
         self._apply_jit = apply_jit
         self._enqueue_batch_after_forward = enqueue_batch_after_forward
+        self._inplace_copy_batch_to_gpu = inplace_copy_batch_to_gpu
 
         logger.info(
             f"enqueue_batch_after_forward: {self._enqueue_batch_after_forward} "
@@ -587,7 +611,10 @@ def enqueue_batch(self, dataloader_iter: Iterator[In]) -> bool:
         load a data batch from dataloader, and copy it from cpu to gpu
         also create the context for this batch.
         """
-        batch, context = self.copy_batch_to_gpu(dataloader_iter)
+        if self._inplace_copy_batch_to_gpu:
+            batch, context = self.inplace_copy_batch_to_gpu(dataloader_iter)
+        else:
+            batch, context = self.copy_batch_to_gpu(dataloader_iter)
         if batch is None:
             return False
         self.batches.append(batch)
@@ -820,6 +847,38 @@ def copy_batch_to_gpu(
                     )
                 return batch, context
 
+    def inplace_copy_batch_to_gpu(
+        self,
+        dataloader_iter: Iterator[In],
+    ) -> Tuple[Optional[In], Optional[TrainPipelineContext]]:
+        """
+        Moves batch to the provided device on memcpy stream.
+
+        Raises:
+            StopIteration: if the dataloader iterator is exhausted; unless
+                `self._execute_all_batches=True`, then returns None.
+        """
+        context = self._create_context()
+        with record_function(f"## inplace_copy_batch_to_gpu {context.index} ##"):
+            batch = self._next_batch(dataloader_iter)
+            if batch is not None:
+                batch = _to_device(
+                    batch,
+                    self._device,
+                    non_blocking=True,
+                    data_copy_stream=self._memcpy_stream,
+                )
+            elif not self._execute_all_batches:
+                logger.info(
+                    "inplace_copy_batch_to_gpu: raising StopIteration for None Batch (execute_all_batches=False)"
+                )
+                raise StopIteration
+            else:
+                logger.info(
+                    "inplace_copy_batch_to_gpu: returning None batch (execute_all_batches=True)"
+                )
+            return batch, context
+
     def _next_batch(self, dataloader_iter: Iterator[In]) -> Optional[In]:
         """
         Retrieves next batch from dataloader and prevents calling `next` on an already
@@ -984,6 +1043,7 @@ def __init__(
         strict: bool = False,
         emb_lookup_stream: str = "data_dist",  # new, current, data_dist (default)
         embedding_lookup_after_data_dist: bool = False,
+        inplace_copy_batch_to_gpu: bool = False,
     ) -> None:
         super().__init__(
             model=model,
@@ -994,6 +1054,7 @@ def __init__(
             context_type=EmbeddingTrainPipelineContext,
             pipeline_postproc=pipeline_postproc,
             custom_model_fwd=custom_model_fwd,
+            inplace_copy_batch_to_gpu=inplace_copy_batch_to_gpu,
         )
         self._embedding_lookup_after_data_dist = embedding_lookup_after_data_dist
 
@@ -1155,6 +1216,7 @@ def __init__(
         ] = None,
         strict: bool = False,
         dmp_collection_sync_interval_batches: Optional[int] = 1,
+        inplace_copy_batch_to_gpu: bool = False,
     ) -> None:
         super().__init__(
             model=model,
@@ -1166,6 +1228,7 @@ def __init__(
             pipeline_postproc=pipeline_postproc,
             custom_model_fwd=custom_model_fwd,
             dmp_collection_sync_interval_batches=dmp_collection_sync_interval_batches,
+            inplace_copy_batch_to_gpu=inplace_copy_batch_to_gpu,
         )
         self._start_batch = start_batch
         self._stash_gradients = stash_gradients
diff --git a/torchrec/distributed/train_pipeline/utils.py b/torchrec/distributed/train_pipeline/utils.py
@@ -68,11 +68,33 @@
 logger: logging.Logger = logging.getLogger(__name__)
 
 
-def _to_device(batch: In, device: torch.device, non_blocking: bool) -> In:
+def _to_device(
+    batch: In,
+    device: torch.device,
+    non_blocking: bool,
+    data_copy_stream: Optional[torch.Stream] = None,
+) -> In:
     assert isinstance(
         batch, (torch.Tensor, Pipelineable)
     ), f"{type(batch)} must implement Pipelineable interface"
-    return cast(In, batch.to(device=device, non_blocking=non_blocking))
+    if data_copy_stream is not None:
+        return cast(
+            In,
+            # pyre-ignore[28]
+            batch.to(
+                device=device,
+                non_blocking=non_blocking,
+                data_copy_stream=data_copy_stream,
+            ),
+        )
+    else:
+        return cast(
+            In,
+            batch.to(
+                device=device,
+                non_blocking=non_blocking,
+            ),
+        )
 
 
 def _wait_for_batch(batch: In, stream: Optional[torch.Stream]) -> None: