Add device parameter to KeyedJaggedTensor.empty_like and copy_ method (meta-pytorch#3510)

TroyGarden · meta-codesync[bot] · commit 8edd9047f7d1 · 2025-11-04T20:33:32.000-08:00
Summary: Pull Request resolved: meta-pytorch#3510 This diff enhances the KeyedJaggedTensor API to support device-aware operations, which is needed for efficient cross-device tensor management in TorchRec. reference: [memory snapshot and footprint for non-blocking copy](meta-pytorch#3485) ## Key Changes: 1. **Extended `empty_like` method**: Added an optional `device` parameter to support creating empty KJT structures on a different device. This enables two usage patterns: - Original: Creates empty KJT on the same device, preserving stride/stride_per_key_per_rank with empty data - Device-copy: Creates empty KJT structure on a new device, useful for pre-allocating tensors before async copy operations 2. **New `copy_` method**: Implements an in-place copy operation for KeyedJaggedTensor that: - Copies values, weights, lengths, and offsets from source to destination KJT - Supports non-blocking (async) copies for better performance - Assumes host-side metadata (keys, stride, etc.) is already configured - Handles optional tensors (weights, lengths, offsets) appropriately 3. **Refactored implementation**: Split the original `_kjt_empty_like` logic into: - `_kjt_empty_like_stride`: Preserves original behavior for same-device empty KJT - `_kjt_empty_like_device`: New function for cross-device empty KJT creation These changes enable more efficient device-to-device transfer patterns in distributed training scenarios. {F1983205769} ### Validation: in a prototyping experiments with sparse-data-dist pipeline (TrainPipelineSparseDist), the Memcpy HtoD has similar speed (bandwidth) and the CUDA memory timeline profile, but the reserved memory is 79.7GB vs 74.0GB, showing a 5~6GB benefit. While the input KJT per rank is about 1GB. * trace with direct copy {F1983200620} * trace with inplace copy {F1983200591} * snapshot with direct copy {F1983200644} {F1983200655} * snapshot with inplace copy {F1983200664} {F1983200670} Reviewed By: spmex Differential Revision: D86068070 fbshipit-source-id: 0d1076fd192190b46eed4bda1d4e53b4b245d2a7
diff --git a/torchrec/distributed/test_utils/model_input.py b/torchrec/distributed/test_utils/model_input.py
@@ -34,22 +34,101 @@ class ModelInput(Pipelineable):
     idscore_features: Optional[KeyedJaggedTensor]
     label: torch.Tensor
 
-    def to(self, device: torch.device, non_blocking: bool = False) -> "ModelInput":
-        return ModelInput(
-            float_features=self.float_features.to(
-                device=device, non_blocking=non_blocking
-            ),
-            idlist_features=(
-                self.idlist_features.to(device=device, non_blocking=non_blocking)
+    def to(
+        self,
+        device: torch.device,
+        non_blocking: bool = False,
+        data_copy_stream: Optional[torch.cuda.streams.Stream] = None,
+    ) -> "ModelInput":
+        """
+        Move ModelInput to the specified device.
+
+        Args:
+            device: Target device to move tensors to.
+            non_blocking: Whether to perform asynchronous copies.
+            data_copy_stream: Optional CUDA stream for async data copies. When provided,
+                tensors are pre-allocated on the target device and copied within this stream.
+                This enables pipelined data transfers with computation on other streams.
+
+        Returns:
+            ModelInput on the target device.
+
+        Example:
+            # Standard synchronous transfer
+            batch_gpu = batch_cpu.to(device="cuda")
+
+            # Async transfer with dedicated stream
+            copy_stream = torch.cuda.Stream()
+            batch_gpu = batch_cpu.to(device="cuda", non_blocking=True, data_copy_stream=copy_stream)
+        """
+        if data_copy_stream is None:
+            # Standard .to() method
+            float_features = self.float_features.to(
+                device=device,
+                non_blocking=non_blocking,
+            )
+            idlist_features = (
+                self.idlist_features.to(
+                    device=device,
+                    non_blocking=non_blocking,
+                )
                 if self.idlist_features is not None
                 else None
-            ),
-            idscore_features=(
-                self.idscore_features.to(device=device, non_blocking=non_blocking)
+            )
+            idscore_features = (
+                self.idscore_features.to(
+                    device=device,
+                    non_blocking=non_blocking,
+                )
                 if self.idscore_features is not None
                 else None
-            ),
-            label=self.label.to(device=device, non_blocking=non_blocking),
+            )
+            label = self.label.to(
+                device=device,
+                non_blocking=non_blocking,
+            )
+        else:
+            # Async copy using dedicated stream
+            current_stream = torch.cuda.current_stream(device)
+
+            # Pre-allocate tensors on target device
+            float_features = torch.empty_like(self.float_features, device=device)
+            label = torch.empty_like(self.label, device=device)
+            idlist_features = (
+                None
+                if self.idlist_features is None
+                else KeyedJaggedTensor.empty_like(self.idlist_features, device=device)
+            )
+            idscore_features = (
+                None
+                if self.idscore_features is None
+                else KeyedJaggedTensor.empty_like(self.idscore_features, device=device)
+            )
+
+            # Perform async copy in dedicated stream
+            with data_copy_stream:
+                # Wait for current stream to finish memory allocation
+                data_copy_stream.wait_stream(current_stream)
+
+                float_features.copy_(self.float_features, non_blocking=non_blocking)
+                label.copy_(self.label, non_blocking=non_blocking)
+                if idlist_features is not None:
+                    idlist_features.copy_(
+                        # pyre-ignore[6]: Pyre doesn't understand self.idlist_features is not None here
+                        self.idlist_features,
+                        non_blocking=non_blocking,
+                    )
+                if idscore_features is not None:
+                    idscore_features.copy_(
+                        # pyre-ignore[6]: Pyre doesn't understand self.idscore_features is not None here
+                        self.idscore_features,
+                        non_blocking=non_blocking,
+                    )
+        return ModelInput(
+            float_features=float_features,
+            idlist_features=idlist_features,
+            idscore_features=idscore_features,
+            label=label,
         )
 
     def record_stream(self, stream: torch.Stream) -> None:
@@ -299,7 +378,7 @@ def generate(
                 tables=weighted_tables,
                 pooling_avg=pooling_avg,
                 tables_pooling=tables_pooling,
-                weighted=False,  # weighted
+                weighted=True,  # weighted
                 max_feature_lengths=max_feature_lengths,
                 use_offsets=use_offsets,
                 device=device,
diff --git a/torchrec/sparse/jagged_tensor.py b/torchrec/sparse/jagged_tensor.py
@@ -1466,8 +1466,9 @@ def _maybe_compute_kjt_to_jt_dict(
 
 
 @torch.fx.wrap
-def _kjt_empty_like(kjt: "KeyedJaggedTensor") -> "KeyedJaggedTensor":
+def _kjt_empty_like_stride(kjt: "KeyedJaggedTensor") -> "KeyedJaggedTensor":
     # empty like function fx wrapped, also avoids device hardcoding
+    # basically the empty KJT only preserve the stride and stride_per_key_per_rank
     stride, stride_per_key_per_rank = (
         (None, kjt._stride_per_key_per_rank)
         if kjt._stride_per_key_per_rank is not None and kjt.variable_stride_per_key()
@@ -1488,6 +1489,51 @@ def _kjt_empty_like(kjt: "KeyedJaggedTensor") -> "KeyedJaggedTensor":
     )
 
 
+@torch.fx.wrap
+def _kjt_empty_like_device(
+    kjt: "KeyedJaggedTensor", device: torch.device
+) -> "KeyedJaggedTensor":
+    # more likely the torch.Tensor.empty_like function, allocate the memory on device
+    stride, stride_per_key_per_rank = (
+        (None, kjt._stride_per_key_per_rank)
+        if kjt._stride_per_key_per_rank is not None and kjt.variable_stride_per_key()
+        else (kjt.stride(), None)
+    )
+    inverse_indices = kjt._inverse_indices
+    return KeyedJaggedTensor(
+        keys=kjt.keys(),
+        values=torch.empty_like(kjt.values(), device=device),
+        weights=(
+            None
+            if kjt.weights_or_none() is None
+            else torch.empty_like(kjt.weights(), device=device)
+        ),
+        lengths=(
+            None
+            if kjt.lengths_or_none() is None
+            else torch.empty_like(kjt.lengths(), device=device)
+        ),
+        offsets=(
+            None
+            if kjt.offsets_or_none() is None
+            else torch.empty_like(kjt.offsets(), device=device)
+        ),
+        stride=stride,
+        inverse_indices=(
+            None
+            if inverse_indices is None
+            else (
+                inverse_indices[0],
+                torch.empty_like(inverse_indices[1], device=device),
+            )
+        ),
+        stride_per_key_per_rank=stride_per_key_per_rank,
+        stride_per_key=kjt._stride_per_key,
+        length_per_key=kjt._length_per_key,
+        offset_per_key=kjt._offset_per_key,
+    )
+
+
 def _sum_by_splits(input_list: List[int], splits: List[int]) -> List[int]:
     return [
         sum(input_list[sum(splits[:i]) : sum(splits[:i]) + n])
@@ -1940,17 +1986,83 @@ def empty(
         )
 
     @staticmethod
-    def empty_like(kjt: "KeyedJaggedTensor") -> "KeyedJaggedTensor":
+    def empty_like(
+        kjt: "KeyedJaggedTensor",
+        device: Optional[torch.device] = None,
+    ) -> "KeyedJaggedTensor":
         """
-        Constructs an empty KeyedJaggedTensor with the same device and dtypes as the input KeyedJaggedTensor.
+        original usage:
+            Constructs an empty KeyedJaggedTensor with the same device and dtypes as the input KeyedJaggedTensor.
+            this perserves stride/stride_per_key_per_rank but the actual data (values, lengths, etc.) is empty
+
+        device-copy usage:
+            Constructs an empty KeyedJaggedTensor with the empty tensors on the new device
 
         Args:
             kjt (KeyedJaggedTensor): input KeyedJaggedTensor.
+            device (Optional[torch.device]): device on which the KeyedJaggedTensor will be placed.
 
         Returns:
             KeyedJaggedTensor: empty KeyedJaggedTensor.
         """
-        return _kjt_empty_like(kjt)
+        if device is None:
+            return _kjt_empty_like_stride(kjt)
+        else:
+            return _kjt_empty_like_device(kjt, device)
+
+    def copy_(
+        self, kjt: "KeyedJaggedTensor", non_blocking: bool = False
+    ) -> "KeyedJaggedTensor":
+        """
+        Copies the values, weights, lengths, and offsets of the input KeyedJaggedTensor to the current KeyedJaggedTensor.
+        Assume host-side meta data like the keys, stride, stride_per_key, etc. are already ready.
+
+        Args:
+            kjt (KeyedJaggedTensor): input KeyedJaggedTensor.
+            non_blocking (bool): whether to perform the copy asynchronously.
+
+        Returns:
+            KeyedJaggedTensor: copied KeyedJaggedTensor.
+        """
+        self._stride_per_key_per_rank = (
+            kjt._stride_per_key_per_rank if kjt.variable_stride_per_key() else None
+        )
+        self._length_per_key = kjt._length_per_key
+        self._lengths_offset_per_key = kjt._lengths_offset_per_key
+        self._offset_per_key = kjt._offset_per_key
+        self._index_per_key = kjt._index_per_key
+        self._stride_per_key = kjt._stride_per_key
+        self._jt_dict = kjt._jt_dict
+
+        # tensor in-place copy
+        self._values.copy_(kjt._values, non_blocking=non_blocking)
+
+        weights_self = self._weights
+        weights_kjt = kjt._weights
+        if weights_self is not None and weights_kjt is not None:
+            weights_self.copy_(weights_kjt, non_blocking=non_blocking)
+
+        lengths_self = self._lengths
+        lengths_kjt = kjt._lengths
+        if lengths_self is not None and lengths_kjt is not None:
+            lengths_self.copy_(lengths_kjt, non_blocking=non_blocking)
+
+        offsets_self = self._offsets
+        offsets_kjt = kjt._offsets
+        if offsets_self is not None and offsets_kjt is not None:
+            offsets_self.copy_(offsets_kjt, non_blocking=non_blocking)
+
+        inverse_indices_self = self._inverse_indices
+        inverse_indices_kjt = kjt._inverse_indices
+        if inverse_indices_self is not None and inverse_indices_kjt is not None:
+            self._inverse_indices = (
+                inverse_indices_kjt[0],
+                inverse_indices_self[1].copy_(
+                    inverse_indices_kjt[1], non_blocking=non_blocking
+                ),
+            )
+
+        return self
 
     @staticmethod
     def from_jt_dict(jt_dict: Dict[str, JaggedTensor]) -> "KeyedJaggedTensor":
diff --git a/torchrec/sparse/tests/test_keyed_jagged_tensor.py b/torchrec/sparse/tests/test_keyed_jagged_tensor.py