Device-to-Host LazyAwaitable (#3477)

TroyGarden · meta-codesync[bot] · commit a4ca26f2abc7 · 2025-10-22T10:59:38.000-07:00
Summary: Pull Request resolved: #3477 workplace post: https://fb.workplace.com/groups/811751593969209/permalink/1285164823294548/ # TL;DR * A new `DeviceToHostTensorAwaitable` class is available to wrap the device-to-host data transfer, and defers the `cudaEventSync` call until the data is really used on the host. * It aims at helping sync-point removal in training optimization which often suffers from cpu-blocking sync points. # why awaitable * as shown in the following diagram, a comms op is often better to overlap with another (irrelevant) compute op to better utilize the device capability * the idea is to **defer** the `wait()` call until running the function that uses the result from the comm op * a convenient way to achieve this "deferring" behavior is to use the `lazy_awaitable` concept, which is already [implemented in torchrec](https://github.com/meta-pytorch/torchrec/blob/main/torchrec/distributed/types.py#L368) * diagram of (lazy_)awaitable in torchrec {F1982900178} # why device-to-host transfer * there are scenarios that the on-device data is needed from the host side, such as metrics logging and data-dependent shape operation. * those pattern creates a device-to-host sync (data transfer) that often blocks the cpu execution, and the correct implementation (with `.to(non_blocking=True)` and cuda event: [PR 3436](#3436)) usually spans across multiple code domain making it difficult to optimize. * here we borrow the `LazyAwaitable` concept for the device-side comms and wrap the (1) non-blocking device-to-host data transfer, and (2) `cuda_event.wait()` inside a `DeviceToHostTensorAwaitable` class for better user experience. * diagram of lazy_awaitable for device-to-host data transfer {F1982900233} # results * the "comms check" result is on device and is needed for validation (host-side assertion) * the `DeviceToHostTensorAwaitable.wait()` **defer** the cudaEventSync until the very end where the result is really needed by host. * You can see the post-comms computes are scheduled before the assertion on the host side. {F1982900468} NOTE: in this version of implementation we don't use a separate stream (as shown in the diagram above) for the non-blocking device-to-host data transfer because usually the data volume is relatively small. {F1982901286} Reviewed By: spmex Differential Revision: D85211205 fbshipit-source-id: 41d03230dd9b190085545cfb76192d59375646c4
diff --git a/torchrec/distributed/benchmark/benchmark_comms.py b/torchrec/distributed/benchmark/benchmark_comms.py
@@ -22,7 +22,7 @@
 """
 
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
 import torch
 import torch.distributed as dist
@@ -39,6 +39,7 @@
     MultiProcessContext,
     run_multi_process_func,
 )
+from torchrec.distributed.types import DeviceToHostTensorAwaitable
 
 _cc = cmd_conf()
 
@@ -253,6 +254,46 @@ def a2a_async_twice(
         assert checks1 and checks2
 
 
+# all_to_all_single with sync and single stream
+def lazyawaitable(
+    _batch_inputs: List[Dict[str, Any]],
+    dim: int,
+    num_mul: int,
+    num_concat: int,
+    ctx: MultiProcessContext,
+) -> None:
+    with record_function("## pre-comms compute ##"):
+        pre_comms = _compute(dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx)
+
+    with record_function("## all_to_all_single ##"):
+        # use zeros instead of empty to make sure no previous data used
+        post_comms = torch.zeros_like(pre_comms)
+        req = dist.all_to_all_single(
+            output=post_comms,
+            input=pre_comms,
+            group=ctx.pg,
+            async_op=True,
+        )
+
+    with record_function("## irrelevant compute ##"):
+        pre_comms = _compute(dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx)
+
+    with record_function("## comms check ##"):
+        # assertion fails without wait(), this wait() makes the main cuda stream wait
+        # for the comms to finish, so the post-comms compute will be blocked until
+        # the comms is done
+        req.wait()
+        check_awaitable = DeviceToHostTensorAwaitable(_validate(post_comms, ctx))
+
+    with record_function("## post-comms compute ##"):
+        post_comms = _compute(
+            dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx, x=post_comms[0]
+        )
+
+    with record_function("## assert ##"):
+        assert check_awaitable.item()
+
+
 # single-rank runner
 def a2a_single_runner(rank: int, world_size: int, arg: AllToAllSingleRunConfig) -> None:
     # Ensure GPUs are available and we have enough of them
@@ -274,8 +315,10 @@ def a2a_single_runner(rank: int, world_size: int, arg: AllToAllSingleRunConfig)
             func = a2a_async_base
         elif arg.name.startswith("a2a_async_twice"):
             func = a2a_async_twice
+        elif arg.name.startswith("lazyawaitable"):
+            func = lazyawaitable
         else:
-            func = a2a_sync_base
+            raise ValueError(f"Unknown benchmark name: {arg.name}")
 
         result = benchmark_func(
             bench_inputs=[],
diff --git a/torchrec/distributed/types.py b/torchrec/distributed/types.py
@@ -463,6 +463,24 @@ def _wait_impl(self) -> W:
         return self._obj
 
 
+class DeviceToHostTensorAwaitable(LazyAwaitable[torch.Tensor]):
+    """An awaitable that waits for a tensor to be copied from device to host."""
+
+    def __init__(self, tensor_on_device: torch.Tensor) -> None:
+        super().__init__()
+        # self._tensor has unintialized value at this momenet
+        self._tensor: torch.Tensor = tensor_on_device.to("cpu", non_blocking=True)
+
+        # cuda event to record the completion of the copy
+        self._event = torch.cuda.Event()
+        self._event.record()
+
+    def _wait_impl(self) -> torch.Tensor:
+        # wait for the copy to complete
+        self._event.synchronize()
+        return self._tensor
+
+
 KT = TypeVar("KT")
 VT_co = TypeVar("VT_co")
 ParentW = TypeVar("ParentW")