Improve SimpleFSDP typing and remove a finished TODO (#1960)

fegin · web-flow · commit a3e170c961d9 · 2025-10-30T10:23:57.000-07:00
Stack from [ghstack](https://github.com/ezyang/ghstack/tree/0.12.0) (oldest at bottom): * __->__ #1960 * #1959 As title, no logic change. **Squash and Merge button won't work for this PR. I'll merge by myself.**
diff --git a/torchtitan/experiments/simple_fsdp/simple_fsdp.py b/torchtitan/experiments/simple_fsdp/simple_fsdp.py
@@ -7,7 +7,6 @@
 from collections.abc import Sequence
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import List, Optional
 
 import torch
 import torch.nn as nn
@@ -45,8 +44,8 @@ def disable_active_parametrization():
 
 @dataclass(frozen=True)
 class MixedPrecisionPolicy:
-    param_dtype: Optional[torch.dtype] = None
-    reduce_dtype: Optional[torch.dtype] = None
+    param_dtype: torch.dtype | None = None
+    reduce_dtype: torch.dtype | None = None
 
 
 class _ScaledPartial(Partial):
@@ -161,8 +160,8 @@ def _distribute_dtensor(
 
 
 def _register_parametrization(
-    module: nn.Module, param_names: List[str], parametrization: nn.Module
-):
+    module: nn.Module, param_names: list[str], parametrization: nn.Module
+) -> None:
     """
     It works with state_dict without incurring parametrization calls because
     state_dict accesses parameters directly from self._parameters, not from getters
@@ -230,16 +229,14 @@ def __init__(
         self.param_dtype = mp_policy.param_dtype
         self.reduce_dtype = mp_policy.reduce_dtype
 
-    def replicate_compute(self, x):
+    def replicate_compute(self, x: DTensor) -> torch.Tensor:
         # data parallel runtime replicate parameters and do local compute
         # the gradients are partial tensors that needs to perform reduction
         # (i.e. DDP: allreduce, FSDP: reduce_scatter, HSDP: mix of both)
         # support FSDP/DDP/HSDP + EP + TP (assuming TP shards the inner-most dim)
         non_dp_mesh_dims = x._spec.mesh.ndim - self.device_mesh.ndim
         assert non_dp_mesh_dims <= 2, "Only DP + EP/TP/EP+TP is supported"
         if non_dp_mesh_dims > 0:
-            # TODO: remove tp_mesh as an input arg to data_parallel API and use x._spec.mesh["tp"]
-            #       after DeviceMesh supports slicing a non-root mesh
             dp_mesh = self.device_mesh
             # re-wrap 2D DTensor to 1D DTensor on dp_mesh for efficient FSDP all-gather
             sharded_local_tensor = x.to_local()
@@ -283,7 +280,7 @@ def replicate_compute(self, x):
 
         return output
 
-    def forward(self, x):
+    def forward(self, x: DTensor) -> torch.Tensor:
         global _active_parametrization
         # This should never be set to true during forward, only outside for model
         # inspection / debugging / initialization
@@ -296,7 +293,10 @@ def forward(self, x):
         if self.regional_ac and self.mode in ("fully_shard", "hybrid_shard"):
             # apply checkpointing to implement reshard_after_forward
             output = checkpoint(
-                self.replicate_compute, x, use_reentrant=False, context_fn=fsdp_policy
+                self.replicate_compute,
+                x,
+                use_reentrant=False,
+                context_fn=fsdp_policy,
             )
         else:
             output = self.replicate_compute(x)
@@ -305,13 +305,13 @@ def forward(self, x):
 
 
 def data_parallel(
-    model,
-    device_mesh,
-    mode="replicate",
+    model: nn.Module,
+    device_mesh: DeviceMesh,
+    mode: str = "replicate",
     ac_mode: str = "none",
-    mp_policy: Optional[MixedPrecisionPolicy] = None,
+    mp_policy: MixedPrecisionPolicy | None = None,
     shard_dim: int = 0,
-    reduction_divide_factor: Optional[float] = None,
+    reduction_divide_factor: float | None = None,
 ):
     if mode == "replicate":
         param_sharding = (Replicate(),)