Silv3S
diff --git a/‎test/distributed/_composable/test_composability/test_2d_composability.py‎
Lines changed: 14 additions & 13 deletions b/‎test/distributed/_composable/test_composability/test_2d_composability.py‎
Lines changed: 14 additions & 13 deletions
diff --git a/‎test/distributed/_composable/test_composability/test_pp_composability.py‎
Lines changed: 9 additions & 18 deletions b/‎test/distributed/_composable/test_composability/test_pp_composability.py‎
Lines changed: 9 additions & 18 deletions
diff --git a/‎test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py‎
Lines changed: 20 additions & 38 deletions b/‎test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py‎
Lines changed: 20 additions & 38 deletions
@@ -65,6 +65,7 @@
 
 
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+curr_backend = dist.get_default_backend_for_device(device_type)
 
 
 class SimpleModel(nn.Module):
@@ -422,10 +423,10 @@ class TestFullyShard2DStateDict(DTensorTestBase):
     @property
     def backend(self):
         # need to specify gloo backend for testing cpu offload
-        return "cpu:gloo,xpu:xccl" if TEST_XPU else "cpu:gloo,cuda:nccl"
+        return f"cpu:gloo,{device_type}:{curr_backend}"
 
-    @with_comms
     @skip_if_lt_x_gpu(4)
+    @with_comms
     def test_fully_shard_tp_2d_set_full_state_dict(self):
         dummy_model = SimpleModel().to(device_type)
         mesh_2d = init_device_mesh(
@@ -514,8 +515,8 @@ def _check_module(self, m1, m2, check_grad=False):
                 ).to_local()
             self.assertEqual(param_m2, param_m1)
 
-    @with_comms
     @skip_if_lt_x_gpu(4)
+    @with_comms
     def test_2d_ddp_integration_functionality(self) -> None:
         model, twod_model, dp_pg = self.init_model(self.device_type)
         optim = torch.optim.Adam(model.parameters(), lr=3e-5)
@@ -566,8 +567,8 @@ def _compare_params(self, m1, m2):
                         p2 = p2.redistribute(p2.device_mesh, [Replicate()]).to_local()
                     self.assertTrue(torch.allclose(p1, p2), f"{p1} vs {p2}")
 
-    @with_comms
     @skip_if_lt_x_gpu(4)
+    @with_comms
     def test_2d_fsdp_state_enable_extension(self):
         mesh_2d = init_device_mesh(
             self.device_type, (2, self.world_size // 2), mesh_dim_names=("dp", "tp")
@@ -642,18 +643,18 @@ def _test_2d_e2e_training(
         # Ensure all params are still the same after optimizer update.
         self._compare_params(model, model_2d)
 
-    @with_comms
     @skip_if_lt_x_gpu(4)
+    @with_comms
     def test_2d_e2e_training_default(self):
         self._test_2d_e2e_training()
 
-    @with_comms
     @skip_if_lt_x_gpu(4)
+    @with_comms
     def test_2d_e2e_training_use_orig_params(self):
         self._test_2d_e2e_training(use_orig_params=True)
 
-    @with_comms
     @skip_if_lt_x_gpu(4)
+    @with_comms
     def test_2d_e2e_training_not_use_orig_params(self):
         # TODO: need to revisit input_reshard API about why it failed multi-gpu tests.
         # self._test_2d_e2e_training(recompute_activation=True)
@@ -666,10 +667,10 @@ class TestNew2dParallelStateDict(DTensorTestBase):
     @property
     def backend(self):
         # need to specify gloo backend for testing cpu offload
-        return "cpu:gloo,xpu:xccl" if TEST_XPU else "cpu:gloo,cuda:nccl"
+        return f"cpu:gloo,{device_type}:{curr_backend}"
 
-    @with_comms
     @skip_if_lt_x_gpu(4)
+    @with_comms
     def test_fsdp_2d_extension(self):
         """
         Test whether _fsdp_extension from FSDPstate has been set correctly.
@@ -700,8 +701,8 @@ def test_fsdp_2d_extension(self):
         model_1d_fsdp_state = _get_module_fsdp_state(model_1d)
         self.assertEqual(model_1d_fsdp_state._fsdp_extension, None)
 
-    @with_comms
     @skip_if_lt_x_gpu(4)
+    @with_comms
     @parametrize("is_even_sharded_model", [True, False])
     def test_2d_state_dict(self, is_even_sharded_model):
         simple_model = SimpleModel if is_even_sharded_model else SimpleModelUneven
@@ -756,8 +757,8 @@ def test_2d_state_dict(self, is_even_sharded_model):
                 torch.allclose(no_wrap_v, all_gather_two_d_v.to_local()), True
             )
 
-    @with_comms
     @skip_if_lt_x_gpu(4)
+    @with_comms
     @parametrize("is_even_sharded_model", [True, False])
     def test_2d_load_state_dict(self, is_even_sharded_model):
         simple_model = SimpleModel if is_even_sharded_model else SimpleModelUneven
@@ -811,8 +812,8 @@ def test_2d_load_state_dict(self, is_even_sharded_model):
             self.assertEqual(v1.device_mesh, v2.device_mesh)
             self.assertEqual(v1.placements, v2.placements)
 
-    @with_comms
     @skip_if_lt_x_gpu(4)
+    @with_comms
     @parametrize("is_even_sharded_model", [True, False])
     def test_2d_optim_state_dict(self, is_even_sharded_model):
         simple_model = SimpleModel if is_even_sharded_model else SimpleModelUneven
@@ -899,9 +900,9 @@ def test_2d_optim_state_dict(self, is_even_sharded_model):
                 else:
                     self.assertEqual(new_state, state)
 
+    @skip_if_lt_x_gpu(4)
     @with_comms
     @with_temp_dir
-    @skip_if_lt_x_gpu(4)
     def test_fsdp1_tp_2d_set_full_state_dict(self):
         """
         This is a workaround for loading full state dict into a FSDP1+TP 2D model.
 
@@ -29,8 +29,8 @@
     parallelize_module,
     RowwiseParallel,
 )
-from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
+    at_least_x_gpu,
     MultiProcessTestCase,
     requires_accelerator_dist_backend,
     skip_if_lt_x_gpu,
@@ -40,7 +40,6 @@
     parametrize,
     run_tests,
     skip_but_pass_in_sandcastle_if,
-    TEST_XPU,
 )
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
 
@@ -107,11 +106,9 @@ def world_size(self):
     def device(self):
         return self.rank
 
-    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @requires_accelerator_dist_backend()
     @skip_if_lt_x_gpu(8)
-    @skip_but_pass_in_sandcastle_if(
-        not TEST_MULTIGPU and not TEST_XPU, "Test requires 4+ GPUs"
-    )
+    @skip_but_pass_in_sandcastle_if(not at_least_x_gpu(8), "Test requires 8+ GPUs")
     def test_pp_and_dcp(self):
         """
         Test that pipeline parallelism and distributed checkpointing can be used together and
@@ -201,11 +198,9 @@ def _dcp_test(self):
 
         _dcp_test(self)
 
-    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @requires_accelerator_dist_backend()
     @skip_if_lt_x_gpu(8)
-    @skip_but_pass_in_sandcastle_if(
-        not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
-    )
+    @skip_but_pass_in_sandcastle_if(not at_least_x_gpu(8), "Test requires 8+ GPUs")
     @parametrize(
         "ScheduleClass",
         [
@@ -355,11 +350,9 @@ def apply_tp(
 
         torch.distributed.destroy_process_group()
 
-    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @requires_accelerator_dist_backend()
     @skip_if_lt_x_gpu(8)
-    @skip_but_pass_in_sandcastle_if(
-        not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
-    )
+    @skip_but_pass_in_sandcastle_if(not at_least_x_gpu(8), "Test requires 8+ GPUs")
     @parametrize(
         "ScheduleClass",
         [
@@ -550,11 +543,9 @@ def apply_same_precision(partial_model):
 
         torch.distributed.destroy_process_group()
 
-    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @requires_accelerator_dist_backend()
     @skip_if_lt_x_gpu(8)
-    @skip_but_pass_in_sandcastle_if(
-        not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
-    )
+    @skip_but_pass_in_sandcastle_if(not at_least_x_gpu(8), "Test requires 8+ GPUs")
     @parametrize(
         "ScheduleClass",
         [
 
@@ -1,6 +1,5 @@
 # Owner(s): ["oncall: distributed"]
 
-import os
 import sys
 
 import torch
@@ -18,8 +17,8 @@
 )
 from torch.nn.parallel import DistributedDataParallel
 from torch.testing._internal.common_distributed import (
-    MultiProcessTestCase,
-    requires_nccl,
+    DistributedTestBase,
+    requires_accelerator_dist_backend,
     skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
@@ -30,9 +29,12 @@
     sys.exit(0)
 
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
+
 def gpus_for_rank(world_size):
-    visible_devices = list(range(torch.cuda.device_count()))
-    gpus_per_process = torch.cuda.device_count() // world_size
+    visible_devices = list(range(torch.accelerator.device_count()))
+    gpus_per_process = torch.accelerator.device_count() // world_size
     gpus_for_rank = []
     for rank in range(world_size):
         gpus_for_rank.append(
@@ -60,27 +62,7 @@ def forward(self, x, rank):
         return self.t0(x ** (1 + rank))
 
 
-class DistributedDataParallelCommHookTest(MultiProcessTestCase):
-    def setUp(self):
-        super().setUp()
-        self._spawn_processes()
-
-    def tearDown(self):
-        try:
-            os.remove(self.file_name)
-        except OSError:
-            pass
-
-    def _get_process_group_nccl(self):
-        store = dist.FileStore(self.file_name, self.world_size)
-        dist.init_process_group(
-            backend="nccl",
-            world_size=self.world_size,
-            rank=self.rank,
-            store=store,
-        )
-        return dist.distributed_c10d._get_default_group()
-
+class DistributedDataParallelCommHookTest(DistributedTestBase):
     @property
     def world_size(self):
         return 2
@@ -119,14 +101,14 @@ def _run_and_get_grads(self, model):
         param = next(model.parameters())
         return param.grad
 
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     @skip_if_lt_x_gpu(2)
     def test_ddp_comm_hook_allreduce_hook(self):
         """
         This unit test verifies the ``allreduce`` hook registered case gives same result
         with no hook registered case.
         """
-        process_group = self._get_process_group_nccl()
+        process_group = self.create_pg(device_type)
 
         # No hook registered case, get the reference grads.
         reference_grads = self._get_grads(process_group, None)
@@ -135,14 +117,14 @@ def test_ddp_comm_hook_allreduce_hook(self):
 
         torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=0)
 
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     @skip_if_lt_x_gpu(2)
     def test_ddp_comm_hook_fp16compress_hook(self):
         """
         This unit test verifies the ``fp16 compress`` hook registered case
         gives close result with no hook registered case.
         """
-        process_group = self._get_process_group_nccl()
+        process_group = self.create_pg(device_type)
 
         # No hook registered case, get the reference grads.
         reference_grads = self._get_grads(process_group, None)
@@ -151,14 +133,14 @@ def test_ddp_comm_hook_fp16compress_hook(self):
 
         torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
 
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     @skip_if_lt_x_gpu(2)
     def test_ddp_comm_hook_quantize_per_tensor_hook(self):
         """
         This unit test verifies the ``quantize per tensor`` hook registered case
         gives close result with no hook registered case.
         """
-        process_group = self._get_process_group_nccl()
+        process_group = self.create_pg(device_type)
 
         # No hook registered case, get the reference grads.
         reference_grads = self._get_grads(process_group, None)
@@ -167,14 +149,14 @@ def test_ddp_comm_hook_quantize_per_tensor_hook(self):
 
         torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
 
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     @skip_if_lt_x_gpu(2)
     def test_ddp_comm_hook_quantize_per_channel_hook(self):
         """
         This unit test verifies the ``quantize per channel`` hook registered case
         gives close result with no hook registered case.
         """
-        process_group = self._get_process_group_nccl()
+        process_group = self.create_pg(device_type)
 
         # No hook registered case, get the reference grads.
         reference_grads = self._get_grads(process_group, None)
@@ -185,14 +167,14 @@ def test_ddp_comm_hook_quantize_per_channel_hook(self):
 
         torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
 
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     @skip_if_lt_x_gpu(2)
     def test_ddp_comm_hook_noop_hook(self):
         """
         This unit test verifies the ``noop`` hook registered case and a subsequent allreduce
         gives same result with no hook registered case.
         """
-        process_group = self._get_process_group_nccl()
+        process_group = self.create_pg(device_type)
 
         # No hook registered case, get the reference grads.
         reference_grads = self._get_grads(process_group, None)
@@ -204,10 +186,10 @@ def test_ddp_comm_hook_noop_hook(self):
 
         torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=0)
 
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     @skip_if_lt_x_gpu(2)
     def test_is_last_hook(self):
-        process_group = self._get_process_group_nccl()
+        process_group = self.create_pg(device_type)
 
         def hook(flags, bucket):
             flags.append(bucket.is_last())