[QEff finetune] : fix qaic device for pp+ddp (#544)

quic-mamta · mamtsing · web-flow · commit c67393b5a008 · 2025-08-25T13:43:33.000+05:30
- Fix qaic device for pp+ddp
- This fix is required for sdk version 1.21
- Without this fix , pp+ddp was working fine on 1.20.0.194

Signed-off-by: Mamta Singh &lt;mamtsing@qti.qualcomm.com&gt;
Co-authored-by: Mamta Singh &lt;mamtsing@qti.qualcomm.com&gt;
diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
@@ -80,9 +80,8 @@ def setup_distributed_training(train_config: TrainConfig) -> None:
     assert torch_device.index is None, f"DDP requires only device type, got: {torch_device}"
     dist_backend_map = {"cpu": "gloo", "qaic": "qccl", "cuda": "gloo"}
     dist.init_process_group(backend=dist_backend_map[torch_device.type])
-    if not train_config.enable_pp:
-        # from here onward "qaic/cuda" will automatically map to "qaic:i/cuda:i", where i = process rank
-        getattr(torch, torch_device.type).set_device(dist.get_rank())
+    # from here onward "qaic/cuda" will automatically map to "qaic:i/cuda:i", where i = process rank
+    getattr(torch, torch_device.type).set_device(dist.get_rank() * train_config.num_pp_stages)
 
 
 def setup_seeds(seed: int) -> None: