intel gpu : enable intel gpu

xiaowangintel · xiaowangintel · commit 3af7b93d8477 · 2024-03-18T19:41:48.000-07:00
diff --git a/generate.py b/generate.py
@@ -16,6 +16,8 @@
 def device_sync(device):
     if "cuda" in device:
         torch.cuda.synchronize(device)
+    elif "xpu" in device:
+        torch.xpu.synchronize(device)
     elif ("cpu" in device) or ("mps" in device):
         pass
     else:
@@ -24,7 +26,8 @@ def device_sync(device):
 
 torch._inductor.config.coordinate_descent_tuning = True
 torch._inductor.config.triton.unique_kernel_names = True
-torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future
+if hasattr(torch._inductor.config, "fx_graph_cache"):
+    torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future
 
 default_device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
@@ -271,7 +274,7 @@ def main(
 
     global print
     from tp import maybe_init_dist
-    rank = maybe_init_dist()
+    rank = maybe_init_dist(device)
     use_tp = rank is not None
     if use_tp:
         if rank != 0:
@@ -302,7 +305,7 @@ def main(
     torch.manual_seed(1234)
     model_size = sum([p.numel() * p.dtype.itemsize for p in itertools.chain(model.parameters(), model.buffers())])
     if compile:
-        if is_speculative and use_tp: # and ("cuda" in device):
+        if is_speculative and use_tp and ("cuda" in device):
             torch._inductor.config.triton.cudagraph_trees = False # Bug with cudagraph trees in this case
 
         if is_speculative:
@@ -353,8 +356,15 @@ def callback(x):
         if (i != num_samples - 1 or not profile) or (use_tp and rank != 0):
             prof = contextlib.nullcontext()
         else:
-            torch.profiler._utils._init_for_cuda_graphs()
-            prof = torch.profiler.profile()
+            if "cuda" in device:
+                torch.profiler._utils._init_for_cuda_graphs()
+                prof = torch.profiler.profile()
+            elif "xpu" in device:
+                prof = torch.profiler.profile(
+                    activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.XPU],
+                )
         with prof:
             y, metrics = generate(
                 model,
@@ -418,6 +428,11 @@ def callback(x):
     parser.add_argument('--device', type=str, default=default_device, help='Device to use')
 
     args = parser.parse_args()
+    if "xpu" in args.device:
+        try:
+            import intel_extension_for_pytorch as ipex
+        except:
+            raise ModuleNotFoundError(f"Intel Extension for PyTorch (intel_extension_for_pytorch) is required to run PyTorch code on Intel GPU (XPU). Please check https://github.com/intel/intel-extension-for-pytorch for details.")
     main(
         args.prompt, args.interactive, args.num_samples, args.max_new_tokens, args.top_k,
         args.temperature, args.checkpoint_path, args.compile, args.compile_prefill, args.profile, args.draft_checkpoint_path,
diff --git a/mixtral-moe/generate.py b/mixtral-moe/generate.py
@@ -16,6 +16,8 @@
 def device_sync(device):
     if "cuda" in device:
         torch.cuda.synchronize(device)
+    elif "xpu" in device:
+        torch.xpu.synchronize(device)
     elif "cpu" in device:
         pass
     else:
@@ -24,7 +26,8 @@ def device_sync(device):
 
 torch._inductor.config.coordinate_descent_tuning = True
 torch._inductor.config.triton.unique_kernel_names = True
-torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future
+if hasattr(torch._inductor.config, "fx_graph_cache"):
+    torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future
 
 
 # support running without installing as a package
@@ -178,7 +181,7 @@ def main(
     assert tokenizer_path.is_file(), tokenizer_path
 
     global print
-    rank = maybe_init_dist()
+    rank = maybe_init_dist(device)
     use_tp = rank is not None
     if use_tp:
         if rank != 0:
@@ -203,7 +206,8 @@ def main(
     torch.manual_seed(1234)
     model_size = sum([p.numel() * p.dtype.itemsize for p in itertools.chain(model.parameters(), model.buffers())])
     if compile:
-        torch._inductor.config.assert_indirect_indexing = False
+        if hasattr(torch._inductor.config, "assert_indirect_indexing"):
+            torch._inductor.config.assert_indirect_indexing = False
 
         global decode_one_token, prefill
         decode_one_token = torch.compile(decode_one_token, mode="reduce-overhead", fullgraph=True)
@@ -248,8 +252,15 @@ def callback(x):
         if (i != num_samples - 1 or not profile) or (use_tp and rank != 0):
             prof = contextlib.nullcontext()
         else:
-            torch.profiler._utils._init_for_cuda_graphs()
-            prof = torch.profiler.profile()
+            if "cuda" in device:
+                torch.profiler._utils._init_for_cuda_graphs()
+                prof = torch.profiler.profile()
+            elif "xpu" in device:
+                prof = torch.profiler.profile(
+                    activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.XPU],
+                )
         with prof:
             y = generate(
                 model,
@@ -302,6 +313,11 @@ def callback(x):
     parser.add_argument('--device', type=str, default="cuda", help='device to use')
 
     args = parser.parse_args()
+    if "xpu" in args.device:
+        try:
+            import intel_extension_for_pytorch as ipex
+        except:
+            raise ModuleNotFoundError(f"Intel Extension for PyTorch (intel_extension_for_pytorch) is required to run PyTorch code on Intel GPU (XPU). Please check https://github.com/intel/intel-extension-for-pytorch for details.")
     main(
         args.prompt, args.interactive, args.num_samples, args.max_new_tokens, args.top_k,
         args.temperature, args.checkpoint_path, args.compile, args.compile_prefill, args.profile, args.device
diff --git a/mixtral-moe/tp.py b/mixtral-moe/tp.py
@@ -28,7 +28,7 @@ def local_break():
 def _get_world_size() -> int:
     return int(os.environ.get("LOCAL_WORLD_SIZE", "1"))
 
-def maybe_init_dist() -> Optional[int]:
+def maybe_init_dist(device) -> Optional[int]:
     try:
         # provided by torchrun
         rank = _get_rank()
@@ -41,8 +41,21 @@ def maybe_init_dist() -> Optional[int]:
         # not run via torchrun, no-op
         return None
 
-    torch.cuda.set_device(rank)
-    dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
+    if "cuda" in device:
+        torch.cuda.set_device(rank)
+        dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
+    elif "xpu" in device:
+        try:
+            import oneccl_bindings_for_pytorch
+        except:
+            raise ModuleNotFoundError(f"OneCCL bindings for PyTorch (oneccl_bindings_for_pytorch) is required to run tensor parallel on Intel GPU (XPU). Please check https://github.com/intel/torch-ccl for details.")
+
+        os.environ['CCL_PROCESS_LAUNCHER'] = 'none'
+        os.environ['CCL_LOCAL_SIZE'] = str(world_size)
+        os.environ['CCL_LOCAL_RANK'] = str(rank)
+
+        torch.xpu.set_device(rank)
+        dist.init_process_group(backend="ccl", rank=rank, world_size=world_size)
     return rank
 
 rank = _get_rank()
diff --git a/tp.py b/tp.py
@@ -33,7 +33,7 @@ def local_break():
 def _get_world_size() -> int:
     return int(os.environ.get("LOCAL_WORLD_SIZE", "1"))
 
-def maybe_init_dist() -> Optional[int]:
+def maybe_init_dist(device) -> Optional[int]:
     try:
         # provided by torchrun
         rank = _get_rank()
@@ -46,8 +46,21 @@ def maybe_init_dist() -> Optional[int]:
         # not run via torchrun, no-op
         return None
 
-    torch.cuda.set_device(rank)
-    dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
+    if "cuda" in device:
+        torch.cuda.set_device(rank)
+        dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
+    elif "xpu" in device:
+        try:
+            import oneccl_bindings_for_pytorch
+        except:
+            raise ModuleNotFoundError(f"OneCCL bindings for PyTorch (oneccl_bindings_for_pytorch) is required to run tensor parallel on Intel GPU (XPU). Please check https://github.com/intel/torch-ccl for details.")
+
+        os.environ['CCL_PROCESS_LAUNCHER'] = 'none'
+        os.environ['CCL_LOCAL_SIZE'] = str(world_size)
+        os.environ['CCL_LOCAL_RANK'] = str(rank)
+
+        torch.xpu.set_device(rank)
+        dist.init_process_group(backend="ccl", rank=rank, world_size=world_size)
     return rank