feat(xpu): enable XPU for Llama

dbyoung18 · dbyoung18 · commit 9022b3895eb5 · 2024-09-18T01:51:22.000Z
Signed-off-by: dbyoung18 &lt;yang5.yang@intel.com&gt;
diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py
@@ -19,6 +19,8 @@
 def device_sync(device):
     if "cuda" in device:
         torch.cuda.synchronize(device)
+    elif "xpu" in device:
+        torch.xpu.synchronize(device)
     elif ("cpu" in device) or ("mps" in device):
         pass
     else:
@@ -288,7 +290,10 @@ def main(
 
     for i in range(start, num_samples):
         if i==0:
-            torch.cuda.reset_peak_memory_stats()
+            if "cuda" in device:
+                torch.cuda.reset_peak_memory_stats()
+            elif "xpu" in device:
+                torch.xpu.reset_peak_memory_stats()
         device_sync(device=device) # MKG
         if i >= 0 and interactive:
             prompt = input("What is your prompt? ")
@@ -318,8 +323,15 @@ def callback(x):
         if (i != num_samples - 1 or not profile):
             prof = contextlib.nullcontext()
         else:
-            torch.profiler._utils._init_for_cuda_graphs()
-            prof = torch.profiler.profile()
+            if "cuda" in device:
+                torch.profiler._utils._init_for_cuda_graphs()
+                prof = torch.profiler.profile()
+            elif "xpu" in device:
+                prof = torch.profiler.profile(
+                    activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.XPU],
+                )
         with prof:
             y = generate(
                 model,
@@ -369,7 +381,8 @@ def callback(x):
 
     tokpersec = torch.mean(torch.tensor(aggregate_metrics['tokens_per_sec'])).item()
     bandwidth = model_size * tokpersec
-    mem = torch.cuda.max_memory_reserved() /1e9
+    max_memory_reserved = torch.cuda.max_memory_reserved() if "cuda" in device else torch.xpu.max_memory_reserved()
+    mem = max_memory_reserved / 1e9
     print(f"Average tokens/sec: {tokpersec:.2f}")
     print(f"Average Bandwidth: {bandwidth:.02f} GB/s")
     print(f"Peak Memory Usage: {mem:.02f} GB")
@@ -431,6 +444,7 @@ def callback(x):
     parser.add_argument('--write_result', type=Path, default=None, help='Path where to write the result')
 
     args = parser.parse_args()
+
     main(
         args.prompt, args.interactive, args.num_samples, args.max_new_tokens, args.top_k,
         args.temperature, args.checkpoint_path, args.quantization, args.kv_cache_quantization, args.cache_size, args.linear_causal_mask, args.save, args.compile, args.compile_prefill, args.profile, args.memory_profile, args.device, args.precision, args.write_result