From d6db1d255c1200db65226683867f060533621370 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Mon, 17 Nov 2025 14:50:46 +0000
Subject: [PATCH 1/3] Add options to print SW efficiency

---
 .../benchmark_testing.py                      | 34 ++++++++++++++++---
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/benchmarks/triton_kernels_benchmark/benchmark_testing.py b/benchmarks/triton_kernels_benchmark/benchmark_testing.py
index 3d3f1d154a..0e1a314ee9 100644
--- a/benchmarks/triton_kernels_benchmark/benchmark_testing.py
+++ b/benchmarks/triton_kernels_benchmark/benchmark_testing.py
@@ -352,6 +352,8 @@ class MarkArgs:
     reports: str = ""
     n_runs: int = 1
     brief: bool = False
+    hw_gbps: float = None
+    hw_tflops: float = None
 
     @staticmethod
     def load_cli_args() -> MarkArgs:
@@ -375,8 +377,32 @@ def load_cli_args() -> MarkArgs:
             action="store_true",
             help="Print only mean values without min, max, CV.",
         )
+        parser.add_argument(
+            "--hw_gbps",
+            type=float,
+            help="Hardware bandwidth in GB/s to calculate efficiency.",
+        )
+        parser.add_argument(
+            "--hw_tflops",
+            type=float,
+            help="Hardware peak performance in TFLOPS to calculate efficiency.",
+        )
         args = parser.parse_args()
-        return MarkArgs(args.reports, args.n_runs, args.brief)
+        return MarkArgs(args.reports, args.n_runs, args.brief, args.hw_gbps, args.hw_tflops)
+
+
+def enhance_df(df, mark_args: MarkArgs):
+    df = df.copy()
+    if mark_args.brief:
+        df = df[[c for c in df.columns if not any(map(c.endswith, ("min", "max", "CV")))]]
+
+    for col in df.columns:
+        if col.lower().replace("/", "p").endswith("gbps") and mark_args.hw_gbps:
+            df[col + "-eff"] = (df[col] / mark_args.hw_gbps).apply(lambda x: f"{x:.1%}")
+        elif col.lower().endswith("tflops") and mark_args.hw_tflops:
+            df[col + "-eff"] = (df[col] / mark_args.hw_tflops).apply(lambda x: f"{x:.1%}")
+
+    return df
 
 
 class Mark:
@@ -462,12 +488,10 @@ def _run(self, bench: Benchmark, save_path: str, show_plots: bool, print_data: b
             col0, col1 = df.columns.tolist()
             df["Diff"] = df[col1] - df[col0]
 
+        df = enhance_df(df, mark_args)
         if print_data:
             print(bench.plot_name + ":")
-            if mark_args.brief:
-                print(df[[c for c in df.columns if not any(map(c.endswith, ("min", "max", "CV")))]].to_string())
-            else:
-                print(df.to_string())
+            print(df.to_string())
 
         if save_path:
             df.to_csv(os.path.join(save_path, f"{filename}.csv"), float_format=f"%.{save_precision}f", index=False)

From 89ef2d39a45218a1d30a41e1680108159d8bdfd1 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Mon, 17 Nov 2025 15:01:28 +0000
Subject: [PATCH 2/3] Removed noise

---
 benchmarks/triton_kernels_benchmark/benchmark_testing.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/triton_kernels_benchmark/benchmark_testing.py b/benchmarks/triton_kernels_benchmark/benchmark_testing.py
index 0e1a314ee9..19c035578b 100644
--- a/benchmarks/triton_kernels_benchmark/benchmark_testing.py
+++ b/benchmarks/triton_kernels_benchmark/benchmark_testing.py
@@ -155,7 +155,6 @@ def do_bench_upstream_pytorch_profiler(fn, n_warmup=25, n_repeat=100, grad_to_no
             fn()
             synchronize()
             i += 1
-        print(f"Stopped warmup after {i} iterations")
     else:
         for _ in range(n_warmup):
             fn()

From 8fbc416281cf87311ae75c55f6b02bc9a88f4707 Mon Sep 17 00:00:00 2001
From: Egor Krivov <egor.krivov@intel.com>
Date: Wed, 19 Nov 2025 15:22:01 +0000
Subject: [PATCH 3/3] Add json file with HW capability

---
 benchmarks/gpu_info.json                      | 19 +++++
 .../benchmark_testing.py                      | 75 ++++++++++++++-----
 2 files changed, 77 insertions(+), 17 deletions(-)
 create mode 100644 benchmarks/gpu_info.json

diff --git a/benchmarks/gpu_info.json b/benchmarks/gpu_info.json
new file mode 100644
index 0000000000..fc473ae44c
--- /dev/null
+++ b/benchmarks/gpu_info.json
@@ -0,0 +1,19 @@
+{
+  "_comment": "GPU -> [BF16/FP16 DPAS TFLOPs , Memory bandwidth GB/s]",
+  "Intel(R) Data Center GPU Max 1100": [
+    355.53,
+    1228.80
+  ],
+  "Intel(R) Data Center GPU Max 1550": [
+    419.43,
+    3276.8
+  ],
+  "Intel(R) Arc(TM) B580 Graphics": [
+    116.74,
+    456.0
+  ],
+  "Intel(R) Arc(TM) B570 Graphics": [
+    103.22,
+    380.0
+  ]
+}
diff --git a/benchmarks/triton_kernels_benchmark/benchmark_testing.py b/benchmarks/triton_kernels_benchmark/benchmark_testing.py
index 19c035578b..9873a91668 100644
--- a/benchmarks/triton_kernels_benchmark/benchmark_testing.py
+++ b/benchmarks/triton_kernels_benchmark/benchmark_testing.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
+import re
 from typing import Callable, ClassVar, Dict, Optional, List, Tuple, Union, Set
 from collections.abc import Iterable
 from enum import Enum
@@ -13,8 +14,10 @@
 import datetime
 import os
 import time
+from pathlib import Path
 
 import scipy.stats
+import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 
@@ -335,6 +338,30 @@ def filter_providers(
     return supported_providers
 
 
+def get_gpu_info():
+    device_name = torch.xpu.is_available() and torch.xpu.get_device_name()
+    if device_name is None:
+        print("Couldn't read device name.")
+        return None, None
+
+    # benchmarks/triton_kernels_benchmark/benchmark_testing.py -> benchmarks/gpu_info.json
+    current_dir = Path(__file__).parent.resolve()
+    gpu_info_path = current_dir.parent / "gpu_info.json"
+
+    if not gpu_info_path.exists():
+        print(f"Warning: '{gpu_info_path}' not found.")
+        return None, None
+
+    with open(gpu_info_path, "r", encoding="utf-8") as f:
+        gpu_info = json.load(f)
+
+    if device_name not in gpu_info:
+        print(f"Warning: Device '{device_name}' not found in {gpu_info_path}")
+        return None, None
+
+    return gpu_info[device_name]
+
+
 def perf_report(benchmarks):
     """
     Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value.
@@ -351,8 +378,7 @@ class MarkArgs:
     reports: str = ""
     n_runs: int = 1
     brief: bool = False
-    hw_gbps: float = None
-    hw_tflops: float = None
+    eff: bool = False
 
     @staticmethod
     def load_cli_args() -> MarkArgs:
@@ -377,29 +403,44 @@ def load_cli_args() -> MarkArgs:
             help="Print only mean values without min, max, CV.",
         )
         parser.add_argument(
-            "--hw_gbps",
-            type=float,
-            help="Hardware bandwidth in GB/s to calculate efficiency.",
-        )
-        parser.add_argument(
-            "--hw_tflops",
-            type=float,
-            help="Hardware peak performance in TFLOPS to calculate efficiency.",
+            "--eff",
+            "-e",
+            action="store_true",
+            help="Print HW utilization, will use internal database from 'gpu_info.json'.",
         )
         args = parser.parse_args()
-        return MarkArgs(args.reports, args.n_runs, args.brief, args.hw_gbps, args.hw_tflops)
+        return MarkArgs(args.reports, args.n_runs, args.brief, args.eff)
 
 
-def enhance_df(df, mark_args: MarkArgs):
+def enhance_df(df, bench, mark_args: MarkArgs):
+    hw_tflops, hw_gbps = None, None
+    if mark_args.eff:
+        hw_tflops, hw_gbps = get_gpu_info()
+
     df = df.copy()
     if mark_args.brief:
         df = df[[c for c in df.columns if not any(map(c.endswith, ("min", "max", "CV")))]]
 
+    # Find and write down HW efficiency columns
+    tflops_labels = [l for l in bench.ylabel if l.lower().endswith("tflops")]
+    tflops_pattern = "-(" + "|".join(tflops_labels) + ")(-min|-max)?$"
+
+    gbps_labels = [l for l in bench.ylabel if l.lower().replace("/", "p").endswith("gbps")]
+    gbps_pattern = "-(" + "|".join(gbps_labels) + ")(-min|-max)?$"
+
     for col in df.columns:
-        if col.lower().replace("/", "p").endswith("gbps") and mark_args.hw_gbps:
-            df[col + "-eff"] = (df[col] / mark_args.hw_gbps).apply(lambda x: f"{x:.1%}")
-        elif col.lower().endswith("tflops") and mark_args.hw_tflops:
-            df[col + "-eff"] = (df[col] / mark_args.hw_tflops).apply(lambda x: f"{x:.1%}")
+        if re.search(tflops_pattern, col) and hw_tflops:
+            df[re.sub(tflops_pattern, "-ceff", col)] = df[col] / hw_tflops
+        if re.search(gbps_pattern, col) and hw_gbps:
+            df[re.sub(gbps_pattern, "-meff", col)] = df[col] / hw_gbps
+            # df[re.sub(gbps_pattern, "-meff", col)] = (df[col] / mark_args.hw_gbps).apply(lambda x: f"{x:.1%}")
+    # We will only keep resulting efficiency column, we are either compute or memory bound.
+    for provider in bench.line_names:
+        if f"{provider}-ceff" in df.columns and f"{provider}-meff" in df.columns:
+            df[f"{provider}-eff"] = np.maximum(df[f"{provider}-ceff"],
+                                               df[f"{provider}-meff"]).apply(lambda x: f"{x:.2%}")
+            del df[f"{provider}-ceff"]
+            del df[f"{provider}-meff"]
 
     return df
 
@@ -487,7 +528,7 @@ def _run(self, bench: Benchmark, save_path: str, show_plots: bool, print_data: b
             col0, col1 = df.columns.tolist()
             df["Diff"] = df[col1] - df[col0]
 
-        df = enhance_df(df, mark_args)
+        df = enhance_df(df, bench, mark_args)
         if print_data:
             print(bench.plot_name + ":")
             print(df.to_string())