Address review comments

leonling-ll · leonling-ll · commit a7a69e278d60 · 2025-07-22T09:10:18.000Z
diff --git a/.github/workflows/third-party-benchmarks.yml b/.github/workflows/third-party-benchmarks.yml
@@ -120,8 +120,8 @@ jobs:
         run: |
           git clone https://github.com/sgl-project/sglang.git
           cd sglang
-          git apply ../benchmarks/third_party/sglang/sglang.patch
-          pip install ./python[dev_xpu]
+          git apply ../benchmarks/third_party/sglang/sglang-fix.patch
+          pip install "./python[dev_xpu]"
 
       # Reinstallation since SGLang installation will force overrides current PyTorch and Triton
       - name: Reinstall PyTorch
@@ -139,7 +139,7 @@ jobs:
           python prefill_attention_benchmark.py --reports $REPORTS
 
           source ../../../scripts/capture-hw-details.sh
-          python ../../triton_kernels_benchmark/build_report.py $REPORTS/prefill-attn-performance.csv $REPORTS/attn-prefill-triton-report.csv --benchmark sglang-prefill-attn --compiler triton --param_cols "B,SEQ_LENS,H_Q,H_KV,D,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
+          python ../../triton_kernels_benchmark/build_report.py $REPORTS/sglang-prefill-attn-performance.csv $REPORTS/sglang-prefill-attn-triton-report.csv --benchmark sglang-prefill-attn --compiler triton --param_cols "B,SEQ_LENS,H_Q,H_KV,D,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Run SGLANG attention decode stage benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() }}
@@ -149,7 +149,7 @@ jobs:
           python decode_attention_benchmark.py --reports $REPORTS
 
           source ../../../scripts/capture-hw-details.sh
-          python ../../triton_kernels_benchmark/build_report.py $REPORTS/decode-attn-performance.csv $REPORTS/attn-decode-triton-report.csv --benchmark sglang-decode-attn --compiler triton --param_cols "B,SEQ_LENS,H_Q,H_KV,D" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
+          python ../../triton_kernels_benchmark/build_report.py $REPORTS/sglang-decode-attn-performance.csv $REPORTS/sglang-decode-attn-triton-report.csv --benchmark sglang-decode-attn --compiler triton --param_cols "B,SEQ_LENS,H_Q,H_KV,D" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Run SGLANG attention append stage benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() }}
@@ -159,10 +159,10 @@ jobs:
           python extended_attention_benchmark.py --reports $REPORTS
 
           source ../../../scripts/capture-hw-details.sh
-          python ../../triton_kernels_benchmark/build_report.py $REPORTS/extended-attn-performance.csv $REPORTS/attn-append-triton-report.csv --benchmark sglang-extended-attn --compiler triton --param_cols "B,Q_LEN,PREFIX_LEN,KV_LEN,H_Q,H_KV,D" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
+          python ../../triton_kernels_benchmark/build_report.py $REPORTS/sglang-extended-attn-performance.csv $REPORTS/sglang-append-attn-triton-report.csv --benchmark sglang-extended-attn --compiler triton --param_cols "B,Q_LEN,PREFIX_LEN,KV_LEN,H_Q,H_KV,D" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Run SGLANG Block FP8 GEMM benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'block_fp8_gemm_benchmark.py') }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
         run: |
           export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/third_party/sglang
diff --git a/benchmarks/third_party/sglang/decode_attention_benchmark.py b/benchmarks/third_party/sglang/decode_attention_benchmark.py
@@ -43,18 +43,29 @@ def gen_args(B, N_CTX, H_Q, H_KV, D, dtype, device):
             sm_scale)
 
 
+def get_dtype(dtype_str: str):
+    if dtype_str == 'bfloat16':
+        return torch.bfloat16
+    if dtype_str == 'float16':
+        return torch.float16
+    if dtype_str == 'float32':
+        return torch.float32
+    raise ValueError(f'Unsupported dtype: {dtype_str}')
+
+
+X_VALS = [[bs, *sizes, mode, dtype]
+          for sizes in [(1024 + 64, 32, 8, 128), (1024 + 64, 32, 32, 96), (1024 + 64, 28, 4, 128)]
+          for bs in [1, 16, 32, 64, 128]
+          for mode in ['fwd']
+          for dtype in ['bfloat16']]
+
+
 # pylint: disable=unused-argument
 @benchmark_suit.perf_report(
     benchmark_suit.Benchmark(
         # argument names to use as an x-axis for the plot
-        x_names=['B', 'SEQ_LENS', 'H_Q', 'H_KV', 'D', 'MODE'],
-        x_vals=[  #
-            [bs, 1024 + 64, 32, 8, 128, 'fwd'] for bs in [1, 16, 32, 64, 128]
-        ] + [  #
-            [bs, 1024 + 64, 32, 32, 96, 'fwd'] for bs in [1, 16, 32, 64, 128]
-        ] + [  #
-            [bs, 1024 + 64, 28, 4, 128, 'fwd'] for bs in [1, 16, 32, 64, 128]
-        ],
+        x_names=['B', 'SEQ_LENS', 'H_Q', 'H_KV', 'D', 'MODE', 'DTYPE'],
+        x_vals=X_VALS,
         line_arg='provider',
         # argument name whose value corresponds to a different line in the plot
         # possible values for `line_arg``
@@ -68,19 +79,19 @@ def gen_args(B, N_CTX, H_Q, H_KV, D, dtype, device):
         # line styles
         styles=[('green', '-'), ('green', '--'), ('blue', '-'), ('blue', '--')],
         ylabel=['GB/s', 'TFlops'],  # label name for the y-axis
-        plot_name='decode-attn-performance',
+        plot_name='sglang-decode-attn-performance',
         # name for the plot. Used also as a file name for saving the plot.
         args={},
     ))
-def benchmark(B, SEQ_LENS, H_Q, H_KV, D, MODE, provider):
+def benchmark(B, SEQ_LENS, H_Q, H_KV, D, MODE, DTYPE, provider):
     torch.manual_seed(0)
-    dtype = torch.bfloat16
-    quantiles = [0.5, 0.0, 1.0]
-    N_CTX = SEQ_LENS
+    dtype = get_dtype(DTYPE)
 
+    N_CTX = SEQ_LENS
     q, k_buffer, v_buffer, o, kv_indptr, kv_indices, attn_logits, attn_lse, num_kv_splits, max_kv_splits, sm_scale = gen_args(
         B, N_CTX, H_Q, H_KV, D, dtype, 'xpu')
 
+    quantiles = [0.5, 0.0, 1.0]
     if provider == 'triton' and MODE == 'fwd':
         triton_fn = lambda: decode_attention_fwd(q, k_buffer, v_buffer, o, kv_indptr, kv_indices, attn_logits, attn_lse,
                                                  num_kv_splits, max_kv_splits, sm_scale)
@@ -89,8 +100,8 @@ def benchmark(B, SEQ_LENS, H_Q, H_KV, D, MODE, provider):
     else:
         raise NotImplementedError(f'Unsupported provider {provider} and mode {MODE}')
 
-    tflops = lambda ms: 2 * B * (H_Q + H_KV) * N_CTX * D * (1e-12) / (ms * 1e-3)
-    gbps = lambda ms: 2 * B * (H_Q + H_KV * N_CTX) * D * 2 * (1e-9) / (ms * 1e-3)
+    tflops = lambda ms: B * N_CTX * H_Q * D * H_KV * 2 * 2 * (1e-12) / (ms * 1e-3)
+    gbps = lambda ms: B * (H_Q + 2 * N_CTX * H_KV) * D * 2 * (1e-9) / (ms * 1e-3)
 
     return (gbps(mean), gbps(max_ms), gbps(min_ms)), (tflops(mean), tflops(max_ms), tflops(min_ms)), cv
 
diff --git a/benchmarks/third_party/sglang/extended_attention_benchmark.py b/benchmarks/third_party/sglang/extended_attention_benchmark.py
@@ -5,10 +5,10 @@
 
 
 # pylint: disable=unused-argument
-def gen_args(B, Q_LEN, PREFIX_LEN, KV_LEN, H_Q, H_KV, D, dtype, device):
+def gen_args(B, EXTEND_LEN, PREFIX_LEN, H_Q, H_KV, D, dtype, device):
 
     b_seq_len_prefix = torch.full((B, ), PREFIX_LEN, dtype=torch.int32, device=device)
-    b_seq_len_extend = torch.full((B, ), Q_LEN, dtype=torch.int32, device=device)
+    b_seq_len_extend = torch.full((B, ), EXTEND_LEN, dtype=torch.int32, device=device)
     b_seq_len = b_seq_len_prefix + b_seq_len_extend
 
     b_start_loc = torch.zeros((B, ), dtype=torch.int32, device=device)
@@ -55,18 +55,31 @@ def gen_args(B, Q_LEN, PREFIX_LEN, KV_LEN, H_Q, H_KV, D, dtype, device):
     return params
 
 
+def get_dtype(dtype_str: str):
+    if dtype_str == 'bfloat16':
+        return torch.bfloat16
+    if dtype_str == 'float16':
+        return torch.float16
+    if dtype_str == 'float32':
+        return torch.float32
+    raise ValueError(f'Unsupported dtype: {dtype_str}')
+
+
+X_VALS = [[bs, *sizes, mode, dtype]
+          for sizes in [(512, 1024 + 128, 32, 8, 128),  #
+                        (512, 1024 + 128, 32, 32,96),   #
+                        (512, 1024 + 128, 28, 4, 128)]
+          for bs in [1, 16, 32, 64, 128]
+          for mode in ['fwd']
+          for dtype in ['bfloat16']]
+
+
 # pylint: disable=unused-argument
 @benchmark_suit.perf_report(
     benchmark_suit.Benchmark(
         # argument names to use as an x-axis for the plot
-        x_names=['B', 'Q_LEN', 'PREFIX_LEN', 'KV_LEN', 'H_Q', 'H_KV', 'D', 'MODE'],
-        x_vals=[  #
-            [bs, 512, 1024 + 128, 512, 32, 8, 128, 'fwd'] for bs in [1, 16, 32, 64, 128]
-        ] + [  #
-            [bs, 512, 1024 + 128, 512, 32, 32, 96, 'fwd'] for bs in [1, 16, 32, 64, 128]
-        ] + [  #
-            [bs, 512, 1024 + 128, 512, 28, 4, 128, 'fwd'] for bs in [1, 16, 32, 64, 128]
-        ],
+        x_names=['B', 'EXTEND_LEN', 'PREFIX_LEN', 'H_Q', 'H_KV', 'D', 'MODE', 'DTYPE'],
+        x_vals=X_VALS,
         line_arg='provider',
         # argument name whose value corresponds to a different line in the plot
         # possible values for `line_arg``
@@ -80,16 +93,15 @@ def gen_args(B, Q_LEN, PREFIX_LEN, KV_LEN, H_Q, H_KV, D, dtype, device):
         # line styles
         styles=[('green', '-'), ('green', '--'), ('blue', '-'), ('blue', '--')],
         ylabel=['GB/s', 'TFlops'],  # label name for the y-axis
-        plot_name='extended-attn-performance',
+        plot_name='sglang-extended-attn-performance',
         # name for the plot. Used also as a file name for saving the plot.
         args={},
     ))
-def benchmark(B, Q_LEN, PREFIX_LEN, KV_LEN, H_Q, H_KV, D, MODE, provider):
+def benchmark(B, EXTEND_LEN, PREFIX_LEN, H_Q, H_KV, D, MODE, DTYPE, provider):
     torch.manual_seed(0)
+    dtype = get_dtype(DTYPE)
 
-    dtype = torch.bfloat16
-
-    params = gen_args(B, Q_LEN, PREFIX_LEN, KV_LEN, H_Q, H_KV, D, dtype, 'xpu')
+    params = gen_args(B, EXTEND_LEN, PREFIX_LEN, H_Q, H_KV, D, dtype, 'xpu')
     q_extend, k_extend, v_extend, o_extend = params[0]
     k_buffer, v_buffer = params[1]
     qo_indptr, kv_indptr, kv_indices, max_len_extend = params[2]
@@ -105,10 +117,11 @@ def benchmark(B, Q_LEN, PREFIX_LEN, KV_LEN, H_Q, H_KV, D, MODE, provider):
     else:
         raise NotImplementedError(f'Unsupported provider {provider} and mode {MODE}')
 
-    N_CTX_TOTAL = k_buffer.shape[0]
-    N_CTX_EXTEND = k_extend.shape[0]
-    tflops = lambda ms: (H_Q + H_KV) * (N_CTX_EXTEND + N_CTX_TOTAL) * N_CTX_TOTAL * D * (1e-12) / (ms * 1e-3)
-    gbps = lambda ms: 2 * (N_CTX_EXTEND * (H_Q + H_KV) + N_CTX_TOTAL * H_KV) * D * 2 * (1e-9) / (ms * 1e-3)
+    N_CTX_TOTAL = PREFIX_LEN + EXTEND_LEN
+    N_CTX_EXTEND = EXTEND_LEN
+
+    tflops = lambda ms: B * (N_CTX_EXTEND + N_CTX_TOTAL) * H_Q * D * H_KV * 2 * 2 * (1e-12) / (ms * 1e-3)
+    gbps = lambda ms: B * ((H_Q * N_CTX_EXTEND) + H_KV * (N_CTX_EXTEND + N_CTX_TOTAL) * 2) * D * 2 * (1e-9) / (ms * 1e-3)
 
     return (gbps(mean), gbps(max_ms), gbps(min_ms)), (tflops(mean), tflops(max_ms), tflops(min_ms)), cv
 
diff --git a/benchmarks/third_party/sglang/prefill_attention_benchmark.py b/benchmarks/third_party/sglang/prefill_attention_benchmark.py
@@ -22,18 +22,30 @@ def gen_args(B, SEQ_LENS, H_Q, H_KV, D, dtype, device):
     return (q, k, v, o, b_start_loc, b_seq_len, max_seq_len)
 
 
+def get_dtype(dtype_str: str):
+    if dtype_str == 'bfloat16':
+        return torch.bfloat16
+    if dtype_str == 'float16':
+        return torch.float16
+    if dtype_str == 'float32':
+        return torch.float32
+    raise ValueError(f'Unsupported dtype: {dtype_str}')
+
+
+X_VALS = [[bs, *sizes, causal, mode, dtype]
+          for bs in [1, 16, 32, 64, 128]
+          for sizes in [(1024, 32, 8, 128), (1024, 32, 32, 96), (1024, 28, 4, 128)]
+          for causal in [True, False]
+          for mode in ['fwd']
+          for dtype in ['bfloat16']]
+
+
 # pylint: disable=unused-argument
 @benchmark_suit.perf_report(
     benchmark_suit.Benchmark(
         # argument names to use as an x-axis for the plot
-        x_names=['B', 'SEQ_LENS', 'H_Q', 'H_KV', 'D', 'CAUSAL', 'MODE'],
-        x_vals=[  #
-            [bs, 1024, 32, 8, 128, causal, 'fwd'] for causal in [True, False] for bs in [1, 16, 32, 64, 128]
-        ] + [  #
-            [bs, 1024, 32, 32, 96, causal, 'fwd'] for causal in [True, False] for bs in [1, 16, 32, 64, 128]
-        ] + [  #
-            [bs, 1024, 28, 4, 128, causal, 'fwd'] for causal in [True, False] for bs in [1, 16, 32, 64, 128]
-        ],
+        x_names=['B', 'SEQ_LENS', 'H_Q', 'H_KV', 'D', 'CAUSAL', 'MODE', 'DTYPE'],
+        x_vals=X_VALS,
         line_arg='provider',
         # argument name whose value corresponds to a different line in the plot
         # possible values for `line_arg``
@@ -47,13 +59,13 @@ def gen_args(B, SEQ_LENS, H_Q, H_KV, D, dtype, device):
         # line styles
         styles=[('green', '-'), ('green', '--'), ('blue', '-'), ('blue', '--')],
         ylabel=['GB/s', 'TFlops'],  # label name for the y-axis
-        plot_name='prefill-attn-performance',
+        plot_name='sglang-prefill-attn-performance',
         # name for the plot. Used also as a file name for saving the plot.
         args={},
     ))
-def benchmark(B, SEQ_LENS, H_Q, H_KV, D, CAUSAL, MODE, provider):
+def benchmark(B, SEQ_LENS, H_Q, H_KV, D, CAUSAL, MODE, DTYPE, provider):
     torch.manual_seed(0)
-    dtype = torch.bfloat16
+    dtype = get_dtype(DTYPE)
 
     q, k, v, o, b_start_loc, b_seq_len, max_seq_len = gen_args(B, SEQ_LENS, H_Q, H_KV, D, dtype, 'xpu')
 
@@ -66,8 +78,8 @@ def benchmark(B, SEQ_LENS, H_Q, H_KV, D, CAUSAL, MODE, provider):
         raise NotImplementedError(f'Unsupported provider {provider} and mode {MODE}')
 
     N_CTX = SEQ_LENS
-    tflops = lambda ms: 2 * B * (H_Q + H_KV) * N_CTX * N_CTX * D * (1e-12) / (ms * 1e-3)
-    gbps = lambda ms: 2 * B * (H_Q + H_KV) * N_CTX * D * 2 * (1e-9) / (ms * 1e-3)
+    tflops = lambda ms: B * N_CTX * H_Q * D * H_KV * 2 * 2 * (1e-12) / (ms * 1e-3)
+    gbps = lambda ms: B * N_CTX * (H_Q + 2 * H_KV) * D * 2 * (1e-9) / (ms * 1e-3)
 
     return (gbps(mean_ms), gbps(max_ms), gbps(min_ms)), (tflops(mean_ms), tflops(max_ms), tflops(min_ms)), cv