Update extended attention interface

leonling-ll · leonling-ll · commit 48b96eca001d · 2025-05-23T12:39:14.000Z
Address review comments

Fix CI XPU not found
diff --git a/.github/workflows/third-party-benchmarks.yml b/.github/workflows/third-party-benchmarks.yml
@@ -71,25 +71,34 @@ jobs:
 
       - name: Setup Triton
         uses: ./.github/actions/setup-triton
+        with:
+          command: DEBUG=1 python setup.py bdist_wheel
 
-      - name: Install benchmarks
+      - name: Install Triton
         id: install
         run: |
-          cd benchmarks
-          pip install .
+          pip install dist/*.whl
 
       - name: Install benchmark dependencies
-        id: install_deps
+        id: install
         run: |
           pip install transformers pandas pytest
 
+          cd benchmarks
+          pip install .
+          pip install intel-pti==0.12.2
+          PTI_LIBS_DIR=$(python -c "import sysconfig; print(sysconfig.get_paths()['stdlib']+'/..')")
+          # the output should contain: `libpti.so`, `libpti_metrics.so.0.12.2` and `libpti_view.so.0.12.2`
+          ls $PTI_LIBS_DIR
+          echo "PTI_LIBS_DIR=$PTI_LIBS_DIR" >> $GITHUB_ENV
+
       - name: Create reports dir
         run: |
           mkdir reports
           echo "REPORTS=$PWD/reports" >> $GITHUB_ENV
 
       - name: Run Liger-Kernel benchmarks
-        if: ${{ steps.install_deps.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
         run: |
           source ./scripts/capture-hw-details.sh
 
@@ -111,11 +120,22 @@ jobs:
       - name: Install SGLANG
         run: |
           git clone https://github.com/sgl-project/sglang.git
-          pip install sglang/python[dev_xpu]
+          cd sglang
+          git apply ../benchmarks/third_party/sglang/sglang.patch
+          pip install ./python[dev_xpu]
+
+      # Reinstallation since SGLang installation will force overrides current PyTorch and Triton
+      - name: Reinstall PyTorch
+        uses: ./.github/actions/setup-pytorch
+
+      - name: Reinstall Triton
+        run: |
+          pip install ./dist/*.whl
 
       - name: Run SGLANG attention prefill stage benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/third_party/sglang
           python prefill_attention_benchmark.py --reports $REPORTS
 
@@ -125,6 +145,7 @@ jobs:
       - name: Run SGLANG attention decode stage benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/third_party/sglang
           python decode_attention_benchmark.py --reports $REPORTS
 
@@ -134,20 +155,22 @@ jobs:
       - name: Run SGLANG attention append stage benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/third_party/sglang
           python extended_attention_benchmark.py --reports $REPORTS
 
           source ../../../scripts/capture-hw-details.sh
-          python ../../triton_kernels_benchmark/build_report.py $REPORTS/extended-attn-performance.csv $REPORTS/attn-append-triton-report.csv --benchmark sglang-extended-attn --compiler triton --param_cols "B,SEQ_LENS,H_Q,H_KV,D" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
+          python ../../triton_kernels_benchmark/build_report.py $REPORTS/extended-attn-performance.csv $REPORTS/attn-append-triton-report.csv --benchmark sglang-extended-attn --compiler triton --param_cols "B,Q_LEN,PREFIX_LEN,KV_LEN,H_Q,H_KV,D" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Run SGLANG Block FP8 GEMM benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'block_fp8_gemm_benchmark.py') }}
         run: |
+          export LD_LIBRARY_PATH=$PTI_LIBS_DIR:$LD_LIBRARY_PATH
           cd benchmarks/third_party/sglang
           python block_fp8_gemm_benchmark.py --reports $REPORTS
 
           source ../../../scripts/capture-hw-details.sh
-          python ../../../scripts/build_report.py $REPORTS/sglang-fp8-gemm-performance.csv $REPORTS/sglang-fp8-gemm-triton-report.csv --benchmark sglang-block-fp8-gemm --compiler triton --param_cols "M,N,K" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
+          python ../../triton_kernels_benchmark/build_report.py $REPORTS/sglang-fp8-gemm-performance.csv $REPORTS/sglang-fp8-gemm-triton-report.csv --benchmark sglang-block-fp8-gemm --compiler triton --param_cols "M,N,K" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Upload benchmark reports
         if: ${{ steps.install.outcome == 'success' && !cancelled() }}
diff --git a/benchmarks/third_party/sglang/decode_attention_benchmark.py b/benchmarks/third_party/sglang/decode_attention_benchmark.py
@@ -47,13 +47,13 @@ def gen_args(B, N_CTX, H_Q, H_KV, D, dtype, device):
 @benchmark_suit.perf_report(
     benchmark_suit.Benchmark(
         # argument names to use as an x-axis for the plot
-        x_names=['B', 'SEQ_LENS', 'H_Q', 'H_KV', 'D', 'MODE', 'VALIDATE'],
+        x_names=['B', 'SEQ_LENS', 'H_Q', 'H_KV', 'D', 'MODE'],
         x_vals=[  #
-            [bs, [1024, 64], 32, 8, 128, 'fwd', False] for bs in [1, 16, 32, 64, 128]
+            [bs, 1024 + 64, 32, 8, 128, 'fwd'] for bs in [1, 16, 32, 64, 128]
         ] + [  #
-            [bs, [1024, 64], 32, 32, 96, 'fwd', False] for bs in [1, 16, 32, 64, 128]
+            [bs, 1024 + 64, 32, 32, 96, 'fwd'] for bs in [1, 16, 32, 64, 128]
         ] + [  #
-            [bs, [1024, 64], 28, 4, 128, 'fwd', False] for bs in [1, 16, 32, 64, 128]
+            [bs, 1024 + 64, 28, 4, 128, 'fwd'] for bs in [1, 16, 32, 64, 128]
         ],
         line_arg='provider',
         # argument name whose value corresponds to a different line in the plot
@@ -72,27 +72,22 @@ def gen_args(B, N_CTX, H_Q, H_KV, D, dtype, device):
         # name for the plot. Used also as a file name for saving the plot.
         args={},
     ))
-def benchmark(B, SEQ_LENS, H_Q, H_KV, D, MODE, VALIDATE, provider):
+def benchmark(B, SEQ_LENS, H_Q, H_KV, D, MODE, provider):
     torch.manual_seed(0)
     dtype = torch.bfloat16
     quantiles = [0.5, 0.0, 1.0]
-    N_CTX = sum(SEQ_LENS)
+    N_CTX = SEQ_LENS
 
     q, k_buffer, v_buffer, o, kv_indptr, kv_indices, attn_logits, attn_lse, num_kv_splits, max_kv_splits, sm_scale = gen_args(
         B, N_CTX, H_Q, H_KV, D, dtype, 'xpu')
 
-    if provider == 'triton':
+    if provider == 'triton' and MODE == 'fwd':
         triton_fn = lambda: decode_attention_fwd(q, k_buffer, v_buffer, o, kv_indptr, kv_indices, attn_logits, attn_lse,
                                                  num_kv_splits, max_kv_splits, sm_scale)
-
-        # TODO: decode attention should have the validation function
-        if VALIDATE:
-            raise NotImplementedError('Validation is not implemented for decode stage')
-
         _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, quantiles=quantiles)
 
     else:
-        raise NotImplementedError(f'Unsupported provider {provider}')
+        raise NotImplementedError(f'Unsupported provider {provider} and mode {MODE}')
 
     tflops = lambda ms: 2 * B * (H_Q + H_KV) * N_CTX * D * (1e-12) / (ms * 1e-3)
     gbps = lambda ms: 2 * B * (H_Q + H_KV * N_CTX) * D * 2 * (1e-9) / (ms * 1e-3)
diff --git a/benchmarks/third_party/sglang/extended_attention_benchmark.py b/benchmarks/third_party/sglang/extended_attention_benchmark.py
@@ -1,19 +1,16 @@
 import torch
 from sglang.srt.layers.attention.triton_ops.extend_attention import (
-    extend_attention_fwd,
-    redundant_attention,
-)
+    extend_attention_fwd, )
 import triton_kernels_benchmark as benchmark_suit
 
 
-def gen_args(B, N_CTX, H_Q, H_KV, D, dtype, device):
+# pylint: disable=unused-argument
+def gen_args(B, Q_LEN, PREFIX_LEN, KV_LEN, H_Q, H_KV, D, dtype, device):
 
-    b_seq_len_prefix = torch.randint(1, N_CTX // 2, (B, ), dtype=torch.int32, device=device)
-    b_seq_len_extend = torch.randint(1, N_CTX // 2, (B, ), dtype=torch.int32, device=device)
+    b_seq_len_prefix = torch.full((B, ), PREFIX_LEN, dtype=torch.int32, device=device)
+    b_seq_len_extend = torch.full((B, ), Q_LEN, dtype=torch.int32, device=device)
     b_seq_len = b_seq_len_prefix + b_seq_len_extend
-    max_len_in_batch = torch.max(b_seq_len, 0)[0].item()
 
-    b_req_idx = torch.arange(B, dtype=torch.int32, device=device)
     b_start_loc = torch.zeros((B, ), dtype=torch.int32, device=device)
     b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0)
     b_start_loc_extend = torch.zeros((B, ), dtype=torch.int32, device=device)
@@ -45,32 +42,30 @@ def gen_args(B, N_CTX, H_Q, H_KV, D, dtype, device):
                                                         device=device).normal_(mean=0.1, std=0.2)
 
     o_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device)
-    o_redundant = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device)
 
     b_seq_len_extend = b_seq_len - b_seq_len_prefix
     max_len_extend = torch.max(b_seq_len_extend, 0)[0].item()
     qo_indptr = torch.zeros((B + 1, ), dtype=torch.int32, device=device)
     qo_indptr[1:B + 1] = torch.cumsum(b_seq_len_extend[:B], dim=0)
 
     params = []
-    params.append((q_extend, k_extend, v_extend, o_extend, o_redundant))
+    params.append((q_extend, k_extend, v_extend, o_extend))
     params.append((k_buffer, v_buffer))
     params.append((qo_indptr, kv_indptr, kv_indices, max_len_extend))
-    params.append((b_req_idx, b_start_loc, b_seq_len, b_seq_len_prefix, max_len_in_batch))
     return params
 
 
 # pylint: disable=unused-argument
 @benchmark_suit.perf_report(
     benchmark_suit.Benchmark(
         # argument names to use as an x-axis for the plot
-        x_names=['B', 'SEQ_LENS', 'H_Q', 'H_KV', 'D', 'MODE', 'VALIDATE'],
+        x_names=['B', 'Q_LEN', 'PREFIX_LEN', 'KV_LEN', 'H_Q', 'H_KV', 'D', 'MODE'],
         x_vals=[  #
-            [bs, [1024, 128, 512], 32, 8, 128, 'fwd', True] for bs in [1, 16, 32, 64, 128]
+            [bs, 512, 1024 + 128, 512, 32, 8, 128, 'fwd'] for bs in [1, 16, 32, 64, 128]
         ] + [  #
-            [bs, [1024, 128, 512], 32, 32, 96, 'fwd', True] for bs in [1, 16, 32, 64, 128]
+            [bs, 512, 1024 + 128, 512, 32, 32, 96, 'fwd'] for bs in [1, 16, 32, 64, 128]
         ] + [  #
-            [bs, [1024, 128, 512], 28, 4, 128, 'fwd', True] for bs in [1, 16, 32, 64, 128]
+            [bs, 512, 1024 + 128, 512, 28, 4, 128, 'fwd'] for bs in [1, 16, 32, 64, 128]
         ],
         line_arg='provider',
         # argument name whose value corresponds to a different line in the plot
@@ -89,41 +84,26 @@ def gen_args(B, N_CTX, H_Q, H_KV, D, dtype, device):
         # name for the plot. Used also as a file name for saving the plot.
         args={},
     ))
-def benchmark(B, SEQ_LENS, H_Q, H_KV, D, MODE, VALIDATE, provider):
+def benchmark(B, Q_LEN, PREFIX_LEN, KV_LEN, H_Q, H_KV, D, MODE, provider):
     torch.manual_seed(0)
 
     dtype = torch.bfloat16
-    N_CTX = sum(SEQ_LENS)
 
-    params = gen_args(B, N_CTX, H_Q, H_KV, D, dtype, 'xpu')
-    q_extend, k_extend, v_extend, o_extend, o_redundant = params[0]
+    params = gen_args(B, Q_LEN, PREFIX_LEN, KV_LEN, H_Q, H_KV, D, dtype, 'xpu')
+    q_extend, k_extend, v_extend, o_extend = params[0]
     k_buffer, v_buffer = params[1]
     qo_indptr, kv_indptr, kv_indices, max_len_extend = params[2]
-    b_req_idx, b_start_loc, b_seq_len, b_seq_len_prefix, max_len_in_batch = params[3]
     custom_mask = None
     mask_indptr = None
 
     quantiles = [0.5, 0.0, 1.0]
-    if provider == 'triton':
-
-        def triton_fn():
-            extend_attention_fwd(q_extend, k_extend, v_extend, o_extend, k_buffer, v_buffer, qo_indptr, kv_indptr,
-                                 kv_indices, custom_mask, mask_indptr, max_len_extend)
-            return o_extend
-
-        if VALIDATE:
-
-            def refer_fn():
-                redundant_attention(q_extend, o_redundant, k_buffer, v_buffer, b_req_idx, b_start_loc, b_seq_len,
-                                    b_seq_len_prefix, max_len_in_batch)
-                return o_redundant
-
-            benchmark_suit.assert_close(triton_fn, refer_fn, atol=1e-3, rtol=1e-2, err_msg='extend to refer')
-
+    if provider == 'triton' and MODE == 'fwd':
+        triton_fn = lambda: extend_attention_fwd(q_extend, k_extend, v_extend, o_extend, k_buffer, v_buffer, qo_indptr,
+                                                 kv_indptr, kv_indices, custom_mask, True, mask_indptr, max_len_extend)
         _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, quantiles=quantiles)
 
     else:
-        raise NotImplementedError(f'Unsupported provider {provider}')
+        raise NotImplementedError(f'Unsupported provider {provider} and mode {MODE}')
 
     N_CTX_TOTAL = k_buffer.shape[0]
     N_CTX_EXTEND = k_extend.shape[0]
diff --git a/benchmarks/third_party/sglang/prefill_attention_benchmark.py b/benchmarks/third_party/sglang/prefill_attention_benchmark.py
@@ -6,8 +6,8 @@
 
 
 def gen_args(B, SEQ_LENS, H_Q, H_KV, D, dtype, device):
-    max_seq_len = max(SEQ_LENS)
-    N_CTX = sum(SEQ_LENS)
+    max_seq_len = SEQ_LENS
+    N_CTX = SEQ_LENS
 
     # Create random input tensors
     q = torch.randn((B * N_CTX, H_Q, D), device=device, dtype=dtype)
@@ -16,8 +16,8 @@ def gen_args(B, SEQ_LENS, H_Q, H_KV, D, dtype, device):
     o = torch.zeros((B * N_CTX, H_Q, D), device=device, dtype=dtype)
 
     # Create b_start_loc and b_seq_len tensors
-    b_start_loc = torch.tensor([0, SEQ_LENS[0]], device=device)
-    b_seq_len = torch.tensor(SEQ_LENS, device=device)
+    b_start_loc = torch.tensor([0, SEQ_LENS], device=device)
+    b_seq_len = torch.tensor([SEQ_LENS], device=device)
 
     return (q, k, v, o, b_start_loc, b_seq_len, max_seq_len)
 
@@ -26,13 +26,13 @@ def gen_args(B, SEQ_LENS, H_Q, H_KV, D, dtype, device):
 @benchmark_suit.perf_report(
     benchmark_suit.Benchmark(
         # argument names to use as an x-axis for the plot
-        x_names=['B', 'SEQ_LENS', 'H_Q', 'H_KV', 'D', 'CAUSAL', 'MODE', 'VALIDATE'],
+        x_names=['B', 'SEQ_LENS', 'H_Q', 'H_KV', 'D', 'CAUSAL', 'MODE'],
         x_vals=[  #
-            [bs, [1024], 32, 8, 128, causal, 'fwd', False] for causal in [True, False] for bs in [1, 16, 32, 64, 128]
+            [bs, 1024, 32, 8, 128, causal, 'fwd'] for causal in [True, False] for bs in [1, 16, 32, 64, 128]
         ] + [  #
-            [bs, [1024], 32, 32, 96, causal, 'fwd', False] for causal in [True, False] for bs in [1, 16, 32, 64, 128]
+            [bs, 1024, 32, 32, 96, causal, 'fwd'] for causal in [True, False] for bs in [1, 16, 32, 64, 128]
         ] + [  #
-            [bs, [1024], 28, 4, 128, causal, 'fwd', False] for causal in [True, False] for bs in [1, 16, 32, 64, 128]
+            [bs, 1024, 28, 4, 128, causal, 'fwd'] for causal in [True, False] for bs in [1, 16, 32, 64, 128]
         ],
         line_arg='provider',
         # argument name whose value corresponds to a different line in the plot
@@ -51,43 +51,21 @@ def gen_args(B, SEQ_LENS, H_Q, H_KV, D, dtype, device):
         # name for the plot. Used also as a file name for saving the plot.
         args={},
     ))
-def benchmark(B, SEQ_LENS, H_Q, H_KV, D, CAUSAL, MODE, VALIDATE, provider):
+def benchmark(B, SEQ_LENS, H_Q, H_KV, D, CAUSAL, MODE, provider):
     torch.manual_seed(0)
     dtype = torch.bfloat16
-    N_CTX = sum(SEQ_LENS)
 
     q, k, v, o, b_start_loc, b_seq_len, max_seq_len = gen_args(B, SEQ_LENS, H_Q, H_KV, D, dtype, 'xpu')
 
     quantiles = [0.5, 0.0, 1.0]
-    if provider == 'triton':
-
+    if provider == 'triton' and MODE == 'fwd':
         triton_fn = lambda: context_attention_fwd(q, k, v, o, b_start_loc, b_seq_len, max_seq_len, is_causal=CAUSAL)
-
-        if VALIDATE:
-            # FIXME: torch sdpa does not support different H_Q and H_KV
-            cu_seq_lens = [0] * (len(SEQ_LENS) + 1)
-            for i, seq_len in enumerate(SEQ_LENS):
-                cu_seq_lens[i + 1] = cu_seq_lens[i] + seq_len
-
-            for i in range(len(SEQ_LENS)):
-                start, end = cu_seq_lens[i], cu_seq_lens[i + 1]
-                o_torch = torch.nn.functional.scaled_dot_product_attention(
-                    q[start:end].permute(1, 0, 2),
-                    k[start:end].permute(1, 0, 2),
-                    v[start:end].permute(1, 0, 2),
-                    is_causal=CAUSAL,
-                ).permute(1, 0, 2)
-
-                cos_sim = torch.nn.functional.cosine_similarity(o[start:end].flatten(), o_torch.flatten(), dim=0)
-                assert cos_sim.item() > 1 - (1e-5)
-                assert torch.allclose(o[start:end], o_torch, atol=1e-2)
-
         _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10,
                                                                  quantiles=quantiles)
-
     else:
-        raise NotImplementedError(f'Unsupported provider {provider}')
+        raise NotImplementedError(f'Unsupported provider {provider} and mode {MODE}')
 
+    N_CTX = SEQ_LENS
     tflops = lambda ms: 2 * B * (H_Q + H_KV) * N_CTX * N_CTX * D * (1e-12) / (ms * 1e-3)
     gbps = lambda ms: 2 * B * (H_Q + H_KV) * N_CTX * D * 2 * (1e-9) / (ms * 1e-3)
 
diff --git a/benchmarks/third_party/sglang/sglang.patch b/benchmarks/third_party/sglang/sglang.patch
@@ -0,0 +1,31 @@
+diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
+index 884e715f..580e2364 100644
+--- a/python/sglang/srt/utils.py
++++ b/python/sglang/srt/utils.py
+@@ -77,12 +77,20 @@ from torch.func import functional_call
+ from torch.library import Library
+ from torch.profiler import ProfilerActivity, profile, record_function
+ from torch.utils._contextlib import _DecoratorContextManager
+-from triton.runtime.cache import (
+-    FileCacheManager,
+-    default_cache_dir,
+-    default_dump_dir,
+-    default_override_dir,
+-)
++try:
++    from triton.runtime.cache import (
++        FileCacheManager,
++        default_cache_dir,
++        default_dump_dir,
++       default_override_dir,
++   )
++except ImportError:
++    from triton.runtime.cache import FileCacheManager
++    from triton.knobs import cache as tt_cache
++
++    default_cache_dir = lambda: tt_cache.dir
++    default_dump_dir = lambda: tt_cache.dump_dir
++    default_override_dir = lambda: tt_cache.override_dir
+ 
+ logger = logging.getLogger(__name__)
+ 
diff --git a/benchmarks/triton_kernels_benchmark/build_report.py b/benchmarks/triton_kernels_benchmark/build_report.py
@@ -90,7 +90,7 @@ def build_report(args: PassedArgs, results_df: Optional[pd.DataFrame] = None):
             df[p] = df[p].astype(int)
             df_results["params"] = [json.dumps(j) for j in df[[*param_cols, "MASK"]].to_dict("records")]
     else:
-        df_results["params"] = [json.dumps(j) for j in df[param_cols].astype(str).to_dict("records")]
+        df_results["params"] = [json.dumps(j) for j in df[param_cols].astype(int).to_dict("records")]
     df_results["tflops"] = df[args.tflops_col]
     if hbm_col is not None:
         df_results["hbm_gbs"] = df[hbm_col]