intel
diff --git a/‎.github/pins/sglang.txt‎
Lines changed: 0 additions & 1 deletion b/‎.github/pins/sglang.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/third-party-benchmarks.yml‎
Lines changed: 14 additions & 14 deletions b/‎.github/workflows/third-party-benchmarks.yml‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎.github/workflows/third-party-tests.yml‎
Lines changed: 0 additions & 13 deletions b/‎.github/workflows/third-party-tests.yml‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎benchmarks/third_party/sglang/decode_attention_benchmark.py‎
Lines changed: 15 additions & 15 deletions b/‎benchmarks/third_party/sglang/decode_attention_benchmark.py‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎benchmarks/third_party/sglang/extended_attention_benchmark.py‎
Lines changed: 25 additions & 25 deletions b/‎benchmarks/third_party/sglang/extended_attention_benchmark.py‎
Lines changed: 25 additions & 25 deletions
@@ -110,35 +110,35 @@ jobs:
 
       - name: Install SGLANG
         run: |
-          SGLANG_PIN="$(<.github/pins/sglang.txt)"
-          pip install sglang==$SGLANG_PIN
+          git clone https://github.com/sgl-project/sglang.git
+          pip install sglang/python[srt_xpu]
 
       - name: Run SGLANG attention prefill stage benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefill_attention_benchmark.py') }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
         run: |
           cd benchmarks/third_party/sglang
-          python prefill_attention_benchmark --reports $REPORTS
+          python prefill_attention_benchmark.py --reports $REPORTS
 
-          source ../../scripts/capture-hw-details.sh
-          python ../../scripts/build_report.py $REPORTS/prefill-attn-performance.csv $REPORTS/attn-prefill-triton-report.csv --benchmark attn --compiler triton --param_cols "B,N_CTX,H_Q,H_KV,D,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
+          source ../../../scripts/capture-hw-details.sh
+          python ../../triton_kernels_benchmark/build_report.py $REPORTS/prefill-attn-performance.csv $REPORTS/attn-prefill-triton-report.csv --benchmark sglang-prefill-attn --compiler triton --param_cols "B,SEQ_LENS,H_Q,H_KV,D,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Run SGLANG attention decode stage benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'decode_attention_benchmark.py') }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
         run: |
           cd benchmarks/third_party/sglang
-          python decode_attention_benchmark --reports $REPORTS
+          python decode_attention_benchmark.py --reports $REPORTS
 
-          source ../../scripts/capture-hw-details.sh
-          python ../../scripts/build_report.py $REPORTS/decode-attn-performance.csv $REPORTS/attn-decode-triton-report.csv --benchmark attn --compiler triton --param_cols "B,N_CTX,H_Q,H_KV,D" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
+          source ../../../scripts/capture-hw-details.sh
+          python ../../triton_kernels_benchmark/build_report.py $REPORTS/decode-attn-performance.csv $REPORTS/attn-decode-triton-report.csv --benchmark sglang-decode-attn --compiler triton --param_cols "B,SEQ_LENS,H_Q,H_KV,D" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Run SGLANG attention append stage benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'decode_attention_benchmark.py') }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
         run: |
           cd benchmarks/third_party/sglang
-          python extended_attention_benchmark --reports $REPORTS
+          python extended_attention_benchmark.py --reports $REPORTS
 
-          source ../../scripts/capture-hw-details.sh
-          python ../../scripts/build_report.py $REPORTS/extended-attn-performance.csv $REPORTS/attn-append-triton-report.csv --benchmark attn --compiler triton --param_cols "B,N_CTX,H_Q,H_KV,D" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
+          source ../../../scripts/capture-hw-details.sh
+          python ../../triton_kernels_benchmark/build_report.py $REPORTS/extended-attn-performance.csv $REPORTS/attn-append-triton-report.csv --benchmark sglang-extended-attn --compiler triton --param_cols "B,SEQ_LENS,H_Q,H_KV,D" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Upload benchmark reports
         if: ${{ steps.install.outcome == 'success' && !cancelled() }}
 
@@ -96,19 +96,6 @@ jobs:
 
           pytest Liger-Kernel/test/
 
-      - name: Run SGLANG tests
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
-        run: |
-          pip install transformers pandas pytest openai
-
-          SGLANG_PIN="$(<.github/pins/sglang.txt)"
-          pip install datasets decord sglang==$SGLANG_PIN
-          git clone https://github.com/sgl-project/sglang.git --branch $SGLANG_PIN --single-branch
-
-          cd sglang
-          git apply ../benchmarks/third_party/sglang/sglang.patch
-          pytest sglang/test/srt/test_triton_attention_kernels.py
-
       - name: Upload test report
         if: ${{ steps.install.outcome == 'success' && !cancelled() }}
         uses: actions/upload-artifact@v4
 
@@ -5,36 +5,36 @@
 import triton_kernels_benchmark as benchmark_suit
 
 
-def gen_args(BATCH, N_CTX, Q_HEAD_NUM, KV_HEAD_NUM, HEAD_DIM, dtype, device):
+def gen_args(B, N_CTX, H_Q, H_KV, D, dtype, device):
 
-    total_tokens = BATCH * N_CTX
-    sm_scale = 1.0 / (HEAD_DIM**0.5)
+    total_tokens = B * N_CTX
+    sm_scale = 1.0 / (D**0.5)
     max_kv_splits = 8
-    num_kv_splits = torch.full((BATCH, ), 4, dtype=torch.int32, device=device)
+    num_kv_splits = torch.full((B, ), 4, dtype=torch.int32, device=device)
 
-    # q represents the new token being generated, one per batch
-    q = torch.randn(BATCH, Q_HEAD_NUM, HEAD_DIM, dtype=dtype, device=device)
+    # q represents the new token being generated, one per B
+    q = torch.randn(B, H_Q, D, dtype=dtype, device=device)
 
     # k_buffer and v_buffer represent all previous tokens
-    k_buffer = torch.randn(total_tokens, KV_HEAD_NUM, HEAD_DIM, dtype=dtype, device=device)
-    v_buffer = torch.randn(total_tokens, KV_HEAD_NUM, HEAD_DIM, dtype=dtype, device=device)
+    k_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device=device)
+    v_buffer = torch.randn(total_tokens, H_KV, D, dtype=dtype, device=device)
 
     # o will have the same shape as q
-    o = torch.zeros(BATCH, Q_HEAD_NUM, HEAD_DIM, dtype=dtype, device=device)
+    o = torch.zeros(B, H_Q, D, dtype=dtype, device=device)
 
-    b_seq_len = torch.full((BATCH, ), N_CTX, device=device)
+    b_seq_len = torch.full((B, ), N_CTX, device=device)
 
-    kv_indptr = torch.zeros((BATCH + 1, ), dtype=torch.int32, device=device)
-    kv_indptr[1:BATCH + 1] = torch.cumsum(b_seq_len[:BATCH], dim=0)
+    kv_indptr = torch.zeros((B + 1, ), dtype=torch.int32, device=device)
+    kv_indptr[1:B + 1] = torch.cumsum(b_seq_len[:B], dim=0)
     kv_indices = torch.arange(total_tokens, device=device)
 
     attn_logits = torch.empty(
-        (BATCH, Q_HEAD_NUM, max_kv_splits, HEAD_DIM),
+        (B, H_Q, max_kv_splits, D),
         dtype=torch.float32,
         device=device,
     )
     attn_lse = torch.empty(
-        (BATCH, Q_HEAD_NUM, max_kv_splits),
+        (B, H_Q, max_kv_splits),
         dtype=torch.float32,
         device=device,
     )
@@ -105,7 +105,7 @@ def benchmark(B, SEQ_LENS, H_Q, H_KV, D, MODE, VALIDATE, provider):
     else:
         raise NotImplementedError(f'Unsupported provider {provider}')
 
-    tflops = lambda ms: 2 * B * (H_Q + H_KV * N_CTX) * N_CTX * D * (1e-12) / (ms * 1e-3)
+    tflops = lambda ms: 2 * B * (H_Q + H_KV) * N_CTX * D * (1e-12) / (ms * 1e-3)
     gbps = lambda ms: 2 * B * (H_Q + H_KV * N_CTX) * D * 2 * (1e-9) / (ms * 1e-3)
 
     return (gbps(mean), gbps(max_ms), gbps(min_ms)), (tflops(mean), tflops(max_ms), tflops(min_ms)), cv
 
@@ -6,53 +6,51 @@
 import triton_kernels_benchmark as benchmark_suit
 
 
-def gen_args(BATCH, N_CTX, Q_HEAD_NUM, KV_HEAD_NUM, HEAD_DIM, dtype, device):
+def gen_args(B, N_CTX, H_Q, H_KV, D, dtype, device):
 
-    b_seq_len_prefix = torch.randint(1, N_CTX // 2, (BATCH, ), dtype=torch.int32, device=device)
-    b_seq_len_extend = torch.randint(1, N_CTX // 2, (BATCH, ), dtype=torch.int32, device=device)
+    b_seq_len_prefix = torch.randint(1, N_CTX // 2, (B, ), dtype=torch.int32, device=device)
+    b_seq_len_extend = torch.randint(1, N_CTX // 2, (B, ), dtype=torch.int32, device=device)
     b_seq_len = b_seq_len_prefix + b_seq_len_extend
     max_len_in_batch = torch.max(b_seq_len, 0)[0].item()
 
-    b_req_idx = torch.arange(BATCH, dtype=torch.int32, device=device)
-    b_start_loc = torch.zeros((BATCH, ), dtype=torch.int32, device=device)
+    b_req_idx = torch.arange(B, dtype=torch.int32, device=device)
+    b_start_loc = torch.zeros((B, ), dtype=torch.int32, device=device)
     b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0)
-    b_start_loc_extend = torch.zeros((BATCH, ), dtype=torch.int32, device=device)
+    b_start_loc_extend = torch.zeros((B, ), dtype=torch.int32, device=device)
     b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0)
 
-    kv_indptr = torch.zeros((BATCH + 1, ), dtype=torch.int32, device=device)
-    kv_indptr[1:BATCH + 1] = torch.cumsum(b_seq_len_prefix[:BATCH], dim=0)
+    kv_indptr = torch.zeros((B + 1, ), dtype=torch.int32, device=device)
+    kv_indptr[1:B + 1] = torch.cumsum(b_seq_len_prefix[:B], dim=0)
     kv_indices = torch.zeros((b_seq_len_prefix.sum().item(), ), dtype=torch.int32, device=device)
 
-    for i in range(BATCH):
+    for i in range(B):
         kv_indices[kv_indptr[i]:kv_indptr[i + 1]] = torch.arange(b_start_loc[i], b_start_loc[i] + b_seq_len_prefix[i])
 
     total_token_num = torch.sum(b_seq_len).item()
     extend_token_num = torch.sum(b_seq_len_extend).item()
-    k_buffer = torch.empty((total_token_num, KV_HEAD_NUM, HEAD_DIM), dtype=dtype,
-                           device=device).normal_(mean=0.1, std=0.2)
-    v_buffer = torch.empty((total_token_num, KV_HEAD_NUM, HEAD_DIM), dtype=dtype,
-                           device=device).normal_(mean=0.1, std=0.2)
-
-    k_extend = torch.empty((extend_token_num, KV_HEAD_NUM, HEAD_DIM), dtype=dtype, device=device)
-    v_extend = torch.empty((extend_token_num, KV_HEAD_NUM, HEAD_DIM), dtype=dtype, device=device)
-    q_extend = torch.empty((extend_token_num, Q_HEAD_NUM, HEAD_DIM), dtype=dtype, device=device)
-    for i in range(BATCH):
+    k_buffer = torch.empty((total_token_num, H_KV, D), dtype=dtype, device=device).normal_(mean=0.1, std=0.2)
+    v_buffer = torch.empty((total_token_num, H_KV, D), dtype=dtype, device=device).normal_(mean=0.1, std=0.2)
+
+    k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device)
+    v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device=device)
+    q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device)
+    for i in range(B):
         extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i]
         extend_end_in_buffer = b_start_loc[i] + b_seq_len[i]
         extend_start = b_start_loc_extend[i]
         extend_end = b_start_loc_extend[i] + b_seq_len_extend[i]
         k_extend[extend_start:extend_end] = k_buffer[extend_start_in_buffer:extend_end_in_buffer]
         v_extend[extend_start:extend_end] = v_buffer[extend_start_in_buffer:extend_end_in_buffer]
-        q_extend[extend_start:extend_end] = torch.empty((b_seq_len_extend[i], Q_HEAD_NUM, HEAD_DIM), dtype=dtype,
+        q_extend[extend_start:extend_end] = torch.empty((b_seq_len_extend[i], H_Q, D), dtype=dtype,
                                                         device=device).normal_(mean=0.1, std=0.2)
 
-    o_extend = torch.empty((extend_token_num, Q_HEAD_NUM, HEAD_DIM), dtype=dtype, device=device)
-    o_redundant = torch.empty((extend_token_num, Q_HEAD_NUM, HEAD_DIM), dtype=dtype, device=device)
+    o_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device)
+    o_redundant = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device=device)
 
     b_seq_len_extend = b_seq_len - b_seq_len_prefix
     max_len_extend = torch.max(b_seq_len_extend, 0)[0].item()
-    qo_indptr = torch.zeros((BATCH + 1, ), dtype=torch.int32, device=device)
-    qo_indptr[1:BATCH + 1] = torch.cumsum(b_seq_len_extend[:BATCH], dim=0)
+    qo_indptr = torch.zeros((B + 1, ), dtype=torch.int32, device=device)
+    qo_indptr[1:B + 1] = torch.cumsum(b_seq_len_extend[:B], dim=0)
 
     params = []
     params.append((q_extend, k_extend, v_extend, o_extend, o_redundant))
@@ -127,8 +125,10 @@ def refer_fn():
     else:
         raise NotImplementedError(f'Unsupported provider {provider}')
 
-    tflops = lambda ms: 2 * B * (H_Q + H_KV * N_CTX) * N_CTX * D * (1e-12) / (ms * 1e-3)
-    gbps = lambda ms: 2 * B * (H_Q + H_KV * N_CTX) * D * 2 * (1e-9) / (ms * 1e-3)
+    N_CTX_TOTAL = k_buffer.shape[0]
+    N_CTX_EXTEND = k_extend.shape[0]
+    tflops = lambda ms: (H_Q + H_KV) * (N_CTX_EXTEND + N_CTX_TOTAL) * N_CTX_TOTAL * D * (1e-12) / (ms * 1e-3)
+    gbps = lambda ms: 2 * (N_CTX_EXTEND * (H_Q + H_KV) + N_CTX_TOTAL * H_KV) * D * 2 * (1e-9) / (ms * 1e-3)
 
     return (gbps(mean), gbps(max_ms), gbps(min_ms)), (tflops(mean), tflops(max_ms), tflops(min_ms)), cv