fix bugs

airMeng · airMeng · commit 80610541791b · 2025-03-18T06:42:56.000Z
diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml
@@ -276,6 +276,14 @@ jobs:
           source ../../scripts/capture-hw-details.sh
           python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
+      - name: Run SGLang FP8 GEMM benchmark
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }}
+        run: |
+          cd benchmarks/triton_kernels_benchmark/sglang
+          python block_fp8_matmul.py --reports $REPORTS
+          source ../../scripts/capture-hw-details.sh
+          python ../../scripts/build_report.py $REPORTS/block_fp8_matmul.csv $REPORTS/block_fp8_matmul-triton-report.csv --benchmark block_fp8_matmul --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
+
       - name: Run micro benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }}
         run: |
diff --git a/benchmarks/triton_kernels_benchmark/sglang/block_fp8_matmul.py b/benchmarks/triton_kernels_benchmark/sglang/block_fp8_matmul.py
@@ -125,7 +125,7 @@ def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int, block_k: int) -> Op
 
     config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
     if os.path.exists(config_file_path):
-        with open(config_file_path) as f:
+        with open(config_file_path, "r", encoding="utf-8") as f:
             logger.info(
                 "Using configuration from %s for W8A8 Block FP8 kernel.",
                 config_file_path,
@@ -332,56 +332,56 @@ def is_enough_memory(x_val):
 @benchmark_suit.perf_report(
     benchmark_suit.Benchmark(
         # argument names to use as an x-axis for the plot
-        x_names=['B', 'M', 'N', 'K'],
+        x_names=["B", "M", "N", "K"],
         # different possible values for `x_name`
         x_vals=X_VALS,
-        line_arg='provider',
+        line_arg="provider",
         # argument name whose value corresponds to a different line in the plot
-        # possible values for `line_arg``
-        line_vals=['triton'],
+        line_vals=["triton"],
         # label name for the lines
-        line_names=['Triton'],
+        line_names=["Triton"],
         # line styles
-        ylabel=['GB/s', 'TFlops'],  # label name for the y-axis
-        plot_name='matmul-performance',
+        ylabel=["GB/s", "TFlops"],  # label name for the y-axis
+        plot_name="matmul-performance",
         # name for the plot. Used also as a file name for saving the plot.
         args={},
     ))
 def benchmark(B, M, N, K, provider):
-    block_size = [[128, 128]]
+    assert provider == "triton"
+
+    block_size = [128, 128]
 
     torch.manual_seed(0)
     factor_for_scale = 1e-2
     fp8_info = torch.finfo(torch.float8_e4m3fn)
     fp8_max, fp8_min = fp8_info.max, fp8_info.min
 
-    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32, device="xpu") - 0.5) * 2 * fp8_max
     A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
 
-    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32, device="xpu") - 0.5) * 2 * fp8_max
     B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
 
     block_n, block_k = block_size[0], block_size[1]
     n_tiles = (N + block_n - 1) // block_n
     k_tiles = (K + block_k - 1) // block_k
 
-    As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
-    Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
+    As = torch.rand(M, k_tiles, dtype=torch.float32, device="xpu") * factor_for_scale
+    Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="xpu") * factor_for_scale
 
     quantiles = [0.5, 0.0, 1.0]
 
-    c = torch.zeros((B, M, N), device='xpu', dtype=torch.float32)
-    triton_fn = lambda: w8a8_block_fp8_matmul(A_fp8, B_fp8, c, As, Bs, block_size)
-    torch_fn = lambda: native_w8a8_block_fp8_matmul(A_fp8, B_fp8, c, As, Bs, block_size)
-    rtol = 1e-2 if c.dtype == torch.bfloat16 else 1e-3
-    benchmark_suit.assert_close(triton_fn, torch_fn, atol=1e-4, rtol=rtol, err_msg='triton to torch')
+    triton_fn = lambda: w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size)
+    torch_fn = lambda: native_w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size)
+    rtol = 1e-3
+    benchmark_suit.assert_close(triton_fn, torch_fn, atol=1e-4, rtol=rtol, err_msg="triton to torch")
     _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, quantiles=quantiles)
 
     tflops = lambda ms: 2 * B * M * N * K * (1e-12) / (ms * 1e-3)
-    gbps = lambda ms: B * (2 * (M * K + K * N) + 4.0 * (M * N)) * (1e-9) / (ms * 1e-3)
+    gbps = lambda ms: B * ((M * K + K * N) + 2.0 * (M * N)) * (1e-9) / (ms * 1e-3)
 
     return (gbps(mean_ms), gbps(max_ms), gbps(min_ms)), (tflops(mean_ms), tflops(max_ms), tflops(min_ms)), cv
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     benchmark.run(show_plots=False, print_data=True)