[FA] Port remaining performance features from advanced path (#3848)

whitneywhtsang · web-flow · commit 91ef0686d47a · 2025-04-08T09:07:59.000-04:00
`set_fast_math` didn't work on default path, as lowering pass generates LLVM operations. This PR changed setting of fastmath flag on LLVM IR. This PR guards the setting of fastmath flag under an env var to avoid accuracy failures. ![Screenshot 2025-04-06 220833](https://github.com/user-attachments/assets/6d53900a-28f2-4294-8e9a-722284b4ac4b) Observations: 1. performance of advanced path without setting fastmath flag is the same as default path 2. performance of default path with setting fastmath flag is faster than advanced path As default path (with env var) is able to achieve performance no worse than advanced path, this PR stops running FA with advanced path. Closes #3286 --------- Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml
@@ -258,18 +258,6 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
           python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
 
-      - name: Run Triton FA fwd kernel benchmark - advanced path
-        if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_benchmark.py_advanced')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py_advanced') }}
-        run: |
-          cd benchmarks/triton_kernels_benchmark
-          TRITON_INTEL_ADVANCED_PATH=1 \
-          IGC_VISAOptions=" -enableBCR" \
-          python flash_attention_benchmark.py --reports $REPORTS --n_runs $N_RUNS
-
-          TAG="${TAG}-adv"
-          source ../../scripts/capture-hw-details.sh
-          python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
-
       - name: Run Triton FA bwd kernel benchmark
         if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'flash_attention_bwd_benchmark.py')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_bwd_benchmark.py') }}
         run: |
diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
@@ -50,6 +50,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_INTEL_DO_NOT_SINK_INSTR_ACROSS_RGN",
     "TRITON_INTEL_ENABLE_FIRST_LOAD_TO_SLM",
     "TRITON_INTEL_ENABLE_INSTR_SCHED",
+    "TRITON_INTEL_FAST_MATH",
     "TRITON_INTEL_RAISE_BLOCK_POINTER",
     "TRITON_INTEL_REDUCE_TRANSPOSE",
     // clang-format on
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -346,7 +346,6 @@ def make_llir(src, metadata, options):
             passes.ttgpuir.add_allocate_shared_memory(pm)
         intel.passes.ttgpuir.add_to_llvmir(pm, options.advanced_path, options.one_matrix_per_load_for_bt)
         intel.passes.ttgpuir.add_rewrite_stack_ptr(pm)
-        intel.set_fast_math(mod)
         passes.convert.add_arith_to_llvmir(pm)
         passes.common.add_canonicalizer(pm)
         passes.common.add_cse(pm)
@@ -359,6 +358,8 @@ def make_llir(src, metadata, options):
         context = llvm.context()
         llvm_mod = llvm.to_module(mod, context)
         intel.set_spv_target_triple(llvm_mod)
+        if os.getenv("TRITON_INTEL_FAST_MATH", "0") == "1":
+            intel.set_fast_math(llvm_mod)
         if options.extern_libs:
             paths = [path for (name, path) in options.extern_libs]
             llvm.link_extern_libs(llvm_mod, paths)
diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc
@@ -2,6 +2,7 @@
 #include "mlir/Pass/PassManager.h"
 #include "passes.h"
 
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/PassPlugin.h"
@@ -256,16 +257,21 @@ void init_triton_intel(py::module &&m) {
     return py::int_(ret);
   });
 
-  // May do this after llvm ir according to user fmath flag.
-  m.def("set_fast_math", [](mlir::ModuleOp mod) {
-    using namespace mlir;
-    MLIRContext *ctx = mod.getContext();
-    mod.walk([&](Operation *op) {
-      if (auto fmIf = dyn_cast<arith::ArithFastMathInterface>(op))
-        op->setAttr(
-            fmIf.getFastMathAttrName(),
-            arith::FastMathFlagsAttr::get(ctx, arith::FastMathFlags::fast));
-    });
+  // FIXME: This is for internal experimentation. In the end we will need a
+  // producer flag (e.g. PyTorch flag) to allow the Triton compiler to use the
+  // fast math semantics on all arithmetic operations.
+  // https://github.com/intel/intel-xpu-backend-for-triton/issues/3862
+  m.def("set_fast_math", [](llvm::Module *mod) {
+    using namespace llvm;
+    for (Function &func : *mod) {
+      for (Instruction &inst : instructions(func)) {
+        if (auto *op = dyn_cast<FPMathOperator>(&inst)) {
+          FastMathFlags FMF;
+          FMF.setFast(true);
+          inst.setFastMathFlags(FMF);
+        }
+      }
+    }
   });
 
   m.def("set_spv_target_triple", [](llvm::Module *mod) {