@@ -1310,42 +1310,42 @@ function triton_optimization_passes()
13101310 " convert-nvvm-to-llvm" ,
13111311 # common passes
13121312 " canonicalize" ,
1313- # # ttir passes
1314- # "triton-combine",
1315- # "triton-reorder-broadcast",
1316- # "triton-rewrite-tensor-pointer",
1317- # "triton-rewrite-tensor-descriptor-to-pointer",
1318- # "triton-loop-unroll",
1319- # "triton-licm",
1320- # "triton-loop-aware-cse",
1321- # # TODO : should num-warps and num-ctas be set for each kernel?
1322- # "convert-triton-to-tritongpu{target=cuda:$(cubinChip[][4:end]) num-warps=1 threads-per-warp=$(cuWarpSize[]) num-ctas=1}",
1323- # # ttgir passes
1324- # "tritongpu-coalesce",
1325- # "tritongpu-optimize-thread-locality",
1326- # "tritongpu-hoist-tmem-alloc",
1327- # "tritongpu-assign-latencies",
1328- # "tritongpu-pipeline",
1329- # "tritongpu-schedule-loops",
1330- # "tritongpu-automatic-warp-specialization",
1331- # "tritongpu-prefetch",
1332- # "tritongpu-accelerate-matmul",
1333- # "tritongpu-reorder-instructions",
1334- # "tritongpu-F32DotTC",
1335- # "tritongpu-optimize-dot-operands",
1336- # "tritongpu-remove-layout-conversions",
1337- # "tritongpu-reduce-data-duplication",
1338- # "tritongpu-hoist-tmem-alloc",
1339- # "tritongpu-fuse-nested-loops",
1340- # "tritongpu-rewrite-partition-dependencies",
1341- # "tritongpu-partition-loops",
1342- # "tritongpu-combine-tensor-select-and-if",
1343- # # ttgir to llvm passes
1344- # "tritongpu-allocate-warp-groups",
1345- # "allocate-shared-memory",
1346- # "tritongpu-global-scratch-memory-allocation",
1347- # "tritongpu-optimize-accumulator-init",
1348- # "tritongpu-coalesce-async-copy",
1313+ # ttir passes
1314+ " triton-combine" ,
1315+ " triton-reorder-broadcast" ,
1316+ " triton-rewrite-tensor-pointer" ,
1317+ " triton-rewrite-tensor-descriptor-to-pointer" ,
1318+ " triton-loop-unroll" ,
1319+ " triton-licm" ,
1320+ " triton-loop-aware-cse" ,
1321+ # TODO : should num-warps and num-ctas be set for each kernel?
1322+ " convert-triton-to-tritongpu{target=cuda:$(cubinChip[][4 : end ]) num-warps=1 threads-per-warp=$(cuWarpSize[]) num-ctas=1}" ,
1323+ # ttgir passes
1324+ " tritongpu-coalesce" ,
1325+ " tritongpu-optimize-thread-locality" ,
1326+ " tritongpu-hoist-tmem-alloc" ,
1327+ " tritongpu-assign-latencies" ,
1328+ " tritongpu-pipeline" ,
1329+ " tritongpu-schedule-loops" ,
1330+ " tritongpu-automatic-warp-specialization" ,
1331+ " tritongpu-prefetch" ,
1332+ " tritongpu-accelerate-matmul" ,
1333+ " tritongpu-reorder-instructions" ,
1334+ " tritongpu-F32DotTC" ,
1335+ " tritongpu-optimize-dot-operands" ,
1336+ " tritongpu-remove-layout-conversions" ,
1337+ " tritongpu-reduce-data-duplication" ,
1338+ " tritongpu-hoist-tmem-alloc" ,
1339+ " tritongpu-fuse-nested-loops" ,
1340+ " tritongpu-rewrite-partition-dependencies" ,
1341+ " tritongpu-partition-loops" ,
1342+ " tritongpu-combine-tensor-select-and-if" ,
1343+ # ttgir to llvm passes
1344+ " tritongpu-allocate-warp-groups" ,
1345+ " allocate-shared-memory" ,
1346+ " tritongpu-global-scratch-memory-allocation" ,
1347+ " tritongpu-optimize-accumulator-init" ,
1348+ " tritongpu-coalesce-async-copy" ,
13491349 ],
13501350 " ," ,
13511351 )
@@ -2303,8 +2303,7 @@ function compile_mlir!(
23032303 end
23042304 end
23052305
2306- # XXX : re-enable this pass
2307- # run_pass_pipeline!(mod, "mark-func-memory-effects", "mark-func-memory-effects")
2306+ run_pass_pipeline! (mod, " mark-func-memory-effects" , " mark-func-memory-effects" )
23082307
23092308 func_op = MLIR. API. mlirSymbolTableLookup (
23102309 MLIR. IR. SymbolTable (MLIR. IR. Operation (mod)), fnname
0 commit comments