@@ -28,6 +28,37 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
2828
2929// -----
3030
31+ // CHECK-DAG: llvm.func spir_funccc @_Z38intel_sub_group_f16_f16_matrix_mad_k16Dv8_sDv8_iDv8_f(vector<8xi16>, vector<8xi32>, vector<8xf32>) -> vector<8xf32> attributes {convergent, memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>, no_unwind, will_return}
32+ // CHECK-DAG: llvm.func spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {no_unwind, will_return}
33+ // CHECK-DAG: llvm.func spir_funccc @_Z52intel_sub_group_2d_block_read_transform_16b_32r16x1cPU3AS1viiiDv2_iPj(!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {no_unwind, will_return}
34+ #blocked = #ttg.blocked <{sizePerThread = [1 , 1 ], threadsPerWarp = [1 , 16 ], warpsPerCTA = [2 , 4 ], order = [1 , 0 ]}>
35+ #dpas = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [4 , 2 ], repCluster = [1 , 1 ], A = [8 , 16 ], B = [16 , 16 ], C = [8 , 16 ]}>
36+ #dot0 = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =2 }>
37+ #dot1 = #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth =2 }>
38+ module attributes {" ttg.num-warps" = 8 : i32 , " ttg.threads-per-warp" = 16 : i32 } {
39+ tt.func public @matmul_no_scf_with_add_kernel (%arg0: !tt.ptr <f16 >, %arg1: !tt.ptr <f16 >, %arg2: !tt.ptr <f32 >, %arg3: i64 , %arg4: i64 , %arg5: i64 , %arg6: i64 , %arg8: i64 ) {
40+ %C = arith.constant dense <0.000000e+00 > : tensor <64 x64 xf32 , #dpas >
41+ %c0_i32 = arith.constant 0 : i32
42+ %c1_i64 = arith.constant 1 : i64
43+ %ptrA = tt.make_tensor_ptr %arg0 , [%arg3 , %arg5 ], [%arg6 , %c1_i64 ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 1 , 0 >} : <tensor <64 x32 xf16 , #dot0 >>
44+ %ptrB = tt.make_tensor_ptr %arg1 , [%arg5 , %arg4 ], [%arg8 , %c1_i64 ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 1 , 0 >} : <tensor <32 x64 xf16 , #dot1 >>
45+ // CHECK-COUNT-2: llvm.call spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x2cPU3AS1viiiDv2_iPt({{.*}}) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
46+ // CHECK-COUNT-2: llvm.call spir_funccc @_Z52intel_sub_group_2d_block_read_transform_16b_32r16x1cPU3AS1viiiDv2_iPj({{.*}}) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
47+ // CHECK-COUNT-8: llvm.call spir_funccc @_Z38intel_sub_group_f16_f16_matrix_mad_k16Dv8_sDv8_iDv8_f({{.*}}) {{.*}} : (vector<8xi16>, vector<8xi32>, vector<8xf32>) -> vector<8xf32>
48+ %A = tt.load %ptrA {boundaryCheck = array<i32 : 1 >, padding = 1 : i32 , triton_intel_gpu.block_io = " row_major" } : !tt.ptr <tensor <64 x32 xf16 , #dot0 >>
49+ %B = tt.load %ptrB {boundaryCheck = array<i32 : 0 >, padding = 1 : i32 , triton_intel_gpu.block_io = " row_major" } : !tt.ptr <tensor <32 x64 xf16 , #dot1 >>
50+ %D = tt.dot %A , %B , %C , inputPrecision = tf32 : tensor <64 x32 xf16 , #dot0 > * tensor <32 x64 xf16 , #dot1 > -> tensor <64 x64 xf32 , #dpas >
51+ %ptrX = tt.make_tensor_ptr %arg2 , [%arg3 , %arg4 ], [%arg8 , %c1_i64 ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 1 , 0 >} : <tensor <64 x64 xf32 , #dpas >>
52+ // CHECK-COUNT-4: llvm.call spir_funccc @_Z41intel_sub_group_2d_block_read_32b_8r16x1cPU3AS1viiiDv2_iPj({{.*}}) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
53+ %X = tt.load %ptrX {boundaryCheck = array<i32 : 0 , 1 >, triton_intel_gpu.block_io = " row_major" } : !tt.ptr <tensor <64 x64 xf32 , #dpas >>
54+ // CHECK-COUNT-32: llvm.fadd {{.*}}, {{.*}}
55+ %0 = arith.addf %D , %X : tensor <64 x64 xf32 , #dpas >
56+ tt.return
57+ }
58+ }
59+
60+ // -----
61+
3162#dpas = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 1 , threadsPerWarp = 16 , warpsPerCTA = [8 , 4 ], repCluster = [1 , 1 ], A = [8 , 8 ], B = [8 , 16 ], C = [8 , 16 ]}>
3263#dot0 = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =1 }>
3364#dot1 = #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth =1 }>
0 commit comments