1- // RUN: mlir-opt %s --split-input-file -convert-xegpu-to-xevm | FileCheck %s
1+ // RUN: mlir-opt %s --split-input-file -convert-xegpu-to-xevm -canonicalize | FileCheck %s
22
33gpu.module @test {
44// CHECK-LABEL: @load_gather_i64_src_value_offset
5- // CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex>
6- gpu.func @load_gather_i64_src_value_offset (%src: i64 , %offset: vector <1 xindex >) {
5+ // CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex>, %[[ARG2:.*]]: memref<1xf16>
6+ // CHECK-SAME: %[[ARG3:.*]]: vector<1xi1>
7+ gpu.func @load_gather_i64_src_value_offset (%src: i64 , %offset: vector <1 xindex >, %dst: memref <1 xf16 >, %mask: vector <1 xi1 >) {
8+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
9+ // CHECK: %[[CST_0:.*]] = arith.constant 0.000000e+00 : f16
10+ // CHECK: %[[C2_I64:.*]] = arith.constant 2 : i64
11+ // CHECK: %[[VAR2:.*]] = vector.extract %[[ARG3]][0] : i1 from vector<1xi1>
712 // CHECK: %[[VAR0:.*]] = vector.extract %[[ARG1]][0] : index from vector<1xindex>
813 // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64
9- // CHECK: %[[CST:.*]] = arith.constant dense<true> : vector<1xi1>
10- // CHECK: %[[VAR2:.*]] = vector.extract %[[CST]][0] : i1 from vector<1xi1>
11- %1 = arith.constant dense <1 >: vector <1 xi1 >
12- // CHECK: %[[C2_I64:.*]] = arith.constant 2 : i64
1314 // CHECK: %[[VAR3:.*]] = arith.muli %[[VAR1]], %[[C2_I64]] : i64
1415 // CHECK: %[[VAR4:.*]] = arith.addi %[[ARG0]], %[[VAR3]] : i64
1516 // CHECK: %[[VAR5:.*]] = llvm.inttoptr %[[VAR4]] : i64 to !llvm.ptr<1>
1617 // CHECK: %[[VAR6:.*]] = scf.if %[[VAR2]] -> (f16) {
1718 // CHECK: %[[VAR7:.*]] = llvm.load %[[VAR5]] {cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>} : !llvm.ptr<1> -> f16
1819 // CHECK: scf.yield %[[VAR7]] : f16
1920 // CHECK: } else {
20- // CHECK: %[[CST_0:.*]] = arith.constant 0.000000e+00 : f16
2121 // CHECK: scf.yield %[[CST_0]] : f16
2222 // CHECK: }
23- %3 = xegpu.load %src [%offset ], %1 <{l1_hint = #xegpu.cache_hint <cached >, l2_hint = #xegpu.cache_hint <uncached >}>
23+ %0 = xegpu.load %src [%offset ], %mask <{l1_hint = #xegpu.cache_hint <cached >, l2_hint = #xegpu.cache_hint <uncached >}>
2424 : i64 , vector <1 xindex >, vector <1 xi1 > -> vector <1 xf16 >
25+ %c0 = arith.constant 0 : index
26+ vector.store %0 , %dst [%c0 ] : memref <1 xf16 >, vector <1 xf16 >
2527 gpu.return
2628}
2729}
@@ -30,16 +32,16 @@ gpu.func @load_gather_i64_src_value_offset(%src: i64, %offset: vector<1xindex>)
3032gpu.module @test {
3133// CHECK-LABEL: @source_materialize_single_elem_vec
3234// CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex>, %[[ARG2:.*]]: memref<1xf16>
33- gpu.func @source_materialize_single_elem_vec ( %src: i64 , %offset: vector <1 x index >, %dst: memref < 1 x f16 >) {
34- %1 = arith.constant dense < 1 > : vector <1 xi1 >
35- %3 = xegpu.load %src [%offset ], %1 <{l1_hint = #xegpu.cache_hint <cached >, l2_hint = #xegpu.cache_hint <uncached >}>
35+ // CHECK-SAME: %[[ARG3:.*]]: vector<1xi1>
36+ gpu.func @source_materialize_single_elem_vec ( %src: i64 , %offset: vector < 1 x index >, %dst: memref < 1 x f16 >, %mask : vector <1 xi1 >) {
37+ %0 = xegpu.load %src [%offset ], %mask <{l1_hint = #xegpu.cache_hint <cached >, l2_hint = #xegpu.cache_hint <uncached >}>
3638 : i64 , vector <1 xindex >, vector <1 xi1 > -> vector <1 xf16 >
39+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
3740 // CHECK: %[[VAR_IF:.*]] = scf.if
3841 // CHECK: %[[VAR_RET:.*]] = vector.broadcast %[[VAR_IF]] : f16 to vector<1xf16>
39- // CHECK: %[[C0:.*]] = arith.constant 0 : index
4042 // CHECK: vector.store %[[VAR_RET]], %[[ARG2]][%[[C0]]] : memref<1xf16>, vector<1xf16>
4143 %c0 = arith.constant 0 : index
42- vector.store %3 , %dst [%c0 ] : memref <1 xf16 >, vector <1 xf16 >
44+ vector.store %0 , %dst [%c0 ] : memref <1 xf16 >, vector <1 xf16 >
4345 gpu.return
4446}
4547}
@@ -48,24 +50,21 @@ gpu.func @source_materialize_single_elem_vec(%src: i64, %offset: vector<1xindex>
4850
4951gpu.module @test {
5052// CHECK-LABEL: @store_scatter_i64_src_value_offset
51- // CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex>
52- gpu.func @store_scatter_i64_src_value_offset (%src: i64 , %offset: vector <1 xindex >) {
53+ // CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex>, %[[ARG2:.*]]: vector<1xi1>
54+ gpu.func @store_scatter_i64_src_value_offset (%src: i64 , %offset: vector <1 xindex >, %mask: vector <1 xi1 >) {
55+ // CHECK: %[[CST_0:.*]] = arith.constant 2.900000e+00 : f32
56+ // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64
57+ // CHECK: %[[VAR2:.*]] = vector.extract %[[ARG2]][0] : i1 from vector<1xi1>
5358 // CHECK: %[[VAR0:.*]] = vector.extract %[[ARG1]][0] : index from vector<1xindex>
5459 // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64
55- // CHECK: %[[CST:.*]] = arith.constant dense<true> : vector<1xi1>
56- // CHECK: %[[VAR2:.*]] = vector.extract %[[CST]][0] : i1 from vector<1xi1>
57- %1 = arith.constant dense <1 >: vector <1 xi1 >
58- // CHECK: %[[CST_0:.*]] = arith.constant dense<2.900000e+00> : vector<1xf32>
59- // CHECK: %[[VAR3:.*]] = vector.extract %[[CST_0]][0] : f32 from vector<1xf32>
60- %2 = arith.constant dense <2.9 >: vector <1 xf32 >
61- // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64
60+ %0 = arith.constant dense <2.9 >: vector <1 xf32 >
6261 // CHECK: %[[VAR4:.*]] = arith.muli %[[VAR1]], %[[C4_I64]] : i64
6362 // CHECK: %[[VAR5:.*]] = arith.addi %[[ARG0]], %[[VAR4]] : i64
6463 // CHECK: %[[VAR6:.*]] = llvm.inttoptr %[[VAR5]] : i64 to !llvm.ptr<1>
6564 // CHECK: scf.if %[[VAR2]] {
66- // CHECK: llvm.store %[[VAR3 ]], %[[VAR6]] {cache_control = #xevm.store_cache_control<L1wb_L2uc_L3uc>} : f32, !llvm.ptr<1>
65+ // CHECK: llvm.store %[[CST_0 ]], %[[VAR6]] {cache_control = #xevm.store_cache_control<L1wb_L2uc_L3uc>} : f32, !llvm.ptr<1>
6766 // CHECK: }
68- xegpu.store %2 , %src [%offset ], %1 <{l1_hint = #xegpu.cache_hint <write_back >, l2_hint = #xegpu.cache_hint <uncached >}>
67+ xegpu.store %0 , %src [%offset ], %mask <{l1_hint = #xegpu.cache_hint <write_back >, l2_hint = #xegpu.cache_hint <uncached >}>
6968 : vector <1 xf32 >, i64 , vector <1 xindex >, vector <1 xi1 >
7069 gpu.return
7170}
@@ -76,9 +75,9 @@ gpu.module @test {
7675// CHECK-LABEL: @prefetch_i64_src_value_offset
7776// CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: vector<1xindex>
7877gpu.func @prefetch_i64_src_value_offset (%src: i64 , %offset: vector <1 xindex >) {
78+ // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64
7979 // CHECK: %[[VAR0:.*]] = vector.extract %[[ARG1]][0] : index from vector<1xindex>
8080 // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64
81- // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64
8281 // CHECK: %[[VAR2:.*]] = arith.muli %[[VAR1]], %[[C4_I64]] : i64
8382 // CHECK: %[[VAR3:.*]] = arith.addi %[[ARG0]], %[[VAR2]] : i64
8483 // CHECK: %[[VAR4:.*]] = llvm.inttoptr %[[VAR3]] : i64 to !llvm.ptr<1>
@@ -94,11 +93,11 @@ gpu.module @test {
9493// CHECK-LABEL: @prefetch_memref_src_value_offset
9594// CHECK-SAME: %[[ARG0:.*]]: memref<256xf32>, %[[ARG1:.*]]: vector<1xindex>
9695gpu.func @prefetch_memref_src_value_offset (%src: memref <256 xf32 >, %offset: vector <1 xindex >) {
96+ // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64
9797 // CHECK: %[[VAR0:.*]] = vector.extract %[[ARG1]][0] : index from vector<1xindex>
9898 // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64
9999 // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<256xf32> -> index
100100 // CHECK: %[[VAR2:.*]] = arith.index_castui %[[INTPTR]] : index to i64
101- // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64
102101 // CHECK: %[[VAR3:.*]] = arith.muli %[[VAR1]], %[[C4_I64]] : i64
103102 // CHECK: %[[VAR4:.*]] = arith.addi %[[VAR2]], %[[VAR3]] : i64
104103 // CHECK: %[[VAR5:.*]] = llvm.inttoptr %[[VAR4]] : i64 to !llvm.ptr<1>
0 commit comments