Skip to content

Commit 973b173

Browse files
[AMDGPU] Stop optimising readfirstlane in pass AMDGPUUniformIntrinsicCombine (#166955)
1 parent 873b8d5 commit 973b173

File tree

7 files changed

+184
-158
lines changed

7 files changed

+184
-158
lines changed

llvm/lib/Target/AMDGPU/AMDGPUUniformIntrinsicCombine.cpp

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,11 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
5757
const UniformityInfo &UI,
5858
ValueMap<const Value *, bool> &Tracker) {
5959
llvm::Intrinsic::ID IID = II.getIntrinsicID();
60-
60+
/// We deliberately do not simplify readfirstlane with a uniform argument, so
61+
/// that frontends can use it to force a copy to SGPR and thereby prevent the
62+
/// backend from generating unwanted waterfall loops.
6163
switch (IID) {
6264
case Intrinsic::amdgcn_permlane64:
63-
case Intrinsic::amdgcn_readfirstlane:
6465
case Intrinsic::amdgcn_readlane: {
6566
Value *Src = II.getArgOperand(0);
6667
if (isDivergentUseWithNew(II.getOperandUse(0), UI, Tracker))
@@ -107,7 +108,7 @@ static bool optimizeUniformIntrinsic(IntrinsicInst &II,
107108
return Changed;
108109
}
109110
default:
110-
llvm_unreachable("Unexpected intrinsic ID in optimizeUniformIntrinsic");
111+
return false;
111112
}
112113
return false;
113114
}
@@ -121,16 +122,6 @@ static bool runUniformIntrinsicCombine(Function &F, const UniformityInfo &UI) {
121122
auto *II = dyn_cast<IntrinsicInst>(&I);
122123
if (!II)
123124
continue;
124-
125-
switch (II->getIntrinsicID()) {
126-
case Intrinsic::amdgcn_permlane64:
127-
case Intrinsic::amdgcn_readfirstlane:
128-
case Intrinsic::amdgcn_readlane:
129-
case Intrinsic::amdgcn_ballot:
130-
break;
131-
default:
132-
continue;
133-
}
134125
IsChanged |= optimizeUniformIntrinsic(*II, UI, Tracker);
135126
}
136127
return IsChanged;

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,8 @@ define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1)
239239
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
240240
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
241241
; PASS-CHECK: [[IF]]:
242-
; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0
242+
; PASS-CHECK-NEXT: [[FIRST_ACTIVE_ID:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 0)
243+
; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, [[FIRST_ACTIVE_ID]]
243244
; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]]
244245
; PASS-CHECK: [[WORK]]:
245246
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -308,7 +309,8 @@ define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i3
308309
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
309310
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
310311
; PASS-CHECK: [[IF]]:
311-
; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]]
312+
; PASS-CHECK-NEXT: [[FIRST_ACTIVE_ID:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[MYMASK]])
313+
; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[FIRST_ACTIVE_ID]]
312314
; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]]
313315
; PASS-CHECK: [[WORK]]:
314316
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4

llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -248,12 +248,14 @@ define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
248248
;
249249
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
250250
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
251-
; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
251+
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 7)
252+
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
252253
; PASS-CHECK-NEXT: ret void
253254
;
254255
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
255256
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
256-
; DCE-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
257+
; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 7)
258+
; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
257259
; DCE-CHECK-NEXT: ret void
258260
;
259261
%v = call i32 @llvm.amdgcn.readfirstlane(i32 7)
@@ -269,12 +271,14 @@ define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i3
269271
;
270272
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
271273
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
272-
; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
274+
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
275+
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
273276
; PASS-CHECK-NEXT: ret void
274277
;
275278
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
276279
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
277-
; DCE-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
280+
; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
281+
; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
278282
; DCE-CHECK-NEXT: ret void
279283
;
280284
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
@@ -360,12 +364,16 @@ define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %ou
360364
;
361365
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
362366
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
363-
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
367+
; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 5)
368+
; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
369+
; PASS-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
364370
; PASS-CHECK-NEXT: ret void
365371
;
366372
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
367373
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
368-
; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
374+
; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 5)
375+
; DCE-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
376+
; DCE-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
369377
; DCE-CHECK-NEXT: ret void
370378
;
371379
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
@@ -388,15 +396,17 @@ define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
388396
; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
389397
; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
390398
; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
391-
; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
399+
; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
400+
; PASS-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
392401
; PASS-CHECK-NEXT: ret void
393402
;
394403
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
395404
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
396405
; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
397406
; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
398407
; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
399-
; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
408+
; DCE-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
409+
; DCE-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
400410
; DCE-CHECK-NEXT: ret void
401411
;
402412
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -537,13 +547,15 @@ define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
537547
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
538548
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
539549
; PASS-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456
540-
; PASS-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
550+
; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[RANDOM]])
551+
; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
541552
; PASS-CHECK-NEXT: ret void
542553
;
543554
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
544555
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
545556
; DCE-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456
546-
; DCE-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
557+
; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[RANDOM]])
558+
; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
547559
; DCE-CHECK-NEXT: ret void
548560
;
549561
%random = xor i32 123, 456

llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,24 @@
44
define amdgpu_gs i32 @main() {
55
; CHECK-LABEL: main:
66
; CHECK: ; %bb.0: ; %bb
7+
; CHECK-NEXT: s_bitcmp1_b32 0, 0
78
; CHECK-NEXT: s_mov_b32 s0, 0
8-
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
9+
; CHECK-NEXT: s_cselect_b32 s1, -1, 0
10+
; CHECK-NEXT: s_or_saveexec_b32 s2, -1
11+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
12+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
13+
; CHECK-NEXT: v_readfirstlane_b32 s1, v0
14+
; CHECK-NEXT: s_mov_b32 exec_lo, s2
15+
; CHECK-NEXT: s_or_b32 s0, s0, s1
16+
; CHECK-NEXT: s_wait_alu 0xfffe
917
; CHECK-NEXT: s_bitcmp1_b32 s0, 0
1018
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
19+
; CHECK-NEXT: s_wait_alu 0xfffe
1120
; CHECK-NEXT: s_xor_b32 s0, s0, -1
12-
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13-
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
14-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
21+
; CHECK-NEXT: s_wait_alu 0xfffe
22+
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
23+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
24+
; CHECK-NEXT: v_readfirstlane_b32 s0, v1
1525
; CHECK-NEXT: s_wait_alu 0xf1ff
1626
; CHECK-NEXT: ; return to shader part epilog
1727
bb:

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,8 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) {
396396
;
397397
; CHECK-GISEL-LABEL: test_readfirstlane_imm_f64:
398398
; CHECK-GISEL: ; %bb.0:
399-
; CHECK-GISEL-NEXT: s_mov_b64 s[0:1], 0x4040000000000000
399+
; CHECK-GISEL-NEXT: s_mov_b32 s0, 0
400+
; CHECK-GISEL-NEXT: s_mov_b32 s1, 0x40400000
400401
; CHECK-GISEL-NEXT: ;;#ASMSTART
401402
; CHECK-GISEL-NEXT: ; use s[0:1]
402403
; CHECK-GISEL-NEXT: ;;#ASMEND
@@ -455,13 +456,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
455456
; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64:
456457
; CHECK-GISEL: ; %bb.0:
457458
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
459+
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
458460
; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
459-
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 32
461+
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
460462
; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
461-
; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
462463
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
463464
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
464-
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0
465+
; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
466+
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
465467
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
466468
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
467469
; CHECK-GISEL-NEXT: s_endpgm
@@ -488,13 +490,15 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
488490
; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64:
489491
; CHECK-GISEL: ; %bb.0:
490492
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
493+
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
491494
; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
492-
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 0
493-
; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
494-
; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
495+
; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
496+
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
495497
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
496498
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
497-
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000
499+
; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
500+
; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
501+
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
498502
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
499503
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
500504
; CHECK-GISEL-NEXT: s_endpgm
@@ -584,17 +588,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
584588
; CHECK-SDAG: ; %bb.0:
585589
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
586590
; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
591+
; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
592+
; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
587593
; CHECK-SDAG-NEXT: ;;#ASMSTART
588594
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
589595
; CHECK-SDAG-NEXT: ;;#ASMEND
590-
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
591-
; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
592596
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
593-
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
594-
; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
595-
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
596-
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3
597-
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
597+
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
598+
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
599+
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
600+
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
601+
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
598602
; CHECK-SDAG-NEXT: s_endpgm
599603
;
600604
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
@@ -624,17 +628,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
624628
; CHECK-SDAG: ; %bb.0:
625629
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
626630
; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
631+
; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
632+
; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
627633
; CHECK-SDAG-NEXT: ;;#ASMSTART
628634
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
629635
; CHECK-SDAG-NEXT: ;;#ASMEND
630-
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
631-
; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
632636
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
633-
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
634-
; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
635-
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
636-
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3
637-
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
637+
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
638+
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
639+
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
640+
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
641+
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
638642
; CHECK-SDAG-NEXT: s_endpgm
639643
;
640644
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:

llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,33 +20,38 @@ define void @test() {
2020
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
2121
; CHECK-NEXT: .LBB0_3: ; %bb.3
2222
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
23+
; CHECK-NEXT: ; implicit-def: $sgpr4
24+
; CHECK-NEXT: v_mov_b32_e32 v0, s4
25+
; CHECK-NEXT: v_readfirstlane_b32 s6, v0
2326
; CHECK-NEXT: s_mov_b64 s[4:5], -1
27+
; CHECK-NEXT: s_mov_b32 s7, 0
28+
; CHECK-NEXT: s_cmp_eq_u32 s6, s7
2429
; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
2530
; CHECK-NEXT: v_writelane_b32 v1, s4, 0
2631
; CHECK-NEXT: v_writelane_b32 v1, s5, 1
27-
; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
28-
; CHECK-NEXT: s_nop 0
32+
; CHECK-NEXT: s_mov_b64 s[10:11], exec
33+
; CHECK-NEXT: s_mov_b64 exec, -1
2934
; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse
30-
; CHECK-NEXT: s_mov_b64 exec, s[8:9]
35+
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
3136
; CHECK-NEXT: s_cbranch_scc1 .LBB0_5
3237
; CHECK-NEXT: ; %bb.4: ; %bb.4
3338
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
34-
; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
39+
; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
3540
; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse
36-
; CHECK-NEXT: s_mov_b64 exec, s[8:9]
41+
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
3742
; CHECK-NEXT: s_mov_b64 s[4:5], 0
3843
; CHECK-NEXT: v_writelane_b32 v1, s4, 0
3944
; CHECK-NEXT: v_writelane_b32 v1, s5, 1
40-
; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
45+
; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
4146
; CHECK-NEXT: s_nop 0
4247
; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse
43-
; CHECK-NEXT: s_mov_b64 exec, s[8:9]
48+
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
4449
; CHECK-NEXT: .LBB0_5: ; %Flow
4550
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
46-
; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
51+
; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
4752
; CHECK-NEXT: s_nop 0
4853
; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse
49-
; CHECK-NEXT: s_mov_b64 exec, s[8:9]
54+
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
5055
; CHECK-NEXT: v_readlane_b32 s4, v1, 0
5156
; CHECK-NEXT: v_readlane_b32 s5, v1, 1
5257
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]

0 commit comments

Comments
 (0)