@@ -557,11 +557,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
557557; GFX908-NEXT: s_mul_hi_u32 s9, s0, s5
558558; GFX908-NEXT: s_mul_i32 s0, s0, s5
559559; GFX908-NEXT: s_add_i32 s1, s9, s1
560- ; GFX908-NEXT: s_lshl_b64 s[0:1 ], s[0:1], 5
560+ ; GFX908-NEXT: s_lshl_b64 s[14:15 ], s[0:1], 5
561561; GFX908-NEXT: s_branch .LBB3_2
562562; GFX908-NEXT: .LBB3_1: ; %Flow20
563563; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
564- ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[14:15 ]
564+ ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[0:1 ]
565565; GFX908-NEXT: s_cbranch_vccz .LBB3_12
566566; GFX908-NEXT: .LBB3_2: ; %bb9
567567; GFX908-NEXT: ; =>This Loop Header: Depth=1
@@ -571,15 +571,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
571571; GFX908-NEXT: ; %bb.3: ; %bb14
572572; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
573573; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
574+ ; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1
574575; GFX908-NEXT: s_mov_b32 s9, s8
576+ ; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
575577; GFX908-NEXT: v_mov_b32_e32 v4, s8
578+ ; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6
576579; GFX908-NEXT: v_mov_b32_e32 v8, s8
577580; GFX908-NEXT: v_mov_b32_e32 v6, s8
578581; GFX908-NEXT: v_mov_b32_e32 v5, s9
579582; GFX908-NEXT: v_mov_b32_e32 v9, s9
580583; GFX908-NEXT: v_mov_b32_e32 v7, s9
581- ; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0
582- ; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1
584+ ; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0
583585; GFX908-NEXT: v_mov_b32_e32 v11, v5
584586; GFX908-NEXT: s_mov_b64 s[20:21], s[10:11]
585587; GFX908-NEXT: v_mov_b32_e32 v10, v4
@@ -599,9 +601,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
599601; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
600602; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
601603; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
602- ; GFX908-NEXT: s_add_u32 s20, s20, s0
604+ ; GFX908-NEXT: s_add_u32 s20, s20, s14
603605; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3]
604- ; GFX908-NEXT: s_addc_u32 s21, s21, s1
606+ ; GFX908-NEXT: s_addc_u32 s21, s21, s15
605607; GFX908-NEXT: s_mov_b64 s[22:23], 0
606608; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25]
607609; GFX908-NEXT: s_cbranch_vccz .LBB3_9
@@ -620,7 +622,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
620622; GFX908-NEXT: s_waitcnt vmcnt(0)
621623; GFX908-NEXT: ds_read_b64 v[12:13], v19
622624; GFX908-NEXT: ds_read_b64 v[14:15], v0
623- ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17 ]
625+ ; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1 ]
624626; GFX908-NEXT: s_waitcnt lgkmcnt(0)
625627; GFX908-NEXT: s_cbranch_vccnz .LBB3_7
626628; GFX908-NEXT: ; %bb.6: ; %bb51
@@ -648,7 +650,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
648650; GFX908-NEXT: s_mov_b64 s[22:23], -1
649651; GFX908-NEXT: s_branch .LBB3_4
650652; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
651- ; GFX908-NEXT: s_mov_b64 s[22:23], s[14:15 ]
653+ ; GFX908-NEXT: s_mov_b64 s[22:23], s[16:17 ]
652654; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23]
653655; GFX908-NEXT: s_cbranch_vccz .LBB3_4
654656; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
@@ -663,7 +665,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
663665; GFX908-NEXT: s_xor_b64 s[16:17], s[22:23], -1
664666; GFX908-NEXT: .LBB3_10: ; %Flow19
665667; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
666- ; GFX908-NEXT: s_mov_b64 s[14:15 ], -1
668+ ; GFX908-NEXT: s_mov_b64 s[0:1 ], -1
667669; GFX908-NEXT: s_and_b64 vcc, exec, s[16:17]
668670; GFX908-NEXT: s_cbranch_vccz .LBB3_1
669671; GFX908-NEXT: ; %bb.11: ; %bb12
@@ -672,7 +674,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
672674; GFX908-NEXT: s_addc_u32 s7, s7, 0
673675; GFX908-NEXT: s_add_u32 s10, s10, s12
674676; GFX908-NEXT: s_addc_u32 s11, s11, s13
675- ; GFX908-NEXT: s_mov_b64 s[14:15 ], 0
677+ ; GFX908-NEXT: s_mov_b64 s[0:1 ], 0
676678; GFX908-NEXT: s_branch .LBB3_1
677679; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock
678680; GFX908-NEXT: s_endpgm
@@ -722,11 +724,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
722724; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s5
723725; GFX90A-NEXT: s_mul_i32 s0, s0, s5
724726; GFX90A-NEXT: s_add_i32 s1, s9, s1
725- ; GFX90A-NEXT: s_lshl_b64 s[0:1 ], s[0:1], 5
727+ ; GFX90A-NEXT: s_lshl_b64 s[14:15 ], s[0:1], 5
726728; GFX90A-NEXT: s_branch .LBB3_2
727729; GFX90A-NEXT: .LBB3_1: ; %Flow20
728730; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
729- ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[14:15 ]
731+ ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[0:1 ]
730732; GFX90A-NEXT: s_cbranch_vccz .LBB3_12
731733; GFX90A-NEXT: .LBB3_2: ; %bb9
732734; GFX90A-NEXT: ; =>This Loop Header: Depth=1
@@ -736,12 +738,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
736738; GFX90A-NEXT: ; %bb.3: ; %bb14
737739; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
738740; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
741+ ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1
739742; GFX90A-NEXT: s_mov_b32 s9, s8
743+ ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1]
740744; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
745+ ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8
741746; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
742747; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
743- ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0
744- ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1
748+ ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0
745749; GFX90A-NEXT: s_mov_b64 s[20:21], s[10:11]
746750; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
747751; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -760,8 +764,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
760764; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
761765; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
762766; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
763- ; GFX90A-NEXT: s_add_u32 s20, s20, s0
764- ; GFX90A-NEXT: s_addc_u32 s21, s21, s1
767+ ; GFX90A-NEXT: s_add_u32 s20, s20, s14
768+ ; GFX90A-NEXT: s_addc_u32 s21, s21, s15
765769; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5]
766770; GFX90A-NEXT: s_mov_b64 s[22:23], 0
767771; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25]
@@ -781,7 +785,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
781785; GFX90A-NEXT: s_waitcnt vmcnt(0)
782786; GFX90A-NEXT: ds_read_b64 v[14:15], v19
783787; GFX90A-NEXT: ds_read_b64 v[16:17], v0
784- ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17 ]
788+ ; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1 ]
785789; GFX90A-NEXT: ; kill: killed $sgpr22 killed $sgpr23
786790; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
787791; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7
@@ -802,7 +806,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
802806; GFX90A-NEXT: s_mov_b64 s[22:23], -1
803807; GFX90A-NEXT: s_branch .LBB3_4
804808; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
805- ; GFX90A-NEXT: s_mov_b64 s[22:23], s[14:15 ]
809+ ; GFX90A-NEXT: s_mov_b64 s[22:23], s[16:17 ]
806810; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23]
807811; GFX90A-NEXT: s_cbranch_vccz .LBB3_4
808812; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
@@ -817,7 +821,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
817821; GFX90A-NEXT: s_xor_b64 s[16:17], s[22:23], -1
818822; GFX90A-NEXT: .LBB3_10: ; %Flow19
819823; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
820- ; GFX90A-NEXT: s_mov_b64 s[14:15 ], -1
824+ ; GFX90A-NEXT: s_mov_b64 s[0:1 ], -1
821825; GFX90A-NEXT: s_and_b64 vcc, exec, s[16:17]
822826; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
823827; GFX90A-NEXT: ; %bb.11: ; %bb12
@@ -826,7 +830,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
826830; GFX90A-NEXT: s_addc_u32 s7, s7, 0
827831; GFX90A-NEXT: s_add_u32 s10, s10, s12
828832; GFX90A-NEXT: s_addc_u32 s11, s11, s13
829- ; GFX90A-NEXT: s_mov_b64 s[14:15 ], 0
833+ ; GFX90A-NEXT: s_mov_b64 s[0:1 ], 0
830834; GFX90A-NEXT: s_branch .LBB3_1
831835; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock
832836; GFX90A-NEXT: s_endpgm
0 commit comments