diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 70db7b4918515..39b001b835ed2 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2291,10 +2291,13 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(LDS_ACCESS, Inst); } - // This is a flat memory operation that access both VMEM and LDS, so note it - // - it will require that both the VM and LGKM be flushed to zero if it is - // pending when a VM or LGKM dependency occurs. - if (FlatASCount > 1) + // If this is a truly flat memory operation, then it accesss both VMEM and + // LDS, so note it - it will require that both the VM and LGKM be flushed to + // zero if it is pending when a VM or LGKM dependency occurs. + // + // For example, LDS DMA operations have FLAT set in their TSFlags for + // unspecified reasons, but they are not flat operations) + if (!SIInstrInfo::isLDSDMA(Inst) && FlatASCount > 1) ScoreBrackets->setPendingFlat(); } else if (SIInstrInfo::isVMEM(Inst) && !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) { diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll index e4e40159e185d..37ba1f42413c9 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll @@ -1,5 +1,6 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN,GFX9 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixes=GCN,GFX10 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixes=GFX10 @lds.0 = internal addrspace(3) global [64 x float] poison, align 16 @lds.1 = internal addrspace(3) global [64 x float] poison, align 16 @@ -15,13 +16,60 @@ declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux) -; GCN-LABEL: {{^}}buffer_load_lds_dword_2_arrays: -; GCN-COUNT-4: buffer_load_dword -; GCN: s_waitcnt vmcnt(2) -; GCN: ds_read_b32 -; GCN: s_waitcnt vmcnt(0) -; GCN: ds_read_b32 define amdgpu_kernel void @buffer_load_lds_dword_2_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, ptr addrspace(1) %out) { +; GFX9-LABEL: buffer_load_lds_dword_2_arrays: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 4 +; GFX9-NEXT: s_mov_b32 m0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 8 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword off, s[8:11], 0 lds +; GFX9-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX9-NEXT: s_movk_i32 m0, 0x100 +; GFX9-NEXT: v_mov_b32_e32 v0, 12 +; GFX9-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX9-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX9-NEXT: s_lshl_b32 s0, s12, 2 +; GFX9-NEXT: s_lshl_b32 s1, s13, 2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: ; wave barrier +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ds_read_b32 v1, v1 offset:256 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: buffer_load_lds_dword_2_arrays: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 4 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_mov_b32_e32 v2, 12 +; GFX10-NEXT: s_mov_b32 m0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds +; GFX10-NEXT: s_movk_i32 m0, 0x100 +; GFX10-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds +; GFX10-NEXT: buffer_load_dword v2, s[0:3], 0 offen lds +; GFX10-NEXT: s_lshl_b32 s0, s4, 2 +; GFX10-NEXT: s_lshl_b32 s1, s5, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: ds_read_b32 v0, v0 +; GFX10-NEXT: ; wave barrier +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ds_read_b32 v1, v1 offset:256 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX10-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0, i32 0, i32 0) @@ -41,15 +89,57 @@ main_body: ; On gfx9 if there is a pending FLAT operation, and this is a VMem or LGKM ; waitcnt and the target can report early completion, then we need to force a waitcnt 0. -; GCN-LABEL: {{^}}global_load_lds_dword_2_arrays: -; GCN-COUNT-4: global_load_dword -; GFX9: s_waitcnt vmcnt(0) -; GFX9-COUNT-2: ds_read_b32 -; GFX10: s_waitcnt vmcnt(2) -; GFX10: ds_read_b32 -; GFX10: s_waitcnt vmcnt(0) -; GFX10: ds_read_b32 define amdgpu_kernel void @global_load_lds_dword_2_arrays(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) { +; GFX9-LABEL: global_load_lds_dword_2_arrays: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_mov_b32 m0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v2, s[0:1] lds +; GFX9-NEXT: global_load_dword v2, s[0:1] offset:4 lds +; GFX9-NEXT: s_movk_i32 m0, 0x100 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v2, s[0:1] offset:8 lds +; GFX9-NEXT: global_load_dword v2, s[0:1] offset:12 lds +; GFX9-NEXT: s_lshl_b32 s0, s2, 2 +; GFX9-NEXT: s_lshl_b32 s1, s3, 2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: ; wave barrier +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ds_read_b32 v1, v1 offset:256 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_load_lds_dword_2_arrays: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_mov_b32 m0, 0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v2, s[0:1] lds +; GFX10-NEXT: global_load_dword v2, s[0:1] offset:4 lds +; GFX10-NEXT: s_movk_i32 m0, 0x100 +; GFX10-NEXT: global_load_dword v2, s[0:1] offset:8 lds +; GFX10-NEXT: global_load_dword v2, s[0:1] offset:12 lds +; GFX10-NEXT: s_lshl_b32 s0, s2, 2 +; GFX10-NEXT: s_lshl_b32 s1, s3, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: ds_read_b32 v0, v0 +; GFX10-NEXT: ; wave barrier +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ds_read_b32 v1, v1 offset:256 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: s_endpgm main_body: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0) call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0) @@ -68,25 +158,144 @@ main_body: ; There are 8 pseudo registers defined to track LDS DMA dependencies. -; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays: -; GCN-COUNT-10: buffer_load_dword -; GCN: s_waitcnt vmcnt(8) -; GCN: ds_read_b32 -; GCN: s_waitcnt vmcnt(7) -; GCN: ds_read_b32 -; GCN: s_waitcnt vmcnt(6) -; GCN: ds_read_b32 -; GCN: s_waitcnt vmcnt(5) -; GCN: ds_read_b32 -; GCN: s_waitcnt vmcnt(4) -; GCN: ds_read_b32 -; GCN: s_waitcnt vmcnt(3) -; GCN: ds_read_b32 -; GCN: s_waitcnt vmcnt(2) -; GCN-NOT: s_waitcnt vmcnt -; GCN: ds_read_b32 -; GCN: ds_read_b32 define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) { +; GFX9-LABEL: buffer_load_lds_dword_10_arrays: +; GFX9: ; %bb.0: ; %main_body +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_mov_b32 m0, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX9-NEXT: s_movk_i32 m0, 0x100 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX9-NEXT: s_movk_i32 m0, 0x200 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX9-NEXT: s_movk_i32 m0, 0x300 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX9-NEXT: s_movk_i32 m0, 0x400 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX9-NEXT: s_movk_i32 m0, 0x500 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX9-NEXT: s_movk_i32 m0, 0x600 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX9-NEXT: s_movk_i32 m0, 0x700 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX9-NEXT: s_movk_i32 m0, 0x800 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX9-NEXT: s_movk_i32 m0, 0x900 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX9-NEXT: s_lshl_b32 s2, s6, 2 +; GFX9-NEXT: s_lshl_b32 s3, s7, 2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v9, s3 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x5c +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: ; wave barrier +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: ds_read_b32 v1, v9 offset:256 +; GFX9-NEXT: ; wave barrier +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: ds_read_b32 v2, v9 offset:512 +; GFX9-NEXT: ; wave barrier +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: ds_read_b32 v3, v9 offset:768 +; GFX9-NEXT: ; wave barrier +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: ds_read_b32 v4, v9 offset:1024 +; GFX9-NEXT: ; wave barrier +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: ds_read_b32 v5, v9 offset:1280 +; GFX9-NEXT: ; wave barrier +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: ds_read_b32 v6, v9 offset:1536 +; GFX9-NEXT: ; wave barrier +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: ds_read_b32 v7, v9 offset:1792 +; GFX9-NEXT: ; wave barrier +; GFX9-NEXT: ds_read_b32 v8, v9 offset:2048 +; GFX9-NEXT: ; wave barrier +; GFX9-NEXT: ds_read_b32 v9, v9 offset:2304 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX9-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:32 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: buffer_load_lds_dword_10_arrays: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_mov_b32 m0, 0 +; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX10-NEXT: s_movk_i32 m0, 0x100 +; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX10-NEXT: s_movk_i32 m0, 0x200 +; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX10-NEXT: s_movk_i32 m0, 0x300 +; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX10-NEXT: s_movk_i32 m0, 0x400 +; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX10-NEXT: s_movk_i32 m0, 0x500 +; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX10-NEXT: s_movk_i32 m0, 0x600 +; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX10-NEXT: s_movk_i32 m0, 0x700 +; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX10-NEXT: s_movk_i32 m0, 0x800 +; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX10-NEXT: s_movk_i32 m0, 0x900 +; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX10-NEXT: s_lshl_b32 s0, s6, 2 +; GFX10-NEXT: s_lshl_b32 s1, s7, 2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v9, s1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x5c +; GFX10-NEXT: s_waitcnt vmcnt(9) +; GFX10-NEXT: ds_read_b32 v0, v0 +; GFX10-NEXT: ; wave barrier +; GFX10-NEXT: s_waitcnt vmcnt(8) +; GFX10-NEXT: ds_read_b32 v1, v9 offset:256 +; GFX10-NEXT: ; wave barrier +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: ds_read_b32 v2, v9 offset:512 +; GFX10-NEXT: ; wave barrier +; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: ds_read_b32 v3, v9 offset:768 +; GFX10-NEXT: ; wave barrier +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: ds_read_b32 v4, v9 offset:1024 +; GFX10-NEXT: ; wave barrier +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: ds_read_b32 v5, v9 offset:1280 +; GFX10-NEXT: ; wave barrier +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: ds_read_b32 v6, v9 offset:1536 +; GFX10-NEXT: ; wave barrier +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: ds_read_b32 v7, v9 offset:1792 +; GFX10-NEXT: ; wave barrier +; GFX10-NEXT: ds_read_b32 v8, v9 offset:2048 +; GFX10-NEXT: ; wave barrier +; GFX10-NEXT: ds_read_b32 v9, v9 offset:2304 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:32 +; GFX10-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 0, i32 0, i32 0, i32 0) @@ -151,14 +360,49 @@ main_body: define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) { ; GFX9-LABEL: global_load_lds_no_alias_ds_read: -; GFX9: global_load_dword -; GFX9: global_load_dword -; GFX9: s_waitcnt vmcnt(1) -; GFX9-NOT: s_waitcnt vmcnt(0) -; GFX9: ds_read_b32 -; GFX9: s_waitcnt vmcnt(0) -; GFX9: ds_read_b32 -; GFX9: s_endpgm +; GFX9: ; %bb.0: ; %body +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_mov_b32 m0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v2, s[0:1] lds +; GFX9-NEXT: s_movk_i32 m0, 0x100 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: global_load_dword v2, s[0:1] offset:4 lds +; GFX9-NEXT: s_lshl_b32 s0, s2, 2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshl_b32 s0, s3, 2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: ds_read_b32 v0, v0 offset:512 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ds_read_b32 v1, v1 offset:768 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_load_lds_no_alias_ds_read: +; GFX10: ; %bb.0: ; %body +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_mov_b32 m0, 0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v2, s[0:1] lds +; GFX10-NEXT: s_movk_i32 m0, 0x100 +; GFX10-NEXT: global_load_dword v2, s[0:1] offset:4 lds +; GFX10-NEXT: s_lshl_b32 s0, s2, 2 +; GFX10-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_lshl_b32 s0, s3, 2 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: ds_read_b32 v0, v0 offset:512 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) +; GFX10-NEXT: ds_read_b32 v1, v1 offset:768 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX10-NEXT: s_endpgm body: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0) call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 4, i32 0)