Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2291,10 +2291,13 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
}

// This is a flat memory operation that access both VMEM and LDS, so note it
// - it will require that both the VM and LGKM be flushed to zero if it is
// pending when a VM or LGKM dependency occurs.
if (FlatASCount > 1)
// If this is a truly flat memory operation, then it accesss both VMEM and
// LDS, so note it - it will require that both the VM and LGKM be flushed to
// zero if it is pending when a VM or LGKM dependency occurs.
//
// For example, LDS DMA operations have FLAT set in their TSFlags for
// unspecified reasons, but they are not flat operations)
if (!SIInstrInfo::isLDSDMA(Inst) && FlatASCount > 1)
ScoreBrackets->setPendingFlat();
} else if (SIInstrInfo::isVMEM(Inst) &&
!llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
Expand Down
328 changes: 286 additions & 42 deletions llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN,GFX9
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixes=GCN,GFX10
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GFX9
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixes=GFX10

@lds.0 = internal addrspace(3) global [64 x float] poison, align 16
@lds.1 = internal addrspace(3) global [64 x float] poison, align 16
Expand All @@ -15,13 +16,60 @@
declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)

; GCN-LABEL: {{^}}buffer_load_lds_dword_2_arrays:
; GCN-COUNT-4: buffer_load_dword
; GCN: s_waitcnt vmcnt(2)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(0)
; GCN: ds_read_b32
define amdgpu_kernel void @buffer_load_lds_dword_2_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
; GFX9-LABEL: buffer_load_lds_dword_2_arrays:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 4
; GFX9-NEXT: s_mov_b32 m0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 8
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_load_dword off, s[8:11], 0 lds
; GFX9-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
; GFX9-NEXT: s_movk_i32 m0, 0x100
; GFX9-NEXT: v_mov_b32_e32 v0, 12
; GFX9-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds
; GFX9-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds
; GFX9-NEXT: s_lshl_b32 s0, s12, 2
; GFX9-NEXT: s_lshl_b32 s1, s13, 2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: ds_read_b32 v0, v0
; GFX9-NEXT: ; wave barrier
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_read_b32 v1, v1 offset:256
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: buffer_load_lds_dword_2_arrays:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 4
; GFX10-NEXT: v_mov_b32_e32 v1, 8
; GFX10-NEXT: v_mov_b32_e32 v2, 12
; GFX10-NEXT: s_mov_b32 m0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX10-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds
; GFX10-NEXT: s_movk_i32 m0, 0x100
; GFX10-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds
; GFX10-NEXT: buffer_load_dword v2, s[0:3], 0 offen lds
; GFX10-NEXT: s_lshl_b32 s0, s4, 2
; GFX10-NEXT: s_lshl_b32 s1, s5, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: ds_read_b32 v0, v0
; GFX10-NEXT: ; wave barrier
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ds_read_b32 v1, v1 offset:256
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0, i32 0, i32 0)
Expand All @@ -41,15 +89,57 @@ main_body:
; On gfx9 if there is a pending FLAT operation, and this is a VMem or LGKM
; waitcnt and the target can report early completion, then we need to force a waitcnt 0.

; GCN-LABEL: {{^}}global_load_lds_dword_2_arrays:
; GCN-COUNT-4: global_load_dword
; GFX9: s_waitcnt vmcnt(0)
; GFX9-COUNT-2: ds_read_b32
; GFX10: s_waitcnt vmcnt(2)
; GFX10: ds_read_b32
; GFX10: s_waitcnt vmcnt(0)
; GFX10: ds_read_b32
define amdgpu_kernel void @global_load_lds_dword_2_arrays(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
; GFX9-LABEL: global_load_lds_dword_2_arrays:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_mov_b32 m0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v2, s[0:1] lds
; GFX9-NEXT: global_load_dword v2, s[0:1] offset:4 lds
; GFX9-NEXT: s_movk_i32 m0, 0x100
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_dword v2, s[0:1] offset:8 lds
; GFX9-NEXT: global_load_dword v2, s[0:1] offset:12 lds
; GFX9-NEXT: s_lshl_b32 s0, s2, 2
; GFX9-NEXT: s_lshl_b32 s1, s3, 2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: ds_read_b32 v0, v0
; GFX9-NEXT: ; wave barrier
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_read_b32 v1, v1 offset:256
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_load_lds_dword_2_arrays:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_mov_b32 m0, 0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v2, s[0:1] lds
; GFX10-NEXT: global_load_dword v2, s[0:1] offset:4 lds
; GFX10-NEXT: s_movk_i32 m0, 0x100
; GFX10-NEXT: global_load_dword v2, s[0:1] offset:8 lds
; GFX10-NEXT: global_load_dword v2, s[0:1] offset:12 lds
; GFX10-NEXT: s_lshl_b32 s0, s2, 2
; GFX10-NEXT: s_lshl_b32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: ds_read_b32 v0, v0
; GFX10-NEXT: ; wave barrier
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ds_read_b32 v1, v1 offset:256
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0)
Expand All @@ -68,25 +158,144 @@ main_body:

; There are 8 pseudo registers defined to track LDS DMA dependencies.

; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays:
; GCN-COUNT-10: buffer_load_dword
; GCN: s_waitcnt vmcnt(8)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(7)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(6)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(5)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(4)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(3)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(2)
; GCN-NOT: s_waitcnt vmcnt
; GCN: ds_read_b32
; GCN: ds_read_b32
define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) {
; GFX9-LABEL: buffer_load_lds_dword_10_arrays:
; GFX9: ; %bb.0: ; %main_body
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: s_mov_b32 m0, 0
; GFX9-NEXT: v_mov_b32_e32 v10, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX9-NEXT: s_movk_i32 m0, 0x100
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX9-NEXT: s_movk_i32 m0, 0x200
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX9-NEXT: s_movk_i32 m0, 0x300
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX9-NEXT: s_movk_i32 m0, 0x400
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX9-NEXT: s_movk_i32 m0, 0x500
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX9-NEXT: s_movk_i32 m0, 0x600
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX9-NEXT: s_movk_i32 m0, 0x700
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX9-NEXT: s_movk_i32 m0, 0x800
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX9-NEXT: s_movk_i32 m0, 0x900
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX9-NEXT: s_lshl_b32 s2, s6, 2
; GFX9-NEXT: s_lshl_b32 s3, s7, 2
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v9, s3
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x5c
; GFX9-NEXT: s_waitcnt vmcnt(9)
; GFX9-NEXT: ds_read_b32 v0, v0
; GFX9-NEXT: ; wave barrier
; GFX9-NEXT: s_waitcnt vmcnt(8)
; GFX9-NEXT: ds_read_b32 v1, v9 offset:256
; GFX9-NEXT: ; wave barrier
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: ds_read_b32 v2, v9 offset:512
; GFX9-NEXT: ; wave barrier
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: ds_read_b32 v3, v9 offset:768
; GFX9-NEXT: ; wave barrier
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: ds_read_b32 v4, v9 offset:1024
; GFX9-NEXT: ; wave barrier
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: ds_read_b32 v5, v9 offset:1280
; GFX9-NEXT: ; wave barrier
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: ds_read_b32 v6, v9 offset:1536
; GFX9-NEXT: ; wave barrier
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: ds_read_b32 v7, v9 offset:1792
; GFX9-NEXT: ; wave barrier
; GFX9-NEXT: ds_read_b32 v8, v9 offset:2048
; GFX9-NEXT: ; wave barrier
; GFX9-NEXT: ds_read_b32 v9, v9 offset:2304
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
; GFX9-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: buffer_load_lds_dword_10_arrays:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX10-NEXT: s_mov_b32 m0, 0
; GFX10-NEXT: v_mov_b32_e32 v10, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX10-NEXT: s_movk_i32 m0, 0x100
; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX10-NEXT: s_movk_i32 m0, 0x200
; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX10-NEXT: s_movk_i32 m0, 0x300
; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX10-NEXT: s_movk_i32 m0, 0x400
; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX10-NEXT: s_movk_i32 m0, 0x500
; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX10-NEXT: s_movk_i32 m0, 0x600
; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX10-NEXT: s_movk_i32 m0, 0x700
; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX10-NEXT: s_movk_i32 m0, 0x800
; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX10-NEXT: s_movk_i32 m0, 0x900
; GFX10-NEXT: buffer_load_dword off, s[0:3], 0 lds
; GFX10-NEXT: s_lshl_b32 s0, s6, 2
; GFX10-NEXT: s_lshl_b32 s1, s7, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v9, s1
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x5c
; GFX10-NEXT: s_waitcnt vmcnt(9)
; GFX10-NEXT: ds_read_b32 v0, v0
; GFX10-NEXT: ; wave barrier
; GFX10-NEXT: s_waitcnt vmcnt(8)
; GFX10-NEXT: ds_read_b32 v1, v9 offset:256
; GFX10-NEXT: ; wave barrier
; GFX10-NEXT: s_waitcnt vmcnt(7)
; GFX10-NEXT: ds_read_b32 v2, v9 offset:512
; GFX10-NEXT: ; wave barrier
; GFX10-NEXT: s_waitcnt vmcnt(6)
; GFX10-NEXT: ds_read_b32 v3, v9 offset:768
; GFX10-NEXT: ; wave barrier
; GFX10-NEXT: s_waitcnt vmcnt(5)
; GFX10-NEXT: ds_read_b32 v4, v9 offset:1024
; GFX10-NEXT: ; wave barrier
; GFX10-NEXT: s_waitcnt vmcnt(4)
; GFX10-NEXT: ds_read_b32 v5, v9 offset:1280
; GFX10-NEXT: ; wave barrier
; GFX10-NEXT: s_waitcnt vmcnt(3)
; GFX10-NEXT: ds_read_b32 v6, v9 offset:1536
; GFX10-NEXT: ; wave barrier
; GFX10-NEXT: s_waitcnt vmcnt(2)
; GFX10-NEXT: ds_read_b32 v7, v9 offset:1792
; GFX10-NEXT: ; wave barrier
; GFX10-NEXT: ds_read_b32 v8, v9 offset:2048
; GFX10-NEXT: ; wave barrier
; GFX10-NEXT: ds_read_b32 v9, v9 offset:2304
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1]
; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
; GFX10-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:32
; GFX10-NEXT: s_endpgm
main_body:
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 0, i32 0, i32 0, i32 0)
Expand Down Expand Up @@ -151,14 +360,49 @@ main_body:

define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
; GFX9-LABEL: global_load_lds_no_alias_ds_read:
; GFX9: global_load_dword
; GFX9: global_load_dword
; GFX9: s_waitcnt vmcnt(1)
; GFX9-NOT: s_waitcnt vmcnt(0)
; GFX9: ds_read_b32
; GFX9: s_waitcnt vmcnt(0)
; GFX9: ds_read_b32
; GFX9: s_endpgm
; GFX9: ; %bb.0: ; %body
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_mov_b32 m0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v2, s[0:1] lds
; GFX9-NEXT: s_movk_i32 m0, 0x100
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: global_load_dword v2, s[0:1] offset:4 lds
; GFX9-NEXT: s_lshl_b32 s0, s2, 2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_lshl_b32 s0, s3, 2
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: ds_read_b32 v0, v0 offset:512
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_read_b32 v1, v1 offset:768
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_load_lds_no_alias_ds_read:
; GFX10: ; %bb.0: ; %body
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_mov_b32 m0, 0
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dword v2, s[0:1] lds
; GFX10-NEXT: s_movk_i32 m0, 0x100
; GFX10-NEXT: global_load_dword v2, s[0:1] offset:4 lds
; GFX10-NEXT: s_lshl_b32 s0, s2, 2
; GFX10-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15)
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_lshl_b32 s0, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: ds_read_b32 v0, v0 offset:512
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15)
; GFX10-NEXT: ds_read_b32 v1, v1 offset:768
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
body:
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 4, i32 0)
Expand Down