@@ -52,26 +52,18 @@ declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_
5252declare x86_amx @llvm.x86.tdpbf16ps.internal (i16 , i16 , i16 , x86_amx, x86_amx, x86_amx)
5353declare void @llvm.x86.tilestored64.internal (i16 , i16 , ptr , i64 , x86_amx)
5454
55- define void @PR90954 (ptr %0 , ptr %1 , i32 %2 ) {
55+ define void @PR90954 (ptr %0 , ptr %1 , i32 %2 ) nounwind {
5656; CHECK-LABEL: PR90954:
5757; CHECK: # %bb.0:
5858; CHECK-NEXT: pushq %rbp
59- ; CHECK-NEXT: .cfi_def_cfa_offset 16
60- ; CHECK-NEXT: .cfi_offset %rbp, -16
6159; CHECK-NEXT: movq %rsp, %rbp
62- ; CHECK-NEXT: .cfi_def_cfa_register %rbp
6360; CHECK-NEXT: pushq %r15
6461; CHECK-NEXT: pushq %r14
6562; CHECK-NEXT: pushq %r13
6663; CHECK-NEXT: pushq %r12
6764; CHECK-NEXT: pushq %rbx
6865; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
6966; CHECK-NEXT: subq $5120, %rsp # imm = 0x1400
70- ; CHECK-NEXT: .cfi_offset %rbx, -56
71- ; CHECK-NEXT: .cfi_offset %r12, -48
72- ; CHECK-NEXT: .cfi_offset %r13, -40
73- ; CHECK-NEXT: .cfi_offset %r14, -32
74- ; CHECK-NEXT: .cfi_offset %r15, -24
7567; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
7668; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
7769; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
@@ -202,5 +194,37 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) {
202194 br label %6
203195}
204196
197+ define void @multi_use () nounwind {
198+ ; CHECK-LABEL: multi_use:
199+ ; CHECK: # %bb.0:
200+ ; CHECK-NEXT: pushq %rbp
201+ ; CHECK-NEXT: subq $2928, %rsp # imm = 0xB70
202+ ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
203+ ; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
204+ ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
205+ ; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp)
206+ ; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp)
207+ ; CHECK-NEXT: movb $16, {{[0-9]+}}(%rsp)
208+ ; CHECK-NEXT: movw $64, {{[0-9]+}}(%rsp)
209+ ; CHECK-NEXT: movw $64, %ax
210+ ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
211+ ; CHECK-NEXT: movw $16, %cx
212+ ; CHECK-NEXT: tilezero %tmm0
213+ ; CHECK-NEXT: movabsq $64, %rbp
214+ ; CHECK-NEXT: tilestored %tmm0, 896(%rsp,%rbp) # 1024-byte Folded Spill
215+ ; CHECK-NEXT: tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm1 # 1024-byte Folded Reload
216+ ; CHECK-NEXT: tdpbf16ps %tmm0, %tmm0, %tmm1
217+ ; CHECK-NEXT: tdpbf16ps %tmm0, %tmm0, %tmm0
218+ ; CHECK-NEXT: addq $2928, %rsp # imm = 0xB70
219+ ; CHECK-NEXT: popq %rbp
220+ ; CHECK-NEXT: tilerelease
221+ ; CHECK-NEXT: vzeroupper
222+ ; CHECK-NEXT: retq
223+ %1 = call x86_amx @llvm.x86.tilezero.internal (i16 16 , i16 64 )
224+ %2 = call x86_amx @llvm.x86.tdpbf16ps.internal (i16 16 , i16 64 , i16 64 , x86_amx %1 , x86_amx %1 , x86_amx %1 )
225+ %3 = call x86_amx @llvm.x86.tdpbf16ps.internal (i16 16 , i16 64 , i16 64 , x86_amx %1 , x86_amx %1 , x86_amx %1 )
226+ ret void
227+ }
228+
205229declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32 (<256 x i32 >)
206230declare <256 x i32 > @llvm.x86.cast.tile.to.vector.v256i32 (x86_amx)
0 commit comments