Skip to content

Commit 2622766

Browse files
author
git apple-llvm automerger
committed
Merge commit '661c387fc2f1' from llvm.org/release/21.x into stable/21.x
2 parents bfa60e6 + 661c387 commit 2622766

File tree

3 files changed

+158
-2
lines changed

3 files changed

+158
-2
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3252,9 +3252,10 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
32523252
return;
32533253

32543254
// Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
3255-
auto NarrowOp = [](VPValue *V) -> VPValue * {
3255+
SmallPtrSet<VPValue *, 4> NarrowedOps;
3256+
auto NarrowOp = [&NarrowedOps](VPValue *V) -> VPValue * {
32563257
auto *R = V->getDefiningRecipe();
3257-
if (!R)
3258+
if (!R || NarrowedOps.contains(V))
32583259
return V;
32593260
if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
32603261
// Narrow interleave group to wide load, as transformed VPlan will only
@@ -3264,13 +3265,15 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
32643265
LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
32653266
/*Reverse=*/false, {}, LoadGroup->getDebugLoc());
32663267
L->insertBefore(LoadGroup);
3268+
NarrowedOps.insert(L);
32673269
return L;
32683270
}
32693271

32703272
if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
32713273
assert(RepR->isSingleScalar() &&
32723274
isa<LoadInst>(RepR->getUnderlyingInstr()) &&
32733275
"must be a single scalar load");
3276+
NarrowedOps.insert(RepR);
32743277
return RepR;
32753278
}
32763279
auto *WideLoad = cast<VPWidenLoadRecipe>(R);
@@ -3281,6 +3284,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
32813284
WideLoad->operands(), /*IsUniform*/ true,
32823285
/*Mask*/ nullptr, *WideLoad);
32833286
N->insertBefore(WideLoad);
3287+
NarrowedOps.insert(N);
32843288
return N;
32853289
};
32863290

llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1203,3 +1203,82 @@ loop:
12031203
exit:
12041204
ret void
12051205
}
1206+
1207+
; Make sure multiple uses of a narrowed op are handled correctly,
1208+
; https://github.com/llvm/llvm-project/issues/156190.
1209+
define void @multiple_store_groups_storing_same_wide_bin_op(ptr noalias %A, ptr noalias %B, ptr noalias %C) {
1210+
; VF2-LABEL: define void @multiple_store_groups_storing_same_wide_bin_op(
1211+
; VF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
1212+
; VF2-NEXT: [[ENTRY:.*:]]
1213+
; VF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
1214+
; VF2: [[VECTOR_PH]]:
1215+
; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
1216+
; VF2: [[VECTOR_BODY]]:
1217+
; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
1218+
; VF2-NEXT: [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 [[INDEX]]
1219+
; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
1220+
; VF2-NEXT: [[TMP2:%.*]] = fadd contract <2 x double> [[BROADCAST_SPLAT]], splat (double 2.000000e+01)
1221+
; VF2-NEXT: [[TMP3:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
1222+
; VF2-NEXT: store <2 x double> [[TMP2]], ptr [[TMP3]], align 8
1223+
; VF2-NEXT: [[TMP4:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
1224+
; VF2-NEXT: store <2 x double> [[TMP2]], ptr [[TMP4]], align 8
1225+
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
1226+
; VF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
1227+
; VF2-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
1228+
; VF2: [[MIDDLE_BLOCK]]:
1229+
; VF2-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
1230+
; VF2: [[SCALAR_PH]]:
1231+
;
1232+
; VF4-LABEL: define void @multiple_store_groups_storing_same_wide_bin_op(
1233+
; VF4-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
1234+
; VF4-NEXT: [[ENTRY:.*:]]
1235+
; VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
1236+
; VF4: [[VECTOR_PH]]:
1237+
; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
1238+
; VF4: [[VECTOR_BODY]]:
1239+
; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
1240+
; VF4-NEXT: [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 [[INDEX]]
1241+
; VF4-NEXT: [[WIDE_VEC:%.*]] = load <8 x double>, ptr [[TMP0]], align 8
1242+
; VF4-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1243+
; VF4-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1244+
; VF4-NEXT: [[TMP1:%.*]] = fadd contract <4 x double> [[STRIDED_VEC]], splat (double 2.000000e+01)
1245+
; VF4-NEXT: [[TMP2:%.*]] = fadd contract <4 x double> [[STRIDED_VEC1]], splat (double 2.000000e+01)
1246+
; VF4-NEXT: [[TMP3:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
1247+
; VF4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1248+
; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1249+
; VF4-NEXT: store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
1250+
; VF4-NEXT: [[TMP5:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
1251+
; VF4-NEXT: store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
1252+
; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1253+
; VF4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
1254+
; VF4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
1255+
; VF4: [[MIDDLE_BLOCK]]:
1256+
; VF4-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
1257+
; VF4: [[SCALAR_PH]]:
1258+
;
1259+
entry:
1260+
br label %loop
1261+
1262+
loop:
1263+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
1264+
%gep.A = getelementptr { double, double }, ptr %A, i64 %iv
1265+
%l.A.0 = load double, ptr %gep.A, align 8
1266+
%gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A, i64 8
1267+
%l.A.1 = load double, ptr %gep.A.1, align 8
1268+
%add.0 = fadd contract double %l.A.0, 20.0
1269+
%add.1 = fadd contract double %l.A.1, 20.0
1270+
%gep.B = getelementptr { double, double }, ptr %B, i64 %iv
1271+
store double %add.0, ptr %gep.B, align 8
1272+
%gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B, i64 8
1273+
store double %add.1, ptr %gep.B.1, align 8
1274+
%gep.C = getelementptr { double, double }, ptr %C, i64 %iv
1275+
%gep.C.1 = getelementptr inbounds nuw i8, ptr %gep.C, i64 8
1276+
store double %add.0, ptr %gep.C, align 8
1277+
store double %add.1, ptr %gep.C.1, align 8
1278+
%iv.next = add nuw nsw i64 %iv, 1
1279+
%.not = icmp eq i64 %iv.next, 1000
1280+
br i1 %.not, label %exit, label %loop
1281+
1282+
exit:
1283+
ret void
1284+
}

llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -587,3 +587,76 @@ loop:
587587
exit:
588588
ret void
589589
}
590+
591+
define void @multiple_store_groups_storing_same_load_group(ptr noalias %A, ptr noalias %B, ptr noalias %C) {
592+
; VF2-LABEL: define void @multiple_store_groups_storing_same_load_group(
593+
; VF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
594+
; VF2-NEXT: [[ENTRY:.*:]]
595+
; VF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
596+
; VF2: [[VECTOR_PH]]:
597+
; VF2-NEXT: br label %[[VECTOR_BODY:.*]]
598+
; VF2: [[VECTOR_BODY]]:
599+
; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
600+
; VF2-NEXT: [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 [[INDEX]]
601+
; VF2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
602+
; VF2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
603+
; VF2-NEXT: [[TMP1:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
604+
; VF2-NEXT: store <2 x double> [[WIDE_LOAD]], ptr [[TMP1]], align 8
605+
; VF2-NEXT: [[TMP2:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
606+
; VF2-NEXT: store <2 x double> [[WIDE_LOAD1]], ptr [[TMP2]], align 8
607+
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
608+
; VF2-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
609+
; VF2-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
610+
; VF2: [[MIDDLE_BLOCK]]:
611+
; VF2-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
612+
; VF2: [[SCALAR_PH]]:
613+
;
614+
; VF4-LABEL: define void @multiple_store_groups_storing_same_load_group(
615+
; VF4-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
616+
; VF4-NEXT: [[ENTRY:.*:]]
617+
; VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
618+
; VF4: [[VECTOR_PH]]:
619+
; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
620+
; VF4: [[VECTOR_BODY]]:
621+
; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
622+
; VF4-NEXT: [[TMP0:%.*]] = getelementptr { double, double }, ptr [[A]], i64 [[INDEX]]
623+
; VF4-NEXT: [[WIDE_VEC:%.*]] = load <8 x double>, ptr [[TMP0]], align 8
624+
; VF4-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
625+
; VF4-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x double> [[WIDE_VEC]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
626+
; VF4-NEXT: [[TMP1:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
627+
; VF4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[STRIDED_VEC]], <4 x double> [[STRIDED_VEC1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
628+
; VF4-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
629+
; VF4-NEXT: store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
630+
; VF4-NEXT: [[TMP3:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
631+
; VF4-NEXT: store <8 x double> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
632+
; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
633+
; VF4-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
634+
; VF4-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
635+
; VF4: [[MIDDLE_BLOCK]]:
636+
; VF4-NEXT: br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
637+
; VF4: [[SCALAR_PH]]:
638+
;
639+
entry:
640+
br label %loop
641+
642+
loop:
643+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
644+
%gep.A = getelementptr { double, double }, ptr %A, i64 %iv
645+
%gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A, i64 8
646+
%l.A.0 = load double, ptr %gep.A, align 8
647+
%l.A.1 = load double, ptr %gep.A.1, align 8
648+
%gep.B = getelementptr { double, double }, ptr %B, i64 %iv
649+
%gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B, i64 8
650+
store double %l.A.0, ptr %gep.B, align 8
651+
store double %l.A.1, ptr %gep.B.1, align 8
652+
%gep.C = getelementptr { double, double }, ptr %C, i64 %iv
653+
%gep.C.1 = getelementptr inbounds nuw i8, ptr %gep.C, i64 8
654+
store double %l.A.0, ptr %gep.C, align 8
655+
store double %l.A.1, ptr %gep.C.1, align 8
656+
%iv.next = add nuw nsw i64 %iv, 1
657+
%.not = icmp eq i64 %iv.next, 1000
658+
br i1 %.not, label %exit, label %loop
659+
660+
exit:
661+
ret void
662+
}

0 commit comments

Comments
 (0)