-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[VPlan] Consolidate logic for narrow to single scalars #151506
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Ramkumar Ramachandra (artagnon) ChangesThe logic for narrowing to single scalar recipes is in two different places: narrowToSingleScalarRecipes and legalizeAndOptimizeInductions. Consolidate them, with minor test changes. -- 8< -- Full diff: https://github.com/llvm/llvm-project/pull/151506.diff 7 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a1d12a3a01e5e..3e7061584f2eb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -631,30 +631,6 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
if (!PhiR)
continue;
- // Try to narrow wide and replicating recipes to uniform recipes, based on
- // VPlan analysis.
- // TODO: Apply to all recipes in the future, to replace legacy uniformity
- // analysis.
- auto Users = collectUsersRecursively(PhiR);
- for (VPUser *U : reverse(Users)) {
- auto *Def = dyn_cast<VPSingleDefRecipe>(U);
- auto *RepR = dyn_cast<VPReplicateRecipe>(U);
- // Skip recipes that shouldn't be narrowed.
- if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
- Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
- (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
- continue;
-
- // Skip recipes that may have other lanes than their first used.
- if (!vputils::isSingleScalar(Def) && !vputils::onlyFirstLaneUsed(Def))
- continue;
-
- auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
- Def->operands(), /*IsUniform*/ true);
- Clone->insertAfter(Def);
- Def->replaceAllUsesWith(Clone);
- }
-
// Replace wide pointer inductions which have only their scalars used by
// PtrAdd(IndStart, ScalarIVSteps (0, Step)).
if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
@@ -1238,21 +1214,19 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
continue;
- auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
- // Skip recipes that aren't single scalars or don't have only their
- // scalar results used. In the latter case, we would introduce extra
- // broadcasts.
- if (!vputils::isSingleScalar(RepOrWidenR) ||
- any_of(RepOrWidenR->users(), [RepOrWidenR](VPUser *U) {
- return !U->usesScalars(RepOrWidenR);
- }))
+ auto *Def = cast<VPSingleDefRecipe>(&R);
+ // Skip recipes that aren't single scalars or that don't have users or
+ // that don't have only their scalar results used (would introduce extra
+ // broadcasts in this case).
+ if (!vputils::isSingleScalar(Def) || Def->getNumUsers() == 0 ||
+ any_of(Def->users(),
+ [Def](VPUser *U) { return !U->usesScalars(Def); }))
continue;
- auto *Clone = new VPReplicateRecipe(RepOrWidenR->getUnderlyingInstr(),
- RepOrWidenR->operands(),
- true /*IsSingleScalar*/);
- Clone->insertBefore(RepOrWidenR);
- RepOrWidenR->replaceAllUsesWith(Clone);
+ auto *Clone = new VPReplicateRecipe(
+ Def->getUnderlyingInstr(), Def->operands(), true /*IsSingleScalar*/);
+ Clone->insertBefore(Def);
+ Def->replaceAllUsesWith(Clone);
}
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 8dcd57f1b3598..5f80214973158 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -40,6 +40,9 @@ const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE);
/// Returns true if \p VPV is a single scalar, either because it produces the
/// same value for all lanes or only has its first lane used.
inline bool isSingleScalar(const VPValue *VPV) {
+ if (onlyFirstLaneUsed(VPV))
+ return true;
+
auto PreservesUniformity = [](unsigned Opcode) -> bool {
if (Instruction::isBinaryOp(Opcode) || Instruction::isCast(Opcode))
return true;
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
index 82f2fdd431238..cc6d70331e69d 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
@@ -57,11 +57,11 @@ define void @ld_div2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
; CHECK-NEXT: [[TMP0:%.*]] = udiv i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP2]], 42
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i64> [[BROADCAST_SPLAT]], splat (i64 42)
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
+; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr [[TMP4]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
index af2b238105925..0775ef9b8de75 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
@@ -15,6 +15,13 @@ define void @ld_and_neg1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = and i64 [[INDEX]], -1
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: store <2 x i64> [[TMP2]], ptr [[TMP3]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: scalar.ph:
@@ -50,10 +57,14 @@ define void @ld_and_neg2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
; CHECK-NEXT: [[TMP0:%.*]] = and i64 [[INDEX]], -2
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP2]], 42
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i64> [[BROADCAST_SPLAT]], splat (i64 42)
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr [[TMP4]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: scalar.ph:
@@ -97,6 +108,11 @@ define void @ld_and_neg3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
; CHECK-NEXT: [[TMP9:%.*]] = add nsw <2 x i64> [[TMP8]], splat (i64 42)
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: store <2 x i64> [[TMP9]], ptr [[TMP10]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[EXIT:%.*]]
; CHECK: scalar.ph:
@@ -351,6 +367,11 @@ define void @ld_and_neg2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
; CHECK-NEXT: [[TMP9:%.*]] = add nsw <2 x i64> [[TMP8]], splat (i64 42)
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: store <2 x i64> [[TMP9]], ptr [[TMP10]], align 8
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
+; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[SCALAR_PH]]
; CHECK: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
index 61f511c16e88b..ab31abd970814 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
@@ -240,11 +240,11 @@ define void @ld_div8_urem3(ptr noalias %A, ptr noalias %B) {
; CHECK-NEXT: [[TMP1:%.*]] = urem i64 [[TMP0]], 3
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[TMP3]], 42
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TMP3]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = add nsw <8 x i64> [[BROADCAST_SPLAT]], splat (i64 42)
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT: store <8 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
+; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[TMP5]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
index e412d130e115f..93e7581a8ba3d 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
@@ -79,11 +79,11 @@ define void @ld_lshr1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
; VF2-NEXT: [[TMP0:%.*]] = lshr i64 [[INDEX]], 1
; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
; VF2-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
-; VF2-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP2]], 42
-; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
+; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VF2-NEXT: [[TMP3:%.*]] = add nsw <2 x i64> [[BROADCAST_SPLAT]], splat (i64 42)
; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF2-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
+; VF2-NEXT: store <2 x i64> [[TMP3]], ptr [[TMP4]], align 8
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; VF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; VF2-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -159,11 +159,11 @@ define void @ld_lshr2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
; VF2-NEXT: [[TMP0:%.*]] = lshr i64 [[INDEX]], 2
; VF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
; VF2-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
-; VF2-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP2]], 42
-; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
+; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VF2-NEXT: [[TMP3:%.*]] = add nsw <2 x i64> [[BROADCAST_SPLAT]], splat (i64 42)
; VF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF2-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
+; VF2-NEXT: store <2 x i64> [[TMP3]], ptr [[TMP4]], align 8
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; VF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; VF2-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -182,11 +182,11 @@ define void @ld_lshr2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
; VF4-NEXT: [[TMP0:%.*]] = lshr i64 [[INDEX]], 2
; VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
; VF4-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
-; VF4-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP2]], 42
-; VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP3]], i64 0
+; VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i64 0
; VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], splat (i64 42)
; VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF4-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
+; VF4-NEXT: store <4 x i64> [[TMP3]], ptr [[TMP4]], align 8
; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; VF4-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; VF4-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
index ef6ce08da5230..1d140ffc14d14 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
@@ -115,11 +115,11 @@ define void @ld_div2_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
; VF2-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], [[TMP1]]
; VF2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
; VF2-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8
-; VF2-NEXT: [[TMP5:%.*]] = add nsw i64 [[TMP4]], 42
-; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
+; VF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0
; VF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; VF2-NEXT: [[TMP5:%.*]] = add nsw <2 x i64> [[BROADCAST_SPLAT]], splat (i64 42)
; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; VF2-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
+; VF2-NEXT: store <2 x i64> [[TMP5]], ptr [[TMP6]], align 8
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; VF2-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; VF2-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
|
61ec5b3
to
b6a897a
Compare
Gentle ping. |
1 similar comment
Gentle ping. |
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer | ||
; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i64> [[BROADCAST_SPLAT]], splat (i64 42) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this a regression? It looks like we're no longer narrowing this add of the load
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried my best to squash the regression in a principled manner, but the best I was able to do was to put in a hacky exception: the hack has resulted in some unrelated improvements.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I investigated further this morning, and I think the Right Thing to do in this case is to unfortunately live with the regression. The hack can never be more than a hack.
d23bd61
to
00640e3
Compare
Gentle ping. |
The logic for narrowing to single scalar recipes is in two different places: narrowToSingleScalarRecipes and legalizeAndOptimizeInductions. Consolidate them, with minor test changes.
00640e3
to
0573a53
Compare
The logic for narrowing to single scalar recipes is in two different places: narrowToSingleScalarRecipes and legalizeAndOptimizeInductions. Consolidate them, with minor test changes.