Skip to content

Commit c58e22e

Browse files
authored
[AggressiveInstCombine] Refactor foldLoadsRecursive to use m_ShlOrSelf (#155176)
This patch was a part of #154375. Two functional changes: 1. Allow matching other commuted patterns. 2. Allow combining loads even if there are multiple uses on a load. It is beneficial in practice.
1 parent 0f4db1a commit c58e22e

File tree

2 files changed

+116
-27
lines changed

2 files changed

+116
-27
lines changed

llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp

Lines changed: 15 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,8 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
8383
// == (ShVal0 << ShAmt) | (ShVal1 >> (Width -ShAmt))
8484
if (match(V, m_OneUse(m_c_Or(
8585
m_Shl(m_Value(ShVal0), m_Value(ShAmt)),
86-
m_LShr(m_Value(ShVal1),
87-
m_Sub(m_SpecificInt(Width), m_Deferred(ShAmt))))))) {
86+
m_LShr(m_Value(ShVal1), m_Sub(m_SpecificInt(Width),
87+
m_Deferred(ShAmt))))))) {
8888
return Intrinsic::fshl;
8989
}
9090

@@ -617,7 +617,7 @@ struct LoadOps {
617617
LoadInst *RootInsert = nullptr;
618618
bool FoundRoot = false;
619619
uint64_t LoadSize = 0;
620-
const APInt *Shift = nullptr;
620+
uint64_t Shift = 0;
621621
Type *ZextType;
622622
AAMDNodes AATags;
623623
};
@@ -627,17 +627,15 @@ struct LoadOps {
627627
// (ZExt(L1) << shift1) | ZExt(L2) -> ZExt(L3)
628628
static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
629629
AliasAnalysis &AA) {
630-
const APInt *ShAmt2 = nullptr;
630+
uint64_t ShAmt2;
631631
Value *X;
632632
Instruction *L1, *L2;
633633

634634
// Go to the last node with loads.
635-
if (match(V, m_OneUse(m_c_Or(
636-
m_Value(X),
637-
m_OneUse(m_Shl(m_OneUse(m_ZExt(m_OneUse(m_Instruction(L2)))),
638-
m_APInt(ShAmt2)))))) ||
639-
match(V, m_OneUse(m_Or(m_Value(X),
640-
m_OneUse(m_ZExt(m_OneUse(m_Instruction(L2)))))))) {
635+
if (match(V,
636+
m_OneUse(m_c_Or(m_Value(X), m_OneUse(m_ShlOrSelf(
637+
m_OneUse(m_ZExt(m_Instruction(L2))),
638+
ShAmt2)))))) {
641639
if (!foldLoadsRecursive(X, LOps, DL, AA) && LOps.FoundRoot)
642640
// Avoid Partial chain merge.
643641
return false;
@@ -646,11 +644,10 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
646644

647645
// Check if the pattern has loads
648646
LoadInst *LI1 = LOps.Root;
649-
const APInt *ShAmt1 = LOps.Shift;
647+
uint64_t ShAmt1 = LOps.Shift;
650648
if (LOps.FoundRoot == false &&
651-
(match(X, m_OneUse(m_ZExt(m_Instruction(L1)))) ||
652-
match(X, m_OneUse(m_Shl(m_OneUse(m_ZExt(m_OneUse(m_Instruction(L1)))),
653-
m_APInt(ShAmt1)))))) {
649+
match(X, m_OneUse(
650+
m_ShlOrSelf(m_OneUse(m_ZExt(m_Instruction(L1))), ShAmt1)))) {
654651
LI1 = dyn_cast<LoadInst>(L1);
655652
}
656653
LoadInst *LI2 = dyn_cast<LoadInst>(L2);
@@ -726,13 +723,6 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
726723
if (IsBigEndian)
727724
std::swap(ShAmt1, ShAmt2);
728725

729-
// Find Shifts values.
730-
uint64_t Shift1 = 0, Shift2 = 0;
731-
if (ShAmt1)
732-
Shift1 = ShAmt1->getZExtValue();
733-
if (ShAmt2)
734-
Shift2 = ShAmt2->getZExtValue();
735-
736726
// First load is always LI1. This is where we put the new load.
737727
// Use the merged load size available from LI1 for forward loads.
738728
if (LOps.FoundRoot) {
@@ -747,7 +737,7 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
747737
uint64_t ShiftDiff = IsBigEndian ? LoadSize2 : LoadSize1;
748738
uint64_t PrevSize =
749739
DL.getTypeStoreSize(IntegerType::get(LI1->getContext(), LoadSize1));
750-
if ((Shift2 - Shift1) != ShiftDiff || (Offset2 - Offset1) != PrevSize)
740+
if ((ShAmt2 - ShAmt1) != ShiftDiff || (Offset2 - Offset1) != PrevSize)
751741
return false;
752742

753743
// Update LOps
@@ -824,7 +814,7 @@ static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL,
824814
// Check if shift needed. We need to shift with the amount of load1
825815
// shift if not zero.
826816
if (LOps.Shift)
827-
NewOp = Builder.CreateShl(NewOp, ConstantInt::get(I.getContext(), *LOps.Shift));
817+
NewOp = Builder.CreateShl(NewOp, LOps.Shift);
828818
I.replaceAllUsesWith(NewOp);
829819

830820
return true;
@@ -860,11 +850,9 @@ static std::optional<PartStore> matchPartStore(Instruction &I,
860850
return std::nullopt;
861851

862852
uint64_t ValWidth = StoredTy->getPrimitiveSizeInBits();
863-
uint64_t ValOffset = 0;
853+
uint64_t ValOffset;
864854
Value *Val;
865-
if (!match(StoredVal, m_CombineOr(m_Trunc(m_LShr(m_Value(Val),
866-
m_ConstantInt(ValOffset))),
867-
m_Trunc(m_Value(Val)))))
855+
if (!match(StoredVal, m_Trunc(m_LShrOrSelf(m_Value(Val), ValOffset))))
868856
return std::nullopt;
869857

870858
Value *Ptr = Store->getPointerOperand();

llvm/test/Transforms/AggressiveInstCombine/X86/or-load.ll

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,107 @@ define i32 @loadCombine_4consecutive(ptr %p) {
101101
ret i32 %o3
102102
}
103103

104+
define i32 @loadCombine_4consecutive_commuted(ptr %p) {
105+
; LE-LABEL: @loadCombine_4consecutive_commuted(
106+
; LE-NEXT: [[O3:%.*]] = load i32, ptr [[P:%.*]], align 1
107+
; LE-NEXT: ret i32 [[O3]]
108+
;
109+
; BE-LABEL: @loadCombine_4consecutive_commuted(
110+
; BE-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 1
111+
; BE-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P]], i32 2
112+
; BE-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[P]], i32 3
113+
; BE-NEXT: [[L1:%.*]] = load i8, ptr [[P]], align 1
114+
; BE-NEXT: [[L2:%.*]] = load i8, ptr [[P1]], align 1
115+
; BE-NEXT: [[L3:%.*]] = load i8, ptr [[P2]], align 1
116+
; BE-NEXT: [[L4:%.*]] = load i8, ptr [[P3]], align 1
117+
; BE-NEXT: [[E1:%.*]] = zext i8 [[L1]] to i32
118+
; BE-NEXT: [[E2:%.*]] = zext i8 [[L2]] to i32
119+
; BE-NEXT: [[E3:%.*]] = zext i8 [[L3]] to i32
120+
; BE-NEXT: [[E4:%.*]] = zext i8 [[L4]] to i32
121+
; BE-NEXT: [[S2:%.*]] = shl i32 [[E2]], 8
122+
; BE-NEXT: [[S3:%.*]] = shl i32 [[E3]], 16
123+
; BE-NEXT: [[S4:%.*]] = shl i32 [[E4]], 24
124+
; BE-NEXT: [[O1:%.*]] = or i32 [[S2]], [[S3]]
125+
; BE-NEXT: [[O2:%.*]] = or i32 [[S4]], [[O1]]
126+
; BE-NEXT: [[O3:%.*]] = or i32 [[E1]], [[O2]]
127+
; BE-NEXT: ret i32 [[O3]]
128+
;
129+
%p1 = getelementptr i8, ptr %p, i32 1
130+
%p2 = getelementptr i8, ptr %p, i32 2
131+
%p3 = getelementptr i8, ptr %p, i32 3
132+
%l1 = load i8, ptr %p
133+
%l2 = load i8, ptr %p1
134+
%l3 = load i8, ptr %p2
135+
%l4 = load i8, ptr %p3
136+
137+
%e1 = zext i8 %l1 to i32
138+
%e2 = zext i8 %l2 to i32
139+
%e3 = zext i8 %l3 to i32
140+
%e4 = zext i8 %l4 to i32
141+
142+
%s2 = shl i32 %e2, 8
143+
%s3 = shl i32 %e3, 16
144+
%s4 = shl i32 %e4, 24
145+
146+
%o1 = or i32 %s2, %s3
147+
%o2 = or i32 %s4, %o1
148+
%o3 = or i32 %e1, %o2
149+
ret i32 %o3
150+
}
151+
152+
define i32 @loadCombine_4consecutive_multiuse(ptr %p) {
153+
; LE-LABEL: @loadCombine_4consecutive_multiuse(
154+
; LE-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 3
155+
; LE-NEXT: [[O3:%.*]] = load i32, ptr [[P]], align 1
156+
; LE-NEXT: [[L4:%.*]] = load i8, ptr [[P3]], align 1
157+
; LE-NEXT: call void @use(i8 [[L4]])
158+
; LE-NEXT: ret i32 [[O3]]
159+
;
160+
; BE-LABEL: @loadCombine_4consecutive_multiuse(
161+
; BE-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 1
162+
; BE-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P]], i32 2
163+
; BE-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[P]], i32 3
164+
; BE-NEXT: [[L1:%.*]] = load i8, ptr [[P]], align 1
165+
; BE-NEXT: [[L2:%.*]] = load i8, ptr [[P1]], align 1
166+
; BE-NEXT: [[L3:%.*]] = load i8, ptr [[P2]], align 1
167+
; BE-NEXT: [[L4:%.*]] = load i8, ptr [[P3]], align 1
168+
; BE-NEXT: call void @use(i8 [[L4]])
169+
; BE-NEXT: [[E1:%.*]] = zext i8 [[L1]] to i32
170+
; BE-NEXT: [[E2:%.*]] = zext i8 [[L2]] to i32
171+
; BE-NEXT: [[E3:%.*]] = zext i8 [[L3]] to i32
172+
; BE-NEXT: [[E4:%.*]] = zext i8 [[L4]] to i32
173+
; BE-NEXT: [[S2:%.*]] = shl i32 [[E2]], 8
174+
; BE-NEXT: [[S3:%.*]] = shl i32 [[E3]], 16
175+
; BE-NEXT: [[S4:%.*]] = shl i32 [[E4]], 24
176+
; BE-NEXT: [[O1:%.*]] = or i32 [[E1]], [[S2]]
177+
; BE-NEXT: [[O2:%.*]] = or i32 [[O1]], [[S3]]
178+
; BE-NEXT: [[O3:%.*]] = or i32 [[O2]], [[S4]]
179+
; BE-NEXT: ret i32 [[O3]]
180+
;
181+
%p1 = getelementptr i8, ptr %p, i32 1
182+
%p2 = getelementptr i8, ptr %p, i32 2
183+
%p3 = getelementptr i8, ptr %p, i32 3
184+
%l1 = load i8, ptr %p
185+
%l2 = load i8, ptr %p1
186+
%l3 = load i8, ptr %p2
187+
%l4 = load i8, ptr %p3
188+
call void @use(i8 %l4)
189+
190+
%e1 = zext i8 %l1 to i32
191+
%e2 = zext i8 %l2 to i32
192+
%e3 = zext i8 %l3 to i32
193+
%e4 = zext i8 %l4 to i32
194+
195+
%s2 = shl i32 %e2, 8
196+
%s3 = shl i32 %e3, 16
197+
%s4 = shl i32 %e4, 24
198+
199+
%o1 = or i32 %e1, %s2
200+
%o2 = or i32 %o1, %s3
201+
%o3 = or i32 %o2, %s4
202+
ret i32 %o3
203+
}
204+
104205
define i32 @loadCombine_4consecutive_BE(ptr %p) {
105206
; LE-LABEL: @loadCombine_4consecutive_BE(
106207
; LE-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 1

0 commit comments

Comments
 (0)