Skip to content

Commit 26466ce

Browse files
committed
[SLP]Improve/fix extracts calculations for non-power-of-2 elements.
Change-Id: I6ea9a21eba83034bb01bb1ab9aabb2b97b0d40c2
1 parent 68f6b4e commit 26466ce

File tree

10 files changed

+593
-303
lines changed

10 files changed

+593
-303
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 86 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,21 @@ static std::string shortBundleName(ArrayRef<Value *> VL) {
239239
}
240240
#endif
241241

242+
/// Returns power-of-2 number of elements in a single register (part), given the
243+
/// total number of elements \p Size and number of registers (parts) \p
244+
/// NumParts.
245+
static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
246+
return PowerOf2Ceil(divideCeil(Size, NumParts));
247+
}
248+
249+
/// Returns correct remaining number of elements, considering total amount \p
250+
/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
251+
/// and current register (part) \p Part.
252+
static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
253+
unsigned Part) {
254+
return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
255+
}
256+
242257
/// \returns true if all of the instructions in \p VL are in the same block or
243258
/// false otherwise.
244259
static bool allSameBlock(ArrayRef<Value *> VL) {
@@ -7139,14 +7154,17 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
71397154
if (NumSrcRegs == 0)
71407155
NumSrcRegs = 1;
71417156
// FIXME: this must be moved to TTI for better estimation.
7142-
unsigned EltsPerVector = PowerOf2Ceil(std::max(
7143-
divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
7157+
unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
71447158
auto CheckPerRegistersShuffle =
7145-
[&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
7159+
[&](MutableArrayRef<int> Mask,
7160+
SmallVector<int> Indices) -> std::optional<TTI::ShuffleKind> {
7161+
if (NumElts <= EltsPerVector)
7162+
return std::nullopt;
71467163
DenseSet<int> RegIndices;
71477164
// Check that if trying to permute same single/2 input vectors.
71487165
TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
71497166
int FirstRegId = -1;
7167+
Indices.assign(1, -1);
71507168
for (int &I : Mask) {
71517169
if (I == PoisonMaskElem)
71527170
continue;
@@ -7156,8 +7174,15 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
71567174
RegIndices.insert(RegId);
71577175
if (RegIndices.size() > 2)
71587176
return std::nullopt;
7159-
if (RegIndices.size() == 2)
7177+
if (RegIndices.size() == 2) {
71607178
ShuffleKind = TTI::SK_PermuteTwoSrc;
7179+
if (Indices.size() == 1)
7180+
Indices.push_back(-1);
7181+
}
7182+
if (RegId == FirstRegId)
7183+
Indices.front() = I % NumElts;
7184+
else
7185+
Indices.back() = I % NumElts;
71617186
I = (I % NumElts) % EltsPerVector +
71627187
(RegId == FirstRegId ? 0 : EltsPerVector);
71637188
}
@@ -7168,22 +7193,23 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
71687193
// Process extracts in blocks of EltsPerVector to check if the source vector
71697194
// operand can be re-used directly. If not, add the cost of creating a
71707195
// shuffle to extract the values into a vector register.
7171-
for (unsigned Part = 0; Part < NumParts; ++Part) {
7196+
for (unsigned Part : seq<unsigned>(NumParts)) {
71727197
if (!ShuffleKinds[Part])
71737198
continue;
7174-
ArrayRef<int> MaskSlice =
7175-
Mask.slice(Part * EltsPerVector,
7176-
(Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
7177-
? Mask.size() % EltsPerVector
7178-
: EltsPerVector);
7199+
ArrayRef<int> MaskSlice = Mask.slice(
7200+
Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
71797201
SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
71807202
copy(MaskSlice, SubMask.begin());
7203+
SmallVector<int> Indices;
71817204
std::optional<TTI::ShuffleKind> RegShuffleKind =
7182-
CheckPerRegistersShuffle(SubMask);
7205+
CheckPerRegistersShuffle(SubMask, Indices);
71837206
if (!RegShuffleKind) {
7184-
Cost += ::getShuffleCost(
7185-
TTI, *ShuffleKinds[Part],
7186-
FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
7207+
if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
7208+
!ShuffleVectorInst::isIdentityMask(
7209+
MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
7210+
Cost += ::getShuffleCost(
7211+
TTI, *ShuffleKinds[Part],
7212+
FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
71877213
continue;
71887214
}
71897215
if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
@@ -7193,6 +7219,13 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
71937219
FixedVectorType::get(VL.front()->getType(), EltsPerVector),
71947220
SubMask);
71957221
}
7222+
for (int Idx : Indices) {
7223+
Cost += ::getShuffleCost(
7224+
TTI, TTI::SK_ExtractSubvector,
7225+
FixedVectorType::get(VL.front()->getType(), NumElts), std::nullopt,
7226+
CostKind, Idx,
7227+
FixedVectorType::get(VL.front()->getType(), EltsPerVector));
7228+
}
71967229
}
71977230
return Cost;
71987231
}
@@ -7220,11 +7253,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
72207253
InVectors.front().get<const TreeEntry *>() == &E1 &&
72217254
InVectors.back().get<const TreeEntry *>() == E2) ||
72227255
(!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
7223-
assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
7256+
unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
7257+
assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
72247258
[](int Idx) { return Idx == PoisonMaskElem; }) &&
72257259
"Expected all poisoned elements.");
7226-
ArrayRef<int> SubMask =
7227-
ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
7260+
ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
72287261
copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
72297262
return;
72307263
}
@@ -7465,10 +7498,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
74657498
});
74667499
});
74677500
SmallPtrSet<Value *, 4> UniqueBases;
7468-
unsigned SliceSize = VL.size() / NumParts;
7469-
for (unsigned Part = 0; Part < NumParts; ++Part) {
7470-
ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
7471-
for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
7501+
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
7502+
for (unsigned Part : seq<unsigned>(NumParts)) {
7503+
unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
7504+
ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
7505+
for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) {
74727506
// Ignore non-extractelement scalars.
74737507
if (isa<UndefValue>(V) ||
74747508
(!SubMask.empty() && SubMask[I] == PoisonMaskElem))
@@ -7561,7 +7595,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
75617595
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
75627596
if (NumParts == 0 || NumParts >= Mask.size())
75637597
NumParts = 1;
7564-
unsigned SliceSize = Mask.size() / NumParts;
7598+
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
75657599
const auto *It =
75667600
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
75677601
unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
@@ -7579,7 +7613,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
75797613
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
75807614
if (NumParts == 0 || NumParts >= Mask.size())
75817615
NumParts = 1;
7582-
unsigned SliceSize = Mask.size() / NumParts;
7616+
unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
75837617
const auto *It =
75847618
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
75857619
unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
@@ -9339,12 +9373,12 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
93399373
assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
93409374
SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
93419375
Mask.assign(VL.size(), PoisonMaskElem);
9342-
unsigned SliceSize = VL.size() / NumParts;
9343-
for (unsigned Part = 0; Part < NumParts; ++Part) {
9376+
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
9377+
for (unsigned Part : seq<unsigned>(NumParts)) {
93449378
// Scan list of gathered scalars for extractelements that can be represented
93459379
// as shuffles.
9346-
MutableArrayRef<Value *> SubVL =
9347-
MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
9380+
MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
9381+
Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
93489382
SmallVector<int> SubMask;
93499383
std::optional<TTI::ShuffleKind> Res =
93509384
tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
@@ -9730,10 +9764,11 @@ BoUpSLP::isGatherShuffledEntry(
97309764
"Expected only single user of the gather node.");
97319765
assert(VL.size() % NumParts == 0 &&
97329766
"Number of scalars must be divisible by NumParts.");
9733-
unsigned SliceSize = VL.size() / NumParts;
9767+
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
97349768
SmallVector<std::optional<TTI::ShuffleKind>> Res;
9735-
for (unsigned Part = 0; Part < NumParts; ++Part) {
9736-
ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
9769+
for (unsigned Part : seq<unsigned>(NumParts)) {
9770+
ArrayRef<Value *> SubVL =
9771+
VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
97379772
SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
97389773
std::optional<TTI::ShuffleKind> SubRes =
97399774
isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part);
@@ -10250,11 +10285,12 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
1025010285
// into a long virtual vector register, forming the original vector.
1025110286
Value *Vec = nullptr;
1025210287
SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
10253-
unsigned SliceSize = E->Scalars.size() / NumParts;
10254-
for (unsigned Part = 0; Part < NumParts; ++Part) {
10288+
unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
10289+
for (unsigned Part : seq<unsigned>(NumParts)) {
10290+
unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part);
1025510291
ArrayRef<Value *> VL =
10256-
ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
10257-
MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
10292+
ArrayRef(E->Scalars).slice(Part * SliceSize, Limit);
10293+
MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
1025810294
constexpr int MaxBases = 2;
1025910295
SmallVector<Value *, MaxBases> Bases(MaxBases);
1026010296
#ifndef NDEBUG
@@ -10290,7 +10326,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
1029010326
assert((Part == 0 || all_of(seq<unsigned>(0, Part),
1029110327
[&](unsigned P) {
1029210328
ArrayRef<int> SubMask =
10293-
Mask.slice(P * SliceSize, SliceSize);
10329+
Mask.slice(P * SliceSize,
10330+
getNumElems(Mask.size(),
10331+
SliceSize, P));
1029410332
return all_of(SubMask, [](int Idx) {
1029510333
return Idx == PoisonMaskElem;
1029610334
});
@@ -10663,13 +10701,19 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
1066310701
Idx == 0) ||
1066410702
(Mask.size() == InputVF &&
1066510703
ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
10666-
std::iota(std::next(Mask.begin(), I * SliceSize),
10667-
std::next(Mask.begin(), (I + 1) * SliceSize), 0);
10704+
std::iota(
10705+
std::next(Mask.begin(), I * SliceSize),
10706+
std::next(Mask.begin(),
10707+
I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
10708+
0);
1066810709
} else {
1066910710
unsigned IVal =
1067010711
*find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
10671-
std::fill(std::next(Mask.begin(), I * SliceSize),
10672-
std::next(Mask.begin(), (I + 1) * SliceSize), IVal);
10712+
std::fill(
10713+
std::next(Mask.begin(), I * SliceSize),
10714+
std::next(Mask.begin(),
10715+
I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
10716+
IVal);
1067310717
}
1067410718
return true;
1067510719
};
@@ -10930,7 +10974,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
1093010974
}
1093110975
}
1093210976
if (!GatherShuffles.empty()) {
10933-
unsigned SliceSize = E->Scalars.size() / NumParts;
10977+
unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
1093410978
SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
1093510979
for (const auto [I, TEs] : enumerate(Entries)) {
1093610980
if (TEs.empty()) {
@@ -10940,7 +10984,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
1094010984
}
1094110985
assert((TEs.size() == 1 || TEs.size() == 2) &&
1094210986
"Expected shuffle of 1 or 2 entries.");
10943-
auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
10987+
unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
10988+
auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
1094410989
VecMask.assign(VecMask.size(), PoisonMaskElem);
1094510990
copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
1094610991
if (TEs.size() == 1) {

0 commit comments

Comments
 (0)