@@ -239,6 +239,21 @@ static std::string shortBundleName(ArrayRef<Value *> VL) {
239
239
}
240
240
#endif
241
241
242
+ /// Returns power-of-2 number of elements in a single register (part), given the
243
+ /// total number of elements \p Size and number of registers (parts) \p
244
+ /// NumParts.
245
+ static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
246
+ return PowerOf2Ceil(divideCeil(Size, NumParts));
247
+ }
248
+
249
+ /// Returns correct remaining number of elements, considering total amount \p
250
+ /// Size, (power-of-2 number) of elements in a single register \p PartNumElems
251
+ /// and current register (part) \p Part.
252
+ static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
253
+ unsigned Part) {
254
+ return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
255
+ }
256
+
242
257
/// \returns true if all of the instructions in \p VL are in the same block or
243
258
/// false otherwise.
244
259
static bool allSameBlock(ArrayRef<Value *> VL) {
@@ -7139,14 +7154,17 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7139
7154
if (NumSrcRegs == 0)
7140
7155
NumSrcRegs = 1;
7141
7156
// FIXME: this must be moved to TTI for better estimation.
7142
- unsigned EltsPerVector = PowerOf2Ceil(std::max(
7143
- divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
7157
+ unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
7144
7158
auto CheckPerRegistersShuffle =
7145
- [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
7159
+ [&](MutableArrayRef<int> Mask,
7160
+ SmallVector<int> Indices) -> std::optional<TTI::ShuffleKind> {
7161
+ if (NumElts <= EltsPerVector)
7162
+ return std::nullopt;
7146
7163
DenseSet<int> RegIndices;
7147
7164
// Check that if trying to permute same single/2 input vectors.
7148
7165
TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
7149
7166
int FirstRegId = -1;
7167
+ Indices.assign(1, -1);
7150
7168
for (int &I : Mask) {
7151
7169
if (I == PoisonMaskElem)
7152
7170
continue;
@@ -7156,8 +7174,15 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7156
7174
RegIndices.insert(RegId);
7157
7175
if (RegIndices.size() > 2)
7158
7176
return std::nullopt;
7159
- if (RegIndices.size() == 2)
7177
+ if (RegIndices.size() == 2) {
7160
7178
ShuffleKind = TTI::SK_PermuteTwoSrc;
7179
+ if (Indices.size() == 1)
7180
+ Indices.push_back(-1);
7181
+ }
7182
+ if (RegId == FirstRegId)
7183
+ Indices.front() = I % NumElts;
7184
+ else
7185
+ Indices.back() = I % NumElts;
7161
7186
I = (I % NumElts) % EltsPerVector +
7162
7187
(RegId == FirstRegId ? 0 : EltsPerVector);
7163
7188
}
@@ -7168,22 +7193,23 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7168
7193
// Process extracts in blocks of EltsPerVector to check if the source vector
7169
7194
// operand can be re-used directly. If not, add the cost of creating a
7170
7195
// shuffle to extract the values into a vector register.
7171
- for (unsigned Part = 0; Part < NumParts; ++Part ) {
7196
+ for (unsigned Part : seq<unsigned>( NumParts) ) {
7172
7197
if (!ShuffleKinds[Part])
7173
7198
continue;
7174
- ArrayRef<int> MaskSlice =
7175
- Mask.slice(Part * EltsPerVector,
7176
- (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
7177
- ? Mask.size() % EltsPerVector
7178
- : EltsPerVector);
7199
+ ArrayRef<int> MaskSlice = Mask.slice(
7200
+ Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
7179
7201
SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
7180
7202
copy(MaskSlice, SubMask.begin());
7203
+ SmallVector<int> Indices;
7181
7204
std::optional<TTI::ShuffleKind> RegShuffleKind =
7182
- CheckPerRegistersShuffle(SubMask);
7205
+ CheckPerRegistersShuffle(SubMask, Indices );
7183
7206
if (!RegShuffleKind) {
7184
- Cost += ::getShuffleCost(
7185
- TTI, *ShuffleKinds[Part],
7186
- FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
7207
+ if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
7208
+ !ShuffleVectorInst::isIdentityMask(
7209
+ MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
7210
+ Cost += ::getShuffleCost(
7211
+ TTI, *ShuffleKinds[Part],
7212
+ FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
7187
7213
continue;
7188
7214
}
7189
7215
if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
@@ -7193,6 +7219,13 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7193
7219
FixedVectorType::get(VL.front()->getType(), EltsPerVector),
7194
7220
SubMask);
7195
7221
}
7222
+ for (int Idx : Indices) {
7223
+ Cost += ::getShuffleCost(
7224
+ TTI, TTI::SK_ExtractSubvector,
7225
+ FixedVectorType::get(VL.front()->getType(), NumElts), std::nullopt,
7226
+ CostKind, Idx,
7227
+ FixedVectorType::get(VL.front()->getType(), EltsPerVector));
7228
+ }
7196
7229
}
7197
7230
return Cost;
7198
7231
}
@@ -7220,11 +7253,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7220
7253
InVectors.front().get<const TreeEntry *>() == &E1 &&
7221
7254
InVectors.back().get<const TreeEntry *>() == E2) ||
7222
7255
(!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
7223
- assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
7256
+ unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
7257
+ assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
7224
7258
[](int Idx) { return Idx == PoisonMaskElem; }) &&
7225
7259
"Expected all poisoned elements.");
7226
- ArrayRef<int> SubMask =
7227
- ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
7260
+ ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
7228
7261
copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
7229
7262
return;
7230
7263
}
@@ -7465,10 +7498,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7465
7498
});
7466
7499
});
7467
7500
SmallPtrSet<Value *, 4> UniqueBases;
7468
- unsigned SliceSize = VL.size() / NumParts;
7469
- for (unsigned Part = 0; Part < NumParts; ++Part) {
7470
- ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
7471
- for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
7501
+ unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
7502
+ for (unsigned Part : seq<unsigned>(NumParts)) {
7503
+ unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
7504
+ ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
7505
+ for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) {
7472
7506
// Ignore non-extractelement scalars.
7473
7507
if (isa<UndefValue>(V) ||
7474
7508
(!SubMask.empty() && SubMask[I] == PoisonMaskElem))
@@ -7561,7 +7595,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7561
7595
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
7562
7596
if (NumParts == 0 || NumParts >= Mask.size())
7563
7597
NumParts = 1;
7564
- unsigned SliceSize = Mask.size() / NumParts;
7598
+ unsigned SliceSize = getPartNumElems( Mask.size(), NumParts) ;
7565
7599
const auto *It =
7566
7600
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
7567
7601
unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
@@ -7579,7 +7613,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7579
7613
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
7580
7614
if (NumParts == 0 || NumParts >= Mask.size())
7581
7615
NumParts = 1;
7582
- unsigned SliceSize = Mask.size() / NumParts;
7616
+ unsigned SliceSize = getPartNumElems( Mask.size(), NumParts) ;
7583
7617
const auto *It =
7584
7618
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
7585
7619
unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
@@ -9339,12 +9373,12 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
9339
9373
assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
9340
9374
SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
9341
9375
Mask.assign(VL.size(), PoisonMaskElem);
9342
- unsigned SliceSize = VL.size() / NumParts;
9343
- for (unsigned Part = 0; Part < NumParts; ++Part ) {
9376
+ unsigned SliceSize = getPartNumElems( VL.size(), NumParts) ;
9377
+ for (unsigned Part : seq<unsigned>( NumParts) ) {
9344
9378
// Scan list of gathered scalars for extractelements that can be represented
9345
9379
// as shuffles.
9346
- MutableArrayRef<Value *> SubVL =
9347
- MutableArrayRef(VL).slice( Part * SliceSize, SliceSize);
9380
+ MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
9381
+ Part * SliceSize, getNumElems(VL.size(), SliceSize, Part) );
9348
9382
SmallVector<int> SubMask;
9349
9383
std::optional<TTI::ShuffleKind> Res =
9350
9384
tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
@@ -9730,10 +9764,11 @@ BoUpSLP::isGatherShuffledEntry(
9730
9764
"Expected only single user of the gather node.");
9731
9765
assert(VL.size() % NumParts == 0 &&
9732
9766
"Number of scalars must be divisible by NumParts.");
9733
- unsigned SliceSize = VL.size() / NumParts;
9767
+ unsigned SliceSize = getPartNumElems( VL.size(), NumParts) ;
9734
9768
SmallVector<std::optional<TTI::ShuffleKind>> Res;
9735
- for (unsigned Part = 0; Part < NumParts; ++Part) {
9736
- ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
9769
+ for (unsigned Part : seq<unsigned>(NumParts)) {
9770
+ ArrayRef<Value *> SubVL =
9771
+ VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
9737
9772
SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
9738
9773
std::optional<TTI::ShuffleKind> SubRes =
9739
9774
isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part);
@@ -10250,11 +10285,12 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
10250
10285
// into a long virtual vector register, forming the original vector.
10251
10286
Value *Vec = nullptr;
10252
10287
SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
10253
- unsigned SliceSize = E->Scalars.size() / NumParts;
10254
- for (unsigned Part = 0; Part < NumParts; ++Part) {
10288
+ unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
10289
+ for (unsigned Part : seq<unsigned>(NumParts)) {
10290
+ unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part);
10255
10291
ArrayRef<Value *> VL =
10256
- ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize );
10257
- MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize );
10292
+ ArrayRef(E->Scalars).slice(Part * SliceSize, Limit );
10293
+ MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit );
10258
10294
constexpr int MaxBases = 2;
10259
10295
SmallVector<Value *, MaxBases> Bases(MaxBases);
10260
10296
#ifndef NDEBUG
@@ -10290,7 +10326,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
10290
10326
assert((Part == 0 || all_of(seq<unsigned>(0, Part),
10291
10327
[&](unsigned P) {
10292
10328
ArrayRef<int> SubMask =
10293
- Mask.slice(P * SliceSize, SliceSize);
10329
+ Mask.slice(P * SliceSize,
10330
+ getNumElems(Mask.size(),
10331
+ SliceSize, P));
10294
10332
return all_of(SubMask, [](int Idx) {
10295
10333
return Idx == PoisonMaskElem;
10296
10334
});
@@ -10663,13 +10701,19 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
10663
10701
Idx == 0) ||
10664
10702
(Mask.size() == InputVF &&
10665
10703
ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
10666
- std::iota(std::next(Mask.begin(), I * SliceSize),
10667
- std::next(Mask.begin(), (I + 1) * SliceSize), 0);
10704
+ std::iota(
10705
+ std::next(Mask.begin(), I * SliceSize),
10706
+ std::next(Mask.begin(),
10707
+ I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
10708
+ 0);
10668
10709
} else {
10669
10710
unsigned IVal =
10670
10711
*find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
10671
- std::fill(std::next(Mask.begin(), I * SliceSize),
10672
- std::next(Mask.begin(), (I + 1) * SliceSize), IVal);
10712
+ std::fill(
10713
+ std::next(Mask.begin(), I * SliceSize),
10714
+ std::next(Mask.begin(),
10715
+ I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
10716
+ IVal);
10673
10717
}
10674
10718
return true;
10675
10719
};
@@ -10930,7 +10974,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
10930
10974
}
10931
10975
}
10932
10976
if (!GatherShuffles.empty()) {
10933
- unsigned SliceSize = E->Scalars.size() / NumParts;
10977
+ unsigned SliceSize = getPartNumElems( E->Scalars.size(), NumParts) ;
10934
10978
SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
10935
10979
for (const auto [I, TEs] : enumerate(Entries)) {
10936
10980
if (TEs.empty()) {
@@ -10940,7 +10984,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
10940
10984
}
10941
10985
assert((TEs.size() == 1 || TEs.size() == 2) &&
10942
10986
"Expected shuffle of 1 or 2 entries.");
10943
- auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
10987
+ unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
10988
+ auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
10944
10989
VecMask.assign(VecMask.size(), PoisonMaskElem);
10945
10990
copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
10946
10991
if (TEs.size() == 1) {
0 commit comments