//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Optional.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/VectorUtils.h"
+#include "llvm/Analysis/VectorUtils.h"
#include <algorithm>
#include <map>
#include <memory>
cl::desc(
"Attempt to vectorize horizontal reductions feeding into a store"));
+static cl::opt<int>
+MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
+ cl::desc("Attempt to vectorize for this register size in bits"));
+
namespace {
+// FIXME: Set this via cl::opt to allow overriding.
static const unsigned MinVecRegSize = 128;
static const unsigned RecursionMaxDepth = 12;
// This limit is useful for very large basic blocks.
static const unsigned MaxMemDepDistance = 160;
+/// \brief Predicate for the element types that the SLP vectorizer supports.
+///
+/// The most important thing to filter here are types which are invalid in LLVM
+/// vectors. We also filter target specific types which have absolutely no
+/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
+/// avoids spending time checking the cost model and realizing that they will
+/// be inevitably scalarized.
+static bool isValidElementType(Type *Ty) {
+ return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
+ !Ty->isPPC_FP128Ty();
+}
+
/// \returns the parent basic block if all of the instructions in \p VL
/// are in the same block or null otherwise.
static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
MD = MDNode::getMostGenericTBAA(MD, IMD);
break;
case LLVMContext::MD_alias_scope:
+ MD = MDNode::getMostGenericAliasScope(MD, IMD);
+ break;
case LLVMContext::MD_noalias:
MD = MDNode::intersect(MD, IMD);
break;
}
/// \returns the AA location that is being access by the instruction.
-static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) {
+static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return AA->getLocation(SI);
+ return MemoryLocation::get(SI);
if (LoadInst *LI = dyn_cast<LoadInst>(I))
- return AA->getLocation(LI);
- return AliasAnalysis::Location();
+ return MemoryLocation::get(LI);
+ return MemoryLocation();
}
/// \returns True if the instruction is not a volatile or atomic load/store.
typedef SmallPtrSet<Value *, 16> ValueSet;
typedef SmallVector<StoreInst *, 8> StoreList;
- BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl,
- TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa,
- LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC)
+ BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
+ TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
+ DominatorTree *Dt, AssumptionCache *AC)
: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
- SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
+ SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
Builder(Se->getContext()) {
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
}
}
/// \returns true if the memory operations A and B are consecutive.
- bool isConsecutiveAccess(Value *A, Value *B);
+ bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL);
/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
/// Create a new VectorizableTree entry.
TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
- VectorizableTree.push_back(TreeEntry());
+ VectorizableTree.emplace_back();
int idx = VectorizableTree.size() - 1;
TreeEntry *Last = &VectorizableTree[idx];
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
ExternalUser (Value *S, llvm::User *U, int L) :
- Scalar(S), User(U), Lane(L){};
+ Scalar(S), User(U), Lane(L){}
// Which scalar in our function.
Value *Scalar;
// Which user that uses the scalar.
///
/// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
/// is invariant in the calling loop.
- bool isAliased(const AliasAnalysis::Location &Loc1, Instruction *Inst1,
+ bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
Instruction *Inst2) {
// First check if the result is already in the cache.
if (result.hasValue()) {
return result.getValue();
}
- AliasAnalysis::Location Loc2 = getLocation(Inst2, AA);
+ MemoryLocation Loc2 = getLocation(Inst2, AA);
bool aliased = true;
if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
// Do the alias check.
// Analysis and block reference.
Function *F;
ScalarEvolution *SE;
- const DataLayout *DL;
TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;
AliasAnalysis *AA;
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
- if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
- if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) {
+ const DataLayout &DL = F->getParent()->getDataLayout();
+ if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
+ if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
++NumLoadsWantToChangeOrder;
}
BS.cancelScheduling(VL);
Type *SrcTy = VL0->getOperand(0)->getType();
for (unsigned i = 0; i < VL.size(); ++i) {
Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
- if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) {
+ if (Ty != SrcTy || !isValidElementType(Ty)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
case Instruction::ICmp:
case Instruction::FCmp: {
// Check that all of the compares have the same predicate.
- CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
+ CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
for (unsigned i = 1, e = VL.size(); i < e; ++i) {
CmpInst *Cmp = cast<CmpInst>(VL[i]);
return;
}
case Instruction::Store: {
+ const DataLayout &DL = F->getParent()->getDataLayout();
// Check if the stores are consecutive or of we need to swizzle them.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
- if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
+ if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
if (VectorizableTree.size() != 2)
return false;
- // Handle splat stores.
- if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars))
+ // Handle splat and all-constants stores.
+ if (!VectorizableTree[0].NeedToGather &&
+ (allConstant(VectorizableTree[1].Scalars) ||
+ isSplat(VectorizableTree[1].Scalars)))
return true;
// Gathering cost would be too much for tiny trees.
return -1;
}
-bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
+bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) {
Value *PtrA = getPointerOperand(A);
Value *PtrB = getPointerOperand(B);
unsigned ASA = getAddressSpaceOperand(A);
if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
return false;
- unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA);
+ unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
- APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty));
+ APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
- PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA);
- PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB);
+ PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+ PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
APInt OffsetDelta = OffsetB - OffsetA;
void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
+ const DataLayout &DL = F->getParent()->getDataLayout();
// Push left and right operands of binary operation into Left and Right
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
Instruction *VL1 = cast<Instruction>(VL[j]);
Instruction *VL2 = cast<Instruction>(VL[j + 1]);
- if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) {
+ if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
std::swap(Left[j], Right[j]);
continue;
- } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) {
+ } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
Instruction *VL1 = cast<Instruction>(VL[j]);
Instruction *VL2 = cast<Instruction>(VL[j + 1]);
- if (isConsecutiveAccess(L, L1) && VL1->isCommutative()) {
+ if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
std::swap(Left[j], Right[j]);
continue;
- } else if (isConsecutiveAccess(L, L1) && VL2->isCommutative()) {
+ } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
Right = OrigRight;
}
+ const DataLayout &DL = F->getParent()->getDataLayout();
+
// Finally check if we can get longer vectorizable chain by reordering
// without breaking the good operand order detected above.
// E.g. If we have something like-
for (unsigned j = 0; j < VL.size() - 1; ++j) {
if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
- if (isConsecutiveAccess(L, L1)) {
+ if (isConsecutiveAccess(L, L1, DL)) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
}
if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
- if (isConsecutiveAccess(L, L1)) {
+ if (isConsecutiveAccess(L, L1, DL)) {
std::swap(Left[j + 1], Right[j + 1]);
continue;
}
return Gather(E->Scalars, VecTy);
}
+ const DataLayout &DL = F->getParent()->getDataLayout();
unsigned Opcode = getSameOpcode(E->Scalars);
switch (Opcode) {
}
// Prepare the operand vector.
- for (unsigned j = 0; j < E->Scalars.size(); ++j)
- Operands.push_back(cast<PHINode>(E->Scalars[j])->
- getIncomingValueForBlock(IBB));
+ for (Value *V : E->Scalars)
+ Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
Builder.SetInsertPoint(IBB->getTerminator());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
case Instruction::FPTrunc:
case Instruction::BitCast: {
ValueList INVL;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i)
- INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+ for (Value *V : E->Scalars)
+ INVL.push_back(cast<Instruction>(V)->getOperand(0));
setInsertPointAfterBundle(E->Scalars);
case Instruction::FCmp:
case Instruction::ICmp: {
ValueList LHSV, RHSV;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
- RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+ for (Value *V : E->Scalars) {
+ LHSV.push_back(cast<Instruction>(V)->getOperand(0));
+ RHSV.push_back(cast<Instruction>(V)->getOperand(1));
}
setInsertPointAfterBundle(E->Scalars);
if (Value *V = alreadyVectorized(E->Scalars))
return V;
- CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
+ CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V;
if (Opcode == Instruction::FCmp)
V = Builder.CreateFCmp(P0, L, R);
}
case Instruction::Select: {
ValueList TrueVec, FalseVec, CondVec;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
- TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
- FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
+ for (Value *V : E->Scalars) {
+ CondVec.push_back(cast<Instruction>(V)->getOperand(0));
+ TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
+ FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
}
setInsertPointAfterBundle(E->Scalars);
if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
else
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
- RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+ for (Value *V : E->Scalars) {
+ LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
+ RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
}
setInsertPointAfterBundle(E->Scalars);
unsigned Alignment = LI->getAlignment();
LI = Builder.CreateLoad(VecPtr);
- if (!Alignment)
- Alignment = DL->getABITypeAlignment(ScalarLoadTy);
+ if (!Alignment) {
+ Alignment = DL.getABITypeAlignment(ScalarLoadTy);
+ }
LI->setAlignment(Alignment);
E->VectorizedValue = LI;
++NumVectorInstructions;
unsigned AS = SI->getPointerAddressSpace();
ValueList ValueOp;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i)
- ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
+ for (Value *V : E->Scalars)
+ ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
setInsertPointAfterBundle(E->Scalars);
ExternalUses.push_back(
ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
- if (!Alignment)
- Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
+ if (!Alignment) {
+ Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType());
+ }
S->setAlignment(Alignment);
E->VectorizedValue = S;
++NumVectorInstructions;
setInsertPointAfterBundle(E->Scalars);
ValueList Op0VL;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i)
- Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0));
+ for (Value *V : E->Scalars)
+ Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
Value *Op0 = vectorizeTree(Op0VL);
for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
++j) {
ValueList OpVL;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i)
- OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j));
+ for (Value *V : E->Scalars)
+ OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
Value *OpVec = vectorizeTree(OpVL);
OpVecs.push_back(OpVec);
}
- Value *V = Builder.CreateGEP(Op0, OpVecs);
+ Value *V = Builder.CreateGEP(
+ cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
E->VectorizedValue = V;
++NumVectorInstructions;
Intrinsic::ID IID = Intrinsic::not_intrinsic;
Value *ScalarArg = nullptr;
if (CI && (FI = CI->getCalledFunction())) {
- IID = (Intrinsic::ID) FI->getIntrinsicID();
+ IID = FI->getIntrinsicID();
}
std::vector<Value *> OpVecs;
for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
OpVecs.push_back(CEI->getArgOperand(j));
continue;
}
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- CallInst *CEI = cast<CallInst>(E->Scalars[i]);
+ for (Value *V : E->Scalars) {
+ CallInst *CEI = cast<CallInst>(V);
OpVL.push_back(CEI->getArgOperand(j));
}
ScheduleData *DepDest = BundleMember->NextLoadStore;
if (DepDest) {
Instruction *SrcInst = BundleMember->Inst;
- AliasAnalysis::Location SrcLoc = getLocation(SrcInst, SLP->AA);
+ MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
unsigned numAliased = 0;
unsigned DistToSrc = 1;
}
ScalarEvolution *SE;
- const DataLayout *DL;
TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;
AliasAnalysis *AA;
return false;
SE = &getAnalysis<ScalarEvolution>();
- DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
- DL = DLP ? &DLP->getDataLayout() : nullptr;
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
TLI = TLIP ? &TLIP->getTLI() : nullptr;
if (!TTI->getNumberOfRegisters(true))
return false;
- // Must have DataLayout. We can't require it because some tests run w/o
- // triple.
- if (!DL)
- return false;
+ // Use the vector register size specified by the target unless overridden
+ // by a command-line option.
+ // TODO: It would be better to limit the vectorization factor based on
+ // data type rather than just register size. For example, x86 AVX has
+ // 256-bit registers, but it does not support integer operations
+ // at that width (that requires AVX2).
+ if (MaxVectorRegSizeOption.getNumOccurrences())
+ MaxVecRegSize = MaxVectorRegSizeOption;
+ else
+ MaxVecRegSize = TTI->getRegisterBitWidth(true);
// Don't vectorize when the attribute NoImplicitFloat is used.
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
// Use the bottom up slp vectorizer to construct chains that start with
// store instructions.
- BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AC);
+ BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
// delete instructions.
// Scan the blocks in the function in post order.
- for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
- e = po_end(&F.getEntryBlock()); it != e; ++it) {
- BasicBlock *BB = *it;
+ for (auto BB : post_order(&F.getEntryBlock())) {
// Vectorize trees that end at stores.
if (unsigned count = collectStores(BB, R)) {
(void)count;
bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
- BoUpSLP &R);
+ BoUpSLP &R, unsigned VecRegSize);
bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
BoUpSLP &R);
private:
StoreListMap StoreRefs;
+ unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
};
/// \brief Check that the Values in the slice in VL array are still existent in
/// the WeakVH array.
/// Vectorization of part of the VL array may cause later values in the VL array
/// to become invalid. We track when this has happened in the WeakVH array.
-static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL,
- SmallVectorImpl<WeakVH> &VH,
- unsigned SliceBegin,
- unsigned SliceSize) {
- for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i)
- if (VH[i] != VL[i])
- return true;
-
- return false;
+static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
+ unsigned SliceBegin, unsigned SliceSize) {
+ VL = VL.slice(SliceBegin, SliceSize);
+ VH = VH.slice(SliceBegin, SliceSize);
+ return !std::equal(VL.begin(), VL.end(), VH.begin());
}
bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
- int CostThreshold, BoUpSLP &R) {
+ int CostThreshold, BoUpSLP &R,
+ unsigned VecRegSize) {
unsigned ChainLen = Chain.size();
DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
<< "\n");
Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
- unsigned Sz = DL->getTypeSizeInBits(StoreTy);
- unsigned VF = MinVecRegSize / Sz;
+ auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
+ unsigned Sz = DL.getTypeSizeInBits(StoreTy);
+ unsigned VF = VecRegSize / Sz;
if (!isPowerOf2_32(Sz) || VF < 2)
return false;
bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
int costThreshold, BoUpSLP &R) {
- SetVector<Value *> Heads, Tails;
- SmallDenseMap<Value *, Value *> ConsecutiveChain;
+ SetVector<StoreInst *> Heads, Tails;
+ SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
// We may run into multiple chains that merge into a single chain. We mark the
// stores that we vectorized so that we don't visit the same store twice.
// Do a quadratic search on all of the given stores and find
// all of the pairs of stores that follow each other.
+ SmallVector<unsigned, 16> IndexQueue;
for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
- for (unsigned j = 0; j < e; ++j) {
- if (i == j)
- continue;
-
- if (R.isConsecutiveAccess(Stores[i], Stores[j])) {
- Tails.insert(Stores[j]);
+ const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
+ IndexQueue.clear();
+ // If a store has multiple consecutive store candidates, search Stores
+ // array according to the sequence: from i+1 to e, then from i-1 to 0.
+ // This is because usually pairing with immediate succeeding or preceding
+ // candidate create the best chance to find slp vectorization opportunity.
+ unsigned j = 0;
+ for (j = i + 1; j < e; ++j)
+ IndexQueue.push_back(j);
+ for (j = i; j > 0; --j)
+ IndexQueue.push_back(j - 1);
+
+ for (auto &k : IndexQueue) {
+ if (R.isConsecutiveAccess(Stores[i], Stores[k], DL)) {
+ Tails.insert(Stores[k]);
Heads.insert(Stores[i]);
- ConsecutiveChain[Stores[i]] = Stores[j];
+ ConsecutiveChain[Stores[i]] = Stores[k];
+ break;
}
}
}
// For stores that start but don't end a link in the chain:
- for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
+ for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
it != e; ++it) {
if (Tails.count(*it))
continue;
// We found a store instr that starts a chain. Now follow the chain and try
// to vectorize it.
BoUpSLP::ValueList Operands;
- Value *I = *it;
+ StoreInst *I = *it;
// Collect the chain into a list.
while (Tails.count(I) || Heads.count(I)) {
if (VectorizedStores.count(I))
I = ConsecutiveChain[I];
}
- bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R);
-
- // Mark the vectorized stores so that we don't vectorize them again.
- if (Vectorized)
- VectorizedStores.insert(Operands.begin(), Operands.end());
- Changed |= Vectorized;
+ // FIXME: Is division-by-2 the correct step? Should we assert that the
+ // register size is a power-of-2?
+ for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
+ if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
+ // Mark the vectorized stores so that we don't vectorize them again.
+ VectorizedStores.insert(Operands.begin(), Operands.end());
+ Changed = true;
+ break;
+ }
+ }
}
return Changed;
unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
unsigned count = 0;
StoreRefs.clear();
- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
- StoreInst *SI = dyn_cast<StoreInst>(it);
+ const DataLayout &DL = BB->getModule()->getDataLayout();
+ for (Instruction &I : *BB) {
+ StoreInst *SI = dyn_cast<StoreInst>(&I);
if (!SI)
continue;
// Check that the pointer points to scalars.
Type *Ty = SI->getValueOperand()->getType();
- if (Ty->isAggregateType() || Ty->isVectorTy())
+ if (!isValidElementType(Ty))
continue;
// Find the base pointer.
return false;
unsigned Opcode0 = I0->getOpcode();
+ const DataLayout &DL = I0->getModule()->getDataLayout();
Type *Ty0 = I0->getType();
- unsigned Sz = DL->getTypeSizeInBits(Ty0);
+ unsigned Sz = DL.getTypeSizeInBits(Ty0);
+ // FIXME: Register size should be a parameter to this function, so we can
+ // try different vectorization factors.
unsigned VF = MinVecRegSize / Sz;
- for (int i = 0, e = VL.size(); i < e; ++i) {
- Type *Ty = VL[i]->getType();
- if (Ty->isAggregateType() || Ty->isVectorTy())
+ for (Value *V : VL) {
+ Type *Ty = V->getType();
+ if (!isValidElementType(Ty))
return false;
- Instruction *Inst = dyn_cast<Instruction>(VL[i]);
+ Instruction *Inst = dyn_cast<Instruction>(V);
if (!Inst || Inst->getOpcode() != Opcode0)
return false;
}
ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
/// \brief Try to find a reduction tree.
- bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
- const DataLayout *DL) {
+ bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
assert((!Phi ||
std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
"Thi phi needs to use the binary operator");
return false;
Type *Ty = B->getType();
- if (Ty->isVectorTy())
+ if (!isValidElementType(Ty))
return false;
+ const DataLayout &DL = B->getModule()->getDataLayout();
ReductionOpcode = B->getOpcode();
ReducedValueOpcode = 0;
- ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty);
+ // FIXME: Register size should be a parameter to this function, so we can
+ // try different vectorization factors.
+ ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
ReductionRoot = B;
ReductionPHI = Phi;
// Try to match and vectorize a horizontal reduction.
HorizontalReduction HorRdx;
- if (ShouldVectorizeHor &&
- HorRdx.matchAssociativeReduction(P, BI, DL) &&
+ if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) &&
HorRdx.tryToReduce(R, TTI)) {
Changed = true;
it = BB->begin();
if (BinaryOperator *BinOp =
dyn_cast<BinaryOperator>(SI->getValueOperand())) {
HorizontalReduction HorRdx;
- if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) &&
+ if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) &&
HorRdx.tryToReduce(R, TTI)) ||
tryToVectorize(BinOp, R))) {
Changed = true;
<< it->second.size() << ".\n");
// Process the stores in chunks of 16.
+ // TODO: The limit of 16 inhibits greater vectorization factors.
+ // For example, AVX2 supports v32i8. Increasing this limit, however,
+ // may cause a significant compile-time increase.
for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
unsigned Len = std::min<unsigned>(CE - CI, 16);
Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),