X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTransforms%2FVectorize%2FSLPVectorizer.cpp;h=57b09caa5a4436036b260e51608080e77078d022;hp=c8b8ff22de0cd26c44c5cf1f02030b66648d7324;hb=1ac8af11c65c1ccad01f2bf1e0c40aca39516c60;hpb=df970d6297162c2d57f6dd9443630787f20b3d08 diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index c8b8ff22de0..57b09caa5a4 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -17,6 +17,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" @@ -42,7 +43,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/VectorUtils.h" +#include "llvm/Analysis/VectorUtils.h" #include #include #include @@ -68,12 +69,50 @@ static cl::opt ShouldStartVectorizeHorAtStore( cl::desc( "Attempt to vectorize horizontal reductions feeding into a store")); +static cl::opt +MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, + cl::desc("Attempt to vectorize for this register size in bits")); + +/// Limits the size of scheduling regions in a block. +/// It avoid long compile times for _very_ large blocks where vector +/// instructions are spread over a wide range. +/// This limit is way higher than needed by real-world functions. +static cl::opt +ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, + cl::desc("Limit the size of the SLP scheduling region per block")); + namespace { +// FIXME: Set this via cl::opt to allow overriding. static const unsigned MinVecRegSize = 128; static const unsigned RecursionMaxDepth = 12; +// Limit the number of alias checks. The limit is chosen so that +// it has no negative effect on the llvm benchmarks. +static const unsigned AliasedCheckLimit = 10; + +// Another limit for the alias checks: The maximum distance between load/store +// instructions where alias checks are done. +// This limit is useful for very large basic blocks. +static const unsigned MaxMemDepDistance = 160; + +/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling +/// regions to be handled. +static const int MinScheduleRegionSize = 16; + +/// \brief Predicate for the element types that the SLP vectorizer supports. +/// +/// The most important thing to filter here are types which are invalid in LLVM +/// vectors. We also filter target specific types which have absolutely no +/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just +/// avoids spending time checking the cost model and realizing that they will +/// be inevitably scalarized. +static bool isValidElementType(Type *Ty) { + return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() && + !Ty->isPPC_FP128Ty(); +} + /// \returns the parent basic block if all of the instructions in \p VL /// are in the same block or null otherwise. static BasicBlock *getSameBlock(ArrayRef VL) { @@ -135,7 +174,7 @@ static bool canCombineAsAltInst(unsigned Op) { return false; } -/// \returns ShuffleVector instruction if intructions in \p VL have +/// \returns ShuffleVector instruction if instructions in \p VL have /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) static unsigned isAltInst(ArrayRef VL) { @@ -207,12 +246,17 @@ static Instruction *propagateMetadata(Instruction *I, ArrayRef VL) { MD = MDNode::getMostGenericTBAA(MD, IMD); break; case LLVMContext::MD_alias_scope: + MD = MDNode::getMostGenericAliasScope(MD, IMD); + break; case LLVMContext::MD_noalias: MD = MDNode::intersect(MD, IMD); break; case LLVMContext::MD_fpmath: MD = MDNode::getMostGenericFPMath(MD, IMD); break; + case LLVMContext::MD_nontemporal: + MD = MDNode::intersect(MD, IMD); + break; } } I->setMetadata(Kind, MD); @@ -263,104 +307,6 @@ static bool CanReuseExtract(ArrayRef VL) { return true; } -static void reorderInputsAccordingToOpcode(ArrayRef VL, - SmallVectorImpl &Left, - SmallVectorImpl &Right) { - - SmallVector OrigLeft, OrigRight; - - bool AllSameOpcodeLeft = true; - bool AllSameOpcodeRight = true; - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - Instruction *I = cast(VL[i]); - Value *V0 = I->getOperand(0); - Value *V1 = I->getOperand(1); - - OrigLeft.push_back(V0); - OrigRight.push_back(V1); - - Instruction *I0 = dyn_cast(V0); - Instruction *I1 = dyn_cast(V1); - - // Check whether all operands on one side have the same opcode. In this case - // we want to preserve the original order and not make things worse by - // reordering. - AllSameOpcodeLeft = I0; - AllSameOpcodeRight = I1; - - if (i && AllSameOpcodeLeft) { - if(Instruction *P0 = dyn_cast(OrigLeft[i-1])) { - if(P0->getOpcode() != I0->getOpcode()) - AllSameOpcodeLeft = false; - } else - AllSameOpcodeLeft = false; - } - if (i && AllSameOpcodeRight) { - if(Instruction *P1 = dyn_cast(OrigRight[i-1])) { - if(P1->getOpcode() != I1->getOpcode()) - AllSameOpcodeRight = false; - } else - AllSameOpcodeRight = false; - } - - // Sort two opcodes. In the code below we try to preserve the ability to use - // broadcast of values instead of individual inserts. - // vl1 = load - // vl2 = phi - // vr1 = load - // vr2 = vr2 - // = vl1 x vr1 - // = vl2 x vr2 - // If we just sorted according to opcode we would leave the first line in - // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). - // = vl1 x vr1 - // = vr2 x vl2 - // Because vr2 and vr1 are from the same load we loose the opportunity of a - // broadcast for the packed right side in the backend: we have [vr1, vl2] - // instead of [vr1, vr2=vr1]. - if (I0 && I1) { - if(!i && I0->getOpcode() > I1->getOpcode()) { - Left.push_back(I1); - Right.push_back(I0); - } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) { - // Try not to destroy a broad cast for no apparent benefit. - Left.push_back(I1); - Right.push_back(I0); - } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] == I0) { - // Try preserve broadcasts. - Left.push_back(I1); - Right.push_back(I0); - } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) { - // Try preserve broadcasts. - Left.push_back(I1); - Right.push_back(I0); - } else { - Left.push_back(I0); - Right.push_back(I1); - } - continue; - } - // One opcode, put the instruction on the right. - if (I0) { - Left.push_back(V1); - Right.push_back(I0); - continue; - } - Left.push_back(V0); - Right.push_back(V1); - } - - bool LeftBroadcast = isSplat(Left); - bool RightBroadcast = isSplat(Right); - - // Don't reorder if the operands where good to begin with. - if (!(LeftBroadcast || RightBroadcast) && - (AllSameOpcodeRight || AllSameOpcodeLeft)) { - Left = OrigLeft; - Right = OrigRight; - } -} - /// \returns True if in-tree use also needs extract. This refers to /// possible scalar operand in vectorized instruction. static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, @@ -388,6 +334,26 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, } } +/// \returns the AA location that is being access by the instruction. +static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) { + if (StoreInst *SI = dyn_cast(I)) + return MemoryLocation::get(SI); + if (LoadInst *LI = dyn_cast(I)) + return MemoryLocation::get(LI); + return MemoryLocation(); +} + +/// \returns True if the instruction is not a volatile or atomic load/store. +static bool isSimple(Instruction *I) { + if (LoadInst *LI = dyn_cast(I)) + return LI->isSimple(); + if (StoreInst *SI = dyn_cast(I)) + return SI->isSimple(); + if (MemIntrinsic *MI = dyn_cast(I)) + return !MI->isVolatile(); + return true; +} + /// Bottom Up SLP Vectorizer. class BoUpSLP { public: @@ -396,11 +362,11 @@ public: typedef SmallPtrSet ValueSet; typedef SmallVector StoreList; - BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl, - TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, - LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC) + BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, + TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, + DominatorTree *Dt, AssumptionCache *AC) : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), - SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), + SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); } @@ -437,12 +403,12 @@ public: } /// \returns true if the memory operations A and B are consecutive. - bool isConsecutiveAccess(Value *A, Value *B); + bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL); /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); - /// \returns true if it is benefitial to reverse the vector order. + /// \returns true if it is beneficial to reverse the vector order. bool shouldReorder() const { return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; } @@ -490,10 +456,20 @@ private: /// \returns a vector from a collection of scalars in \p VL. Value *Gather(ArrayRef VL, VectorType *Ty); - /// \returns whether the VectorizableTree is fully vectoriable and will + /// \returns whether the VectorizableTree is fully vectorizable and will /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree(); + /// \reorder commutative operands in alt shuffle if they result in + /// vectorized code. + void reorderAltShuffleOperands(ArrayRef VL, + SmallVectorImpl &Left, + SmallVectorImpl &Right); + /// \reorder commutative operands to get better probability of + /// generating vectorized code. + void reorderInputsAccordingToOpcode(ArrayRef VL, + SmallVectorImpl &Left, + SmallVectorImpl &Right); struct TreeEntry { TreeEntry() : Scalars(), VectorizedValue(nullptr), NeedToGather(0) {} @@ -516,7 +492,7 @@ private: /// Create a new VectorizableTree entry. TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized) { - VectorizableTree.push_back(TreeEntry()); + VectorizableTree.emplace_back(); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); @@ -545,7 +521,7 @@ private: /// This POD struct describes one external user in the vectorized tree. struct ExternalUser { ExternalUser (Value *S, llvm::User *U, int L) : - Scalar(S), User(U), Lane(L){}; + Scalar(S), User(U), Lane(L){} // Which scalar in our function. Value *Scalar; // Which user that uses the scalar. @@ -555,6 +531,52 @@ private: }; typedef SmallVector UserList; + /// Checks if two instructions may access the same memory. + /// + /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it + /// is invariant in the calling loop. + bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, + Instruction *Inst2) { + + // First check if the result is already in the cache. + AliasCacheKey key = std::make_pair(Inst1, Inst2); + Optional &result = AliasCache[key]; + if (result.hasValue()) { + return result.getValue(); + } + MemoryLocation Loc2 = getLocation(Inst2, AA); + bool aliased = true; + if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) { + // Do the alias check. + aliased = AA->alias(Loc1, Loc2); + } + // Store the result in the cache. + result = aliased; + return aliased; + } + + typedef std::pair AliasCacheKey; + + /// Cache for alias results. + /// TODO: consider moving this to the AliasAnalysis itself. + DenseMap> AliasCache; + + /// Removes an instruction from its block and eventually deletes it. + /// It's like Instruction::eraseFromParent() except that the actual deletion + /// is delayed until BoUpSLP is destructed. + /// This is required to ensure that there are no incorrect collisions in the + /// AliasCache, which can happen if a new instruction is allocated at the + /// same address as a previously deleted instruction. + void eraseInstruction(Instruction *I) { + I->removeFromParent(); + I->dropAllReferences(); + DeletedInstructions.push_back(std::unique_ptr(I)); + } + + /// Temporary store for deleted instructions. Instructions will be deleted + /// eventually when the BoUpSLP is destructed. + SmallVector, 8> DeletedInstructions; + /// A list of values that need to extracted out of the tree. /// This list holds pairs of (Internal Scalar : External User). UserList ExternalUses; @@ -710,6 +732,8 @@ private: : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), ScheduleStart(nullptr), ScheduleEnd(nullptr), FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr), + ScheduleRegionSize(0), + ScheduleRegionSizeLimit(ScheduleRegionSizeBudget), // Make sure that the initial SchedulingRegionID is greater than the // initial SchedulingRegionID in ScheduleData (which is 0). SchedulingRegionID(1) {} @@ -721,6 +745,13 @@ private: FirstLoadStoreInRegion = nullptr; LastLoadStoreInRegion = nullptr; + // Reduce the maximum schedule region size by the size of the + // previous scheduling run. + ScheduleRegionSizeLimit -= ScheduleRegionSize; + if (ScheduleRegionSizeLimit < MinScheduleRegionSize) + ScheduleRegionSizeLimit = MinScheduleRegionSize; + ScheduleRegionSize = 0; + // Make a new scheduling region, i.e. all existing ScheduleData is not // in the new region yet. ++SchedulingRegionID; @@ -791,13 +822,14 @@ private: /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. - bool tryScheduleBundle(ArrayRef VL, AliasAnalysis *AA); + bool tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP); /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef VL); /// Extends the scheduling region so that V is inside the region. - void extendSchedulingRegion(Value *V); + /// \returns true if the region size is within the limit. + bool extendSchedulingRegion(Value *V); /// Initialize the ScheduleData structures for new instructions in the /// scheduling region. @@ -808,7 +840,7 @@ private: /// Updates the dependency information of a bundle and of all instructions/ /// bundles which depend on the original bundle. void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, - AliasAnalysis *AA); + BoUpSLP *SLP); /// Sets all instruction in the scheduling region to un-scheduled. void resetSchedule(); @@ -851,13 +883,19 @@ private: /// (can be null). ScheduleData *LastLoadStoreInRegion; + /// The current size of the scheduling region. + int ScheduleRegionSize; + + /// The maximum size allowed for the scheduling region. + int ScheduleRegionSizeLimit; + /// The ID of the scheduling region. For a new vectorization iteration this /// is incremented which "removes" all ScheduleData from the region. int SchedulingRegionID; }; /// Attaches the BlockScheduling structures to basic blocks. - DenseMap> BlocksSchedules; + MapVector> BlocksSchedules; /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. @@ -875,7 +913,6 @@ private: // Analysis and block reference. Function *F; ScalarEvolution *SE; - const DataLayout *DL; TargetTransformInfo *TTI; TargetLibraryInfo *TLI; AliasAnalysis *AA; @@ -1031,11 +1068,11 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { } } - // If any of the scalars appears in the table OR it is marked as a value that - // needs to stat scalar then we need to gather the scalars. + // If any of the scalars is marked as a value that needs to stay scalar then + // we need to gather the scalars. for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) { - DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n"); + if (MustGather.count(VL[i])) { + DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); newTreeEntry(VL, false); return; } @@ -1069,9 +1106,11 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { } BlockScheduling &BS = *BSRef.get(); - if (!BS.tryScheduleBundle(VL, AA)) { + if (!BS.tryScheduleBundle(VL, this)) { DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); - BS.cancelScheduling(VL); + assert((!BS.getScheduleData(VL[0]) || + !BS.getScheduleData(VL[0])->isPartOfBundle()) && + "tryScheduleBundle should cancelScheduling on failure"); newTreeEntry(VL, false); return; } @@ -1119,6 +1158,23 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { return; } case Instruction::Load: { + // Check that a vectorized load would load the same memory as a scalar + // load. + // For example we don't want vectorize loads that are smaller than 8 bit. + // Even though we have a packed struct {} LLVM treats + // loading/storing it as an i8 struct. If we vectorize loads/stores from + // such a struct we read/write packed bits disagreeing with the + // unvectorized version. + const DataLayout &DL = F->getParent()->getDataLayout(); + Type *ScalarTy = VL[0]->getType(); + + if (DL.getTypeSizeInBits(ScalarTy) != + DL.getTypeAllocSizeInBits(ScalarTy)) { + BS.cancelScheduling(VL); + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); + return; + } // Check if the loads are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { LoadInst *L = cast(VL[i]); @@ -1128,8 +1184,9 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } - if (!isConsecutiveAccess(VL[i], VL[i + 1])) { - if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) { + + if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) { + if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) { ++NumLoadsWantToChangeOrder; } BS.cancelScheduling(VL); @@ -1158,7 +1215,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { Type *SrcTy = VL0->getOperand(0)->getType(); for (unsigned i = 0; i < VL.size(); ++i) { Type *Ty = cast(VL[i])->getOperand(0)->getType(); - if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) { + if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); @@ -1181,7 +1238,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { case Instruction::ICmp: case Instruction::FCmp: { // Check that all of the compares have the same predicate. - CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); + CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Type *ComparedTy = cast(VL[0])->getOperand(0)->getType(); for (unsigned i = 1, e = VL.size(); i < e; ++i) { CmpInst *Cmp = cast(VL[i]); @@ -1298,9 +1355,10 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { return; } case Instruction::Store: { + const DataLayout &DL = F->getParent()->getDataLayout(); // Check if the stores are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1])) { + if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) { BS.cancelScheduling(VL); newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); @@ -1381,6 +1439,16 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { } newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); + + // Reorder operands if reordering would enable vectorization. + if (isa(VL0)) { + ValueList Left, Right; + reorderAltShuffleOperands(VL, Left, Right); + buildTree_rec(Left, Depth + 1); + buildTree_rec(Right, Depth + 1); + return; + } + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1624,8 +1692,10 @@ bool BoUpSLP::isFullyVectorizableTinyTree() { if (VectorizableTree.size() != 2) return false; - // Handle splat stores. - if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars)) + // Handle splat and all-constants stores. + if (!VectorizableTree[0].NeedToGather && + (allConstant(VectorizableTree[1].Scalars) || + isSplat(VectorizableTree[1].Scalars))) return true; // Gathering cost would be too much for tiny trees. @@ -1672,7 +1742,8 @@ int BoUpSLP::getSpillCost() { } // Now find the sequence of instructions between PrevInst and Inst. - BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst); + BasicBlock::reverse_iterator InstIt(Inst->getIterator()), + PrevInstIt(PrevInst->getIterator()); --PrevInstIt; while (InstIt != PrevInstIt) { if (PrevInstIt == PrevInst->getParent()->rend()) { @@ -1704,7 +1775,7 @@ int BoUpSLP::getTreeCost() { // We only vectorize tiny trees if it is fully vectorizable. if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) { - if (!VectorizableTree.size()) { + if (VectorizableTree.empty()) { assert(!ExternalUses.size() && "We should not have any external users"); } return INT_MAX; @@ -1777,7 +1848,7 @@ unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { return -1; } -bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { +bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) { Value *PtrA = getPointerOperand(A); Value *PtrB = getPointerOperand(B); unsigned ASA = getAddressSpaceOperand(A); @@ -1791,13 +1862,13 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) return false; - unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA); + unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); Type *Ty = cast(PtrA->getType())->getElementType(); - APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty)); + APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty)); APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); - PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA); - PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB); + PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); + PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); APInt OffsetDelta = OffsetB - OffsetA; @@ -1818,9 +1889,231 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { return X == PtrSCEVB; } +// Reorder commutative operations in alternate shuffle if the resulting vectors +// are consecutive loads. This would allow us to vectorize the tree. +// If we have something like- +// load a[0] - load b[0] +// load b[1] + load a[1] +// load a[2] - load b[2] +// load a[3] + load b[3] +// Reordering the second load b[1] load a[1] would allow us to vectorize this +// code. +void BoUpSLP::reorderAltShuffleOperands(ArrayRef VL, + SmallVectorImpl &Left, + SmallVectorImpl &Right) { + const DataLayout &DL = F->getParent()->getDataLayout(); + + // Push left and right operands of binary operation into Left and Right + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Left.push_back(cast(VL[i])->getOperand(0)); + Right.push_back(cast(VL[i])->getOperand(1)); + } + + // Reorder if we have a commutative operation and consecutive access + // are on either side of the alternate instructions. + for (unsigned j = 0; j < VL.size() - 1; ++j) { + if (LoadInst *L = dyn_cast(Left[j])) { + if (LoadInst *L1 = dyn_cast(Right[j + 1])) { + Instruction *VL1 = cast(VL[j]); + Instruction *VL2 = cast(VL[j + 1]); + if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) { + std::swap(Left[j], Right[j]); + continue; + } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + // else unchanged + } + } + if (LoadInst *L = dyn_cast(Right[j])) { + if (LoadInst *L1 = dyn_cast(Left[j + 1])) { + Instruction *VL1 = cast(VL[j]); + Instruction *VL2 = cast(VL[j + 1]); + if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) { + std::swap(Left[j], Right[j]); + continue; + } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + // else unchanged + } + } + } +} + + +// Return true if I should be commuted before adding it's left and right +// operands to the arrays Left and Right. +// +// The vectorizer is trying to either have all elements one side being +// instruction with the same opcode to enable further vectorization, or having +// a splat to lower the vectorizing cost. +static bool shouldReorderOperands(int i, Instruction &I, + SmallVectorImpl &Left, + SmallVectorImpl &Right) { + Value *VLeft = I.getOperand(0); + Value *VRight = I.getOperand(1); + Instruction *ILeft = dyn_cast(VLeft); + Instruction *IRight = dyn_cast(VRight); + + // Sort two opcodes. In the code below we try to preserve the ability to use + // broadcast of values instead of individual inserts. + // vl1 = load + // vl2 = phi + // vr1 = load + // vr2 = vr2 + // = vl1 x vr1 + // = vl2 x vr2 + // If we just sorted according to opcode we would leave the first line in + // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). + // = vl1 x vr1 + // = vr2 x vl2 + // Because vr2 and vr1 are from the same load we loose the opportunity of a + // broadcast for the packed right side in the backend: we have [vr1, vl2] + // instead of [vr1, vr2=vr1]. + if (ILeft && IRight) { + if (ILeft->getOpcode() > IRight->getOpcode() && + Right[i - 1] != IRight) { + // Try not to destroy a broad cast for no apparent benefit. + return true; + } else if (ILeft->getOpcode() == IRight->getOpcode() && + Right[i - 1] == ILeft) { + // Try preserve broadcasts. + return true; + } else if (ILeft->getOpcode() == IRight->getOpcode() && + Left[i - 1] == IRight) { + // Try preserve broadcasts. + return true; + } + return false; + } + // One opcode, put the instruction on the right. + if (ILeft) { + return true; + } + return false; +} + +void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, + SmallVectorImpl &Left, + SmallVectorImpl &Right) { + + SmallVector OrigLeft, OrigRight; + + if (VL.size()) { + // Peel the first iteration out of the loop since there's nothing + // interesting to do anyway and it simplifies the checks + auto VLeft = cast(VL[0])->getOperand(0); + auto VRight = cast(VL[0])->getOperand(1); + OrigLeft.push_back(VLeft); + OrigRight.push_back(VRight); + if (!isa(VRight) && isa(VLeft)) + // Favor having instruction to the right. FIXME: why? + std::swap(VLeft, VRight); + Left.push_back(VLeft); + Right.push_back(VRight); + } + + // Keep track if we have instructions with all the same opcode on one side. + bool AllSameOpcodeLeft = isa(Left[0]); + bool AllSameOpcodeRight = isa(Right[0]); + + for (unsigned i = 1, e = VL.size(); i != e; ++i) { + Instruction *I = cast(VL[i]); + + Value *VLeft = I->getOperand(0); + Value *VRight = I->getOperand(1); + OrigLeft.push_back(VLeft); + OrigRight.push_back(VRight); + Instruction *ILeft = dyn_cast(VLeft); + Instruction *IRight = dyn_cast(VRight); + + // Check whether all operands on one side have the same opcode. In this case + // we want to preserve the original order and not make things worse by + // reordering. + if (AllSameOpcodeLeft && ILeft) { + if (Instruction *PLeft = dyn_cast(OrigLeft[i - 1])) { + if (PLeft->getOpcode() != ILeft->getOpcode()) + AllSameOpcodeLeft = false; + } else + AllSameOpcodeLeft = false; + } + if (AllSameOpcodeRight && IRight) { + if (Instruction *PRight = dyn_cast(OrigRight[i - 1])) { + if (PRight->getOpcode() != IRight->getOpcode()) + AllSameOpcodeRight = false; + } else + AllSameOpcodeRight = false; + } + + + // Commute to favor either a splat or maximizing having the same opcodes on + // one side. + if (shouldReorderOperands(i, *I, Left, Right)) { + Left.push_back(I->getOperand(1)); + Right.push_back(I->getOperand(0)); + } else { + Left.push_back(I->getOperand(0)); + Right.push_back(I->getOperand(1)); + } + } + + bool LeftBroadcast = isSplat(Left); + bool RightBroadcast = isSplat(Right); + + // If operands end up being broadcast return this operand order. + if (LeftBroadcast || RightBroadcast) + return; + + // Don't reorder if the operands where good to begin. + if (AllSameOpcodeRight || AllSameOpcodeLeft) { + Left = OrigLeft; + Right = OrigRight; + } + + const DataLayout &DL = F->getParent()->getDataLayout(); + + // Finally check if we can get longer vectorizable chain by reordering + // without breaking the good operand order detected above. + // E.g. If we have something like- + // load a[0] load b[0] + // load b[1] load a[1] + // load a[2] load b[2] + // load a[3] load b[3] + // Reordering the second load b[1] load a[1] would allow us to vectorize + // this code and we still retain AllSameOpcode property. + // FIXME: This load reordering might break AllSameOpcode in some rare cases + // such as- + // add a[0],c[0] load b[0] + // add a[1],c[2] load b[1] + // b[2] load b[2] + // add a[3],c[3] load b[3] + for (unsigned j = 0; j < VL.size() - 1; ++j) { + if (LoadInst *L = dyn_cast(Left[j])) { + if (LoadInst *L1 = dyn_cast(Right[j + 1])) { + if (isConsecutiveAccess(L, L1, DL)) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + } + } + if (LoadInst *L = dyn_cast(Right[j])) { + if (LoadInst *L1 = dyn_cast(Left[j + 1])) { + if (isConsecutiveAccess(L, L1, DL)) { + std::swap(Left[j + 1], Right[j + 1]); + continue; + } + } + } + // else unchanged + } +} + void BoUpSLP::setInsertPointAfterBundle(ArrayRef VL) { Instruction *VL0 = cast(VL[0]); - BasicBlock::iterator NextInst = VL0; + BasicBlock::iterator NextInst(VL0); ++NextInst; Builder.SetInsertPoint(VL0->getParent(), NextInst); Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); @@ -1904,6 +2197,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return Gather(E->Scalars, VecTy); } + const DataLayout &DL = F->getParent()->getDataLayout(); unsigned Opcode = getSameOpcode(E->Scalars); switch (Opcode) { @@ -1928,9 +2222,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } // Prepare the operand vector. - for (unsigned j = 0; j < E->Scalars.size(); ++j) - Operands.push_back(cast(E->Scalars[j])-> - getIncomingValueForBlock(IBB)); + for (Value *V : E->Scalars) + Operands.push_back(cast(V)->getIncomingValueForBlock(IBB)); Builder.SetInsertPoint(IBB->getTerminator()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); @@ -1964,8 +2257,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::FPTrunc: case Instruction::BitCast: { ValueList INVL; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) - INVL.push_back(cast(E->Scalars[i])->getOperand(0)); + for (Value *V : E->Scalars) + INVL.push_back(cast(V)->getOperand(0)); setInsertPointAfterBundle(E->Scalars); @@ -1983,9 +2276,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::FCmp: case Instruction::ICmp: { ValueList LHSV, RHSV; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) { - LHSV.push_back(cast(E->Scalars[i])->getOperand(0)); - RHSV.push_back(cast(E->Scalars[i])->getOperand(1)); + for (Value *V : E->Scalars) { + LHSV.push_back(cast(V)->getOperand(0)); + RHSV.push_back(cast(V)->getOperand(1)); } setInsertPointAfterBundle(E->Scalars); @@ -1996,7 +2289,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (Value *V = alreadyVectorized(E->Scalars)) return V; - CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); + CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V; if (Opcode == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); @@ -2009,10 +2302,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::Select: { ValueList TrueVec, FalseVec, CondVec; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) { - CondVec.push_back(cast(E->Scalars[i])->getOperand(0)); - TrueVec.push_back(cast(E->Scalars[i])->getOperand(1)); - FalseVec.push_back(cast(E->Scalars[i])->getOperand(2)); + for (Value *V : E->Scalars) { + CondVec.push_back(cast(V)->getOperand(0)); + TrueVec.push_back(cast(V)->getOperand(1)); + FalseVec.push_back(cast(V)->getOperand(2)); } setInsertPointAfterBundle(E->Scalars); @@ -2051,9 +2344,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (isa(VL0) && VL0->isCommutative()) reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); else - for (int i = 0, e = E->Scalars.size(); i < e; ++i) { - LHSVL.push_back(cast(E->Scalars[i])->getOperand(0)); - RHSVL.push_back(cast(E->Scalars[i])->getOperand(1)); + for (Value *V : E->Scalars) { + LHSVL.push_back(cast(V)->getOperand(0)); + RHSVL.push_back(cast(V)->getOperand(1)); } setInsertPointAfterBundle(E->Scalars); @@ -2100,8 +2393,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { unsigned Alignment = LI->getAlignment(); LI = Builder.CreateLoad(VecPtr); - if (!Alignment) - Alignment = DL->getABITypeAlignment(ScalarLoadTy); + if (!Alignment) { + Alignment = DL.getABITypeAlignment(ScalarLoadTy); + } LI->setAlignment(Alignment); E->VectorizedValue = LI; ++NumVectorInstructions; @@ -2113,8 +2407,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { unsigned AS = SI->getPointerAddressSpace(); ValueList ValueOp; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) - ValueOp.push_back(cast(E->Scalars[i])->getValueOperand()); + for (Value *V : E->Scalars) + ValueOp.push_back(cast(V)->getValueOperand()); setInsertPointAfterBundle(E->Scalars); @@ -2130,8 +2424,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ExternalUses.push_back( ExternalUser(SI->getPointerOperand(), cast(VecPtr), 0)); - if (!Alignment) - Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); + if (!Alignment) { + Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType()); + } S->setAlignment(Alignment); E->VectorizedValue = S; ++NumVectorInstructions; @@ -2141,8 +2436,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { setInsertPointAfterBundle(E->Scalars); ValueList Op0VL; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) - Op0VL.push_back(cast(E->Scalars[i])->getOperand(0)); + for (Value *V : E->Scalars) + Op0VL.push_back(cast(V)->getOperand(0)); Value *Op0 = vectorizeTree(Op0VL); @@ -2150,14 +2445,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { for (int j = 1, e = cast(VL0)->getNumOperands(); j < e; ++j) { ValueList OpVL; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) - OpVL.push_back(cast(E->Scalars[i])->getOperand(j)); + for (Value *V : E->Scalars) + OpVL.push_back(cast(V)->getOperand(j)); Value *OpVec = vectorizeTree(OpVL); OpVecs.push_back(OpVec); } - Value *V = Builder.CreateGEP(Op0, OpVecs); + Value *V = Builder.CreateGEP( + cast(VL0)->getSourceElementType(), Op0, OpVecs); E->VectorizedValue = V; ++NumVectorInstructions; @@ -2173,7 +2469,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Intrinsic::ID IID = Intrinsic::not_intrinsic; Value *ScalarArg = nullptr; if (CI && (FI = CI->getCalledFunction())) { - IID = (Intrinsic::ID) FI->getIntrinsicID(); + IID = FI->getIntrinsicID(); } std::vector OpVecs; for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { @@ -2186,8 +2482,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { OpVecs.push_back(CEI->getArgOperand(j)); continue; } - for (int i = 0, e = E->Scalars.size(); i < e; ++i) { - CallInst *CEI = cast(E->Scalars[i]); + for (Value *V : E->Scalars) { + CallInst *CEI = cast(V); OpVL.push_back(CEI->getArgOperand(j)); } @@ -2214,10 +2510,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; - for (int i = 0, e = E->Scalars.size(); i < e; ++i) { - LHSVL.push_back(cast(E->Scalars[i])->getOperand(0)); - RHSVL.push_back(cast(E->Scalars[i])->getOperand(1)); - } + assert(isa(VL0) && "Invalid Shuffle Vector Operand"); + reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); setInsertPointAfterBundle(E->Scalars); Value *LHS = vectorizeTree(LHSVL); @@ -2276,7 +2570,7 @@ Value *BoUpSLP::vectorizeTree() { scheduleBlock(BSIter.second.get()); } - Builder.SetInsertPoint(F->getEntryBlock().begin()); + Builder.SetInsertPoint(&F->getEntryBlock().front()); vectorizeTree(&VectorizableTree[0]); DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); @@ -2321,7 +2615,7 @@ Value *BoUpSLP::vectorizeTree() { User->replaceUsesOfWith(Scalar, Ex); } } else { - Builder.SetInsertPoint(F->getEntryBlock().begin()); + Builder.SetInsertPoint(&F->getEntryBlock().front()); Value *Ex = Builder.CreateExtractElement(Vec, Lane); CSEBlocks.insert(&F->getEntryBlock()); User->replaceUsesOfWith(Scalar, Ex); @@ -2360,7 +2654,7 @@ Value *BoUpSLP::vectorizeTree() { Scalar->replaceAllUsesWith(Undef); } DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); - cast(Scalar)->eraseFromParent(); + eraseInstruction(cast(Scalar)); } } @@ -2430,7 +2724,7 @@ void BoUpSLP::optimizeGatherSequence() { BasicBlock *BB = (*I)->getBlock(); // For all instructions in blocks containing gather sequences: for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { - Instruction *In = it++; + Instruction *In = &*it++; if (!isa(In) && !isa(In)) continue; @@ -2442,7 +2736,7 @@ void BoUpSLP::optimizeGatherSequence() { if (In->isIdenticalTo(*v) && DT->dominates((*v)->getParent(), In->getParent())) { In->replaceAllUsesWith(*v); - In->eraseFromParent(); + eraseInstruction(In); In = nullptr; break; } @@ -2460,7 +2754,7 @@ void BoUpSLP::optimizeGatherSequence() { // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, - AliasAnalysis *AA) { + BoUpSLP *SLP) { if (isa(VL[0])) return true; @@ -2470,8 +2764,15 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, ScheduleData *Bundle = nullptr; bool ReSchedule = false; DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n"); + + // Make sure that the scheduling region contains all + // instructions of the bundle. + for (Value *V : VL) { + if (!extendSchedulingRegion(V)) + return false; + } + for (Value *V : VL) { - extendSchedulingRegion(V); ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); @@ -2517,7 +2818,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " << BB->getName() << "\n"); - calculateDependencies(Bundle, true, AA); + calculateDependencies(Bundle, true, SLP); // Now try to schedule the new bundle. As soon as the bundle is "ready" it // means that there are no cyclic dependencies and we can schedule it. @@ -2532,7 +2833,11 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, schedule(pickedSD, ReadyInsts); } } - return Bundle->isReady(); + if (!Bundle->isReady()) { + cancelScheduling(VL); + return false; + } + return true; } void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL) { @@ -2561,9 +2866,9 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL) { } } -void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { +bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { if (getScheduleData(V)) - return; + return true; Instruction *I = dyn_cast(V); assert(I && "bundle member must be an instruction"); assert(!isa(I) && "phi nodes don't need to be scheduled"); @@ -2574,21 +2879,26 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { ScheduleEnd = I->getNextNode(); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); - return; + return true; } // Search up and down at the same time, because we don't know if the new // instruction is above or below the existing scheduling region. - BasicBlock::reverse_iterator UpIter(ScheduleStart); + BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator()); BasicBlock::reverse_iterator UpperEnd = BB->rend(); BasicBlock::iterator DownIter(ScheduleEnd); BasicBlock::iterator LowerEnd = BB->end(); for (;;) { + if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { + DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n"); + return false; + } + if (UpIter != UpperEnd) { if (&*UpIter == I) { initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); ScheduleStart = I; DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); - return; + return true; } UpIter++; } @@ -2599,13 +2909,14 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { ScheduleEnd = I->getNextNode(); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); - return; + return true; } DownIter++; } assert((UpIter != UpperEnd || DownIter != LowerEnd) && "instruction not found in block"); } + return true; } void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, @@ -2648,18 +2959,9 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, } } -/// \returns the AA location that is being access by the instruction. -static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) { - if (StoreInst *SI = dyn_cast(I)) - return AA->getLocation(SI); - if (LoadInst *LI = dyn_cast(I)) - return AA->getLocation(LI); - return AliasAnalysis::Location(); -} - void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, bool InsertInReadyList, - AliasAnalysis *AA) { + BoUpSLP *SLP) { assert(SD->isSchedulingEntity()); SmallVector WorkList; @@ -2694,8 +2996,8 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, } } else { // I'm not sure if this can ever happen. But we need to be safe. - // This lets the instruction/bundle never be scheduled and eventally - // disable vectorization. + // This lets the instruction/bundle never be scheduled and + // eventually disable vectorization. BundleMember->Dependencies++; BundleMember->incrementUnscheduledDeps(1); } @@ -2704,26 +3006,60 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, // Handle the memory dependencies. ScheduleData *DepDest = BundleMember->NextLoadStore; if (DepDest) { - AliasAnalysis::Location SrcLoc = getLocation(BundleMember->Inst, AA); + Instruction *SrcInst = BundleMember->Inst; + MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA); bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); + unsigned numAliased = 0; + unsigned DistToSrc = 1; while (DepDest) { assert(isInSchedulingRegion(DepDest)); - if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) { - AliasAnalysis::Location DstLoc = getLocation(DepDest->Inst, AA); - if (!SrcLoc.Ptr || !DstLoc.Ptr || AA->alias(SrcLoc, DstLoc)) { - DepDest->MemoryDependencies.push_back(BundleMember); - BundleMember->Dependencies++; - ScheduleData *DestBundle = DepDest->FirstInBundle; - if (!DestBundle->IsScheduled) { - BundleMember->incrementUnscheduledDeps(1); - } - if (!DestBundle->hasValidDependencies()) { - WorkList.push_back(DestBundle); - } + + // We have two limits to reduce the complexity: + // 1) AliasedCheckLimit: It's a small limit to reduce calls to + // SLP->isAliased (which is the expensive part in this loop). + // 2) MaxMemDepDistance: It's for very large blocks and it aborts + // the whole loop (even if the loop is fast, it's quadratic). + // It's important for the loop break condition (see below) to + // check this limit even between two read-only instructions. + if (DistToSrc >= MaxMemDepDistance || + ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && + (numAliased >= AliasedCheckLimit || + SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { + + // We increment the counter only if the locations are aliased + // (instead of counting all alias checks). This gives a better + // balance between reduced runtime and accurate dependencies. + numAliased++; + + DepDest->MemoryDependencies.push_back(BundleMember); + BundleMember->Dependencies++; + ScheduleData *DestBundle = DepDest->FirstInBundle; + if (!DestBundle->IsScheduled) { + BundleMember->incrementUnscheduledDeps(1); + } + if (!DestBundle->hasValidDependencies()) { + WorkList.push_back(DestBundle); } } DepDest = DepDest->NextLoadStore; + + // Example, explaining the loop break condition: Let's assume our + // starting instruction is i0 and MaxMemDepDistance = 3. + // + // +--------v--v--v + // i0,i1,i2,i3,i4,i5,i6,i7,i8 + // +--------^--^--^ + // + // MaxMemDepDistance let us stop alias-checking at i3 and we add + // dependencies from i0 to i3,i4,.. (even if they are not aliased). + // Previously we already added dependencies from i3 to i6,i7,i8 + // (because of MaxMemDepDistance). As we added a dependency from + // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 + // and we can abort this loop at i6. + if (DistToSrc >= 2 * MaxMemDepDistance) + break; + DistToSrc++; } } } @@ -2767,7 +3103,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { }; std::set ReadyInsts; - // Ensure that all depencency data is updated and fill the ready-list with + // Ensure that all dependency data is updated and fill the ready-list with // initial instructions. int Idx = 0; int NumToSchedule = 0; @@ -2779,7 +3115,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { "scheduler and vectorizer have different opinion on what is a bundle"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { - BS->calculateDependencies(SD, false, AA); + BS->calculateDependencies(SD, false, this); NumToSchedule++; } } @@ -2799,7 +3135,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { Instruction *pickedInst = BundleMember->Inst; if (LastScheduledInst->getNextNode() != pickedInst) { BS->BB->getInstList().remove(pickedInst); - BS->BB->getInstList().insert(LastScheduledInst, pickedInst); + BS->BB->getInstList().insert(LastScheduledInst->getIterator(), + pickedInst); } LastScheduledInst = pickedInst; BundleMember = BundleMember->NextInBundle; @@ -2827,7 +3164,6 @@ struct SLPVectorizer : public FunctionPass { } ScalarEvolution *SE; - const DataLayout *DL; TargetTransformInfo *TTI; TargetLibraryInfo *TLI; AliasAnalysis *AA; @@ -2839,13 +3175,12 @@ struct SLPVectorizer : public FunctionPass { if (skipOptnoneFunction(F)) return false; - SE = &getAnalysis(); - DataLayoutPass *DLP = getAnalysisIfAvailable(); - DL = DLP ? &DLP->getDataLayout() : nullptr; - TTI = &getAnalysis(); - TLI = getAnalysisIfAvailable(); - AA = &getAnalysis(); - LI = &getAnalysis(); + SE = &getAnalysis().getSE(); + TTI = &getAnalysis().getTTI(F); + auto *TLIP = getAnalysisIfAvailable(); + TLI = TLIP ? &TLIP->getTLI() : nullptr; + AA = &getAnalysis().getAAResults(); + LI = &getAnalysis().getLoopInfo(); DT = &getAnalysis().getDomTree(); AC = &getAnalysis().getAssumptionCache(F); @@ -2857,10 +3192,16 @@ struct SLPVectorizer : public FunctionPass { if (!TTI->getNumberOfRegisters(true)) return false; - // Must have DataLayout. We can't require it because some tests run w/o - // triple. - if (!DL) - return false; + // Use the vector register size specified by the target unless overridden + // by a command-line option. + // TODO: It would be better to limit the vectorization factor based on + // data type rather than just register size. For example, x86 AVX has + // 256-bit registers, but it does not support integer operations + // at that width (that requires AVX2). + if (MaxVectorRegSizeOption.getNumOccurrences()) + MaxVecRegSize = MaxVectorRegSizeOption; + else + MaxVecRegSize = TTI->getRegisterBitWidth(true); // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) @@ -2870,12 +3211,13 @@ struct SLPVectorizer : public FunctionPass { // Use the bottom up slp vectorizer to construct chains that start with // store instructions. - BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AC); + BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC); + + // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to + // delete instructions. // Scan the blocks in the function in post order. - for (po_iterator it = po_begin(&F.getEntryBlock()), - e = po_end(&F.getEntryBlock()); it != e; ++it) { - BasicBlock *BB = *it; + for (auto BB : post_order(&F.getEntryBlock())) { // Vectorize trees that end at stores. if (unsigned count = collectStores(BB, R)) { (void)count; @@ -2898,12 +3240,12 @@ struct SLPVectorizer : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { FunctionPass::getAnalysisUsage(AU); AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); - AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); AU.addRequired(); - AU.addPreserved(); + AU.addPreserved(); AU.addPreserved(); AU.setPreservesCFG(); } @@ -2938,37 +3280,36 @@ private: bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); bool vectorizeStoreChain(ArrayRef Chain, int CostThreshold, - BoUpSLP &R); + BoUpSLP &R, unsigned VecRegSize); bool vectorizeStores(ArrayRef Stores, int costThreshold, BoUpSLP &R); private: StoreListMap StoreRefs; + unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. }; /// \brief Check that the Values in the slice in VL array are still existent in /// the WeakVH array. /// Vectorization of part of the VL array may cause later values in the VL array /// to become invalid. We track when this has happened in the WeakVH array. -static bool hasValueBeenRAUWed(ArrayRef &VL, - SmallVectorImpl &VH, - unsigned SliceBegin, - unsigned SliceSize) { - for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i) - if (VH[i] != VL[i]) - return true; - - return false; +static bool hasValueBeenRAUWed(ArrayRef VL, ArrayRef VH, + unsigned SliceBegin, unsigned SliceSize) { + VL = VL.slice(SliceBegin, SliceSize); + VH = VH.slice(SliceBegin, SliceSize); + return !std::equal(VL.begin(), VL.end(), VH.begin()); } bool SLPVectorizer::vectorizeStoreChain(ArrayRef Chain, - int CostThreshold, BoUpSLP &R) { + int CostThreshold, BoUpSLP &R, + unsigned VecRegSize) { unsigned ChainLen = Chain.size(); DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); Type *StoreTy = cast(Chain[0])->getValueOperand()->getType(); - unsigned Sz = DL->getTypeSizeInBits(StoreTy); - unsigned VF = MinVecRegSize / Sz; + auto &DL = cast(Chain[0])->getModule()->getDataLayout(); + unsigned Sz = DL.getTypeSizeInBits(StoreTy); + unsigned VF = VecRegSize / Sz; if (!isPowerOf2_32(Sz) || VF < 2) return false; @@ -3010,8 +3351,8 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef Chain, bool SLPVectorizer::vectorizeStores(ArrayRef Stores, int costThreshold, BoUpSLP &R) { - SetVector Heads, Tails; - SmallDenseMap ConsecutiveChain; + SetVector Heads, Tails; + SmallDenseMap ConsecutiveChain; // We may run into multiple chains that merge into a single chain. We mark the // stores that we vectorized so that we don't visit the same store twice. @@ -3020,21 +3361,32 @@ bool SLPVectorizer::vectorizeStores(ArrayRef Stores, // Do a quadratic search on all of the given stores and find // all of the pairs of stores that follow each other. + SmallVector IndexQueue; for (unsigned i = 0, e = Stores.size(); i < e; ++i) { - for (unsigned j = 0; j < e; ++j) { - if (i == j) - continue; - - if (R.isConsecutiveAccess(Stores[i], Stores[j])) { - Tails.insert(Stores[j]); + const DataLayout &DL = Stores[i]->getModule()->getDataLayout(); + IndexQueue.clear(); + // If a store has multiple consecutive store candidates, search Stores + // array according to the sequence: from i+1 to e, then from i-1 to 0. + // This is because usually pairing with immediate succeeding or preceding + // candidate create the best chance to find slp vectorization opportunity. + unsigned j = 0; + for (j = i + 1; j < e; ++j) + IndexQueue.push_back(j); + for (j = i; j > 0; --j) + IndexQueue.push_back(j - 1); + + for (auto &k : IndexQueue) { + if (R.isConsecutiveAccess(Stores[i], Stores[k], DL)) { + Tails.insert(Stores[k]); Heads.insert(Stores[i]); - ConsecutiveChain[Stores[i]] = Stores[j]; + ConsecutiveChain[Stores[i]] = Stores[k]; + break; } } } // For stores that start but don't end a link in the chain: - for (SetVector::iterator it = Heads.begin(), e = Heads.end(); + for (SetVector::iterator it = Heads.begin(), e = Heads.end(); it != e; ++it) { if (Tails.count(*it)) continue; @@ -3042,7 +3394,7 @@ bool SLPVectorizer::vectorizeStores(ArrayRef Stores, // We found a store instr that starts a chain. Now follow the chain and try // to vectorize it. BoUpSLP::ValueList Operands; - Value *I = *it; + StoreInst *I = *it; // Collect the chain into a list. while (Tails.count(I) || Heads.count(I)) { if (VectorizedStores.count(I)) @@ -3052,12 +3404,16 @@ bool SLPVectorizer::vectorizeStores(ArrayRef Stores, I = ConsecutiveChain[I]; } - bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R); - - // Mark the vectorized stores so that we don't vectorize them again. - if (Vectorized) - VectorizedStores.insert(Operands.begin(), Operands.end()); - Changed |= Vectorized; + // FIXME: Is division-by-2 the correct step? Should we assert that the + // register size is a power-of-2? + for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) { + if (vectorizeStoreChain(Operands, costThreshold, R, Size)) { + // Mark the vectorized stores so that we don't vectorize them again. + VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed = true; + break; + } + } } return Changed; @@ -3067,8 +3423,9 @@ bool SLPVectorizer::vectorizeStores(ArrayRef Stores, unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { unsigned count = 0; StoreRefs.clear(); - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - StoreInst *SI = dyn_cast(it); + const DataLayout &DL = BB->getModule()->getDataLayout(); + for (Instruction &I : *BB) { + StoreInst *SI = dyn_cast(&I); if (!SI) continue; @@ -3078,7 +3435,7 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { // Check that the pointer points to scalars. Type *Ty = SI->getValueOperand()->getType(); - if (Ty->isAggregateType() || Ty->isVectorTy()) + if (!isValidElementType(Ty)) continue; // Find the base pointer. @@ -3112,16 +3469,19 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, return false; unsigned Opcode0 = I0->getOpcode(); + const DataLayout &DL = I0->getModule()->getDataLayout(); Type *Ty0 = I0->getType(); - unsigned Sz = DL->getTypeSizeInBits(Ty0); + unsigned Sz = DL.getTypeSizeInBits(Ty0); + // FIXME: Register size should be a parameter to this function, so we can + // try different vectorization factors. unsigned VF = MinVecRegSize / Sz; - for (int i = 0, e = VL.size(); i < e; ++i) { - Type *Ty = VL[i]->getType(); - if (Ty->isAggregateType() || Ty->isVectorTy()) + for (Value *V : VL) { + Type *Ty = V->getType(); + if (!isValidElementType(Ty)) return false; - Instruction *Inst = dyn_cast(VL[i]); + Instruction *Inst = dyn_cast(V); if (!Inst || Inst->getOpcode() != Opcode0) return false; } @@ -3180,7 +3540,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, unsigned VecIdx = 0; for (auto &V : BuildVectorSlice) { IRBuilder Builder( - ++BasicBlock::iterator(InsertAfter)); + InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter)); InsertElementInst *IE = cast(V); Instruction *Extract = cast(Builder.CreateExtractElement( VectorizedRoot, Builder.getInt32(VecIdx++))); @@ -3316,8 +3676,7 @@ public: ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} /// \brief Try to find a reduction tree. - bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, - const DataLayout *DL) { + bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { assert((!Phi || std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && "Thi phi needs to use the binary operator"); @@ -3339,12 +3698,15 @@ public: return false; Type *Ty = B->getType(); - if (Ty->isVectorTy()) + if (!isValidElementType(Ty)) return false; + const DataLayout &DL = B->getModule()->getDataLayout(); ReductionOpcode = B->getOpcode(); ReducedValueOpcode = 0; - ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty); + // FIXME: Register size should be a parameter to this function, so we can + // try different vectorization factors. + ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty); ReductionRoot = B; ReductionPHI = Phi; @@ -3469,7 +3831,7 @@ public: private: - /// \brief Calcuate the cost of a reduction. + /// \brief Calculate the cost of a reduction. int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { Type *ScalarTy = FirstReducedVal->getType(); Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); @@ -3631,7 +3993,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { // We may go through BB multiple times so skip the one we have checked. - if (!VisitedInstrs.insert(it).second) + if (!VisitedInstrs.insert(&*it).second) continue; if (isa(it)) @@ -3654,8 +4016,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Try to match and vectorize a horizontal reduction. HorizontalReduction HorRdx; - if (ShouldVectorizeHor && - HorRdx.matchAssociativeReduction(P, BI, DL) && + if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) && HorRdx.tryToReduce(R, TTI)) { Changed = true; it = BB->begin(); @@ -3685,7 +4046,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (BinaryOperator *BinOp = dyn_cast(SI->getValueOperand())) { HorizontalReduction HorRdx; - if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) && + if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) && HorRdx.tryToReduce(R, TTI)) || tryToVectorize(BinOp, R))) { Changed = true; @@ -3729,6 +4090,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // and the iterator may become invalid value. it = BB->begin(); e = BB->end(); + break; } } } @@ -3770,6 +4132,9 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { << it->second.size() << ".\n"); // Process the stores in chunks of 16. + // TODO: The limit of 16 inhibits greater vectorization factors. + // For example, AVX2 supports v32i8. Increasing this limit, however, + // may cause a significant compile-time increase. for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { unsigned Len = std::min(CE - CI, 16); Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), @@ -3784,10 +4149,10 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { char SLPVectorizer::ID = 0; static const char lv_name[] = "SLP Vectorizer"; INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)