X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTransforms%2FVectorize%2FSLPVectorizer.cpp;h=f69a4e52c7e1e86441bfd91351cd338374202892;hb=8bb7acb4c17874dd85a9acd009295c5d5cb0fbf9;hp=becd51a5727504d9a107dde9ec1ade7bc86610f8;hpb=9cd73adba00afbd76eb686252ff4d6a4c69ebce2;p=oota-llvm.git diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index becd51a5727..f69a4e52c7e 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/LoopInfo.h" @@ -61,7 +62,7 @@ static cl::opt "number ")); static cl::opt -ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, +ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions")); static cl::opt ShouldStartVectorizeHorAtStore( @@ -73,6 +74,14 @@ static cl::opt MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); +/// Limits the size of scheduling regions in a block. +/// It avoid long compile times for _very_ large blocks where vector +/// instructions are spread over a wide range. +/// This limit is way higher than needed by real-world functions. +static cl::opt +ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, + cl::desc("Limit the size of the SLP scheduling region per block")); + namespace { // FIXME: Set this via cl::opt to allow overriding. @@ -89,6 +98,10 @@ static const unsigned AliasedCheckLimit = 10; // This limit is useful for very large basic blocks. static const unsigned MaxMemDepDistance = 160; +/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling +/// regions to be handled. +static const int MinScheduleRegionSize = 16; + /// \brief Predicate for the element types that the SLP vectorizer supports. /// /// The most important thing to filter here are types which are invalid in LLVM @@ -156,10 +169,8 @@ static unsigned getAltOpcode(unsigned Op) { /// of an alternate sequence which can later be merged as /// a ShuffleVector instruction. static bool canCombineAsAltInst(unsigned Op) { - if (Op == Instruction::FAdd || Op == Instruction::FSub || - Op == Instruction::Sub || Op == Instruction::Add) - return true; - return false; + return Op == Instruction::FAdd || Op == Instruction::FSub || + Op == Instruction::Sub || Op == Instruction::Add; } /// \returns ShuffleVector instruction if instructions in \p VL have @@ -211,7 +222,7 @@ static void propagateIRFlags(Value *I, ArrayRef VL) { } } } - + /// \returns \p I after propagating metadata from \p VL. static Instruction *propagateMetadata(Instruction *I, ArrayRef VL) { Instruction *I0 = cast(VL[0]); @@ -495,7 +506,7 @@ private: } return Last; } - + /// -- Vectorization State -- /// Holds all of the tree entries. std::vector VectorizableTree; @@ -720,6 +731,8 @@ private: : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), ScheduleStart(nullptr), ScheduleEnd(nullptr), FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr), + ScheduleRegionSize(0), + ScheduleRegionSizeLimit(ScheduleRegionSizeBudget), // Make sure that the initial SchedulingRegionID is greater than the // initial SchedulingRegionID in ScheduleData (which is 0). SchedulingRegionID(1) {} @@ -731,6 +744,13 @@ private: FirstLoadStoreInRegion = nullptr; LastLoadStoreInRegion = nullptr; + // Reduce the maximum schedule region size by the size of the + // previous scheduling run. + ScheduleRegionSizeLimit -= ScheduleRegionSize; + if (ScheduleRegionSizeLimit < MinScheduleRegionSize) + ScheduleRegionSizeLimit = MinScheduleRegionSize; + ScheduleRegionSize = 0; + // Make a new scheduling region, i.e. all existing ScheduleData is not // in the new region yet. ++SchedulingRegionID; @@ -807,7 +827,8 @@ private: void cancelScheduling(ArrayRef VL); /// Extends the scheduling region so that V is inside the region. - void extendSchedulingRegion(Value *V); + /// \returns true if the region size is within the limit. + bool extendSchedulingRegion(Value *V); /// Initialize the ScheduleData structures for new instructions in the /// scheduling region. @@ -861,6 +882,12 @@ private: /// (can be null). ScheduleData *LastLoadStoreInRegion; + /// The current size of the scheduling region. + int ScheduleRegionSize; + + /// The maximum size allowed for the scheduling region. + int ScheduleRegionSizeLimit; + /// The ID of the scheduling region. For a new vectorization iteration this /// is incremented which "removes" all ScheduleData from the region. int SchedulingRegionID; @@ -1062,7 +1089,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { newTreeEntry(VL, false); return; } - + // Check that every instructions appears once in this bundle. for (unsigned i = 0, e = VL.size(); i < e; ++i) for (unsigned j = i+1; j < e; ++j) @@ -1080,7 +1107,9 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (!BS.tryScheduleBundle(VL, this)) { DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); - BS.cancelScheduling(VL); + assert((!BS.getScheduleData(VL[0]) || + !BS.getScheduleData(VL[0])->isPartOfBundle()) && + "tryScheduleBundle should cancelScheduling on failure"); newTreeEntry(VL, false); return; } @@ -1128,6 +1157,23 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { return; } case Instruction::Load: { + // Check that a vectorized load would load the same memory as a scalar + // load. + // For example we don't want vectorize loads that are smaller than 8 bit. + // Even though we have a packed struct {} LLVM treats + // loading/storing it as an i8 struct. If we vectorize loads/stores from + // such a struct we read/write packed bits disagreeing with the + // unvectorized version. + const DataLayout &DL = F->getParent()->getDataLayout(); + Type *ScalarTy = VL[0]->getType(); + + if (DL.getTypeSizeInBits(ScalarTy) != + DL.getTypeAllocSizeInBits(ScalarTy)) { + BS.cancelScheduling(VL); + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); + return; + } // Check if the loads are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { LoadInst *L = cast(VL[i]); @@ -1137,7 +1183,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } - const DataLayout &DL = F->getParent()->getDataLayout(); + if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) { if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) { ++NumLoadsWantToChangeOrder; @@ -1667,7 +1713,7 @@ int BoUpSLP::getSpillCost() { int Cost = 0; SmallPtrSet LiveValues; - Instruction *PrevInst = nullptr; + Instruction *PrevInst = nullptr; for (unsigned N = 0; N < VectorizableTree.size(); ++N) { Instruction *Inst = dyn_cast(VectorizableTree[N].Scalars[0]); @@ -1692,10 +1738,11 @@ int BoUpSLP::getSpillCost() { for (auto &J : PrevInst->operands()) { if (isa(&*J) && ScalarToTreeEntry.count(&*J)) LiveValues.insert(cast(&*J)); - } + } // Now find the sequence of instructions between PrevInst and Inst. - BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst); + BasicBlock::reverse_iterator InstIt(Inst->getIterator()), + PrevInstIt(PrevInst->getIterator()); --PrevInstIt; while (InstIt != PrevInstIt) { if (PrevInstIt == PrevInst->getParent()->rend()) { @@ -1735,30 +1782,29 @@ int BoUpSLP::getTreeCost() { unsigned BundleWidth = VectorizableTree[0].Scalars.size(); - for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) { - int C = getEntryCost(&VectorizableTree[i]); + for (TreeEntry &TE : VectorizableTree) { + int C = getEntryCost(&TE); DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " - << *VectorizableTree[i].Scalars[0] << " .\n"); + << TE.Scalars[0] << " .\n"); Cost += C; } SmallSet ExtractCostCalculated; int ExtractCost = 0; - for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end(); - I != E; ++I) { + for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. - if (!ExtractCostCalculated.insert(I->Scalar).second) + if (!ExtractCostCalculated.insert(EU.Scalar).second) continue; // Uses by ephemeral values are free (because the ephemeral value will be // removed prior to code generation, and so the extraction will be // removed as well). - if (EphValues.count(I->User)) + if (EphValues.count(EU.User)) continue; - VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth); + VectorType *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, - I->Lane); + EU.Lane); } Cost += getSpillCost(); @@ -1895,106 +1941,126 @@ void BoUpSLP::reorderAltShuffleOperands(ArrayRef VL, } } +// Return true if I should be commuted before adding it's left and right +// operands to the arrays Left and Right. +// +// The vectorizer is trying to either have all elements one side being +// instruction with the same opcode to enable further vectorization, or having +// a splat to lower the vectorizing cost. +static bool shouldReorderOperands(int i, Instruction &I, + SmallVectorImpl &Left, + SmallVectorImpl &Right, + bool AllSameOpcodeLeft, + bool AllSameOpcodeRight, bool SplatLeft, + bool SplatRight) { + Value *VLeft = I.getOperand(0); + Value *VRight = I.getOperand(1); + // If we have "SplatRight", try to see if commuting is needed to preserve it. + if (SplatRight) { + if (VRight == Right[i - 1]) + // Preserve SplatRight + return false; + if (VLeft == Right[i - 1]) { + // Commuting would preserve SplatRight, but we don't want to break + // SplatLeft either, i.e. preserve the original order if possible. + // (FIXME: why do we care?) + if (SplatLeft && VLeft == Left[i - 1]) + return false; + return true; + } + } + // Symmetrically handle Right side. + if (SplatLeft) { + if (VLeft == Left[i - 1]) + // Preserve SplatLeft + return false; + if (VRight == Left[i - 1]) + return true; + } + + Instruction *ILeft = dyn_cast(VLeft); + Instruction *IRight = dyn_cast(VRight); + + // If we have "AllSameOpcodeRight", try to see if the left operands preserves + // it and not the right, in this case we want to commute. + if (AllSameOpcodeRight) { + unsigned RightPrevOpcode = cast(Right[i - 1])->getOpcode(); + if (IRight && RightPrevOpcode == IRight->getOpcode()) + // Do not commute, a match on the right preserves AllSameOpcodeRight + return false; + if (ILeft && RightPrevOpcode == ILeft->getOpcode()) { + // We have a match and may want to commute, but first check if there is + // not also a match on the existing operands on the Left to preserve + // AllSameOpcodeLeft, i.e. preserve the original order if possible. + // (FIXME: why do we care?) + if (AllSameOpcodeLeft && ILeft && + cast(Left[i - 1])->getOpcode() == ILeft->getOpcode()) + return false; + return true; + } + } + // Symmetrically handle Left side. + if (AllSameOpcodeLeft) { + unsigned LeftPrevOpcode = cast(Left[i - 1])->getOpcode(); + if (ILeft && LeftPrevOpcode == ILeft->getOpcode()) + return false; + if (IRight && LeftPrevOpcode == IRight->getOpcode()) + return true; + } + return false; +} + void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right) { - SmallVector OrigLeft, OrigRight; - - bool AllSameOpcodeLeft = true; - bool AllSameOpcodeRight = true; - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - Instruction *I = cast(VL[i]); - Value *VLeft = I->getOperand(0); - Value *VRight = I->getOperand(1); - - OrigLeft.push_back(VLeft); - OrigRight.push_back(VRight); - - Instruction *ILeft = dyn_cast(VLeft); - Instruction *IRight = dyn_cast(VRight); - - // Check whether all operands on one side have the same opcode. In this case - // we want to preserve the original order and not make things worse by - // reordering. - if (i && AllSameOpcodeLeft && ILeft) { - if (Instruction *PLeft = dyn_cast(OrigLeft[i - 1])) { - if (PLeft->getOpcode() != ILeft->getOpcode()) - AllSameOpcodeLeft = false; - } else - AllSameOpcodeLeft = false; - } - if (i && AllSameOpcodeRight && IRight) { - if (Instruction *PRight = dyn_cast(OrigRight[i - 1])) { - if (PRight->getOpcode() != IRight->getOpcode()) - AllSameOpcodeRight = false; - } else - AllSameOpcodeRight = false; - } - - // Sort two opcodes. In the code below we try to preserve the ability to use - // broadcast of values instead of individual inserts. - // vl1 = load - // vl2 = phi - // vr1 = load - // vr2 = vr2 - // = vl1 x vr1 - // = vl2 x vr2 - // If we just sorted according to opcode we would leave the first line in - // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). - // = vl1 x vr1 - // = vr2 x vl2 - // Because vr2 and vr1 are from the same load we loose the opportunity of a - // broadcast for the packed right side in the backend: we have [vr1, vl2] - // instead of [vr1, vr2=vr1]. - if (ILeft && IRight) { - if (!i && ILeft->getOpcode() > IRight->getOpcode()) { - Left.push_back(IRight); - Right.push_back(ILeft); - } else if (i && ILeft->getOpcode() > IRight->getOpcode() && - Right[i - 1] != IRight) { - // Try not to destroy a broad cast for no apparent benefit. - Left.push_back(IRight); - Right.push_back(ILeft); - } else if (i && ILeft->getOpcode() == IRight->getOpcode() && - Right[i - 1] == ILeft) { - // Try preserve broadcasts. - Left.push_back(IRight); - Right.push_back(ILeft); - } else if (i && ILeft->getOpcode() == IRight->getOpcode() && - Left[i - 1] == IRight) { - // Try preserve broadcasts. - Left.push_back(IRight); - Right.push_back(ILeft); - } else { - Left.push_back(ILeft); - Right.push_back(IRight); - } - continue; - } - // One opcode, put the instruction on the right. - if (ILeft) { - Left.push_back(VRight); - Right.push_back(ILeft); - continue; - } + if (VL.size()) { + // Peel the first iteration out of the loop since there's nothing + // interesting to do anyway and it simplifies the checks in the loop. + auto VLeft = cast(VL[0])->getOperand(0); + auto VRight = cast(VL[0])->getOperand(1); + if (!isa(VRight) && isa(VLeft)) + // Favor having instruction to the right. FIXME: why? + std::swap(VLeft, VRight); Left.push_back(VLeft); Right.push_back(VRight); } - bool LeftBroadcast = isSplat(Left); - bool RightBroadcast = isSplat(Right); - - // If operands end up being broadcast return this operand order. - if (LeftBroadcast || RightBroadcast) - return; + // Keep track if we have instructions with all the same opcode on one side. + bool AllSameOpcodeLeft = isa(Left[0]); + bool AllSameOpcodeRight = isa(Right[0]); + // Keep track if we have one side with all the same value (broadcast). + bool SplatLeft = true; + bool SplatRight = true; - // Don't reorder if the operands where good to begin. - if (AllSameOpcodeRight || AllSameOpcodeLeft) { - Left = OrigLeft; - Right = OrigRight; + for (unsigned i = 1, e = VL.size(); i != e; ++i) { + Instruction *I = cast(VL[i]); + assert(I->isCommutative() && "Can only process commutative instruction"); + // Commute to favor either a splat or maximizing having the same opcodes on + // one side. + if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft, + AllSameOpcodeRight, SplatLeft, SplatRight)) { + Left.push_back(I->getOperand(1)); + Right.push_back(I->getOperand(0)); + } else { + Left.push_back(I->getOperand(0)); + Right.push_back(I->getOperand(1)); + } + // Update Splat* and AllSameOpcode* after the insertion. + SplatRight = SplatRight && (Right[i - 1] == Right[i]); + SplatLeft = SplatLeft && (Left[i - 1] == Left[i]); + AllSameOpcodeLeft = AllSameOpcodeLeft && isa(Left[i]) && + (cast(Left[i - 1])->getOpcode() == + cast(Left[i])->getOpcode()); + AllSameOpcodeRight = AllSameOpcodeRight && isa(Right[i]) && + (cast(Right[i - 1])->getOpcode() == + cast(Right[i])->getOpcode()); } + // If one operand end up being broadcast, return this operand order. + if (SplatRight || SplatLeft) + return; + const DataLayout &DL = F->getParent()->getDataLayout(); // Finally check if we can get longer vectorizable chain by reordering @@ -2035,7 +2101,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, void BoUpSLP::setInsertPointAfterBundle(ArrayRef VL) { Instruction *VL0 = cast(VL[0]); - BasicBlock::iterator NextInst = VL0; + BasicBlock::iterator NextInst(VL0); ++NextInst; Builder.SetInsertPoint(VL0->getParent(), NextInst); Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); @@ -2486,13 +2552,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *BoUpSLP::vectorizeTree() { - + // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { scheduleBlock(BSIter.second.get()); } - Builder.SetInsertPoint(F->getEntryBlock().begin()); + Builder.SetInsertPoint(&F->getEntryBlock().front()); vectorizeTree(&VectorizableTree[0]); DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); @@ -2537,7 +2603,7 @@ Value *BoUpSLP::vectorizeTree() { User->replaceUsesOfWith(Scalar, Ex); } } else { - Builder.SetInsertPoint(F->getEntryBlock().begin()); + Builder.SetInsertPoint(&F->getEntryBlock().front()); Value *Ex = Builder.CreateExtractElement(Vec, Lane); CSEBlocks.insert(&F->getEntryBlock()); User->replaceUsesOfWith(Scalar, Ex); @@ -2646,7 +2712,7 @@ void BoUpSLP::optimizeGatherSequence() { BasicBlock *BB = (*I)->getBlock(); // For all instructions in blocks containing gather sequences: for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { - Instruction *In = it++; + Instruction *In = &*it++; if (!isa(In) && !isa(In)) continue; @@ -2686,8 +2752,15 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, ScheduleData *Bundle = nullptr; bool ReSchedule = false; DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n"); + + // Make sure that the scheduling region contains all + // instructions of the bundle. + for (Value *V : VL) { + if (!extendSchedulingRegion(V)) + return false; + } + for (Value *V : VL) { - extendSchedulingRegion(V); ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); @@ -2748,7 +2821,11 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, schedule(pickedSD, ReadyInsts); } } - return Bundle->isReady(); + if (!Bundle->isReady()) { + cancelScheduling(VL); + return false; + } + return true; } void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL) { @@ -2777,9 +2854,9 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL) { } } -void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { +bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { if (getScheduleData(V)) - return; + return true; Instruction *I = dyn_cast(V); assert(I && "bundle member must be an instruction"); assert(!isa(I) && "phi nodes don't need to be scheduled"); @@ -2790,21 +2867,26 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { ScheduleEnd = I->getNextNode(); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); - return; + return true; } // Search up and down at the same time, because we don't know if the new // instruction is above or below the existing scheduling region. - BasicBlock::reverse_iterator UpIter(ScheduleStart); + BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator()); BasicBlock::reverse_iterator UpperEnd = BB->rend(); BasicBlock::iterator DownIter(ScheduleEnd); BasicBlock::iterator LowerEnd = BB->end(); for (;;) { + if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { + DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n"); + return false; + } + if (UpIter != UpperEnd) { if (&*UpIter == I) { initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); ScheduleStart = I; DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); - return; + return true; } UpIter++; } @@ -2815,13 +2897,14 @@ void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { ScheduleEnd = I->getNextNode(); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); - return; + return true; } DownIter++; } assert((UpIter != UpperEnd || DownIter != LowerEnd) && "instruction not found in block"); } + return true; } void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, @@ -2990,10 +3073,10 @@ void BoUpSLP::BlockScheduling::resetSchedule() { } void BoUpSLP::scheduleBlock(BlockScheduling *BS) { - + if (!BS->ScheduleStart) return; - + DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); BS->resetSchedule(); @@ -3040,7 +3123,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { Instruction *pickedInst = BundleMember->Inst; if (LastScheduledInst->getNextNode() != pickedInst) { BS->BB->getInstList().remove(pickedInst); - BS->BB->getInstList().insert(LastScheduledInst, pickedInst); + BS->BB->getInstList().insert(LastScheduledInst->getIterator(), + pickedInst); } LastScheduledInst = pickedInst; BundleMember = BundleMember->NextInBundle; @@ -3083,7 +3167,7 @@ struct SLPVectorizer : public FunctionPass { TTI = &getAnalysis().getTTI(F); auto *TLIP = getAnalysisIfAvailable(); TLI = TLIP ? &TLIP->getTLI() : nullptr; - AA = &getAnalysis(); + AA = &getAnalysis().getAAResults(); LI = &getAnalysis().getLoopInfo(); DT = &getAnalysis().getDomTree(); AC = &getAnalysis().getAssumptionCache(F); @@ -3145,12 +3229,14 @@ struct SLPVectorizer : public FunctionPass { FunctionPass::getAnalysisUsage(AU); AU.addRequired(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); AU.setPreservesCFG(); } @@ -3444,7 +3530,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, unsigned VecIdx = 0; for (auto &V : BuildVectorSlice) { IRBuilder Builder( - ++BasicBlock::iterator(InsertAfter)); + InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter)); InsertElementInst *IE = cast(V); Instruction *Extract = cast(Builder.CreateExtractElement( VectorizedRoot, Builder.getInt32(VecIdx++))); @@ -3505,7 +3591,7 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { /// \param NumEltsToRdx The number of elements that should be reduced in the /// vector. /// \param IsPairwise Whether the reduction is a pairwise or splitting -/// reduction. A pairwise reduction will generate a mask of +/// reduction. A pairwise reduction will generate a mask of /// <0,2,...> or <1,3,..> while a splitting reduction will generate /// <2,3, undef,undef> for a vector of 4 and NumElts = 2. /// \param IsLeft True will generate a mask of even elements, odd otherwise. @@ -3568,16 +3654,17 @@ class HorizontalReduction { unsigned ReductionOpcode; /// The opcode of the values we perform a reduction on. unsigned ReducedValueOpcode; - /// The width of one full horizontal reduction operation. - unsigned ReduxWidth; /// Should we model this reduction as a pairwise reduction tree or a tree that /// splits the vector in halves and adds those halves. bool IsPairwiseReduction; public: + /// The width of one full horizontal reduction operation. + unsigned ReduxWidth; + HorizontalReduction() : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), - ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} + ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0) {} /// \brief Try to find a reduction tree. bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { @@ -3623,11 +3710,11 @@ public: return false; // Post order traverse the reduction tree starting at B. We only handle true - // trees containing only binary operators. - SmallVector, 32> Stack; + // trees containing only binary operators or selects. + SmallVector, 32> Stack; Stack.push_back(std::make_pair(B, 0)); while (!Stack.empty()) { - BinaryOperator *TreeN = Stack.back().first; + Instruction *TreeN = Stack.back().first; unsigned EdgeToVist = Stack.back().second++; bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; @@ -3663,9 +3750,10 @@ public: // Visit left or right. Value *NextV = TreeN->getOperand(EdgeToVist); - BinaryOperator *Next = dyn_cast(NextV); - if (Next) - Stack.push_back(std::make_pair(Next, 0)); + // We currently only allow BinaryOperator's and SelectInst's as reduction + // values in our tree. + if (isa(NextV) || isa(NextV)) + Stack.push_back(std::make_pair(cast(NextV), 0)); else if (NextV != Phi) return false; } @@ -3686,7 +3774,7 @@ public: IRBuilder<> Builder(ReductionRoot); FastMathFlags Unsafe; Unsafe.setUnsafeAlgebra(); - Builder.SetFastMathFlags(Unsafe); + Builder.setFastMathFlags(Unsafe); unsigned i = 0; for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { @@ -3733,8 +3821,11 @@ public: return VectorizedTree != nullptr; } -private: + unsigned numReductionValues() const { + return ReducedVals.size(); + } +private: /// \brief Calculate the cost of a reduction. int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { Type *ScalarTy = FirstReducedVal->getType(); @@ -3841,6 +3932,82 @@ static bool PhiTypeSorterFunc(Value *V, Value *V2) { return V->getType() < V2->getType(); } +/// \brief Try and get a reduction value from a phi node. +/// +/// Given a phi node \p P in a block \p ParentBB, consider possible reductions +/// if they come from either \p ParentBB or a containing loop latch. +/// +/// \returns A candidate reduction value if possible, or \code nullptr \endcode +/// if not possible. +static Value *getReductionValue(const DominatorTree *DT, PHINode *P, + BasicBlock *ParentBB, LoopInfo *LI) { + // There are situations where the reduction value is not dominated by the + // reduction phi. Vectorizing such cases has been reported to cause + // miscompiles. See PR25787. + auto DominatedReduxValue = [&](Value *R) { + return ( + dyn_cast(R) && + DT->dominates(P->getParent(), dyn_cast(R)->getParent())); + }; + + Value *Rdx = nullptr; + + // Return the incoming value if it comes from the same BB as the phi node. + if (P->getIncomingBlock(0) == ParentBB) { + Rdx = P->getIncomingValue(0); + } else if (P->getIncomingBlock(1) == ParentBB) { + Rdx = P->getIncomingValue(1); + } + + if (Rdx && DominatedReduxValue(Rdx)) + return Rdx; + + // Otherwise, check whether we have a loop latch to look at. + Loop *BBL = LI->getLoopFor(ParentBB); + if (!BBL) + return nullptr; + BasicBlock *BBLatch = BBL->getLoopLatch(); + if (!BBLatch) + return nullptr; + + // There is a loop latch, return the incoming value if it comes from + // that. This reduction pattern occassionaly turns up. + if (P->getIncomingBlock(0) == BBLatch) { + Rdx = P->getIncomingValue(0); + } else if (P->getIncomingBlock(1) == BBLatch) { + Rdx = P->getIncomingValue(1); + } + + if (Rdx && DominatedReduxValue(Rdx)) + return Rdx; + + return nullptr; +} + +/// \brief Attempt to reduce a horizontal reduction. +/// If it is legal to match a horizontal reduction feeding +/// the phi node P with reduction operators BI, then check if it +/// can be done. +/// \returns true if a horizontal reduction was matched and reduced. +/// \returns false if a horizontal reduction was not matched. +static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI, + BoUpSLP &R, TargetTransformInfo *TTI) { + if (!ShouldVectorizeHor) + return false; + + HorizontalReduction HorRdx; + if (!HorRdx.matchAssociativeReduction(P, BI)) + return false; + + // If there is a sufficient number of reduction values, reduce + // to a nearby power-of-2. Can safely generate oversized + // vectors and rely on the backend to split them to legal sizes. + HorRdx.ReduxWidth = + std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues())); + + return HorRdx.tryToReduce(R, TTI); +} + bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector Incoming; @@ -3852,9 +4019,8 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Collect the incoming values from the PHIs. Incoming.clear(); - for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie; - ++instr) { - PHINode *P = dyn_cast(instr); + for (Instruction &I : *BB) { + PHINode *P = dyn_cast(&I); if (!P) break; @@ -3897,7 +4063,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { // We may go through BB multiple times so skip the one we have checked. - if (!VisitedInstrs.insert(it).second) + if (!VisitedInstrs.insert(&*it).second) continue; if (isa(it)) @@ -3908,20 +4074,16 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Check that the PHI is a reduction PHI. if (P->getNumIncomingValues() != 2) return Changed; - Value *Rdx = - (P->getIncomingBlock(0) == BB - ? (P->getIncomingValue(0)) - : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) - : nullptr)); + + Value *Rdx = getReductionValue(DT, P, BB, LI); + // Check if this is a Binary Operator. BinaryOperator *BI = dyn_cast_or_null(Rdx); if (!BI) continue; // Try to match and vectorize a horizontal reduction. - HorizontalReduction HorRdx; - if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) && - HorRdx.tryToReduce(R, TTI)) { + if (canMatchHorizontalReduction(P, BI, R, TTI)) { Changed = true; it = BB->begin(); e = BB->end(); @@ -3944,15 +4106,12 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { continue; } - // Try to vectorize horizontal reductions feeding into a store. if (ShouldStartVectorizeHorAtStore) if (StoreInst *SI = dyn_cast(it)) if (BinaryOperator *BinOp = dyn_cast(SI->getValueOperand())) { - HorizontalReduction HorRdx; - if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) && - HorRdx.tryToReduce(R, TTI)) || - tryToVectorize(BinOp, R))) { + if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI) || + tryToVectorize(BinOp, R)) { Changed = true; it = BB->begin(); e = BB->end(); @@ -4053,7 +4212,7 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { char SLPVectorizer::ID = 0; static const char lv_name[] = "SLP Vectorizer"; INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)