X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTransforms%2FVectorize%2FLoopVectorize.cpp;h=77633a562a6759549ea9c6fb77b40aa5ba0f7926;hb=9d84b4d70cbd86c462d70a23836ec42323bba591;hp=c05288bd07030a9c90975ba36672accfa59a2b78;hpb=e96fec2e436003714a6bf3739d594d77701da1e5;p=oota-llvm.git diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index c05288bd070..77633a562a6 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -56,6 +56,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" @@ -74,14 +75,15 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" #include "llvm/Pass.h" +#include "llvm/Support/BranchProbability.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/PatternMatch.h" -#include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" @@ -139,11 +141,60 @@ static const unsigned RuntimeMemoryCheckThreshold = 8; /// Maximum simd width. static const unsigned MaxVectorWidth = 64; +static cl::opt ForceTargetNumScalarRegs( + "force-target-num-scalar-regs", cl::init(0), cl::Hidden, + cl::desc("A flag that overrides the target's number of scalar registers.")); + +static cl::opt ForceTargetNumVectorRegs( + "force-target-num-vector-regs", cl::init(0), cl::Hidden, + cl::desc("A flag that overrides the target's number of vector registers.")); + /// Maximum vectorization unroll count. static const unsigned MaxUnrollFactor = 16; -/// The cost of a loop that is considered 'small' by the unroller. -static const unsigned SmallLoopCost = 20; +static cl::opt ForceTargetMaxScalarUnrollFactor( + "force-target-max-scalar-unroll", cl::init(0), cl::Hidden, + cl::desc("A flag that overrides the target's max unroll factor for scalar " + "loops.")); + +static cl::opt ForceTargetMaxVectorUnrollFactor( + "force-target-max-vector-unroll", cl::init(0), cl::Hidden, + cl::desc("A flag that overrides the target's max unroll factor for " + "vectorized loops.")); + +static cl::opt ForceTargetInstructionCost( + "force-target-instruction-cost", cl::init(0), cl::Hidden, + cl::desc("A flag that overrides the target's expected cost for " + "an instruction to a single constant value. Mostly " + "useful for getting consistent testing.")); + +static cl::opt SmallLoopCost( + "small-loop-cost", cl::init(20), cl::Hidden, + cl::desc("The cost of a loop that is considered 'small' by the unroller.")); + +static cl::opt LoopVectorizeWithBlockFrequency( + "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, + cl::desc("Enable the use of the block frequency analysis to access PGO " + "heuristics minimizing code growth in cold regions and being more " + "aggressive in hot regions.")); + +// Runtime unroll loops for load/store throughput. +static cl::opt EnableLoadStoreRuntimeUnroll( + "enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden, + cl::desc("Enable runtime unrolling until load/store ports are saturated")); + +/// The number of stores in a loop that are allowed to need predication. +static cl::opt NumberOfStoresToPredicate( + "vectorize-num-stores-pred", cl::init(1), cl::Hidden, + cl::desc("Max number of stores to be predicated behind an if.")); + +static cl::opt EnableIndVarRegisterHeur( + "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, + cl::desc("Count the induction variable only once when unrolling")); + +static cl::opt EnableCondStoresVectorization( + "enable-cond-stores-vec", cl::init(false), cl::Hidden, + cl::desc("Enable if predication of stores during vectorization.")); namespace { @@ -168,7 +219,7 @@ class LoopVectorizationCostModel; class InnerLoopVectorizer { public: InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, DataLayout *DL, + DominatorTree *DT, const DataLayout *DL, const TargetLibraryInfo *TLI, unsigned VecWidth, unsigned UnrollFactor) : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI), @@ -248,8 +299,11 @@ protected: void updateAnalysis(); /// This instruction is un-vectorizable. Implement it as a sequence - /// of scalars. - virtual void scalarizeInstruction(Instruction *Instr); + /// of scalars. If \p IfPredicateStore is true we need to 'hide' each + /// scalarized instruction behind an if block predicated on the control + /// dependence of the instruction. + virtual void scalarizeInstruction(Instruction *Instr, + bool IfPredicateStore=false); /// Vectorize Load and Store instructions, virtual void vectorizeMemoryInstruction(Instruction *Instr); @@ -325,7 +379,7 @@ protected: /// Dominator Tree. DominatorTree *DT; /// Data Layout. - DataLayout *DL; + const DataLayout *DL; /// Target Library Info. const TargetLibraryInfo *TLI; @@ -352,7 +406,7 @@ protected: ///The ExitBlock of the scalar loop. BasicBlock *LoopExitBlock; ///The vector loop body. - BasicBlock *LoopVectorBody; + SmallVector LoopVectorBody; ///The scalar loop body. BasicBlock *LoopScalarBody; /// A list of all bypass blocks. The first block is the entry of the loop. @@ -374,16 +428,17 @@ protected: class InnerLoopUnroller : public InnerLoopVectorizer { public: InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, DataLayout *DL, + DominatorTree *DT, const DataLayout *DL, const TargetLibraryInfo *TLI, unsigned UnrollFactor) : InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { } private: - virtual void scalarizeInstruction(Instruction *Instr); - virtual void vectorizeMemoryInstruction(Instruction *Instr); - virtual Value *getBroadcastInstrs(Value *V); - virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); - virtual Value *reverseVector(Value *Vec); + void scalarizeInstruction(Instruction *Instr, + bool IfPredicateStore = false) override; + void vectorizeMemoryInstruction(Instruction *Instr) override; + Value *getBroadcastInstrs(Value *V) override; + Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate) override; + Value *reverseVector(Value *Vec) override; }; /// \brief Look for a meaningful debug location on the instruction or it's @@ -429,10 +484,14 @@ static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) { /// induction variable and the different reduction variables. class LoopVectorizationLegality { public: - LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL, + unsigned NumLoads; + unsigned NumStores; + unsigned NumPredStores; + + LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL, DominatorTree *DT, TargetLibraryInfo *TLI) - : TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI), - Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false), + : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL), + DT(DT), TLI(TLI), Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {} /// This enum represents the kinds of reductions that we support. @@ -667,7 +726,7 @@ private: /// Scev analysis. ScalarEvolution *SE; /// DataLayout analysis. - DataLayout *DL; + const DataLayout *DL; /// Dominators. DominatorTree *DT; /// Target Library Info. @@ -717,7 +776,7 @@ public: LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, - DataLayout *DL, const TargetLibraryInfo *TLI) + const DataLayout *DL, const TargetLibraryInfo *TLI) : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {} /// Information about vectorization costs @@ -790,7 +849,7 @@ private: /// Vector target information. const TargetTransformInfo &TTI; /// Target data layout information. - DataLayout *DL; + const DataLayout *DL; /// Target Library Info. const TargetLibraryInfo *TLI; }; @@ -930,39 +989,53 @@ private: } }; +static void addInnerLoop(Loop *L, SmallVectorImpl &V) { + if (L->empty()) + return V.push_back(L); + + for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) + addInnerLoop(*I, V); +} + /// The LoopVectorize Pass. -struct LoopVectorize : public LoopPass { +struct LoopVectorize : public FunctionPass { /// Pass identification, replacement for typeid static char ID; explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true) - : LoopPass(ID), + : FunctionPass(ID), DisableUnrolling(NoUnrolling), AlwaysVectorize(AlwaysVectorize) { initializeLoopVectorizePass(*PassRegistry::getPassRegistry()); } ScalarEvolution *SE; - DataLayout *DL; + const DataLayout *DL; LoopInfo *LI; TargetTransformInfo *TTI; DominatorTree *DT; + BlockFrequencyInfo *BFI; TargetLibraryInfo *TLI; bool DisableUnrolling; bool AlwaysVectorize; - virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { - // We only vectorize innermost loops. - if (!L->empty()) - return false; + BlockFrequency ColdEntryFreq; + bool runOnFunction(Function &F) override { SE = &getAnalysis(); - DL = getAnalysisIfAvailable(); + DataLayoutPass *DLP = getAnalysisIfAvailable(); + DL = DLP ? &DLP->getDataLayout() : 0; LI = &getAnalysis(); TTI = &getAnalysis(); DT = &getAnalysis().getDomTree(); + BFI = &getAnalysis(); TLI = getAnalysisIfAvailable(); + // Compute some weights outside of the loop over the loops. Compute this + // using a BranchProbability to re-use its scaling math. + const BranchProbability ColdProb(1, 5); // 20% + ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb; + // If the target claims to have no vector registers don't attempt // vectorization. if (!TTI->getNumberOfRegisters(true)) @@ -973,6 +1046,32 @@ struct LoopVectorize : public LoopPass { return false; } + // Build up a worklist of inner-loops to vectorize. This is necessary as + // the act of vectorizing or partially unrolling a loop creates new loops + // and can invalidate iterators across the loops. + SmallVector Worklist; + + for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) + addInnerLoop(*I, Worklist); + + // Now walk the identified inner loops. + bool Changed = false; + while (!Worklist.empty()) + Changed |= processLoop(Worklist.pop_back_val()); + + // Process each loop nest in the function. + return Changed; + } + + bool processLoop(Loop *L) { + // We only handle inner loops, so if there are children just recurse. + if (!L->empty()) { + bool Changed = false; + for (Loop::iterator I = L->begin(), E = L->begin(); I != E; ++I) + Changed |= processLoop(*I); + return Changed; + } + DEBUG(dbgs() << "LV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); @@ -1006,14 +1105,25 @@ struct LoopVectorize : public LoopPass { // Check the function attributes to find out if this function should be // optimized for size. Function *F = L->getHeader()->getParent(); - Attribute::AttrKind SzAttr = Attribute::OptimizeForSize; - Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat; - unsigned FnIndex = AttributeSet::FunctionIndex; - bool OptForSize = Hints.Force != 1 && - F->getAttributes().hasAttribute(FnIndex, SzAttr); - bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr); - - if (NoFloat) { + bool OptForSize = + Hints.Force != 1 && F->hasFnAttribute(Attribute::OptimizeForSize); + + // Compute the weighted frequency of this loop being executed and see if it + // is less than 20% of the function entry baseline frequency. Note that we + // always have a canonical loop here because we think we *can* vectoriez. + // FIXME: This is hidden behind a flag due to pervasive problems with + // exactly what block frequency models. + if (LoopVectorizeWithBlockFrequency) { + BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader()); + if (Hints.Force != 1 && LoopEntryFreq < ColdEntryFreq) + OptForSize = true; + } + + // Check the function attributes to see if implicit floats are allowed.a + // FIXME: This check doesn't seem possibly correct -- what if the loop is + // an integer loop and the vector instructions selected are purely integer + // vector instructions? + if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"); return false; @@ -1051,10 +1161,10 @@ struct LoopVectorize : public LoopPass { return true; } - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - LoopPass::getAnalysisUsage(AU); + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); @@ -1133,7 +1243,9 @@ void LoopVectorizationLegality::RuntimePointerCheck::insert( Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { // We need to place the broadcast of invariant variables outside the loop. Instruction *Instr = dyn_cast(V); - bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); + bool NewInstr = + (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(), + Instr->getParent()) != LoopVectorBody.end()); bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; // Place the code for broadcasting invariant variables in the new preheader. @@ -1173,7 +1285,7 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, /// \brief Find the operand of the GEP that should be checked for consecutive /// stores. This ignores trailing indices that have no effect on the final /// pointer. -static unsigned getGEPInductionOperand(DataLayout *DL, +static unsigned getGEPInductionOperand(const DataLayout *DL, const GetElementPtrInst *Gep) { unsigned LastOperand = Gep->getNumOperands() - 1; unsigned GEPAllocSize = DL->getTypeAllocSize( @@ -1338,6 +1450,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; + if (SI && Legal->blockNeedsPredication(SI->getParent())) + return scalarizeInstruction(Instr, true); + if (ScalarAllocatedSize != VectorElementSize) return scalarizeInstruction(Instr); @@ -1456,7 +1571,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { } } -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector Params; @@ -1501,10 +1616,38 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + Instruction *InsertPt = Builder.GetInsertPoint(); + BasicBlock *IfBlock = Builder.GetInsertBlock(); + BasicBlock *CondBlock = 0; + + VectorParts Cond; + Loop *VectorLp = 0; + if (IfPredicateStore) { + assert(Instr->getParent()->getSinglePredecessor() && + "Only support single predecessor blocks"); + Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), + Instr->getParent()); + VectorLp = LI->getLoopFor(IfBlock); + assert(VectorLp && "Must have a loop for this block"); + } + // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { // For each scalar that we create: for (unsigned Width = 0; Width < VF; ++Width) { + + // Start if-block. + Value *Cmp = 0; + if (IfPredicateStore) { + Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); + Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); + CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); + LoopVectorBody.push_back(CondBlock); + VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); + // Update Builder with newly created basic block. + Builder.SetInsertPoint(InsertPt); + } + Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); @@ -1525,6 +1668,17 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { if (!IsVoidRetTy) VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, Builder.getInt32(Width)); + // End if-block. + if (IfPredicateStore) { + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + LoopVectorBody.push_back(NewIfBlock); + VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } } } } @@ -1824,7 +1978,7 @@ void InnerLoopVectorizer::createEmptyLoop() { // sequence of instructions that form a check. Instruction *StrideCheck; Instruction *FirstCheckInst; - tie(FirstCheckInst, StrideCheck) = + std::tie(FirstCheckInst, StrideCheck) = addStrideCheck(BypassBlock->getTerminator()); if (StrideCheck) { // Create a new block containing the stride check. @@ -1848,7 +2002,7 @@ void InnerLoopVectorizer::createEmptyLoop() { // checks into a separate block to make the more common case of few elements // faster. Instruction *MemRuntimeCheck; - tie(FirstCheckInst, MemRuntimeCheck) = + std::tie(FirstCheckInst, MemRuntimeCheck) = addRuntimeCheck(LastBypassBlock->getTerminator()); if (MemRuntimeCheck) { // Create a new block containing the memory check. @@ -2028,7 +2182,7 @@ void InnerLoopVectorizer::createEmptyLoop() { LoopScalarPreHeader = ScalarPH; LoopMiddleBlock = MiddleBlock; LoopExitBlock = ExitBlock; - LoopVectorBody = VecBody; + LoopVectorBody.push_back(VecBody); LoopScalarBody = OldBasicBlock; LoopVectorizeHints Hints(Lp, true); @@ -2296,26 +2450,53 @@ struct CSEDenseMapInfo { }; } +/// \brief Check whether this block is a predicated block. +/// Due to if predication of stores we might create a sequence of "if(pred) a[i] +/// = ...; " blocks. We start with one vectorized basic block. For every +/// conditional block we split this vectorized block. Therefore, every second +/// block will be a predicated one. +static bool isPredicatedBlock(unsigned BlockNum) { + return BlockNum % 2; +} + ///\brief Perform cse of induction variable instructions. -static void cse(BasicBlock *BB) { +static void cse(SmallVector &BBs) { // Perform simple cse. SmallDenseMap CSEMap; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { - Instruction *In = I++; + for (unsigned i = 0, e = BBs.size(); i != e; ++i) { + BasicBlock *BB = BBs[i]; + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { + Instruction *In = I++; - if (!CSEDenseMapInfo::canHandle(In)) - continue; + if (!CSEDenseMapInfo::canHandle(In)) + continue; - // Check if we can replace this instruction with any of the - // visited instructions. - if (Instruction *V = CSEMap.lookup(In)) { - In->replaceAllUsesWith(V); - In->eraseFromParent(); - continue; + // Check if we can replace this instruction with any of the + // visited instructions. + if (Instruction *V = CSEMap.lookup(In)) { + In->replaceAllUsesWith(V); + In->eraseFromParent(); + continue; + } + // Ignore instructions in conditional blocks. We create "if (pred) a[i] = + // ...;" blocks for predicated stores. Every second block is a predicated + // block. + if (isPredicatedBlock(i)) + continue; + + CSEMap[In] = In; } + } +} - CSEMap[In] = In; +/// \brief Adds a 'fast' flag to floating point operations. +static Value *addFastMathFlag(Value *V) { + if (isa(V)){ + FastMathFlags Flags; + Flags.setUnsafeAlgebra(); + cast(V)->setFastMathFlags(Flags); } + return V; } void InnerLoopVectorizer::vectorizeLoop() { @@ -2371,7 +2552,7 @@ void InnerLoopVectorizer::vectorizeLoop() { setDebugLocFromInst(Builder, RdxDesc.StartValue); // We need to generate a reduction vector from the incoming scalar. - // To do so, we need to generate the 'identity' vector and overide + // To do so, we need to generate the 'identity' vector and override // one of the elements with the incoming scalar reduction. We need // to do it in the vector-loop preheader. Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator()); @@ -2430,7 +2611,8 @@ void InnerLoopVectorizer::vectorizeLoop() { // first unroll part. Value *StartVal = (part == 0) ? VectorStart : Identity; cast(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader); - cast(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody); + cast(VecRdxPhi[part])->addIncoming(Val[part], + LoopVectorBody.back()); } // Before each round, move the insertion point right between @@ -2449,7 +2631,8 @@ void InnerLoopVectorizer::vectorizeLoop() { Value *StartVal = (part == 0) ? VectorStart : Identity; for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); - NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody); + NewPhi->addIncoming(RdxExitVal[part], + LoopVectorBody.back()); RdxParts.push_back(NewPhi); } @@ -2459,9 +2642,10 @@ void InnerLoopVectorizer::vectorizeLoop() { setDebugLocFromInst(Builder, ReducedPartRdx); for (unsigned part = 1; part < UF; ++part) { if (Op != Instruction::ICmp && Op != Instruction::FCmp) - ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op, - RdxParts[part], ReducedPartRdx, - "bin.rdx"); + // Floating point operations had to be 'fast' to enable the reduction. + ReducedPartRdx = addFastMathFlag( + Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part], + ReducedPartRdx, "bin.rdx")); else ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind, ReducedPartRdx, RdxParts[part]); @@ -2491,8 +2675,9 @@ void InnerLoopVectorizer::vectorizeLoop() { "rdx.shuf"); if (Op != Instruction::ICmp && Op != Instruction::FCmp) - TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, - "bin.rdx"); + // Floating point operations had to be 'fast' to enable the reduction. + TmpVec = addFastMathFlag(Builder.CreateBinOp( + (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); else TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf); } @@ -2622,7 +2807,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Type *VecTy = (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", - LoopVectorBody-> getFirstInsertionPt()); + LoopVectorBody.back()-> getFirstInsertionPt()); } PV->push_back(P); return; @@ -2826,6 +3011,10 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { if (VecOp && isa(VecOp)) VecOp->setIsExact(BinOp->isExact()); + // Copy the fast-math flags. + if (VecOp && isa(V)) + VecOp->setFastMathFlags(it->getFastMathFlags()); + Entry[Part] = V; } break; @@ -2971,7 +3160,19 @@ void InnerLoopVectorizer::updateAnalysis() { for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]); DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); - DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); + + // Due to if predication of stores we might create a sequence of "if(pred) + // a[i] = ...; " blocks. + for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) { + if (i == 0) + DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); + else if (isPredicatedBlock(i)) { + DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]); + } else { + DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]); + } + } + DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front()); DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); @@ -3115,7 +3316,7 @@ bool LoopVectorizationLegality::canVectorize() { return true; } -static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) { +static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { if (Ty->isPointerTy()) return DL.getIntPtrType(Ty); @@ -3127,7 +3328,7 @@ static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) { return Ty; } -static Type* getWiderType(DataLayout &DL, Type *Ty0, Type *Ty1) { +static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { Ty0 = convertPointerToIntegerType(DL, Ty0); Ty1 = convertPointerToIntegerType(DL, Ty1); if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) @@ -3325,7 +3526,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { ///\brief Remove GEPs whose indices but the last one are loop invariant and /// return the induction operand of the gep pointer. static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, - DataLayout *DL, Loop *Lp) { + const DataLayout *DL, Loop *Lp) { GetElementPtrInst *GEP = dyn_cast(Ptr); if (!GEP) return Ptr; @@ -3361,7 +3562,7 @@ static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) { /// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a /// pointer to the Value, or null otherwise. static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, - DataLayout *DL, Loop *Lp) { + const DataLayout *DL, Loop *Lp) { const PointerType *PtrTy = dyn_cast(Ptr->getType()); if (!PtrTy || PtrTy->isAggregateType()) return 0; @@ -3496,7 +3697,7 @@ public: /// \brief Set of potential dependent memory accesses. typedef EquivalenceClasses DepCandidates; - AccessAnalysis(DataLayout *Dl, DepCandidates &DA) : + AccessAnalysis(const DataLayout *Dl, DepCandidates &DA) : DL(Dl), DepCands(DA), AreAllWritesIdentified(true), AreAllReadsIdentified(true), IsRTCheckNeeded(false) {} @@ -3562,7 +3763,7 @@ private: /// Set of underlying objects already written to. SmallPtrSet WriteObjects; - DataLayout *DL; + const DataLayout *DL; /// Sets of potentially dependent accesses - members of one set share an /// underlying pointer. The set "CheckDeps" identfies which sets really need a @@ -3589,7 +3790,7 @@ static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides, /// \brief Check the stride of the pointer and ensure that it does not wrap in /// the address space. -static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr, +static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, const Loop *Lp, ValueToValueMap &StridesMap); bool AccessAnalysis::canCheckPtrAtRT( @@ -3713,8 +3914,8 @@ void AccessAnalysis::processMemAccesses(bool UseDeferred) { } bool NeedDepCheck = false; - // Check whether there is the possiblity of dependency because of underlying - // objects being the same. + // Check whether there is the possibility of dependency because of + // underlying objects being the same. typedef SmallVector ValueVector; ValueVector TempObjects; GetUnderlyingObjects(Ptr, TempObjects, DL); @@ -3809,7 +4010,7 @@ public: typedef PointerIntPair MemAccessInfo; typedef SmallPtrSet MemAccessInfoSet; - MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) + MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L) : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), ShouldRetryWithRuntimeCheck(false) {} @@ -3847,7 +4048,7 @@ public: private: ScalarEvolution *SE; - DataLayout *DL; + const DataLayout *DL; const Loop *InnermostLoop; /// \brief Maps access locations (ptr, read/write) to program order. @@ -3896,7 +4097,7 @@ static bool isInBoundsGep(Value *Ptr) { } /// \brief Check whether the access through \p Ptr has a constant stride. -static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr, +static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, const Loop *Lp, ValueToValueMap &StridesMap) { const Type *Ty = Ptr->getType(); assert(Ty->isPointerTy() && "Unexpected non-ptr"); @@ -4155,7 +4356,7 @@ bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, // Check every access pair. while (AI != AE) { CheckDeps.erase(*AI); - EquivalenceClasses::member_iterator OI = llvm::next(AI); + EquivalenceClasses::member_iterator OI = std::next(AI); while (OI != AE) { // Check every accessing instruction pair in program order. for (std::vector::iterator I1 = Accesses[*AI].begin(), @@ -4219,6 +4420,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { DEBUG(dbgs() << "LV: Found a non-simple load.\n"); return false; } + NumLoads++; Loads.push_back(Ld); DepChecker.addAccess(Ld); continue; @@ -4232,6 +4434,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { DEBUG(dbgs() << "LV: Found a non-simple store.\n"); return false; } + NumStores++; Stores.push_back(St); DepChecker.addAccess(St); } @@ -4535,13 +4738,22 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, continue; } - // Process instructions only once (termination). + // Process instructions only once (termination). Each reduction cycle + // value must only be used once, except by phi nodes and min/max + // reductions which are represented as a cmp followed by a select. + ReductionInstDesc IgnoredVal(false, 0); if (VisitedInsts.insert(Usr)) { if (isa(Usr)) PHIs.push_back(Usr); else NonPHIs.push_back(Usr); - } + } else if (!isa(Usr) && + ((!isa(Usr) && + !isa(Usr) && + !isa(Usr)) || + !isMinMaxSelectCmpPattern(Usr, IgnoredVal).IsReduction)) + return false; + // Remember that we completed the cycle. if (Usr == Phi) FoundStartPHI = true; @@ -4734,7 +4946,16 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, } // We don't predicate stores at the moment. - if (it->mayWriteToMemory() || it->mayThrow()) + if (it->mayWriteToMemory()) { + StoreInst *SI = dyn_cast(it); + // We only support predication of stores in basic blocks with one + // predecessor. + if (!SI || ++NumPredStores > NumberOfStoresToPredicate || + !SafePtrs.count(SI->getPointerOperand()) || + !SI->getParent()->getSinglePredecessor()) + return false; + } + if (it->mayThrow()) return false; // Check that we don't have a constant expression that can trap as operand. @@ -4769,6 +4990,11 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, return Factor; } + if (!EnableCondStoresVectorization && Legal->NumPredStores) { + DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); + return Factor; + } + // Find the trip count. unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); @@ -4924,9 +5150,17 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, if (TC > 1 && TC < TinyTripCountUnrollThreshold) return 1; - unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true); - DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters << - " vector registers\n"); + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); + DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters << + " registers\n"); + + if (VF == 1) { + if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) + TargetNumRegisters = ForceTargetNumScalarRegs; + } else { + if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) + TargetNumRegisters = ForceTargetNumVectorRegs; + } LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); // We divide by these constants so assume that we have at least one @@ -4939,12 +5173,29 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, // registers. These registers are used by all of the unrolled instances. // Next, divide the remaining registers by the number of registers that is // required by the loop, in order to estimate how many parallel instances - // fit without causing spills. - unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers; + // fit without causing spills. All of this is rounded down if necessary to be + // a power of two. We want power of two unroll factors to simplify any + // addressing operations or alignment considerations. + unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / + R.MaxLocalUsers); + + // Don't count the induction variable as unrolled. + if (EnableIndVarRegisterHeur) + UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / + std::max(1U, (R.MaxLocalUsers - 1))); // Clamp the unroll factor ranges to reasonable factors. unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor(); + // Check if the user has overridden the unroll max. + if (VF == 1) { + if (ForceTargetMaxScalarUnrollFactor.getNumOccurrences() > 0) + MaxUnrollSize = ForceTargetMaxScalarUnrollFactor; + } else { + if (ForceTargetMaxVectorUnrollFactor.getNumOccurrences() > 0) + MaxUnrollSize = ForceTargetMaxVectorUnrollFactor; + } + // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) @@ -4957,32 +5208,40 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, else if (UF < 1) UF = 1; - bool HasReductions = Legal->getReductionVars()->size(); - - // Decide if we want to unroll if we decided that it is legal to vectorize - // but not profitable. - if (VF == 1) { - if (TheLoop->getNumBlocks() > 1 || !HasReductions || - LoopCost > SmallLoopCost) - return 1; - - return UF; - } - - if (HasReductions) { + // Unroll if we vectorized this loop and there is a reduction that could + // benefit from unrolling. + if (VF > 1 && Legal->getReductionVars()->size()) { DEBUG(dbgs() << "LV: Unrolling because of reductions.\n"); return UF; } - // We want to unroll tiny loops in order to reduce the loop overhead. - // We assume that the cost overhead is 1 and we use the cost model - // to estimate the cost of the loop and unroll until the cost of the - // loop overhead is about 5% of the cost of the loop. + // Note that if we've already vectorized the loop we will have done the + // runtime check and so unrolling won't require further checks. + bool UnrollingRequiresRuntimePointerCheck = + (VF == 1 && Legal->getRuntimePointerCheck()->Need); + + // We want to unroll small loops in order to reduce the loop overhead and + // potentially expose ILP opportunities. DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); - if (LoopCost < SmallLoopCost) { + if (!UnrollingRequiresRuntimePointerCheck && + LoopCost < SmallLoopCost) { + // We assume that the cost overhead is 1 and we use the cost model + // to estimate the cost of the loop and unroll until the cost of the + // loop overhead is about 5% of the cost of the loop. + unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); + + // Unroll until store/load ports (estimated by max unroll factor) are + // saturated. + unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1); + unsigned LoadsUF = UF / (Legal->NumLoads ? Legal->NumLoads : 1); + + if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) { + DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n"); + return std::max(StoresUF, LoadsUF); + } + DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n"); - unsigned NewUF = SmallLoopCost / (LoopCost + 1); - return std::min(NewUF, UF); + return SmallUF; } DEBUG(dbgs() << "LV: Not Unrolling.\n"); @@ -5118,6 +5377,11 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { continue; unsigned C = getInstructionCost(it, VF); + + // Check if we should override the cost. + if (ForceTargetInstructionCost.getNumOccurrences() > 0) + C = ForceTargetInstructionCost; + BlockCost += C; DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " << VF << " For instruction: " << *it << '\n'); @@ -5245,9 +5509,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_AnyValue; + Value *Op2 = I->getOperand(1); - if (isa(I->getOperand(1))) + // Check for a splat of a constant or for a non uniform vector of constants. + if (isa(Op2)) Op2VK = TargetTransformInfo::OK_UniformConstantValue; + else if (isa(Op2) || isa(Op2)) { + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + if (cast(Op2)->getSplatValue() != NULL) + Op2VK = TargetTransformInfo::OK_UniformConstantValue; + } return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK); } @@ -5391,6 +5662,7 @@ char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LCSSA) @@ -5417,7 +5689,8 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { } -void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) { +void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, + bool IfPredicateStore) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector Params; @@ -5462,10 +5735,40 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) { // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); + Instruction *InsertPt = Builder.GetInsertPoint(); + BasicBlock *IfBlock = Builder.GetInsertBlock(); + BasicBlock *CondBlock = 0; + + VectorParts Cond; + Loop *VectorLp = 0; + if (IfPredicateStore) { + assert(Instr->getParent()->getSinglePredecessor() && + "Only support single predecessor blocks"); + Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), + Instr->getParent()); + VectorLp = LI->getLoopFor(IfBlock); + assert(VectorLp && "Must have a loop for this block"); + } + // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { // For each scalar that we create: + // Start an "if (pred) a[i] = ..." block. + Value *Cmp = 0; + if (IfPredicateStore) { + if (Cond[Part]->getType()->isVectorTy()) + Cond[Part] = + Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); + Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part], + ConstantInt::get(Cond[Part]->getType(), 1)); + CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); + LoopVectorBody.push_back(CondBlock); + VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); + // Update Builder with newly created basic block. + Builder.SetInsertPoint(InsertPt); + } + Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); @@ -5482,11 +5785,26 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) { // so that future users will be able to use it. if (!IsVoidRetTy) VecResults[Part] = Cloned; + + // End if-block. + if (IfPredicateStore) { + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + LoopVectorBody.push_back(NewIfBlock); + VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } } } void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { - return scalarizeInstruction(Instr); + StoreInst *SI = dyn_cast(Instr); + bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent())); + + return scalarizeInstruction(Instr, IfPredicateStore); } Value *InnerLoopUnroller::reverseVector(Value *Vec) {