X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTransforms%2FVectorize%2FLoopVectorize.cpp;h=c5b8b5b073d6779f110d4ace19c05f14620ebe33;hp=be5ceb5d953043a4e0edae38957b5874d9057c3b;hb=aaaedd7f8f7d68b41ad3845d050365c03163942f;hpb=8faf2a455ae8cd9a360ef95c341672450e113cdf diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index be5ceb5d953..c5b8b5b073d 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -48,7 +48,6 @@ #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" @@ -58,10 +57,13 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DemandedBits.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" @@ -99,6 +101,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include +#include #include #include @@ -123,6 +126,11 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), "trip count that is smaller than this " "value.")); +static cl::opt MaximizeBandwidth( + "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, + cl::desc("Maximize bandwidth when selecting vectorization factor which " + "will be determined by the smallest type in loop.")); + /// This enables versioning on the strides of symbolically striding memory /// accesses in code like the following. /// for (i = 0; i < N; ++i) @@ -136,7 +144,7 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), /// ... static cl::opt EnableMemAccessVersioning( "enable-mem-access-versioning", cl::init(true), cl::Hidden, - cl::desc("Enable symblic stride memory access versioning")); + cl::desc("Enable symbolic stride memory access versioning")); static cl::opt EnableInterleavedMemAccesses( "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, @@ -214,6 +222,20 @@ static cl::opt MaxNestedScalarReductionIC( cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop.")); +static cl::opt PragmaVectorizeMemoryCheckThreshold( + "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum allowed number of runtime memory checks with a " + "vectorize(enable) pragma.")); + +static cl::opt VectorizeSCEVCheckThreshold( + "vectorize-scev-check-threshold", cl::init(16), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed.")); + +static cl::opt PragmaVectorizeSCEVCheckThreshold( + "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed with a " + "vectorize(enable) pragma")); + namespace { // Forward declarations. @@ -246,6 +268,32 @@ static Type* ToVectorTy(Type *Scalar, unsigned VF) { return VectorType::get(Scalar, VF); } +/// A helper function that returns GEP instruction and knows to skip a +/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination +/// pointee types of the 'bitcast' have the same size. +/// For example: +/// bitcast double** %var to i64* - can be skipped +/// bitcast double** %var to i8* - can not +static GetElementPtrInst *getGEPInstruction(Value *Ptr) { + + if (isa(Ptr)) + return cast(Ptr); + + if (isa(Ptr) && + isa(cast(Ptr)->getOperand(0))) { + Type *BitcastTy = Ptr->getType(); + Type *GEPTy = cast(Ptr)->getSrcTy(); + if (!isa(BitcastTy) || !isa(GEPTy)) + return nullptr; + Type *Pointee1Ty = cast(BitcastTy)->getPointerElementType(); + Type *Pointee2Ty = cast(GEPTy)->getPointerElementType(); + const DataLayout &DL = cast(Ptr)->getModule()->getDataLayout(); + if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty)) + return cast(cast(Ptr)->getOperand(0)); + } + return nullptr; +} + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -265,22 +313,26 @@ public: InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, unsigned VecWidth, - unsigned UnrollFactor) + unsigned UnrollFactor, SCEVUnionPredicate &Preds) : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), - Legal(nullptr), AddedSafetyChecks(false) {} + TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr), + AddedSafetyChecks(false), Preds(Preds) {} // Perform the actual loop widening (vectorization). - void vectorize(LoopVectorizationLegality *L) { + // MinimumBitWidths maps scalar integer values to the smallest bitwidth they + // can be validly truncated to. The cost model has assumed this truncation + // will happen when vectorizing. + void vectorize(LoopVectorizationLegality *L, + MapVector MinimumBitWidths) { + MinBWs = MinimumBitWidths; Legal = L; // Create a new empty loop. Unlink the old loop and connect the new one. createEmptyLoop(); // Widen each instruction in the old loop to a new one in the new loop. // Use the Legality module to find the induction and reduction variables. vectorizeLoop(); - // Register the new loop and update the analysis passes. - updateAnalysis(); } // Return true if any runtime check is added. @@ -303,14 +355,11 @@ protected: typedef DenseMap, VectorParts> EdgeMaskCache; - /// \brief Add checks for strides that were assumed to be 1. - /// - /// Returns the last check instruction and the first check instruction in the - /// pair as (first, last). - std::pair addStrideCheck(Instruction *Loc); - /// Create an empty loop, based on the loop ranges of the old loop. void createEmptyLoop(); + /// Create a new induction variable inside L. + PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, + Value *Step, Instruction *DL); /// Copy and widen the instructions from the old loop. virtual void vectorizeLoop(); @@ -320,6 +369,9 @@ protected: /// See PR14725. void fixLCSSAPHIs(); + /// Shrinks vector element sizes based on information in "MinBWs". + void truncateToMinimalBitwidths(); + /// A helper function that computes the predicate of the block BB, assuming /// that the header block of the loop is set to True. It returns the *entry* /// mask for the block BB. @@ -330,7 +382,7 @@ protected: /// A helper function to vectorize a single BB within the innermost loop. void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); - + /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. @@ -375,6 +427,23 @@ protected: /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); + /// Returns (and creates if needed) the original loop trip count. + Value *getOrCreateTripCount(Loop *NewLoop); + + /// Returns (and creates if needed) the trip count of the widened loop. + Value *getOrCreateVectorTripCount(Loop *NewLoop); + + /// Emit a bypass check to see if the trip count would overflow, or we + /// wouldn't have enough iterations to execute one vector loop. + void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); + /// Emit a bypass check to see if the vector trip count is nonzero. + void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass); + /// Emit a bypass check to see if all of the SCEV assumptions we've + /// had to make are correct. + void emitSCEVChecks(Loop *L, BasicBlock *Bypass); + /// Emit bypass checks to check any memory assumptions we may have made. + void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); + /// This is a helper class that holds the vectorizer state. It maps scalar /// instructions to vector instructions. When the code is 'unrolled' then /// then a single scalar value is mapped to multiple vector parts. The parts @@ -463,24 +532,42 @@ protected: PHINode *Induction; /// The induction variable of the old basic block. PHINode *OldInduction; - /// Holds the extended (to the widest induction type) start index. - Value *ExtendedIdx; /// Maps scalars to widened vectors. ValueMap WidenMap; + /// Store instructions that should be predicated, as a pair + /// + SmallVector, 4> PredicatedStores; EdgeMaskCache MaskCache; - + /// Trip count of the original loop. + Value *TripCount; + /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) + Value *VectorTripCount; + + /// Map of scalar integer values to the smallest bitwidth they can be legally + /// represented as. The vector equivalents of these values should be truncated + /// to this type. + MapVector MinBWs; LoopVectorizationLegality *Legal; // Record whether runtime check is added. bool AddedSafetyChecks; + + /// The SCEV predicate containing all the SCEV-related assumptions. + /// The predicate is used to simplify existing expressions in the + /// context of existing SCEV assumptions. Since legality checking is + /// not done here, we don't need to use this predicate to record + /// further assumptions. + SCEVUnionPredicate &Preds; }; class InnerLoopUnroller : public InnerLoopVectorizer { public: InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, unsigned UnrollFactor) - : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {} + const TargetTransformInfo *TTI, unsigned UnrollFactor, + SCEVUnionPredicate &Preds) + : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor, + Preds) {} private: void scalarizeInstruction(Instruction *Instr, @@ -561,7 +648,8 @@ static void propagateMetadata(Instruction *To, const Instruction *From) { } /// \brief Propagate known metadata from one instruction to a vector of others. -static void propagateMetadata(SmallVectorImpl &To, const Instruction *From) { +static void propagateMetadata(SmallVectorImpl &To, + const Instruction *From) { for (Value *V : To) if (Instruction *I = dyn_cast(V)) propagateMetadata(I, From); @@ -701,8 +789,9 @@ private: /// between the member and the group in a map. class InterleavedAccessInfo { public: - InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT) - : SE(SE), TheLoop(L), DT(DT) {} + InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT, + SCEVUnionPredicate &Preds) + : SE(SE), TheLoop(L), DT(DT), Preds(Preds) {} ~InterleavedAccessInfo() { SmallSet DelSet; @@ -736,6 +825,13 @@ private: Loop *TheLoop; DominatorTree *DT; + /// The SCEV predicate containing all the SCEV-related assumptions. + /// The predicate is used to simplify SCEV expressions in the + /// context of existing SCEV assumptions. The interleaved access + /// analysis can also add new predicates (for example by versioning + /// strides of pointers). + SCEVUnionPredicate &Preds; + /// Holds the relationships between the members and the interleave group. DenseMap InterleaveGroupMap; @@ -929,6 +1025,15 @@ public: return DiagnosticInfo::AlwaysPrint; } + bool allowReordering() const { + // When enabling loop hints are provided we allow the vectorizer to change + // the order of operations that is given by the scalar loop. This is not + // enabled by default because can be unsafe or inefficient. For example, + // reordering floating-point operations will change the way round-off + // error accumulates in the loop. + return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1; + } + private: /// Find hints specified in the loop metadata and update local values. void getHintsFromMetadata() { @@ -1089,85 +1194,13 @@ public: Function *F, const TargetTransformInfo *TTI, LoopAccessAnalysis *LAA, LoopVectorizationRequirements *R, - const LoopVectorizeHints *H) + const LoopVectorizeHints *H, + SCEVUnionPredicate &Preds) : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F), - TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(SE, L, DT), - Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), - Requirements(R), Hints(H) {} - - /// This enum represents the kinds of inductions that we support. - enum InductionKind { - IK_NoInduction, ///< Not an induction variable. - IK_IntInduction, ///< Integer induction variable. Step = C. - IK_PtrInduction ///< Pointer induction var. Step = C / sizeof(elem). - }; - - /// A struct for saving information about induction variables. - struct InductionInfo { - InductionInfo(Value *Start, InductionKind K, ConstantInt *Step) - : StartValue(Start), IK(K), StepValue(Step) { - assert(IK != IK_NoInduction && "Not an induction"); - assert(StartValue && "StartValue is null"); - assert(StepValue && !StepValue->isZero() && "StepValue is zero"); - assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) && - "StartValue is not a pointer for pointer induction"); - assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) && - "StartValue is not an integer for integer induction"); - assert(StepValue->getType()->isIntegerTy() && - "StepValue is not an integer"); - } - InductionInfo() - : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {} - - /// Get the consecutive direction. Returns: - /// 0 - unknown or non-consecutive. - /// 1 - consecutive and increasing. - /// -1 - consecutive and decreasing. - int getConsecutiveDirection() const { - if (StepValue && (StepValue->isOne() || StepValue->isMinusOne())) - return StepValue->getSExtValue(); - return 0; - } - - /// Compute the transformed value of Index at offset StartValue using step - /// StepValue. - /// For integer induction, returns StartValue + Index * StepValue. - /// For pointer induction, returns StartValue[Index * StepValue]. - /// FIXME: The newly created binary instructions should contain nsw/nuw - /// flags, which can be found from the original scalar operations. - Value *transform(IRBuilder<> &B, Value *Index) const { - switch (IK) { - case IK_IntInduction: - assert(Index->getType() == StartValue->getType() && - "Index type does not match StartValue type"); - if (StepValue->isMinusOne()) - return B.CreateSub(StartValue, Index); - if (!StepValue->isOne()) - Index = B.CreateMul(Index, StepValue); - return B.CreateAdd(StartValue, Index); - - case IK_PtrInduction: - assert(Index->getType() == StepValue->getType() && - "Index type does not match StepValue type"); - if (StepValue->isMinusOne()) - Index = B.CreateNeg(Index); - else if (!StepValue->isOne()) - Index = B.CreateMul(Index, StepValue); - return B.CreateGEP(nullptr, StartValue, Index); - - case IK_NoInduction: - return nullptr; - } - llvm_unreachable("invalid enum"); - } - - /// Start value. - TrackingVH StartValue; - /// Induction kind. - InductionKind IK; - /// Step value. - ConstantInt *StepValue; - }; + TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), + InterleaveInfo(SE, L, DT, Preds), Induction(nullptr), + WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), Hints(H), + Preds(Preds) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. @@ -1175,7 +1208,7 @@ public: /// InductionList saves induction variables and maps them to the /// induction descriptor. - typedef MapVector InductionList; + typedef MapVector InductionList; /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this @@ -1197,6 +1230,9 @@ public: /// Returns True if V is an induction variable in this loop. bool isInductionVariable(const Value *V); + /// Returns True if PN is a reduction variable in this loop. + bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } + /// Return true if the block BB needs to be predicated in order for the loop /// to be vectorized. bool blockNeedsPredication(BasicBlock *BB); @@ -1248,12 +1284,12 @@ public: /// Returns true if the target machine supports masked store operation /// for the given \p DataType and kind of access to \p Ptr. bool isLegalMaskedStore(Type *DataType, Value *Ptr) { - return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr)); + return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType); } /// Returns true if the target machine supports masked load operation /// for the given \p DataType and kind of access to \p Ptr. bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { - return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr)); + return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType); } /// Returns true if vector representation of the instruction \p I /// requires mask. @@ -1293,10 +1329,6 @@ private: /// and we know that we can read from them without segfault. bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl &SafePtrs); - /// Returns the induction kind of Phi and record the step. This function may - /// return NoInduction if the PHI is not an induction variable. - InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue); - /// \brief Collect memory access with loop invariant strides. /// /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop @@ -1370,7 +1402,14 @@ private: /// While vectorizing these instructions we have to generate a /// call to the appropriate masked intrinsic - SmallPtrSet MaskedOp; + SmallPtrSet MaskedOp; + + /// The SCEV predicate containing all the SCEV-related assumptions. + /// The predicate is used to simplify SCEV expressions in the + /// context of existing SCEV assumptions. The analysis will also + /// add a minimal set of new predicates if this is required to + /// enable vectorization/unrolling. + SCEVUnionPredicate &Preds; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1385,12 +1424,13 @@ public: LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, - const TargetLibraryInfo *TLI, AssumptionCache *AC, - const Function *F, const LoopVectorizeHints *Hints) - : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), - TheFunction(F), Hints(Hints) { - CodeMetrics::collectEphemeralValues(L, AC, EphValues); - } + const TargetLibraryInfo *TLI, DemandedBits *DB, + AssumptionCache *AC, const Function *F, + const LoopVectorizeHints *Hints, + SmallPtrSetImpl &ValuesToIgnore, + SCEVUnionPredicate &Preds) + : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), + TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {} /// Information about vectorization costs struct VectorizationFactor { @@ -1403,10 +1443,10 @@ public: /// possible. VectorizationFactor selectVectorizationFactor(bool OptForSize); - /// \return The size (in bits) of the widest type in the code that - /// needs to be vectorized. We ignore values that remain scalar such as + /// \return The size (in bits) of the smallest and widest types in the code + /// that needs to be vectorized. We ignore values that remain scalar such as /// 64 bit loop indices. - unsigned getWidestType(); + std::pair getSmallestAndWidestTypes(); /// \return The desired interleave count. /// If interleave count has been specified by metadata it will be returned. @@ -1433,8 +1473,10 @@ public: unsigned NumInstructions; }; - /// \return information about the register usage of the loop. - RegisterUsage calculateRegisterUsage(); + /// \return Returns information about the register usages of the loop for the + /// given vectorization factors. + SmallVector + calculateRegisterUsage(const SmallVector &VFs); private: /// Returns the expected execution cost. The unit of the cost does @@ -1459,8 +1501,11 @@ private: emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); } - /// Values used only by @llvm.assume calls. - SmallPtrSet EphValues; +public: + /// Map of scalar integer values to the smallest bitwidth they can be legally + /// represented as. The vector equivalents of these values should be truncated + /// to this type. + MapVector MinBWs; /// The loop that we evaluate. Loop *TheLoop; @@ -1474,9 +1519,13 @@ private: const TargetTransformInfo &TTI; /// Target Library Info. const TargetLibraryInfo *TLI; + /// Demanded bits analysis + DemandedBits *DB; const Function *TheFunction; // Loop Vectorize Hint. const LoopVectorizeHints *Hints; + // Values to ignore in the cost model. + const SmallPtrSetImpl &ValuesToIgnore; }; /// \brief This holds vectorization requirements that must be verified late in @@ -1507,29 +1556,25 @@ public: bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) { const char *Name = Hints.vectorizeAnalysisPassName(); bool Failed = false; - if (UnsafeAlgebraInst && - Hints.getForce() == LoopVectorizeHints::FK_Undefined && - Hints.getWidth() == 0) { + if (UnsafeAlgebraInst && !Hints.allowReordering()) { emitOptimizationRemarkAnalysisFPCommute( F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(), - VectorizationReport() << "vectorization requires changes in the " - "order of operations, however IEEE 754 " - "floating-point operations are not " - "commutative"); + VectorizationReport() << "cannot prove it is safe to reorder " + "floating-point operations"); Failed = true; } - if (NumRuntimePointerChecks > - VectorizerParams::RuntimeMemoryCheckThreshold) { + // Test if runtime memcheck thresholds are exceeded. + bool PragmaThresholdReached = + NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; + bool ThresholdReached = + NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; + if ((ThresholdReached && !Hints.allowReordering()) || + PragmaThresholdReached) { emitOptimizationRemarkAnalysisAliasing( F->getContext(), Name, *F, L->getStartLoc(), VectorizationReport() - << "cannot prove pointers refer to independent arrays in memory. " - "The loop requires " - << NumRuntimePointerChecks - << " runtime independence checks to vectorize the loop, but that " - "would exceed the limit of " - << VectorizerParams::RuntimeMemoryCheckThreshold << " checks"); + << "cannot prove it is safe to reorder memory operations"); DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); Failed = true; } @@ -1568,6 +1613,7 @@ struct LoopVectorize : public FunctionPass { DominatorTree *DT; BlockFrequencyInfo *BFI; TargetLibraryInfo *TLI; + DemandedBits *DB; AliasAnalysis *AA; AssumptionCache *AC; LoopAccessAnalysis *LAA; @@ -1584,9 +1630,10 @@ struct LoopVectorize : public FunctionPass { BFI = &getAnalysis().getBFI(); auto *TLIP = getAnalysisIfAvailable(); TLI = TLIP ? &TLIP->getTLI() : nullptr; - AA = &getAnalysis(); + AA = &getAnalysis().getAAResults(); AC = &getAnalysis().getAssumptionCache(F); LAA = &getAnalysis(); + DB = &getAnalysis(); // Compute some weights outside of the loop over the loops. Compute this // using a BranchProbability to re-use its scaling math. @@ -1711,18 +1758,31 @@ struct LoopVectorize : public FunctionPass { } } + SCEVUnionPredicate Preds; + // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements; LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA, - &Requirements, &Hints); + &Requirements, &Hints, Preds); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); emitMissedWarning(F, L, Hints); return false; } + // Collect values we want to ignore in the cost model. This includes + // type-promoting instructions we identified during reduction detection. + SmallPtrSet ValuesToIgnore; + CodeMetrics::collectEphemeralValues(L, AC, ValuesToIgnore); + for (auto &Reduction : *LVL.getReductionVars()) { + RecurrenceDescriptor &RedDes = Reduction.second; + SmallPtrSetImpl &Casts = RedDes.getCastInsts(); + ValuesToIgnore.insert(Casts.begin(), Casts.end()); + } + // Use the cost model. - LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints); + LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, DB, AC, F, &Hints, + ValuesToIgnore, Preds); // Check the function attributes to find out if this function should be // optimized for size. @@ -1833,16 +1893,16 @@ struct LoopVectorize : public FunctionPass { assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop then // interleave it. - InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC); - Unroller.vectorize(&LVL); + InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC, Preds); + Unroller.vectorize(&LVL, CM.MinBWs); emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), Twine("interleaved loop (interleaved count: ") + Twine(IC) + ")"); } else { // If we decided that it is *legal* to vectorize the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC); - LB.vectorize(&LVL); + InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC, Preds); + LB.vectorize(&LVL, CM.MinBWs); ++LoopsVectorized; // Add metadata to disable runtime unrolling scalar loop when there's no @@ -1874,11 +1934,14 @@ struct LoopVectorize : public FunctionPass { AU.addRequired(); AU.addRequired(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.addPreserved(); - AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); } }; @@ -1946,11 +2009,11 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // If this value is a pointer induction variable we know it is consecutive. PHINode *Phi = dyn_cast_or_null(Ptr); if (Phi && Inductions.count(Phi)) { - InductionInfo II = Inductions[Phi]; + InductionDescriptor II = Inductions[Phi]; return II.getConsecutiveDirection(); } - GetElementPtrInst *Gep = dyn_cast_or_null(Ptr); + GetElementPtrInst *Gep = getGEPInstruction(Ptr); if (!Gep) return 0; @@ -1971,7 +2034,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) return 0; - InductionInfo II = Inductions[Phi]; + InductionDescriptor II = Inductions[Phi]; return II.getConsecutiveDirection(); } @@ -1999,7 +2062,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // %idxprom = zext i32 %mul to i64 << Safe cast. // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom // - Last = replaceSymbolicStrideSCEV(SE, Strides, + Last = replaceSymbolicStrideSCEV(SE, Strides, Preds, Gep->getOperand(InductionOperand), Gep); if (const SCEVCastExpr *C = dyn_cast(Last)) Last = @@ -2343,7 +2406,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { VectorParts &Entry = WidenMap.get(Instr); // Handle consecutive loads/stores. - GetElementPtrInst *Gep = dyn_cast(Ptr); + GetElementPtrInst *Gep = getGEPInstruction(Ptr); if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { setDebugLocFromInst(Builder, Gep); Value *PtrOperand = Gep->getPointerOperand(); @@ -2403,7 +2466,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // We don't want to update the value in the map as it might be used in // another expression. So don't use a reference type for "StoredVal". VectorParts StoredVal = getVectorValue(SI->getValueOperand()); - + for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. Value *PartPtr = @@ -2464,7 +2527,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { } } -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) { +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, + bool IfPredicateStore) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector Params; @@ -2509,19 +2573,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); - Instruction *InsertPt = Builder.GetInsertPoint(); - BasicBlock *IfBlock = Builder.GetInsertBlock(); - BasicBlock *CondBlock = nullptr; - VectorParts Cond; - Loop *VectorLp = nullptr; if (IfPredicateStore) { assert(Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks"); Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), Instr->getParent()); - VectorLp = LI->getLoopFor(IfBlock); - assert(VectorLp && "Must have a loop for this block"); } // For each vector unroll 'part': @@ -2533,12 +2590,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic Value *Cmp = nullptr; if (IfPredicateStore) { Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); - Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); - CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); - LoopVectorBody.push_back(CondBlock); - VectorLp->addBasicBlockToLoop(CondBlock, *LI); - // Update Builder with newly created basic block. - Builder.SetInsertPoint(InsertPt); + Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, + ConstantInt::get(Cmp->getType(), 1)); } Instruction *Cloned = Instr->clone(); @@ -2562,65 +2615,200 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, Builder.getInt32(Width)); // End if-block. - if (IfPredicateStore) { - BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); - LoopVectorBody.push_back(NewIfBlock); - VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); - Builder.SetInsertPoint(InsertPt); - ReplaceInstWithInst(IfBlock->getTerminator(), - BranchInst::Create(CondBlock, NewIfBlock, Cmp)); - IfBlock = NewIfBlock; - } + if (IfPredicateStore) + PredicatedStores.push_back(std::make_pair(cast(Cloned), + Cmp)); } } } -static Instruction *getFirstInst(Instruction *FirstInst, Value *V, - Instruction *Loc) { - if (FirstInst) - return FirstInst; - if (Instruction *I = dyn_cast(V)) - return I->getParent() == Loc->getParent() ? I : nullptr; - return nullptr; +PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, + Value *End, Value *Step, + Instruction *DL) { + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); + // As we're just creating this loop, it's possible no latch exists + // yet. If so, use the header as this will be a single block loop. + if (!Latch) + Latch = Header; + + IRBuilder<> Builder(&*Header->getFirstInsertionPt()); + setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); + auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); + + Builder.SetInsertPoint(Latch->getTerminator()); + + // Create i+1 and fill the PHINode. + Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); + Induction->addIncoming(Start, L->getLoopPreheader()); + Induction->addIncoming(Next, Latch); + // Create the compare. + Value *ICmp = Builder.CreateICmpEQ(Next, End); + Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); + + // Now we have two terminators. Remove the old one from the block. + Latch->getTerminator()->eraseFromParent(); + + return Induction; } -std::pair -InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { - Instruction *tnullptr = nullptr; - if (!Legal->mustCheckStrides()) - return std::pair(tnullptr, tnullptr); - - IRBuilder<> ChkBuilder(Loc); - - // Emit checks. - Value *Check = nullptr; - Instruction *FirstInst = nullptr; - for (SmallPtrSet::iterator SI = Legal->strides_begin(), - SE = Legal->strides_end(); - SI != SE; ++SI) { - Value *Ptr = stripIntegerCast(*SI); - Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1), - "stride.chk"); - // Store the first instruction we create. - FirstInst = getFirstInst(FirstInst, C, Loc); - if (Check) - Check = ChkBuilder.CreateOr(Check, C); - else - Check = C; - } +Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { + if (TripCount) + return TripCount; - // We have to do this trickery because the IRBuilder might fold the check to a - // constant expression in which case there is no Instruction anchored in a - // the block. - LLVMContext &Ctx = Loc->getContext(); - Instruction *TheCheck = - BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx)); - ChkBuilder.Insert(TheCheck, "stride.not.one"); - FirstInst = getFirstInst(FirstInst, TheCheck, Loc); + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + // Find the loop boundaries. + const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop); + assert(BackedgeTakenCount != SE->getCouldNotCompute() && + "Invalid loop count"); + + Type *IdxTy = Legal->getWidestInductionType(); + + // The exit count might have the type of i64 while the phi is i32. This can + // happen if we have an induction variable that is sign extended before the + // compare. The only way that we get a backedge taken count is that the + // induction variable was signed and as such will not overflow. In such a case + // truncation is legal. + if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > + IdxTy->getPrimitiveSizeInBits()) + BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); + BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); + + // Get the total trip count from the count by adding 1. + const SCEV *ExitCount = SE->getAddExpr( + BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); - return std::make_pair(FirstInst, TheCheck); + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + + // Expand the trip count and place the new instructions in the preheader. + // Notice that the pre-header does not change, only the loop body. + SCEVExpander Exp(*SE, DL, "induction"); + + // Count holds the overall loop count (N). + TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), + L->getLoopPreheader()->getTerminator()); + + if (TripCount->getType()->isPointerTy()) + TripCount = + CastInst::CreatePointerCast(TripCount, IdxTy, + "exitcount.ptrcnt.to.int", + L->getLoopPreheader()->getTerminator()); + + return TripCount; +} + +Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { + if (VectorTripCount) + return VectorTripCount; + + Value *TC = getOrCreateTripCount(L); + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + + // Now we need to generate the expression for N - (N % VF), which is + // the part that the vectorized body will execute. + // The loop step is equal to the vectorization factor (num of SIMD elements) + // times the unroll factor (num of SIMD instructions). + Constant *Step = ConstantInt::get(TC->getType(), VF * UF); + Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); + VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); + + return VectorTripCount; } +void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, + BasicBlock *Bypass) { + Value *Count = getOrCreateTripCount(L); + BasicBlock *BB = L->getLoopPreheader(); + IRBuilder<> Builder(BB->getTerminator()); + + // Generate code to check that the loop's trip count that we computed by + // adding one to the backedge-taken count will not overflow. + Value *CheckMinIters = + Builder.CreateICmpULT(Count, + ConstantInt::get(Count->getType(), VF * UF), + "min.iters.check"); + + BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), + "min.iters.checked"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, CheckMinIters)); + LoopBypassBlocks.push_back(BB); +} + +void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L, + BasicBlock *Bypass) { + Value *TC = getOrCreateVectorTripCount(L); + BasicBlock *BB = L->getLoopPreheader(); + IRBuilder<> Builder(BB->getTerminator()); + + // Now, compare the new count to zero. If it is zero skip the vector loop and + // jump to the scalar loop. + Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()), + "cmp.zero"); + + // Generate code to check that the loop's trip count that we computed by + // adding one to the backedge-taken count will not overflow. + BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), + "vector.ph"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, Cmp)); + LoopBypassBlocks.push_back(BB); +} + +void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { + BasicBlock *BB = L->getLoopPreheader(); + + // Generate the code to check that the SCEV assumptions that we made. + // We want the new basic block to start at the first instruction in a + // sequence of instructions that form a check. + SCEVExpander Exp(*SE, Bypass->getModule()->getDataLayout(), "scev.check"); + Value *SCEVCheck = Exp.expandCodeForPredicate(&Preds, BB->getTerminator()); + + if (auto *C = dyn_cast(SCEVCheck)) + if (C->isZero()) + return; + + // Create a new block containing the stride check. + BB->setName("vector.scevcheck"); + auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, SCEVCheck)); + LoopBypassBlocks.push_back(BB); + AddedSafetyChecks = true; +} + +void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, + BasicBlock *Bypass) { + BasicBlock *BB = L->getLoopPreheader(); + + // Generate the code that checks in runtime if arrays overlap. We put the + // checks into a separate block to make the more common case of few elements + // faster. + Instruction *FirstCheckInst; + Instruction *MemRuntimeCheck; + std::tie(FirstCheckInst, MemRuntimeCheck) = + Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); + if (!MemRuntimeCheck) + return; + + // Create a new block containing the memory check. + BB->setName("vector.memcheck"); + auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); + LoopBypassBlocks.push_back(BB); + AddedSafetyChecks = true; +} + + void InnerLoopVectorizer::createEmptyLoop() { /* In this function we generate a new loop. The new loop will contain @@ -2634,13 +2822,13 @@ void InnerLoopVectorizer::createEmptyLoop() { | / | | / v || [ ] <-- vector pre header. - || | - || v - || [ ] \ - || [ ]_| <-- vector loop. - || | - | \ v - | >[ ] <--- middle-block. + |/ | + | v + | [ ] \ + | [ ]_| <-- vector loop. + | | + | v + | -[ ] <--- middle-block. | / | | / v -|- >[ ] <--- new preheader. @@ -2664,69 +2852,16 @@ void InnerLoopVectorizer::createEmptyLoop() { // don't. One example is c++ iterators that often have multiple pointer // induction variables. In the code below we also support a case where we // don't have a single induction variable. + // + // We try to obtain an induction variable from the original loop as hard + // as possible. However if we don't find one that: + // - is an integer + // - counts from zero, stepping by one + // - is the size of the widest induction variable type + // then we create a new one. OldInduction = Legal->getInduction(); Type *IdxTy = Legal->getWidestInductionType(); - // Find the loop boundaries. - const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop); - assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); - - // The exit count might have the type of i64 while the phi is i32. This can - // happen if we have an induction variable that is sign extended before the - // compare. The only way that we get a backedge taken count is that the - // induction variable was signed and as such will not overflow. In such a case - // truncation is legal. - if (ExitCount->getType()->getPrimitiveSizeInBits() > - IdxTy->getPrimitiveSizeInBits()) - ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); - - const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); - // Get the total trip count from the count by adding 1. - ExitCount = SE->getAddExpr(BackedgeTakeCount, - SE->getConstant(BackedgeTakeCount->getType(), 1)); - - const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout(); - - // Expand the trip count and place the new instructions in the preheader. - // Notice that the pre-header does not change, only the loop body. - SCEVExpander Exp(*SE, DL, "induction"); - - // The loop minimum iterations check below is to ensure the loop has enough - // trip count so the generated vector loop will likely be executed and the - // preparation and rounding-off costs will likely be worthy. - // - // The minimum iteration check also covers case where the backedge-taken - // count is uint##_max. Adding one to it will cause overflow and an - // incorrect loop trip count being generated in the vector body. In this - // case we also want to directly jump to the scalar remainder loop. - Value *ExitCountValue = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - VectorPH->getTerminator()); - if (ExitCountValue->getType()->isPointerTy()) - ExitCountValue = CastInst::CreatePointerCast(ExitCountValue, IdxTy, - "exitcount.ptrcnt.to.int", - VectorPH->getTerminator()); - - Instruction *CheckMinIters = - CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, ExitCountValue, - ConstantInt::get(ExitCountValue->getType(), VF * UF), - "min.iters.check", VectorPH->getTerminator()); - - // The loop index does not have to start at Zero. Find the original start - // value from the induction PHI node. If we don't have an induction variable - // then we know that it starts at zero. - Builder.SetInsertPoint(VectorPH->getTerminator()); - Value *StartIdx = ExtendedIdx = - OldInduction - ? Builder.CreateZExt(OldInduction->getIncomingValueForBlock(VectorPH), - IdxTy) - : ConstantInt::get(IdxTy, 0); - - // Count holds the overall loop count (N). - Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - VectorPH->getTerminator()); - - LoopBypassBlocks.push_back(VectorPH); - // Split the single block loop into the two loop structure described above. BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); @@ -2750,117 +2885,36 @@ void InnerLoopVectorizer::createEmptyLoop() { } Lp->addBasicBlockToLoop(VecBody, *LI); - // Use this IR builder to create the loop instructions (Phi, Br, Cmp) - // inside the loop. - Builder.SetInsertPoint(VecBody->getFirstNonPHI()); - - // Generate the induction variable. - setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); - Induction = Builder.CreatePHI(IdxTy, 2, "index"); - // The loop step is equal to the vectorization factor (num of SIMD elements) - // times the unroll factor (num of SIMD instructions). - Constant *Step = ConstantInt::get(IdxTy, VF * UF); - - // Generate code to check that the loop's trip count is not less than the - // minimum loop iteration number threshold. - BasicBlock *NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "min.iters.checked"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - ReplaceInstWithInst(VectorPH->getTerminator(), - BranchInst::Create(ScalarPH, NewVectorPH, CheckMinIters)); - VectorPH = NewVectorPH; - - // This is the IR builder that we use to add all of the logic for bypassing - // the new vector loop. - IRBuilder<> BypassBuilder(VectorPH->getTerminator()); - setDebugLocFromInst(BypassBuilder, - getDebugLocFromInstOrOperands(OldInduction)); - - // We may need to extend the index in case there is a type mismatch. - // We know that the count starts at zero and does not overflow. - if (Count->getType() != IdxTy) { - // The exit count can be of pointer type. Convert it to the correct - // integer type. - if (ExitCount->getType()->isPointerTy()) - Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int"); - else - Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast"); - } - - // Add the start index to the loop count to get the new end index. - Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx"); + // Find the loop boundaries. + Value *Count = getOrCreateTripCount(Lp); - // Now we need to generate the expression for N - (N % VF), which is - // the part that the vectorized body will execute. - Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf"); - Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec"); - Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx, - "end.idx.rnd.down"); + Value *StartIdx = ConstantInt::get(IdxTy, 0); + // We need to test whether the backedge-taken count is uint##_max. Adding one + // to it will cause overflow and an incorrect loop trip count in the vector + // body. In case of overflow we want to directly jump to the scalar remainder + // loop. + emitMinimumIterationCountCheck(Lp, ScalarPH); // Now, compare the new count to zero. If it is zero skip the vector loop and // jump to the scalar loop. - Value *Cmp = - BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero"); - NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - LoopBypassBlocks.push_back(VectorPH); - ReplaceInstWithInst(VectorPH->getTerminator(), - BranchInst::Create(MiddleBlock, NewVectorPH, Cmp)); - VectorPH = NewVectorPH; - - // Generate the code to check that the strides we assumed to be one are really - // one. We want the new basic block to start at the first instruction in a - // sequence of instructions that form a check. - Instruction *StrideCheck; - Instruction *FirstCheckInst; - std::tie(FirstCheckInst, StrideCheck) = - addStrideCheck(VectorPH->getTerminator()); - if (StrideCheck) { - AddedSafetyChecks = true; - // Create a new block containing the stride check. - VectorPH->setName("vector.stridecheck"); - NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - LoopBypassBlocks.push_back(VectorPH); - - // Replace the branch into the memory check block with a conditional branch - // for the "few elements case". - ReplaceInstWithInst( - VectorPH->getTerminator(), - BranchInst::Create(MiddleBlock, NewVectorPH, StrideCheck)); - - VectorPH = NewVectorPH; - } + emitVectorLoopEnteredCheck(Lp, ScalarPH); + // Generate the code to check any assumptions that we've made for SCEV + // expressions. + emitSCEVChecks(Lp, ScalarPH); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - Instruction *MemRuntimeCheck; - std::tie(FirstCheckInst, MemRuntimeCheck) = - Legal->getLAI()->addRuntimeChecks(VectorPH->getTerminator()); - if (MemRuntimeCheck) { - AddedSafetyChecks = true; - // Create a new block containing the memory check. - VectorPH->setName("vector.memcheck"); - NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - LoopBypassBlocks.push_back(VectorPH); - - // Replace the branch into the memory check block with a conditional branch - // for the "few elements case". - ReplaceInstWithInst( - VectorPH->getTerminator(), - BranchInst::Create(MiddleBlock, NewVectorPH, MemRuntimeCheck)); - - VectorPH = NewVectorPH; - } + emitMemRuntimeChecks(Lp, ScalarPH); + + // Generate the induction variable. + // The loop step is equal to the vectorization factor (num of SIMD elements) + // times the unroll factor (num of SIMD instructions). + Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + Constant *Step = ConstantInt::get(IdxTy, VF * UF); + Induction = + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the @@ -2870,152 +2924,60 @@ void InnerLoopVectorizer::createEmptyLoop() { // If we come from a bypass edge then we need to start from the original // start value. - // This variable saves the new starting index for the scalar loop. - PHINode *ResumeIndex = nullptr; + // This variable saves the new starting index for the scalar loop. It is used + // to test if there are any tail iterations left once the vector loop has + // completed. LoopVectorizationLegality::InductionList::iterator I, E; LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); - // Set builder to point to last bypass block. - BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator()); for (I = List->begin(), E = List->end(); I != E; ++I) { PHINode *OrigPhi = I->first; - LoopVectorizationLegality::InductionInfo II = I->second; - - Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType(); - PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val", - MiddleBlock->getTerminator()); - // We might have extended the type of the induction variable but we need a - // truncated version for the scalar loop. - PHINode *TruncResumeVal = (OrigPhi == OldInduction) ? - PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", - MiddleBlock->getTerminator()) : nullptr; + InductionDescriptor II = I->second; // Create phi nodes to merge from the backedge-taken check block. - PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val", + PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, + "bc.resume.val", ScalarPH->getTerminator()); - BCResumeVal->addIncoming(ResumeVal, MiddleBlock); - - PHINode *BCTruncResumeVal = nullptr; + Value *EndValue; if (OrigPhi == OldInduction) { - BCTruncResumeVal = - PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val", - ScalarPH->getTerminator()); - BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock); - } - - Value *EndValue = nullptr; - switch (II.IK) { - case LoopVectorizationLegality::IK_NoInduction: - llvm_unreachable("Unknown induction"); - case LoopVectorizationLegality::IK_IntInduction: { - // Handle the integer induction counter. - assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); - - // We have the canonical induction variable. - if (OrigPhi == OldInduction) { - // Create a truncated version of the resume value for the scalar loop, - // we might have promoted the type to a larger width. - EndValue = - BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); - // The new PHI merges the original incoming value, in case of a bypass, - // or the value at the end of the vectorized loop. - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); - TruncResumeVal->addIncoming(EndValue, VecBody); - - BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); - - // We know what the end value is. - EndValue = IdxEndRoundDown; - // We also know which PHI node holds it. - ResumeIndex = ResumeVal; - break; - } - - // Not the canonical induction variable - add the vector loop count to the - // start value. - Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, - II.StartValue->getType(), - "cast.crd"); - EndValue = II.transform(BypassBuilder, CRD); + // We know what the end value is. + EndValue = CountRoundDown; + } else { + IRBuilder<> B(LoopBypassBlocks.back()->getTerminator()); + Value *CRD = B.CreateSExtOrTrunc(CountRoundDown, + II.getStepValue()->getType(), + "cast.crd"); + EndValue = II.transform(B, CRD); EndValue->setName("ind.end"); - break; - } - case LoopVectorizationLegality::IK_PtrInduction: { - Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, - II.StepValue->getType(), - "cast.crd"); - EndValue = II.transform(BypassBuilder, CRD); - EndValue->setName("ptr.ind.end"); - break; } - }// end of case // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) { - if (OrigPhi == OldInduction) - ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); - else - ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); - } - ResumeVal->addIncoming(EndValue, VecBody); + BCResumeVal->addIncoming(EndValue, MiddleBlock); // Fix the scalar body counter (PHI node). unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); // The old induction's phi node in the scalar body needs the truncated // value. - if (OrigPhi == OldInduction) { - BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]); - OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal); - } else { - BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); - OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); - } - } - - // If we are generating a new induction variable then we also need to - // generate the code that calculates the exit value. This value is not - // simply the end of the counter because we may skip the vectorized body - // in case of a runtime check. - if (!OldInduction){ - assert(!ResumeIndex && "Unexpected resume value found"); - ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", - MiddleBlock->getTerminator()); - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); - ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]); + OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); } - // Make sure that we found the index where scalar loop needs to continue. - assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && - "Invalid resume Index"); - // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. - Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, - ResumeIndex, "cmp.n", + Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, + CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); ReplaceInstWithInst(MiddleBlock->getTerminator(), BranchInst::Create(ExitBlock, ScalarPH, CmpN)); - // Create i+1 and fill the PHINode. - Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); - Induction->addIncoming(StartIdx, VectorPH); - Induction->addIncoming(NextIdx, VecBody); - // Create the compare. - Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); - Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); - - // Now we have two terminators. Remove the old one from the block. - VecBody->getTerminator()->eraseFromParent(); - // Get ready to start creating new instructions into the vectorized body. - Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); // Save the state. - LoopVectorPreHeader = VectorPH; + LoopVectorPreHeader = Lp->getLoopPreheader(); LoopScalarPreHeader = ScalarPH; LoopMiddleBlock = MiddleBlock; LoopExitBlock = ExitBlock; @@ -3068,7 +3030,7 @@ static void cse(SmallVector &BBs) { for (unsigned i = 0, e = BBs.size(); i != e; ++i) { BasicBlock *BB = BBs[i]; for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { - Instruction *In = I++; + Instruction *In = &*I++; if (!CSEDenseMapInfo::canHandle(In)) continue; @@ -3190,6 +3152,117 @@ static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); } +static Type *smallestIntegerVectorType(Type *T1, Type *T2) { + IntegerType *I1 = cast(T1->getVectorElementType()); + IntegerType *I2 = cast(T2->getVectorElementType()); + return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; +} +static Type *largestIntegerVectorType(Type *T1, Type *T2) { + IntegerType *I1 = cast(T1->getVectorElementType()); + IntegerType *I2 = cast(T2->getVectorElementType()); + return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; +} + +void InnerLoopVectorizer::truncateToMinimalBitwidths() { + // For every instruction `I` in MinBWs, truncate the operands, create a + // truncated version of `I` and reextend its result. InstCombine runs + // later and will remove any ext/trunc pairs. + // + for (auto &KV : MinBWs) { + VectorParts &Parts = WidenMap.get(KV.first); + for (Value *&I : Parts) { + if (I->use_empty()) + continue; + Type *OriginalTy = I->getType(); + Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(), + KV.second); + Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, + OriginalTy->getVectorNumElements()); + if (TruncatedTy == OriginalTy) + continue; + + IRBuilder<> B(cast(I)); + auto ShrinkOperand = [&](Value *V) -> Value* { + if (auto *ZI = dyn_cast(V)) + if (ZI->getSrcTy() == TruncatedTy) + return ZI->getOperand(0); + return B.CreateZExtOrTrunc(V, TruncatedTy); + }; + + // The actual instruction modification depends on the instruction type, + // unfortunately. + Value *NewI = nullptr; + if (BinaryOperator *BO = dyn_cast(I)) { + NewI = B.CreateBinOp(BO->getOpcode(), + ShrinkOperand(BO->getOperand(0)), + ShrinkOperand(BO->getOperand(1))); + cast(NewI)->copyIRFlags(I); + } else if (ICmpInst *CI = dyn_cast(I)) { + NewI = B.CreateICmp(CI->getPredicate(), + ShrinkOperand(CI->getOperand(0)), + ShrinkOperand(CI->getOperand(1))); + } else if (SelectInst *SI = dyn_cast(I)) { + NewI = B.CreateSelect(SI->getCondition(), + ShrinkOperand(SI->getTrueValue()), + ShrinkOperand(SI->getFalseValue())); + } else if (CastInst *CI = dyn_cast(I)) { + switch (CI->getOpcode()) { + default: llvm_unreachable("Unhandled cast!"); + case Instruction::Trunc: + NewI = ShrinkOperand(CI->getOperand(0)); + break; + case Instruction::SExt: + NewI = B.CreateSExtOrTrunc(CI->getOperand(0), + smallestIntegerVectorType(OriginalTy, + TruncatedTy)); + break; + case Instruction::ZExt: + NewI = B.CreateZExtOrTrunc(CI->getOperand(0), + smallestIntegerVectorType(OriginalTy, + TruncatedTy)); + break; + } + } else if (ShuffleVectorInst *SI = dyn_cast(I)) { + auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); + auto *O0 = + B.CreateZExtOrTrunc(SI->getOperand(0), + VectorType::get(ScalarTruncatedTy, Elements0)); + auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); + auto *O1 = + B.CreateZExtOrTrunc(SI->getOperand(1), + VectorType::get(ScalarTruncatedTy, Elements1)); + + NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); + } else if (isa(I)) { + // Don't do anything with the operands, just extend the result. + continue; + } else { + llvm_unreachable("Unhandled instruction type!"); + } + + // Lastly, extend the result. + NewI->takeName(cast(I)); + Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); + I->replaceAllUsesWith(Res); + cast(I)->eraseFromParent(); + I = Res; + } + } + + // We'll have created a bunch of ZExts that are now parentless. Clean up. + for (auto &KV : MinBWs) { + VectorParts &Parts = WidenMap.get(KV.first); + for (Value *&I : Parts) { + ZExtInst *Inst = dyn_cast(I); + if (Inst && Inst->use_empty()) { + Value *NewI = Inst->getOperand(0); + Inst->eraseFromParent(); + I = NewI; + } + } + } +} + void InnerLoopVectorizer::vectorizeLoop() { //===------------------------------------------------===// // @@ -3220,6 +3293,11 @@ void InnerLoopVectorizer::vectorizeLoop() { be = DFS.endRPO(); bb != be; ++bb) vectorizeBlockInLoop(*bb, &RdxPHIsToFix); + // Insert truncates and extends for any truncated instructions as hints to + // InstCombine. + if (VF > 1) + truncateToMinimalBitwidths(); + // At this point every instruction in the original loop is widened to // a vector form. We are almost done. Now, we need to fix the PHI nodes // that we vectorized. The PHI nodes are currently empty because we did @@ -3235,7 +3313,7 @@ void InnerLoopVectorizer::vectorizeLoop() { assert(RdxPhi && "Unable to recover vectorized PHI"); // Find the reduction variable descriptor. - assert(Legal->getReductionVars()->count(RdxPhi) && + assert(Legal->isReductionVariable(RdxPhi) && "Unable to find the reduction variable"); RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; @@ -3310,21 +3388,33 @@ void InnerLoopVectorizer::vectorizeLoop() { // the PHIs and the values we are going to write. // This allows us to write both PHINodes and the extractelement // instructions. - Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); + Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); - VectorParts RdxParts; + VectorParts RdxParts = getVectorValue(LoopExitInst); setDebugLocFromInst(Builder, LoopExitInst); - for (unsigned part = 0; part < UF; ++part) { - // This PHINode contains the vectorized reduction variable, or - // the initial value vector, if we bypass the vector loop. - VectorParts &RdxExitVal = getVectorValue(LoopExitInst); - PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); - Value *StartVal = (part == 0) ? VectorStart : Identity; - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); - NewPhi->addIncoming(RdxExitVal[part], - LoopVectorBody.back()); - RdxParts.push_back(NewPhi); + + // If the vector reduction can be performed in a smaller type, we truncate + // then extend the loop exit value to enable InstCombine to evaluate the + // entire expression in the smaller type. + if (VF > 1 && RdxPhi->getType() != RdxDesc.getRecurrenceType()) { + Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); + Builder.SetInsertPoint(LoopVectorBody.back()->getTerminator()); + for (unsigned part = 0; part < UF; ++part) { + Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy); + Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) + : Builder.CreateZExt(Trunc, VecTy); + for (Value::user_iterator UI = RdxParts[part]->user_begin(); + UI != RdxParts[part]->user_end();) + if (*UI != Trunc) { + (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd); + RdxParts[part] = Extnd; + } else { + ++UI; + } + } + Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); + for (unsigned part = 0; part < UF; ++part) + RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy); } // Reduce all of the unrolled parts into a single vector. @@ -3377,13 +3467,22 @@ void InnerLoopVectorizer::vectorizeLoop() { // The result is in the first element of the vector. ReducedPartRdx = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + + // If the reduction can be performed in a smaller type, we need to extend + // the reduction to the wider type before we branch to the original loop. + if (RdxPhi->getType() != RdxDesc.getRecurrenceType()) + ReducedPartRdx = + RdxDesc.isSigned() + ? Builder.CreateSExt(ReducedPartRdx, RdxPhi->getType()) + : Builder.CreateZExt(ReducedPartRdx, RdxPhi->getType()); } // Create a phi node that merges control-flow from the backedge-taken check // block and the middle block. PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx", LoopScalarPreHeader->getTerminator()); - BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]); + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); // Now, we need to fix the users of the reduction variable @@ -3421,6 +3520,20 @@ void InnerLoopVectorizer::vectorizeLoop() { fixLCSSAPHIs(); + // Make sure DomTree is updated. + updateAnalysis(); + + // Predicate any stores. + for (auto KV : PredicatedStores) { + BasicBlock::iterator I(KV.first); + auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI); + auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, + /*BranchWeights=*/nullptr, DT); + I->moveBefore(T); + I->getParent()->setName("pred.store.if"); + BB->setName("pred.store.continue"); + } + DEBUG(DT->verifyDomTree()); // Remove redundant induction instructions. cse(LoopVectorBody); } @@ -3495,18 +3608,18 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { return BlockMask; } -void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, - InnerLoopVectorizer::VectorParts &Entry, - unsigned UF, unsigned VF, PhiVector *PV) { +void InnerLoopVectorizer::widenPHIInstruction( + Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF, + unsigned VF, PhiVector *PV) { PHINode* P = cast(PN); // Handle reduction variables: - if (Legal->getReductionVars()->count(P)) { + if (Legal->isReductionVariable(P)) { for (unsigned part = 0; part < UF; ++part) { // This is phase one of vectorizing PHIs. Type *VecTy = (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); - Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", - LoopVectorBody.back()-> getFirstInsertionPt()); + Entry[part] = PHINode::Create( + VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt()); } PV->push_back(P); return; @@ -3554,53 +3667,44 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); - LoopVectorizationLegality::InductionInfo II = - Legal->getInductionVars()->lookup(P); + InductionDescriptor II = Legal->getInductionVars()->lookup(P); // FIXME: The newly created binary instructions should contain nsw/nuw flags, // which can be found from the original scalar operations. - switch (II.IK) { - case LoopVectorizationLegality::IK_NoInduction: + switch (II.getKind()) { + case InductionDescriptor::IK_NoInduction: llvm_unreachable("Unknown induction"); - case LoopVectorizationLegality::IK_IntInduction: { - assert(P->getType() == II.StartValue->getType() && "Types must match"); - Type *PhiTy = P->getType(); - Value *Broadcasted; - if (P == OldInduction) { - // Handle the canonical induction variable. We might have had to - // extend the type. - Broadcasted = Builder.CreateTrunc(Induction, PhiTy); - } else { - // Handle other induction variables that are now based on the - // canonical one. - Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, - "normalized.idx"); - NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); - Broadcasted = II.transform(Builder, NormalizedIdx); - Broadcasted->setName("offset.idx"); + case InductionDescriptor::IK_IntInduction: { + assert(P->getType() == II.getStartValue()->getType() && + "Types must match"); + // Handle other induction variables that are now based on the + // canonical one. + Value *V = Induction; + if (P != OldInduction) { + V = Builder.CreateSExtOrTrunc(Induction, P->getType()); + V = II.transform(Builder, V); + V->setName("offset.idx"); } - Broadcasted = getBroadcastInstrs(Broadcasted); + Value *Broadcasted = getBroadcastInstrs(V); // After broadcasting the induction variable we need to make the vector // consecutive by adding 0, 1, 2, etc. for (unsigned part = 0; part < UF; ++part) - Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue); + Entry[part] = getStepVector(Broadcasted, VF * part, II.getStepValue()); return; } - case LoopVectorizationLegality::IK_PtrInduction: + case InductionDescriptor::IK_PtrInduction: // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); // This is the normalized GEP that starts counting at zero. - Value *NormalizedIdx = - Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx"); - NormalizedIdx = - Builder.CreateSExtOrTrunc(NormalizedIdx, II.StepValue->getType()); + Value *PtrInd = Induction; + PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStepValue()->getType()); // This is the vector of results. Notice that we don't generate // vector geps because scalar geps result in better code. for (unsigned part = 0; part < UF; ++part) { if (VF == 1) { int EltIndex = part; - Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex); - Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); + Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); + Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = II.transform(Builder, GlobalIdx); SclrGep->setName("next.gep"); Entry[part] = SclrGep; @@ -3610,8 +3714,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); for (unsigned int i = 0; i < VF; ++i) { int EltIndex = i + part * VF; - Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex); - Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); + Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); + Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = II.transform(Builder, GlobalIdx); SclrGep->setName("next.gep"); VecVal = Builder.CreateInsertElement(VecVal, SclrGep, @@ -3627,7 +3731,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - VectorParts &Entry = WidenMap.get(it); + VectorParts &Entry = WidenMap.get(&*it); + switch (it->getOpcode()) { case Instruction::Br: // Nothing to do for PHIs and BR, since we already took care of the @@ -3635,7 +3740,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { continue; case Instruction::PHI: { // Vectorize PHINodes. - widenPHIInstruction(it, Entry, UF, VF, PV); + widenPHIInstruction(&*it, Entry, UF, VF, PV); continue; }// End of PHI. @@ -3673,7 +3778,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Entry[Part] = V; } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } case Instruction::Select: { @@ -3682,7 +3787,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // instruction with a scalar condition. Otherwise, use vector-select. bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), OrigLoop); - setDebugLocFromInst(Builder, it); + setDebugLocFromInst(Builder, &*it); // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. @@ -3691,7 +3796,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { VectorParts &Cond = getVectorValue(it->getOperand(0)); VectorParts &Op0 = getVectorValue(it->getOperand(1)); VectorParts &Op1 = getVectorValue(it->getOperand(2)); - + Value *ScalarCond = (VF == 1) ? Cond[0] : Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); @@ -3702,7 +3807,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Op1[Part]); } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } @@ -3711,25 +3816,27 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Widen compares. Generate vector compares. bool FCmp = (it->getOpcode() == Instruction::FCmp); CmpInst *Cmp = dyn_cast(it); - setDebugLocFromInst(Builder, it); + setDebugLocFromInst(Builder, &*it); VectorParts &A = getVectorValue(it->getOperand(0)); VectorParts &B = getVectorValue(it->getOperand(1)); for (unsigned Part = 0; Part < UF; ++Part) { Value *C = nullptr; - if (FCmp) + if (FCmp) { C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); - else + cast(C)->copyFastMathFlags(&*it); + } else { C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); + } Entry[Part] = C; } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } case Instruction::Store: case Instruction::Load: - vectorizeMemoryInstruction(it); + vectorizeMemoryInstruction(&*it); break; case Instruction::ZExt: case Instruction::SExt: @@ -3744,7 +3851,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { case Instruction::FPTrunc: case Instruction::BitCast: { CastInst *CI = dyn_cast(it); - setDebugLocFromInst(Builder, it); + setDebugLocFromInst(Builder, &*it); /// Optimize the special case where the source is the induction /// variable. Notice that we can only optimize the 'trunc' case /// because: a. FP conversions lose precision, b. sext/zext may wrap, @@ -3754,13 +3861,13 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, CI->getType()); Value *Broadcasted = getBroadcastInstrs(ScalarCast); - LoopVectorizationLegality::InductionInfo II = + InductionDescriptor II = Legal->getInductionVars()->lookup(OldInduction); - Constant *Step = - ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue()); + Constant *Step = ConstantInt::getSigned( + CI->getType(), II.getStepValue()->getSExtValue()); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } /// Vectorize casts. @@ -3770,7 +3877,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { VectorParts &A = getVectorValue(it->getOperand(0)); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } @@ -3778,7 +3885,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Ignore dbg intrinsics. if (isa(it)) break; - setDebugLocFromInst(Builder, it); + setDebugLocFromInst(Builder, &*it); Module *M = BB->getParent()->getParent(); CallInst *CI = cast(it); @@ -3794,7 +3901,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || ID == Intrinsic::lifetime_start)) { - scalarizeInstruction(it); + scalarizeInstruction(&*it); break; } // The flag shows whether we use Intrinsic or a usual Call for vectorized @@ -3805,7 +3912,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { bool UseVectorIntrinsic = ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; if (!UseVectorIntrinsic && NeedToScalarize) { - scalarizeInstruction(it); + scalarizeInstruction(&*it); break; } @@ -3846,13 +3953,13 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Entry[Part] = Builder.CreateCall(VectorF, Args); } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } default: // All other instructions are unsupported. Scalarize them. - scalarizeInstruction(it); + scalarizeInstruction(&*it); break; }// end of switch. }// end of for_each instr. @@ -3870,19 +3977,12 @@ void InnerLoopVectorizer::updateAnalysis() { DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]); DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); - // Due to if predication of stores we might create a sequence of "if(pred) - // a[i] = ...; " blocks. - for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) { - if (i == 0) - DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); - else if (isPredicatedBlock(i)) { - DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]); - } else { - DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]); - } - } + // We don't predicate stores by this point, so the vector body should be a + // single loop. + assert(LoopVectorBody.size() == 1 && "Expected single block loop!"); + DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); - DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]); + DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back()); DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); @@ -4056,7 +4156,19 @@ bool LoopVectorizationLegality::canVectorize() { // Analyze interleaved memory accesses. if (UseInterleaved) - InterleaveInfo.analyzeInterleaving(Strides); + InterleaveInfo.analyzeInterleaving(Strides); + + unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; + if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) + SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; + + if (Preds.getComplexity() > SCEVThreshold) { + emitAnalysis(VectorizationReport() + << "Too many SCEV assumptions need to be made and checked " + << "at runtime"); + DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n"); + return false; + } // Okay! We can vectorize. At this point we don't have any other mem analysis // which may limit our maximum vectorization factor, so just return true with @@ -4104,7 +4216,6 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, } bool LoopVectorizationLegality::canVectorizeInstrs() { - BasicBlock *PreHeader = TheLoop->getLoopPreheader(); BasicBlock *Header = TheLoop->getHeader(); // Look for the attribute signaling the absence of NaNs. @@ -4128,7 +4239,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && !PhiTy->isPointerTy()) { - emitAnalysis(VectorizationReport(it) + emitAnalysis(VectorizationReport(&*it) << "loop control flow is not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; @@ -4140,9 +4251,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (*bb != Header) { // Check that this instruction has no outside users or is an // identified reduction value with an outside user. - if (!hasOutsideLoopUser(TheLoop, it, AllowedExit)) + if (!hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) continue; - emitAnalysis(VectorizationReport(it) << + emitAnalysis(VectorizationReport(&*it) << "value could not be identified as " "an induction or reduction variable"); return false; @@ -4150,19 +4261,15 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // We only allow if-converted PHIs with exactly two incoming values. if (Phi->getNumIncomingValues() != 2) { - emitAnalysis(VectorizationReport(it) + emitAnalysis(VectorizationReport(&*it) << "control flow not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; } - // This is the value coming from the preheader. - Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); - ConstantInt *StepValue = nullptr; - // Check if this is an induction variable. - InductionKind IK = isInductionVariable(Phi, StepValue); - - if (IK_NoInduction != IK) { + InductionDescriptor ID; + if (InductionDescriptor::isInductionPHI(Phi, SE, ID)) { + Inductions[Phi] = ID; // Get the widest type. if (!WidestIndTy) WidestIndTy = convertPointerToIntegerType(DL, PhiTy); @@ -4170,21 +4277,24 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); // Int inductions are special because we only allow one IV. - if (IK == IK_IntInduction && StepValue->isOne()) { + if (ID.getKind() == InductionDescriptor::IK_IntInduction && + ID.getStepValue()->isOne() && + isa(ID.getStartValue()) && + cast(ID.getStartValue())->isNullValue()) { // Use the phi node with the widest type as induction. Use the last // one if there are multiple (no good reason for doing this other - // than it is expedient). + // than it is expedient). We've checked that it begins at zero and + // steps by one, so this is a canonical induction variable. if (!Induction || PhiTy == WidestIndTy) Induction = Phi; } DEBUG(dbgs() << "LV: Found an induction variable.\n"); - Inductions[Phi] = InductionInfo(StartValue, IK, StepValue); // Until we explicitly handle the case of an induction variable with // an outside loop user we have to give up vectorizing this loop. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { - emitAnalysis(VectorizationReport(it) << + if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) { + emitAnalysis(VectorizationReport(&*it) << "use of induction value outside of the " "loop is not handled by vectorizer"); return false; @@ -4202,7 +4312,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { continue; } - emitAnalysis(VectorizationReport(it) << + emitAnalysis(VectorizationReport(&*it) << "value that could not be identified as " "reduction is used outside the loop"); DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); @@ -4217,8 +4327,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa(CI) && !(CI->getCalledFunction() && TLI && TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { - emitAnalysis(VectorizationReport(it) << - "call instruction cannot be vectorized"); + emitAnalysis(VectorizationReport(&*it) + << "call instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); return false; } @@ -4228,7 +4338,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (CI && hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { - emitAnalysis(VectorizationReport(it) + emitAnalysis(VectorizationReport(&*it) << "intrinsic instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); return false; @@ -4239,7 +4349,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Also, we can't vectorize extractelement instructions. if ((!VectorType::isValidElementType(it->getType()) && !it->getType()->isVoidTy()) || isa(it)) { - emitAnalysis(VectorizationReport(it) + emitAnalysis(VectorizationReport(&*it) << "instruction return type cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; @@ -4263,8 +4373,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { - emitAnalysis(VectorizationReport(it) << + if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) { + emitAnalysis(VectorizationReport(&*it) << "value cannot be used outside the loop"); return false; } @@ -4282,6 +4392,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } } + // Now we know the widest induction type, check if our found induction + // is the same size. If it's not, unset it here and InnerLoopVectorizer + // will create another. + if (Induction && WidestIndTy != Induction->getType()) + Induction = nullptr; + return true; } @@ -4320,7 +4436,7 @@ void LoopVectorizationLegality::collectLoopUniforms() { BE = TheLoop->block_end(); B != BE; ++B) for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end(); I != IE; ++I) - if (I->getType()->isPointerTy() && isConsecutivePtr(I)) + if (I->getType()->isPointerTy() && isConsecutivePtr(&*I)) Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); while (!Worklist.empty()) { @@ -4358,24 +4474,11 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); + Preds.add(&LAI->Preds); return true; } -LoopVectorizationLegality::InductionKind -LoopVectorizationLegality::isInductionVariable(PHINode *Phi, - ConstantInt *&StepValue) { - if (!isInductionPHI(Phi, SE, StepValue)) - return IK_NoInduction; - - Type *PhiTy = Phi->getType(); - // Found an Integer induction variable. - if (PhiTy->isIntegerTy()) - return IK_IntInduction; - // Found an Pointer induction variable. - return IK_PtrInduction; -} - bool LoopVectorizationLegality::isInductionVariable(const Value *V) { Value *In0 = const_cast(V); PHINode *PN = dyn_cast_or_null(In0); @@ -4427,8 +4530,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr || !isSinglePredecessor) { - // Build a masked store if it is legal for the target, otherwise scalarize - // the block. + // Build a masked store if it is legal for the target, otherwise + // scalarize the block. bool isLegalMaskedOp = isLegalMaskedStore(SI->getValueOperand()->getType(), SI->getPointerOperand()); @@ -4486,7 +4589,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses( StoreInst *SI = dyn_cast(I); Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); - int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides); + int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides, Preds); // The factor of the corresponding interleave group. unsigned Factor = std::abs(Stride); @@ -4495,7 +4598,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses( if (Factor < 2 || Factor > MaxInterleaveGroupFactor) continue; - const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); + const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr); PointerType *PtrTy = dyn_cast(Ptr->getType()); unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); @@ -4642,7 +4745,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { unsigned TC = SE->getSmallConstantTripCount(TheLoop); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); - unsigned WidestType = getWidestType(); + MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); + unsigned SmallestType, WidestType; + std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); unsigned MaxSafeDepDist = -1U; if (Legal->getMaxSafeDepDistBytes() != -1U) @@ -4650,7 +4755,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { WidestRegister = ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist); unsigned MaxVectorSize = WidestRegister / WidestType; - DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); + + DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / " + << WidestType << " bits.\n"); DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"); @@ -4663,6 +4770,26 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { " into one vector!"); unsigned VF = MaxVectorSize; + if (MaximizeBandwidth && !OptForSize) { + // Collect all viable vectorization factors. + SmallVector VFs; + unsigned NewMaxVectorSize = WidestRegister / SmallestType; + for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2) + VFs.push_back(VS); + + // For each VF calculate its register usage. + auto RUs = calculateRegisterUsage(VFs); + + // Select the largest VF which doesn't require more registers than existing + // ones. + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); + for (int i = RUs.size() - 1; i >= 0; --i) { + if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { + VF = VFs[i]; + break; + } + } + } // If we optimize the program for size, avoid creating the tail loop. if (OptForSize) { @@ -4738,7 +4865,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { return Factor; } -unsigned LoopVectorizationCostModel::getWidestType() { +std::pair +LoopVectorizationCostModel::getSmallestAndWidestTypes() { + unsigned MinWidth = -1U; unsigned MaxWidth = 8; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); @@ -4751,18 +4880,22 @@ unsigned LoopVectorizationCostModel::getWidestType() { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { Type *T = it->getType(); - // Ignore ephemeral values. - if (EphValues.count(it)) + // Skip ignored values. + if (ValuesToIgnore.count(&*it)) continue; // Only examine Loads, Stores and PHINodes. if (!isa(it) && !isa(it) && !isa(it)) continue; - // Examine PHI nodes that are reduction variables. - if (PHINode *PN = dyn_cast(it)) - if (!Legal->getReductionVars()->count(PN)) + // Examine PHI nodes that are reduction variables. Update the type to + // account for the recurrence type. + if (PHINode *PN = dyn_cast(it)) { + if (!Legal->isReductionVariable(PN)) continue; + RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; + T = RdxDesc.getRecurrenceType(); + } // Examine the stored values. if (StoreInst *ST = dyn_cast(it)) @@ -4771,15 +4904,17 @@ unsigned LoopVectorizationCostModel::getWidestType() { // Ignore loaded pointer types and stored pointer types that are not // consecutive. However, we do want to take consecutive stores/loads of // pointer vectors into account. - if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) + if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it)) continue; + MinWidth = std::min(MinWidth, + (unsigned)DL.getTypeSizeInBits(T->getScalarType())); MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(T->getScalarType())); } } - return MaxWidth; + return {MinWidth, MaxWidth}; } unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, @@ -4825,7 +4960,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, TargetNumRegisters = ForceTargetNumVectorRegs; } - LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage(); + RegisterUsage R = calculateRegisterUsage({VF})[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); @@ -4923,8 +5058,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, } // Interleave if this is a large loop (small loops are already dealt with by - // this - // point) that could benefit from interleaving. + // this point) that could benefit from interleaving. bool HasReductions = (Legal->getReductionVars()->size() > 0); if (TTI.enableAggressiveInterleaving(HasReductions)) { DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); @@ -4935,8 +5069,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, return 1; } -LoopVectorizationCostModel::RegisterUsage -LoopVectorizationCostModel::calculateRegisterUsage() { +SmallVector +LoopVectorizationCostModel::calculateRegisterUsage( + const SmallVector &VFs) { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and @@ -4957,8 +5092,8 @@ LoopVectorizationCostModel::calculateRegisterUsage() { LoopBlocksDFS DFS(TheLoop); DFS.perform(LI); - RegisterUsage R; - R.NumInstructions = 0; + RegisterUsage RU; + RU.NumInstructions = 0; // Each 'key' in the map opens a new interval. The values // of the map are the index of the 'last seen' usage of the @@ -4977,15 +5112,13 @@ LoopVectorizationCostModel::calculateRegisterUsage() { unsigned Index = 0; for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), be = DFS.endRPO(); bb != be; ++bb) { - R.NumInstructions += (*bb)->size(); - for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; - ++it) { - Instruction *I = it; - IdxToInstr[Index++] = I; + RU.NumInstructions += (*bb)->size(); + for (Instruction &I : **bb) { + IdxToInstr[Index++] = &I; // Save the end location of each USE. - for (unsigned i = 0; i < I->getNumOperands(); ++i) { - Value *U = I->getOperand(i); + for (unsigned i = 0; i < I.getNumOperands(); ++i) { + Value *U = I.getOperand(i); Instruction *Instr = dyn_cast(U); // Ignore non-instruction values such as arguments, constants, etc. @@ -5014,42 +5147,81 @@ LoopVectorizationCostModel::calculateRegisterUsage() { TransposeEnds[it->second].push_back(it->first); SmallSet OpenIntervals; - unsigned MaxUsage = 0; + // Get the size of the widest register. + unsigned MaxSafeDepDist = -1U; + if (Legal->getMaxSafeDepDistBytes() != -1U) + MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; + unsigned WidestRegister = + std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist); + const DataLayout &DL = TheFunction->getParent()->getDataLayout(); + + SmallVector RUs(VFs.size()); + SmallVector MaxUsages(VFs.size(), 0); DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); + + // A lambda that gets the register usage for the given type and VF. + auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) { + unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); + return std::max(1, VF * TypeSize / WidestRegister); + }; + for (unsigned int i = 0; i < Index; ++i) { Instruction *I = IdxToInstr[i]; // Ignore instructions that are never used within the loop. if (!Ends.count(I)) continue; - // Ignore ephemeral values. - if (EphValues.count(I)) + // Skip ignored values. + if (ValuesToIgnore.count(I)) continue; // Remove all of the instructions that end at this location. InstrList &List = TransposeEnds[i]; - for (unsigned int j=0, e = List.size(); j < e; ++j) + for (unsigned int j = 0, e = List.size(); j < e; ++j) OpenIntervals.erase(List[j]); - // Count the number of live interals. - MaxUsage = std::max(MaxUsage, OpenIntervals.size()); + // For each VF find the maximum usage of registers. + for (unsigned j = 0, e = VFs.size(); j < e; ++j) { + if (VFs[j] == 1) { + MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); + continue; + } + + // Count the number of live intevals. + unsigned RegUsage = 0; + for (auto Inst : OpenIntervals) + RegUsage += GetRegUsage(Inst->getType(), VFs[j]); + MaxUsages[j] = std::max(MaxUsages[j], RegUsage); + } - DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " << - OpenIntervals.size() << '\n'); + DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " + << OpenIntervals.size() << '\n'); // Add the current instruction to the list of open intervals. OpenIntervals.insert(I); } - unsigned Invariant = LoopInvariants.size(); - DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n'); - DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); - DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n'); + for (unsigned i = 0, e = VFs.size(); i < e; ++i) { + unsigned Invariant = 0; + if (VFs[i] == 1) + Invariant = LoopInvariants.size(); + else { + for (auto Inst : LoopInvariants) + Invariant += GetRegUsage(Inst->getType(), VFs[i]); + } + + DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); + DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); + DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n'); + DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n'); + + RU.LoopInvariantRegs = Invariant; + RU.MaxLocalUsers = MaxUsages[i]; + RUs[i] = RU; + } - R.LoopInvariantRegs = Invariant; - R.MaxLocalUsers = MaxUsage; - return R; + return RUs; } unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { @@ -5067,11 +5239,11 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { if (isa(it)) continue; - // Ignore ephemeral values. - if (EphValues.count(it)) + // Skip ignored values. + if (ValuesToIgnore.count(&*it)) continue; - unsigned C = getInstructionCost(it, VF); + unsigned C = getInstructionCost(&*it, VF); // Check if we should override the cost. if (ForceTargetInstructionCost.getNumOccurrences() > 0) @@ -5148,9 +5320,8 @@ static bool isLikelyComplexAddressComputation(Value *Ptr, } static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { - if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1))) - return true; - return false; + return Legal->hasStride(I->getOperand(0)) || + Legal->hasStride(I->getOperand(1)); } unsigned @@ -5161,6 +5332,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { VF = 1; Type *RetTy = I->getType(); + if (VF > 1 && MinBWs.count(I)) + RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); Type *VectorTy = ToVectorTy(RetTy, VF); // TODO: We need to estimate the cost of intrinsic calls. @@ -5243,6 +5416,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::ICmp: case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); + Instruction *Op0AsInstruction = dyn_cast(I->getOperand(0)); + auto It = MinBWs.find(Op0AsInstruction); + if (VF > 1 && It != MinBWs.end()) + ValTy = IntegerType::get(ValTy->getContext(), It->second); VectorTy = ToVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); } @@ -5366,8 +5543,28 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { Legal->isInductionVariable(I->getOperand(0))) return TTI.getCastInstrCost(I->getOpcode(), I->getType(), I->getOperand(0)->getType()); - - Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); + + Type *SrcScalarTy = I->getOperand(0)->getType(); + Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF); + if (VF > 1 && MinBWs.count(I)) { + // This cast is going to be shrunk. This may remove the cast or it might + // turn it into slightly different cast. For example, if MinBW == 16, + // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". + // + // Calculate the modified src and dest types. + Type *MinVecTy = VectorTy; + if (I->getOpcode() == Instruction::Trunc) { + SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); + VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF), + MinVecTy); + } else if (I->getOpcode() == Instruction::ZExt || + I->getOpcode() == Instruction::SExt) { + SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); + VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF), + MinVecTy); + } + } + return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } case Instruction::Call: { @@ -5407,7 +5604,9 @@ char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) @@ -5416,6 +5615,7 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) +INITIALIZE_PASS_DEPENDENCY(DemandedBits) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { @@ -5483,19 +5683,12 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); - Instruction *InsertPt = Builder.GetInsertPoint(); - BasicBlock *IfBlock = Builder.GetInsertBlock(); - BasicBlock *CondBlock = nullptr; - VectorParts Cond; - Loop *VectorLp = nullptr; if (IfPredicateStore) { assert(Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks"); Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), Instr->getParent()); - VectorLp = LI->getLoopFor(IfBlock); - assert(VectorLp && "Must have a loop for this block"); } // For each vector unroll 'part': @@ -5510,11 +5703,6 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part], ConstantInt::get(Cond[Part]->getType(), 1)); - CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); - LoopVectorBody.push_back(CondBlock); - VectorLp->addBasicBlockToLoop(CondBlock, *LI); - // Update Builder with newly created basic block. - Builder.SetInsertPoint(InsertPt); } Instruction *Cloned = Instr->clone(); @@ -5534,16 +5722,10 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, if (!IsVoidRetTy) VecResults[Part] = Cloned; - // End if-block. - if (IfPredicateStore) { - BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); - LoopVectorBody.push_back(NewIfBlock); - VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); - Builder.SetInsertPoint(InsertPt); - ReplaceInstWithInst(IfBlock->getTerminator(), - BranchInst::Create(CondBlock, NewIfBlock, Cmp)); - IfBlock = NewIfBlock; - } + // End if-block. + if (IfPredicateStore) + PredicatedStores.push_back(std::make_pair(cast(Cloned), + Cmp)); } }