X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTransforms%2FVectorize%2FLoopVectorize.cpp;h=a6dbeed6bc860f6f38820fd8628515c490ebffd7;hb=69c9697fa7da77feec29aef669df5a9fa7f388a4;hp=34d8a1053fa133892241bd383c227800173c2bf8;hpb=d16404a0e79a176b7698d9c7bc1ec146ae2f2f1b;p=oota-llvm.git diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 34d8a1053fa..a6dbeed6bc8 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -54,7 +54,11 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" @@ -102,14 +106,19 @@ using namespace llvm::PatternMatch; STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); -static cl::opt -VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, - cl::desc("Sets the SIMD width. Zero is autoselect.")); +static cl::opt +VectorizationFactor("force-vector-width", cl::Hidden, + cl::desc("Sets the SIMD width. Zero is autoselect."), + cl::location(VectorizerParams::VectorizationFactor)); +unsigned VectorizerParams::VectorizationFactor = 0; -static cl::opt -VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden, - cl::desc("Sets the vectorization unroll count. " - "Zero is autoselect.")); +static cl::opt +VectorizationInterleave("force-vector-interleave", cl::Hidden, + cl::desc("Sets the vectorization interleave count. " + "Zero is autoselect."), + cl::location( + VectorizerParams::VectorizationInterleave)); +unsigned VectorizerParams::VectorizationInterleave = 0; static cl::opt EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, @@ -143,10 +152,10 @@ static const unsigned TinyTripCountUnrollThreshold = 128; /// When performing memory disambiguation checks at runtime do not make more /// than this number of comparisons. -static const unsigned RuntimeMemoryCheckThreshold = 8; +const unsigned VectorizerParams::RuntimeMemoryCheckThreshold = 8; /// Maximum simd width. -static const unsigned MaxVectorWidth = 64; +const unsigned VectorizerParams::MaxVectorWidth = 64; static cl::opt ForceTargetNumScalarRegs( "force-target-num-scalar-regs", cl::init(0), cl::Hidden, @@ -156,17 +165,17 @@ static cl::opt ForceTargetNumVectorRegs( "force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers.")); -/// Maximum vectorization unroll count. -static const unsigned MaxUnrollFactor = 16; +/// Maximum vectorization interleave count. +static const unsigned MaxInterleaveFactor = 16; -static cl::opt ForceTargetMaxScalarUnrollFactor( - "force-target-max-scalar-unroll", cl::init(0), cl::Hidden, - cl::desc("A flag that overrides the target's max unroll factor for scalar " - "loops.")); +static cl::opt ForceTargetMaxScalarInterleaveFactor( + "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, + cl::desc("A flag that overrides the target's max interleave factor for " + "scalar loops.")); -static cl::opt ForceTargetMaxVectorUnrollFactor( - "force-target-max-vector-unroll", cl::init(0), cl::Hidden, - cl::desc("A flag that overrides the target's max unroll factor for " +static cl::opt ForceTargetMaxVectorInterleaveFactor( + "force-target-max-vector-interleave", cl::init(0), cl::Hidden, + cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops.")); static cl::opt ForceTargetInstructionCost( @@ -203,11 +212,17 @@ static cl::opt EnableCondStoresVectorization( "enable-cond-stores-vec", cl::init(false), cl::Hidden, cl::desc("Enable if predication of stores during vectorization.")); +static cl::opt MaxNestedScalarReductionUF( + "max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden, + cl::desc("The maximum unroll factor to use when unrolling a scalar " + "reduction in a nested loop.")); + namespace { // Forward declarations. class LoopVectorizationLegality; class LoopVectorizationCostModel; +class LoopVectorizeHints; /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). @@ -261,13 +276,6 @@ protected: typedef DenseMap, VectorParts> EdgeMaskCache; - /// \brief Add code that checks at runtime if the accessed arrays overlap. - /// - /// Returns a pair of instructions where the first element is the first - /// instruction generated in possibly a sequence of instructions and the - /// second value is the final comparator value or NULL if no check is needed. - std::pair addRuntimeCheck(Instruction *Loc); - /// \brief Add checks for strides that where assumed to be 1. /// /// Returns the last check instruction and the first check instruction in the @@ -323,10 +331,9 @@ protected: /// element. virtual Value *getBroadcastInstrs(Value *V); - /// This function adds 0, 1, 2 ... to each vector element, starting at zero. - /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...). - /// The sequence starts at StartIndex. - virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate); + /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...) + /// to each vector element of Val. The sequence starts at StartIndex. + virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step); /// When we go over instructions in the basic block we rely on previous /// values within the current basic block or on loop invariant values. @@ -386,6 +393,8 @@ protected: LoopInfo *LI; /// Dominator Tree. DominatorTree *DT; + /// Alias Analysis. + AliasAnalysis *AA; /// Data Layout. const DataLayout *DL; /// Target Library Info. @@ -445,7 +454,7 @@ private: bool IfPredicateStore = false) override; void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; - Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate) override; + Value *getStepVector(Value *Val, int StartIdx, Value *Step) override; Value *reverseVector(Value *Vec) override; }; @@ -495,6 +504,36 @@ static std::string getDebugLocString(const Loop *L) { } #endif +/// \brief Propagate known metadata from one instruction to another. +static void propagateMetadata(Instruction *To, const Instruction *From) { + SmallVector, 4> Metadata; + From->getAllMetadataOtherThanDebugLoc(Metadata); + + for (auto M : Metadata) { + unsigned Kind = M.first; + + // These are safe to transfer (this is safe for TBAA, even when we + // if-convert, because should that metadata have had a control dependency + // on the condition, and thus actually aliased with some other + // non-speculated memory access when the condition was false, this would be + // caught by the runtime overlap checks). + if (Kind != LLVMContext::MD_tbaa && + Kind != LLVMContext::MD_alias_scope && + Kind != LLVMContext::MD_noalias && + Kind != LLVMContext::MD_fpmath) + continue; + + To->setMetadata(Kind, M.second); + } +} + +/// \brief Propagate known metadata from one instruction to a vector of others. +static void propagateMetadata(SmallVectorImpl &To, const Instruction *From) { + for (Value *V : To) + if (Instruction *I = dyn_cast(V)) + propagateMetadata(I, From); +} + /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and /// to what vectorization factor. /// This class does not look at the profitability of vectorization, only the @@ -510,15 +549,14 @@ static std::string getDebugLocString(const Loop *L) { /// induction variable and the different reduction variables. class LoopVectorizationLegality { public: - unsigned NumLoads; - unsigned NumStores; - unsigned NumPredStores; - LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL, - DominatorTree *DT, TargetLibraryInfo *TLI) - : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL), - DT(DT), TLI(TLI), Induction(nullptr), WidestIndTy(nullptr), - HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {} + DominatorTree *DT, TargetLibraryInfo *TLI, + AliasAnalysis *AA, Function *F, + const TargetTransformInfo *TTI, + LoopAccessAnalysis *LAA) + : NumPredStores(0), TheLoop(L), SE(SE), DL(DL), + TLI(TLI), TheFunction(F), TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), + Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {} /// This enum represents the kinds of reductions that we support. enum ReductionKind { @@ -536,11 +574,9 @@ public: /// This enum represents the kinds of inductions that we support. enum InductionKind { - IK_NoInduction, ///< Not an induction variable. - IK_IntInduction, ///< Integer induction variable. Step = 1. - IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1. - IK_PtrInduction, ///< Pointer induction var. Step = sizeof(elem). - IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem). + IK_NoInduction, ///< Not an induction variable. + IK_IntInduction, ///< Integer induction variable. Step = C. + IK_PtrInduction ///< Pointer induction var. Step = C / sizeof(elem). }; // This enum represents the kind of minmax reduction. @@ -591,48 +627,69 @@ public: MinMaxReductionKind MinMaxKind; }; - /// This struct holds information about the memory runtime legality - /// check that a group of pointers do not overlap. - struct RuntimePointerCheck { - RuntimePointerCheck() : Need(false) {} - - /// Reset the state of the pointer runtime information. - void reset() { - Need = false; - Pointers.clear(); - Starts.clear(); - Ends.clear(); - IsWritePtr.clear(); - DependencySetId.clear(); + /// A struct for saving information about induction variables. + struct InductionInfo { + InductionInfo(Value *Start, InductionKind K, ConstantInt *Step) + : StartValue(Start), IK(K), StepValue(Step) { + assert(IK != IK_NoInduction && "Not an induction"); + assert(StartValue && "StartValue is null"); + assert(StepValue && !StepValue->isZero() && "StepValue is zero"); + assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) && + "StartValue is not a pointer for pointer induction"); + assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) && + "StartValue is not an integer for integer induction"); + assert(StepValue->getType()->isIntegerTy() && + "StepValue is not an integer"); + } + InductionInfo() + : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {} + + /// Get the consecutive direction. Returns: + /// 0 - unknown or non-consecutive. + /// 1 - consecutive and increasing. + /// -1 - consecutive and decreasing. + int getConsecutiveDirection() const { + if (StepValue && (StepValue->isOne() || StepValue->isMinusOne())) + return StepValue->getSExtValue(); + return 0; } - /// Insert a pointer and calculate the start and end SCEVs. - void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, - unsigned DepSetId, ValueToValueMap &Strides); - - /// This flag indicates if we need to add the runtime check. - bool Need; - /// Holds the pointers that we need to check. - SmallVector, 2> Pointers; - /// Holds the pointer value at the beginning of the loop. - SmallVector Starts; - /// Holds the pointer value at the end of the loop. - SmallVector Ends; - /// Holds the information if this pointer is used for writing to memory. - SmallVector IsWritePtr; - /// Holds the id of the set of pointers that could be dependent because of a - /// shared underlying object. - SmallVector DependencySetId; - }; + /// Compute the transformed value of Index at offset StartValue using step + /// StepValue. + /// For integer induction, returns StartValue + Index * StepValue. + /// For pointer induction, returns StartValue[Index * StepValue]. + /// FIXME: The newly created binary instructions should contain nsw/nuw + /// flags, which can be found from the original scalar operations. + Value *transform(IRBuilder<> &B, Value *Index) const { + switch (IK) { + case IK_IntInduction: + assert(Index->getType() == StartValue->getType() && + "Index type does not match StartValue type"); + if (StepValue->isMinusOne()) + return B.CreateSub(StartValue, Index); + if (!StepValue->isOne()) + Index = B.CreateMul(Index, StepValue); + return B.CreateAdd(StartValue, Index); + + case IK_PtrInduction: + if (StepValue->isMinusOne()) + Index = B.CreateNeg(Index); + else if (!StepValue->isOne()) + Index = B.CreateMul(Index, StepValue); + return B.CreateGEP(StartValue, Index); + + case IK_NoInduction: + return nullptr; + } + llvm_unreachable("invalid enum"); + } - /// A struct for saving information about induction variables. - struct InductionInfo { - InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} - InductionInfo() : StartValue(nullptr), IK(IK_NoInduction) {} /// Start value. TrackingVH StartValue; /// Induction kind. InductionKind IK; + /// Step value. + ConstantInt *StepValue; }; /// ReductionList contains the reduction descriptors for all @@ -684,13 +741,19 @@ public: bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } /// Returns the information that we collected about runtime memory check. - RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; } + LoopAccessInfo::RuntimePointerCheck *getRuntimePointerCheck() { + return LAI->getRuntimePointerCheck(); + } + + LoopAccessInfo *getLAI() { + return LAI; + } /// This function returns the identity element (or neutral element) for /// the operation K. static Constant *getReductionIdentity(ReductionKind K, Type *Tp); - unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } + unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } bool hasStride(Value *V) { return StrideSet.count(V); } bool mustCheckStrides() { return !StrideSet.empty(); } @@ -699,6 +762,30 @@ public: } SmallPtrSet::iterator strides_end() { return StrideSet.end(); } + /// Returns true if the target machine supports masked store operation + /// for the given \p DataType and kind of access to \p Ptr. + bool isLegalMaskedStore(Type *DataType, Value *Ptr) { + return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr)); + } + /// Returns true if the target machine supports masked load operation + /// for the given \p DataType and kind of access to \p Ptr. + bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { + return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr)); + } + /// Returns true if vector representation of the instruction \p I + /// requires mask. + bool isMaskRequired(const Instruction* I) { + return (MaskedOp.count(I) != 0); + } + unsigned getNumStores() const { + return LAI->getNumStores(); + } + unsigned getNumLoads() const { + return LAI->getNumLoads(); + } + unsigned getNumPredStores() const { + return NumPredStores; + } private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -721,7 +808,7 @@ private: /// Return true if all of the instructions in the block can be speculatively /// executed. \p SafePtrs is a list of addresses that are known to be legal /// and we know that we can read from them without segfault. - bool blockCanBePredicated(BasicBlock *BB, SmallPtrSet& SafePtrs); + bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl &SafePtrs); /// Returns True, if 'Phi' is the kind of reduction variable for type /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. @@ -737,15 +824,23 @@ private: /// pattern corresponding to a min(X, Y) or max(X, Y). static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I, ReductionInstDesc &Prev); - /// Returns the induction kind of Phi. This function may return NoInduction - /// if the PHI is not an induction variable. - InductionKind isInductionVariable(PHINode *Phi); + /// Returns the induction kind of Phi and record the step. This function may + /// return NoInduction if the PHI is not an induction variable. + InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue); /// \brief Collect memory access with loop invariant strides. /// /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop /// invariant. - void collectStridedAcccess(Value *LoadOrStoreInst); + void collectStridedAccess(Value *LoadOrStoreInst); + + /// Report an analysis message to assist the user in diagnosing loops that are + /// not vectorized. + void emitAnalysis(const VectorizationReport &Message) { + VectorizationReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); + } + + unsigned NumPredStores; /// The loop that we evaluate. Loop *TheLoop; @@ -753,10 +848,19 @@ private: ScalarEvolution *SE; /// DataLayout analysis. const DataLayout *DL; - /// Dominators. - DominatorTree *DT; /// Target Library Info. TargetLibraryInfo *TLI; + /// Parent function + Function *TheFunction; + /// Target Transform Info + const TargetTransformInfo *TTI; + /// Dominator Tree. + DominatorTree *DT; + // LoopAccess analysis. + LoopAccessAnalysis *LAA; + // And the loop-accesses info corresponding to this loop. This pointer is + // null until canVectorizeMemory sets it up. + LoopAccessInfo *LAI; // --- vectorization state --- // @@ -778,16 +882,16 @@ private: /// This set holds the variables which are known to be uniform after /// vectorization. SmallPtrSet Uniforms; - /// We need to check that all of the pointers in this list are disjoint - /// at runtime. - RuntimePointerCheck PtrRtCheck; + /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; - unsigned MaxSafeDepDistBytes; - ValueToValueMap Strides; SmallPtrSet StrideSet; + + /// While vectorizing these instructions we have to generate a + /// call to the appropriate masked intrinsic + SmallPtrSet MaskedOp; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -802,8 +906,13 @@ public: LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, - const DataLayout *DL, const TargetLibraryInfo *TLI) - : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {} + const DataLayout *DL, const TargetLibraryInfo *TLI, + AssumptionCache *AC, const Function *F, + const LoopVectorizeHints *Hints) + : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI), + TheFunction(F), Hints(Hints) { + CodeMetrics::collectEphemeralValues(L, AC, EphValues); + } /// Information about vectorization costs struct VectorizationFactor { @@ -814,9 +923,7 @@ public: /// This method checks every power of two up to VF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is /// possible. - VectorizationFactor selectVectorizationFactor(bool OptForSize, - unsigned UserVF, - bool ForceVectorization); + VectorizationFactor selectVectorizationFactor(bool OptForSize); /// \return The size (in bits) of the widest type in the code that /// needs to be vectorized. We ignore values that remain scalar such as @@ -828,8 +935,7 @@ public: /// based on register pressure and other parameters. /// VF and LoopCost are the selected vectorization factor and the cost of the /// selected VF. - unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF, - unsigned LoopCost); + unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost); /// \brief A struct that represents some properties of the register usage /// of a loop. @@ -865,6 +971,15 @@ private: /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); + /// Report an analysis message to assist the user in diagnosing loops that are + /// not vectorized. + void emitAnalysis(const VectorizationReport &Message) { + VectorizationReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); + } + + /// Values used only by @llvm.assume calls. + SmallPtrSet EphValues; + /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. @@ -879,11 +994,59 @@ private: const DataLayout *DL; /// Target Library Info. const TargetLibraryInfo *TLI; + const Function *TheFunction; + // Loop Vectorize Hint. + const LoopVectorizeHints *Hints; }; /// Utility class for getting and setting loop vectorizer hints in the form /// of loop metadata. +/// This class keeps a number of loop annotations locally (as member variables) +/// and can, upon request, write them back as metadata on the loop. It will +/// initially scan the loop for existing metadata, and will update the local +/// values based on information in the loop. +/// We cannot write all values to metadata, as the mere presence of some info, +/// for example 'force', means a decision has been made. So, we need to be +/// careful NOT to add them if the user hasn't specifically asked so. class LoopVectorizeHints { + enum HintKind { + HK_WIDTH, + HK_UNROLL, + HK_FORCE + }; + + /// Hint - associates name and validation with the hint value. + struct Hint { + const char * Name; + unsigned Value; // This may have to change for non-numeric values. + HintKind Kind; + + Hint(const char * Name, unsigned Value, HintKind Kind) + : Name(Name), Value(Value), Kind(Kind) { } + + bool validate(unsigned Val) { + switch (Kind) { + case HK_WIDTH: + return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; + case HK_UNROLL: + return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; + case HK_FORCE: + return (Val <= 1); + } + return false; + } + }; + + /// Vectorization width. + Hint Width; + /// Vectorization interleave factor. + Hint Interleave; + /// Vectorization forced + Hint Force; + + /// Return the loop metadata prefix. + static StringRef Prefix() { return "llvm.loop."; } + public: enum ForceKind { FK_Undefined = -1, ///< Not selected. @@ -891,65 +1054,57 @@ public: FK_Enabled = 1, ///< Forcing enabled. }; - LoopVectorizeHints(const Loop *L, bool DisableUnrolling) - : Width(VectorizationFactor), - Unroll(DisableUnrolling), - Force(FK_Undefined), - LoopID(L->getLoopID()) { - getHints(L); - // force-vector-unroll overrides DisableUnrolling. - if (VectorizationUnroll.getNumOccurrences() > 0) - Unroll = VectorizationUnroll; - - DEBUG(if (DisableUnrolling && Unroll == 1) dbgs() - << "LV: Unrolling disabled by the pass manager\n"); - } + LoopVectorizeHints(const Loop *L, bool DisableInterleaving) + : Width("vectorize.width", VectorizationFactor, HK_WIDTH), + Interleave("interleave.count", DisableInterleaving, HK_UNROLL), + Force("vectorize.enable", FK_Undefined, HK_FORCE), + TheLoop(L) { + // Populate values with existing loop metadata. + getHintsFromMetadata(); - /// Return the loop vectorizer metadata prefix. - static StringRef Prefix() { return "llvm.vectorizer."; } + // force-vector-interleave overrides DisableInterleaving. + if (VectorizationInterleave.getNumOccurrences() > 0) + Interleave.Value = VectorizationInterleave; - MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) const { - SmallVector Vals; - Vals.push_back(MDString::get(Context, Name)); - Vals.push_back(ConstantInt::get(Type::getInt32Ty(Context), V)); - return MDNode::get(Context, Vals); + DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() + << "LV: Interleaving disabled by the pass manager\n"); } /// Mark the loop L as already vectorized by setting the width to 1. - void setAlreadyVectorized(Loop *L) { - LLVMContext &Context = L->getHeader()->getContext(); - - Width = 1; - - // Create a new loop id with one more operand for the already_vectorized - // hint. If the loop already has a loop id then copy the existing operands. - SmallVector Vals(1); - if (LoopID) - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) - Vals.push_back(LoopID->getOperand(i)); - - Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width)); - Vals.push_back(createHint(Context, Twine(Prefix(), "unroll").str(), 1)); - - MDNode *NewLoopID = MDNode::get(Context, Vals); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); - - L->setLoopID(NewLoopID); - if (LoopID) - LoopID->replaceAllUsesWith(NewLoopID); + void setAlreadyVectorized() { + Width.Value = Interleave.Value = 1; + Hint Hints[] = {Width, Interleave}; + writeHintsToMetadata(Hints); + } + + /// Dumps all the hint information. + std::string emitRemark() const { + VectorizationReport R; + if (Force.Value == LoopVectorizeHints::FK_Disabled) + R << "vectorization is explicitly disabled"; + else { + R << "use -Rpass-analysis=loop-vectorize for more info"; + if (Force.Value == LoopVectorizeHints::FK_Enabled) { + R << " (Force=true"; + if (Width.Value != 0) + R << ", Vector Width=" << Width.Value; + if (Interleave.Value != 0) + R << ", Interleave Count=" << Interleave.Value; + R << ")"; + } + } - LoopID = NewLoopID; + return R.str(); } - unsigned getWidth() const { return Width; } - unsigned getUnroll() const { return Unroll; } - enum ForceKind getForce() const { return Force; } - MDNode *getLoopID() const { return LoopID; } + unsigned getWidth() const { return Width.Value; } + unsigned getInterleave() const { return Interleave.Value; } + enum ForceKind getForce() const { return (ForceKind)Force.Value; } private: - /// Find hints specified in the loop metadata. - void getHints(const Loop *L) { + /// Find hints specified in the loop metadata and update local values. + void getHintsFromMetadata() { + MDNode *LoopID = TheLoop->getLoopID(); if (!LoopID) return; @@ -959,7 +1114,7 @@ private: for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { const MDString *S = nullptr; - SmallVector Args; + SmallVector Args; // The expected hint is either a MDString or a MDNode with the first // operand a MDString. @@ -977,55 +1132,108 @@ private: if (!S) continue; - // Check if the hint starts with the vectorizer prefix. - StringRef Hint = S->getString(); - if (!Hint.startswith(Prefix())) - continue; - // Remove the prefix. - Hint = Hint.substr(Prefix().size(), StringRef::npos); - + // Check if the hint starts with the loop metadata prefix. + StringRef Name = S->getString(); if (Args.size() == 1) - getHint(Hint, Args[0]); + setHint(Name, Args[0]); } } - // Check string hint with one operand. - void getHint(StringRef Hint, Value *Arg) { - const ConstantInt *C = dyn_cast(Arg); + /// Checks string hint with one operand and set value if valid. + void setHint(StringRef Name, Metadata *Arg) { + if (!Name.startswith(Prefix())) + return; + Name = Name.substr(Prefix().size(), StringRef::npos); + + const ConstantInt *C = mdconst::dyn_extract(Arg); if (!C) return; unsigned Val = C->getZExtValue(); - if (Hint == "width") { - if (isPowerOf2_32(Val) && Val <= MaxVectorWidth) - Width = Val; - else - DEBUG(dbgs() << "LV: ignoring invalid width hint metadata\n"); - } else if (Hint == "unroll") { - if (isPowerOf2_32(Val) && Val <= MaxUnrollFactor) - Unroll = Val; - else - DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n"); - } else if (Hint == "enable") { - if (C->getBitWidth() == 1) - Force = Val == 1 ? LoopVectorizeHints::FK_Enabled - : LoopVectorizeHints::FK_Disabled; - else - DEBUG(dbgs() << "LV: ignoring invalid enable hint metadata\n"); - } else { - DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n'); + Hint *Hints[] = {&Width, &Interleave, &Force}; + for (auto H : Hints) { + if (Name == H->Name) { + if (H->validate(Val)) + H->Value = Val; + else + DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); + break; + } } } - /// Vectorization width. - unsigned Width; - /// Vectorization unroll factor. - unsigned Unroll; - /// Vectorization forced - enum ForceKind Force; + /// Create a new hint from name / value pair. + MDNode *createHintMetadata(StringRef Name, unsigned V) const { + LLVMContext &Context = TheLoop->getHeader()->getContext(); + Metadata *MDs[] = {MDString::get(Context, Name), + ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(Context), V))}; + return MDNode::get(Context, MDs); + } + + /// Matches metadata with hint name. + bool matchesHintMetadataName(MDNode *Node, ArrayRef HintTypes) { + MDString* Name = dyn_cast(Node->getOperand(0)); + if (!Name) + return false; + + for (auto H : HintTypes) + if (Name->getString().endswith(H.Name)) + return true; + return false; + } + + /// Sets current hints into loop metadata, keeping other values intact. + void writeHintsToMetadata(ArrayRef HintTypes) { + if (HintTypes.size() == 0) + return; + + // Reserve the first element to LoopID (see below). + SmallVector MDs(1); + // If the loop already has metadata, then ignore the existing operands. + MDNode *LoopID = TheLoop->getLoopID(); + if (LoopID) { + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + MDNode *Node = cast(LoopID->getOperand(i)); + // If node in update list, ignore old value. + if (!matchesHintMetadataName(Node, HintTypes)) + MDs.push_back(Node); + } + } + + // Now, add the missing hints. + for (auto H : HintTypes) + MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); + + // Replace current metadata node with new one. + LLVMContext &Context = TheLoop->getHeader()->getContext(); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); - MDNode *LoopID; + TheLoop->setLoopID(NewLoopID); + } + + /// The loop these hints belong to. + const Loop *TheLoop; }; +static void emitMissedWarning(Function *F, Loop *L, + const LoopVectorizeHints &LH) { + emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F, + L->getStartLoc(), LH.emitRemark()); + + if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { + if (LH.getWidth() != 1) + emitLoopVectorizeWarning( + F->getContext(), *F, L->getStartLoc(), + "failed explicitly specified loop vectorization"); + else if (LH.getInterleave() != 1) + emitLoopInterleaveWarning( + F->getContext(), *F, L->getStartLoc(), + "failed explicitly specified loop interleaving"); + } +} + static void addInnerLoop(Loop &L, SmallVectorImpl &V) { if (L.empty()) return V.push_back(&L); @@ -1053,6 +1261,9 @@ struct LoopVectorize : public FunctionPass { DominatorTree *DT; BlockFrequencyInfo *BFI; TargetLibraryInfo *TLI; + AliasAnalysis *AA; + AssumptionCache *AC; + LoopAccessAnalysis *LAA; bool DisableUnrolling; bool AlwaysVectorize; @@ -1062,11 +1273,15 @@ struct LoopVectorize : public FunctionPass { SE = &getAnalysis(); DataLayoutPass *DLP = getAnalysisIfAvailable(); DL = DLP ? &DLP->getDataLayout() : nullptr; - LI = &getAnalysis(); - TTI = &getAnalysis(); + LI = &getAnalysis().getLoopInfo(); + TTI = &getAnalysis().getTTI(F); DT = &getAnalysis().getDomTree(); BFI = &getAnalysis(); - TLI = getAnalysisIfAvailable(); + auto *TLIP = getAnalysisIfAvailable(); + TLI = TLIP ? &TLIP->getTLI() : nullptr; + AA = &getAnalysis(); + AC = &getAnalysis().getAssumptionCache(F); + LAA = &getAnalysis(); // Compute some weights outside of the loop over the loops. Compute this // using a BranchProbability to re-use its scaling math. @@ -1123,27 +1338,45 @@ struct LoopVectorize : public FunctionPass { : (Hints.getForce() == LoopVectorizeHints::FK_Enabled ? "enabled" : "?")) << " width=" << Hints.getWidth() - << " unroll=" << Hints.getUnroll() << "\n"); + << " unroll=" << Hints.getInterleave() << "\n"); + + // Function containing loop + Function *F = L->getHeader()->getParent(); + + // Looking at the diagnostic output is the only way to determine if a loop + // was vectorized (other than looking at the IR or machine code), so it + // is important to generate an optimization remark for each loop. Most of + // these messages are generated by emitOptimizationRemarkAnalysis. Remarks + // generated by emitOptimizationRemark and emitOptimizationRemarkMissed are + // less verbose reporting vectorized loops and unvectorized loops that may + // benefit from vectorization, respectively. if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) { DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); + emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, + L->getStartLoc(), Hints.emitRemark()); return false; } if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) { DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); + emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, + L->getStartLoc(), Hints.emitRemark()); return false; } - if (Hints.getWidth() == 1 && Hints.getUnroll() == 1) { + if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) { DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); + emitOptimizationRemarkAnalysis( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + "loop not vectorized: vector width and interleave count are " + "explicitly set to 1"); return false; } // Check the loop for a trip count threshold: // do not vectorize loops with a tiny trip count. - BasicBlock *Latch = L->getLoopLatch(); - const unsigned TC = SE->getSmallConstantTripCount(L, Latch); + const unsigned TC = SE->getSmallConstantTripCount(L); if (TC > 0u && TC < TinyTripCountVectorThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing."); @@ -1151,23 +1384,27 @@ struct LoopVectorize : public FunctionPass { DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { DEBUG(dbgs() << "\n"); + emitOptimizationRemarkAnalysis( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + "vectorization is not beneficial and is not explicitly forced"); return false; } } // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL, DT, TLI); + LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F, TTI, LAA); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); + emitMissedWarning(F, L, Hints); return false; } // Use the cost model. - LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI); + LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AC, F, + &Hints); // Check the function attributes to find out if this function should be // optimized for size. - Function *F = L->getHeader()->getParent(); bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->hasFnAttribute(Attribute::OptimizeForSize); @@ -1190,27 +1427,34 @@ struct LoopVectorize : public FunctionPass { if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"); + emitOptimizationRemarkAnalysis( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + "loop not vectorized due to NoImplicitFloat attribute"); + emitMissedWarning(F, L, Hints); return false; } // Select the optimal vectorization factor. const LoopVectorizationCostModel::VectorizationFactor VF = - CM.selectVectorizationFactor(OptForSize, Hints.getWidth(), - Hints.getForce() == - LoopVectorizeHints::FK_Enabled); + CM.selectVectorizationFactor(OptForSize); // Select the unroll factor. const unsigned UF = - CM.selectUnrollFactor(OptForSize, Hints.getUnroll(), VF.Width, VF.Cost); + CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost); DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " << DebugLocStr << '\n'); DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n'); if (VF.Width == 1) { - DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); - if (UF == 1) + DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n"); + + if (UF == 1) { + emitOptimizationRemarkAnalysis( + F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), + "not beneficial to vectorize and user disabled interleaving"); return false; + } DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n"); // Report the unrolling decision. @@ -1220,6 +1464,7 @@ struct LoopVectorize : public FunctionPass { " (vectorization not beneficial)")); // We decided not to vectorize, but we may want to unroll. + InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF); Unroller.vectorize(&LVL); } else { @@ -1236,22 +1481,26 @@ struct LoopVectorize : public FunctionPass { } // Mark the loop as already vectorized to avoid vectorizing again. - Hints.setAlreadyVectorized(L); + Hints.setAlreadyVectorized(); DEBUG(verifyFunction(*L->getHeader()->getParent())); return true; } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addRequired(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); AU.addPreserved(); + AU.addPreserved(); } }; @@ -1263,64 +1512,6 @@ struct LoopVectorize : public FunctionPass { // LoopVectorizationCostModel. //===----------------------------------------------------------------------===// -static Value *stripIntegerCast(Value *V) { - if (CastInst *CI = dyn_cast(V)) - if (CI->getOperand(0)->getType()->isIntegerTy()) - return CI->getOperand(0); - return V; -} - -///\brief Replaces the symbolic stride in a pointer SCEV expression by one. -/// -/// If \p OrigPtr is not null, use it to look up the stride value instead of -/// \p Ptr. -static const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE, - ValueToValueMap &PtrToStride, - Value *Ptr, Value *OrigPtr = nullptr) { - - const SCEV *OrigSCEV = SE->getSCEV(Ptr); - - // If there is an entry in the map return the SCEV of the pointer with the - // symbolic stride replaced by one. - ValueToValueMap::iterator SI = PtrToStride.find(OrigPtr ? OrigPtr : Ptr); - if (SI != PtrToStride.end()) { - Value *StrideVal = SI->second; - - // Strip casts. - StrideVal = stripIntegerCast(StrideVal); - - // Replace symbolic stride by one. - Value *One = ConstantInt::get(StrideVal->getType(), 1); - ValueToValueMap RewriteMap; - RewriteMap[StrideVal] = One; - - const SCEV *ByOne = - SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true); - DEBUG(dbgs() << "LV: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne - << "\n"); - return ByOne; - } - - // Otherwise, just return the SCEV of the original pointer. - return SE->getSCEV(Ptr); -} - -void LoopVectorizationLegality::RuntimePointerCheck::insert( - ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, - ValueToValueMap &Strides) { - // Get the stride replaced scev. - const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr); - const SCEVAddRecExpr *AR = dyn_cast(Sc); - assert(AR && "Invalid addrec expression"); - const SCEV *Ex = SE->getBackedgeTakenCount(Lp); - const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); - Pointers.push_back(Ptr); - Starts.push_back(AR->getStart()); - Ends.push_back(ScEnd); - IsWritePtr.push_back(WritePtr); - DependencySetId.push_back(DepSetId); -} - Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { // We need to place the broadcast of invariant variables outside the loop. Instruction *Instr = dyn_cast(V); @@ -1340,11 +1531,13 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { return Shuf; } -Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, - bool Negate) { +Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, + Value *Step) { assert(Val->getType()->isVectorTy() && "Must be a vector"); assert(Val->getType()->getScalarType()->isIntegerTy() && "Elem must be an integer"); + assert(Step->getType() == Val->getType()->getScalarType() && + "Step has wrong type"); // Create the types. Type *ITy = Val->getType()->getScalarType(); VectorType *Ty = cast(Val->getType()); @@ -1352,15 +1545,18 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx, SmallVector Indices; // Create a vector of consecutive numbers from zero to VF. - for (int i = 0; i < VLen; ++i) { - int64_t Idx = Negate ? (-i) : i; - Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate)); - } + for (int i = 0; i < VLen; ++i) + Indices.push_back(ConstantInt::get(ITy, StartIdx + i)); // Add the consecutive indices to the vector value. Constant *Cv = ConstantVector::get(Indices); assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); - return Builder.CreateAdd(Val, Cv, "induction"); + Step = Builder.CreateVectorSplat(VLen, Step); + assert(Step->getType() == Val->getType() && "Invalid step vec"); + // FIXME: The newly created binary instructions should contain nsw/nuw flags, + // which can be found from the original scalar operations. + Step = Builder.CreateMul(Cv, Step); + return Builder.CreateAdd(Val, Step, "induction"); } /// \brief Find the operand of the GEP that should be checked for consecutive @@ -1398,10 +1594,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { PHINode *Phi = dyn_cast_or_null(Ptr); if (Phi && Inductions.count(Phi)) { InductionInfo II = Inductions[Phi]; - if (IK_PtrInduction == II.IK) - return 1; - else if (IK_ReversePtrInduction == II.IK) - return -1; + return II.getConsecutiveDirection(); } GetElementPtrInst *Gep = dyn_cast_or_null(Ptr); @@ -1426,10 +1619,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { return 0; InductionInfo II = Inductions[Phi]; - if (IK_PtrInduction == II.IK) - return 1; - else if (IK_ReversePtrInduction == II.IK) - return -1; + return II.getConsecutiveDirection(); } unsigned InductionOperand = getGEPInductionOperand(DL, Gep); @@ -1479,7 +1669,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { } bool LoopVectorizationLegality::isUniform(Value *V) { - return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); + return LAI->isUniform(V); } InnerLoopVectorizer::VectorParts& @@ -1531,7 +1721,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; - if (SI && Legal->blockNeedsPredication(SI->getParent())) + if (SI && Legal->blockNeedsPredication(SI->getParent()) && + !Legal->isMaskRequired(SI)) return scalarizeInstruction(Instr, true); if (ScalarAllocatedSize != VectorElementSize) @@ -1600,6 +1791,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); } + VectorParts Mask = createBlockInMask(Instr->getParent()); // Handle Stores: if (SI) { assert(!Legal->isUniform(SI->getPointerOperand()) && @@ -1608,7 +1800,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // We don't want to update the value in the map as it might be used in // another expression. So don't use a reference type for "StoredVal". VectorParts StoredVal = getVectorValue(SI->getValueOperand()); - + for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); @@ -1621,11 +1813,19 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // wide store needs to start at the last vector element. PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + Mask[Part] = reverseVector(Mask[Part]); } Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment); + + Instruction *NewSI; + if (Legal->isMaskRequired(SI)) + NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment, + Mask[Part]); + else + NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment); + propagateMetadata(NewSI, SI); } return; } @@ -1639,16 +1839,23 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { if (Reverse) { // If the address is consecutive but reversed, then the - // wide store needs to start at the last vector element. + // wide load needs to start at the last vector element. PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + Mask[Part] = reverseVector(Mask[Part]); } + Instruction* NewLI; Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); - Value *LI = Builder.CreateLoad(VecPtr, "wide.load"); - cast(LI)->setAlignment(Alignment); - Entry[Part] = Reverse ? reverseVector(LI) : LI; + if (Legal->isMaskRequired(LI)) + NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], + UndefValue::get(DataTy), + "wide.masked.load"); + else + NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load"); + propagateMetadata(NewLI, LI); + Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI; } } @@ -1724,7 +1931,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); LoopVectorBody.push_back(CondBlock); - VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); + VectorLp->addBasicBlockToLoop(CondBlock, *LI); // Update Builder with newly created basic block. Builder.SetInsertPoint(InsertPt); } @@ -1753,7 +1960,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic if (IfPredicateStore) { BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); LoopVectorBody.push_back(NewIfBlock); - VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); + VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); Builder.SetInsertPoint(InsertPt); Instruction *OldBr = IfBlock->getTerminator(); BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); @@ -1810,119 +2017,29 @@ InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { return std::make_pair(FirstInst, TheCheck); } -std::pair -InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) { - LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = - Legal->getRuntimePointerCheck(); - - Instruction *tnullptr = nullptr; - if (!PtrRtCheck->Need) - return std::pair(tnullptr, tnullptr); - - unsigned NumPointers = PtrRtCheck->Pointers.size(); - SmallVector , 2> Starts; - SmallVector , 2> Ends; - - LLVMContext &Ctx = Loc->getContext(); - SCEVExpander Exp(*SE, "induction"); - Instruction *FirstInst = nullptr; - - for (unsigned i = 0; i < NumPointers; ++i) { - Value *Ptr = PtrRtCheck->Pointers[i]; - const SCEV *Sc = SE->getSCEV(Ptr); - - if (SE->isLoopInvariant(Sc, OrigLoop)) { - DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << - *Ptr <<"\n"); - Starts.push_back(Ptr); - Ends.push_back(Ptr); - } else { - DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n'); - unsigned AS = Ptr->getType()->getPointerAddressSpace(); - - // Use this type for pointer arithmetic. - Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); - - Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); - Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); - Starts.push_back(Start); - Ends.push_back(End); - } - } - - IRBuilder<> ChkBuilder(Loc); - // Our instructions might fold to a constant. - Value *MemoryRuntimeCheck = nullptr; - for (unsigned i = 0; i < NumPointers; ++i) { - for (unsigned j = i+1; j < NumPointers; ++j) { - // No need to check if two readonly pointers intersect. - if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) - continue; - - // Only need to check pointers between two different dependency sets. - if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) - continue; - - unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); - unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); - - assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && - (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && - "Trying to bounds check pointers with different address spaces"); - - Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); - Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); - - Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc"); - Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc"); - Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc"); - Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc"); - - Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); - FirstInst = getFirstInst(FirstInst, Cmp0, Loc); - Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); - FirstInst = getFirstInst(FirstInst, Cmp1, Loc); - Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); - FirstInst = getFirstInst(FirstInst, IsConflict, Loc); - if (MemoryRuntimeCheck) { - IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, - "conflict.rdx"); - FirstInst = getFirstInst(FirstInst, IsConflict, Loc); - } - MemoryRuntimeCheck = IsConflict; - } - } - - // We have to do this trickery because the IRBuilder might fold the check to a - // constant expression in which case there is no Instruction anchored in a - // the block. - Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, - ConstantInt::getTrue(Ctx)); - ChkBuilder.Insert(Check, "memcheck.conflict"); - FirstInst = getFirstInst(FirstInst, Check, Loc); - return std::make_pair(FirstInst, Check); -} - void InnerLoopVectorizer::createEmptyLoop() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the scalar remainder. - [ ] <-- vector loop bypass (may consist of multiple blocks). - / | - / v - | [ ] <-- vector pre header. - | | - | v - | [ ] \ - | [ ]_| <-- vector loop. - | | - \ v - >[ ] <--- middle-block. - / | - / v - | [ ] <--- new preheader. + [ ] <-- Back-edge taken count overflow check. + / | + / v + | [ ] <-- vector loop bypass (may consist of multiple blocks). + | / | + | / v + || [ ] <-- vector pre header. + || | + || v + || [ ] \ + || [ ]_| <-- vector loop. + || | + | \ v + | >[ ] <--- middle-block. + | / | + | / v + -|- >[ ] <--- new preheader. | | | v | [ ] \ @@ -1936,6 +2053,7 @@ void InnerLoopVectorizer::createEmptyLoop() { BasicBlock *OldBasicBlock = OrigLoop->getHeader(); BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); BasicBlock *ExitBlock = OrigLoop->getExitBlock(); + assert(BypassBlock && "Invalid loop structure"); assert(ExitBlock && "Must have an exit block"); // Some loops have a single integer induction variable, while other loops @@ -1958,18 +2076,30 @@ void InnerLoopVectorizer::createEmptyLoop() { IdxTy->getPrimitiveSizeInBits()) ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); - ExitCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); + const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); // Get the total trip count from the count by adding 1. - ExitCount = SE->getAddExpr(ExitCount, - SE->getConstant(ExitCount->getType(), 1)); + ExitCount = SE->getAddExpr(BackedgeTakeCount, + SE->getConstant(BackedgeTakeCount->getType(), 1)); // Expand the trip count and place the new instructions in the preheader. // Notice that the pre-header does not change, only the loop body. SCEVExpander Exp(*SE, "induction"); - // Count holds the overall loop count (N). - Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - BypassBlock->getTerminator()); + // We need to test whether the backedge-taken count is uint##_max. Adding one + // to it will cause overflow and an incorrect loop trip count in the vector + // body. In case of overflow we want to directly jump to the scalar remainder + // loop. + Value *BackedgeCount = + Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(), + BypassBlock->getTerminator()); + if (BackedgeCount->getType()->isPointerTy()) + BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy, + "backedge.ptrcnt.to.int", + BypassBlock->getTerminator()); + Instruction *CheckBCOverflow = + CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount, + Constant::getAllOnesValue(BackedgeCount->getType()), + "backedge.overflow", BypassBlock->getTerminator()); // The loop index does not have to start at Zero. Find the original start // value from the induction PHI node. If we don't have an induction variable @@ -1980,7 +2110,18 @@ void InnerLoopVectorizer::createEmptyLoop() { IdxTy): ConstantInt::get(IdxTy, 0); - assert(BypassBlock && "Invalid loop structure"); + // We need an instruction to anchor the overflow check on. StartIdx needs to + // be defined before the overflow check branch. Because the scalar preheader + // is going to merge the start index and so the overflow branch block needs to + // contain a definition of the start index. + Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd( + StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor", + BypassBlock->getTerminator()); + + // Count holds the overall loop count (N). + Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), + BypassBlock->getTerminator()); + LoopBypassBlocks.push_back(BypassBlock); // Split the single block loop into the two loop structure described above. @@ -2001,13 +2142,13 @@ void InnerLoopVectorizer::createEmptyLoop() { // before calling any utilities such as SCEV that require valid LoopInfo. if (ParentLoop) { ParentLoop->addChildLoop(Lp); - ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); - ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); - ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); + ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); + ParentLoop->addBasicBlockToLoop(VectorPH, *LI); + ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); } else { LI->addTopLevelLoop(Lp); } - Lp->addBasicBlockToLoop(VecBody, LI->getBase()); + Lp->addBasicBlockToLoop(VecBody, *LI); // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. @@ -2049,29 +2190,45 @@ void InnerLoopVectorizer::createEmptyLoop() { // Now, compare the new count to zero. If it is zero skip the vector loop and // jump to the scalar loop. - Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, - "cmp.zero"); + Value *Cmp = + BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero"); BasicBlock *LastBypassBlock = BypassBlock; + // Generate code to check that the loops trip count that we computed by adding + // one to the backedge-taken count will not overflow. + { + auto PastOverflowCheck = + std::next(BasicBlock::iterator(OverflowCheckAnchor)); + BasicBlock *CheckBlock = + LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked"); + if (ParentLoop) + ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); + LoopBypassBlocks.push_back(CheckBlock); + Instruction *OldTerm = LastBypassBlock->getTerminator(); + BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm); + OldTerm->eraseFromParent(); + LastBypassBlock = CheckBlock; + } + // Generate the code to check that the strides we assumed to be one are really // one. We want the new basic block to start at the first instruction in a // sequence of instructions that form a check. Instruction *StrideCheck; Instruction *FirstCheckInst; std::tie(FirstCheckInst, StrideCheck) = - addStrideCheck(BypassBlock->getTerminator()); + addStrideCheck(LastBypassBlock->getTerminator()); if (StrideCheck) { // Create a new block containing the stride check. BasicBlock *CheckBlock = - BypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck"); + LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck"); if (ParentLoop) - ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); + ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); LoopBypassBlocks.push_back(CheckBlock); // Replace the branch into the memory check block with a conditional branch // for the "few elements case". - Instruction *OldTerm = BypassBlock->getTerminator(); + Instruction *OldTerm = LastBypassBlock->getTerminator(); BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); OldTerm->eraseFromParent(); @@ -2084,13 +2241,13 @@ void InnerLoopVectorizer::createEmptyLoop() { // faster. Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = - addRuntimeCheck(LastBypassBlock->getTerminator()); + Legal->getLAI()->addRuntimeCheck(LastBypassBlock->getTerminator()); if (MemRuntimeCheck) { // Create a new block containing the memory check. BasicBlock *CheckBlock = - LastBypassBlock->splitBasicBlock(MemRuntimeCheck, "vector.memcheck"); + LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.memcheck"); if (ParentLoop) - ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase()); + ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); LoopBypassBlocks.push_back(CheckBlock); // Replace the branch into the memory check block with a conditional branch @@ -2134,6 +2291,19 @@ void InnerLoopVectorizer::createEmptyLoop() { PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", MiddleBlock->getTerminator()) : nullptr; + // Create phi nodes to merge from the backedge-taken check block. + PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val", + ScalarPH->getTerminator()); + BCResumeVal->addIncoming(ResumeVal, MiddleBlock); + + PHINode *BCTruncResumeVal = nullptr; + if (OrigPhi == OldInduction) { + BCTruncResumeVal = + PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val", + ScalarPH->getTerminator()); + BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock); + } + Value *EndValue = nullptr; switch (II.IK) { case LoopVectorizationLegality::IK_NoInduction: @@ -2150,10 +2320,12 @@ void InnerLoopVectorizer::createEmptyLoop() { BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); TruncResumeVal->addIncoming(EndValue, VecBody); + BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); + // We know what the end value is. EndValue = IdxEndRoundDown; // We also know which PHI node holds it. @@ -2166,40 +2338,20 @@ void InnerLoopVectorizer::createEmptyLoop() { Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, II.StartValue->getType(), "cast.crd"); - EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end"); - break; - } - case LoopVectorizationLegality::IK_ReverseIntInduction: { - // Convert the CountRoundDown variable to the PHI size. - Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, - II.StartValue->getType(), - "cast.crd"); - // Handle reverse integer induction counter. - EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end"); + EndValue = II.transform(BypassBuilder, CRD); + EndValue->setName("ind.end"); break; } case LoopVectorizationLegality::IK_PtrInduction: { - // For pointer induction variables, calculate the offset using - // the end index. - EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown, - "ptr.ind.end"); - break; - } - case LoopVectorizationLegality::IK_ReversePtrInduction: { - // The value at the end of the loop for the reverse pointer is calculated - // by creating a GEP with a negative index starting from the start value. - Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0); - Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown, - "rev.ind.end"); - EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx, - "rev.ptr.ind.end"); + EndValue = II.transform(BypassBuilder, CountRoundDown); + EndValue->setName("ptr.ind.end"); break; } }// end of case // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) { + for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) { if (OrigPhi == OldInduction) ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); else @@ -2209,11 +2361,16 @@ void InnerLoopVectorizer::createEmptyLoop() { // Fix the scalar body counter (PHI node). unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); - // The old inductions phi node in the scalar body needs the truncated value. - if (OrigPhi == OldInduction) - OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal); - else - OrigPhi->setIncomingValue(BlockIdx, ResumeVal); + + // The old induction's phi node in the scalar body needs the truncated + // value. + if (OrigPhi == OldInduction) { + BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]); + OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal); + } else { + BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); + OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); + } } // If we are generating a new induction variable then we also need to @@ -2224,7 +2381,7 @@ void InnerLoopVectorizer::createEmptyLoop() { assert(!ResumeIndex && "Unexpected resume value found"); ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", MiddleBlock->getTerminator()); - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); } @@ -2267,7 +2424,7 @@ void InnerLoopVectorizer::createEmptyLoop() { LoopScalarBody = OldBasicBlock; LoopVectorizeHints Hints(Lp, true); - Hints.setAlreadyVectorized(Lp); + Hints.setAlreadyVectorized(); } /// This function returns the identity element (or neutral element) for @@ -2494,7 +2651,7 @@ void InnerLoopVectorizer::vectorizeLoop() { // To do so, we need to generate the 'identity' vector and override // one of the elements with the incoming scalar reduction. We need // to do it in the vector-loop preheader. - Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator()); + Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator()); // This is the vector-clone of the value that leaves the loop. VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr); @@ -2535,9 +2692,6 @@ void InnerLoopVectorizer::vectorizeLoop() { } // Fix the vector-loop phi. - // We created the induction variable so we know that the - // preheader is the first entry. - BasicBlock *VecPreheader = Induction->getIncomingBlock(0); // Reductions do not have to start at zero. They can start with // any loop invariant values. @@ -2549,7 +2703,8 @@ void InnerLoopVectorizer::vectorizeLoop() { // Make sure to add the reduction stat value only to the // first unroll part. Value *StartVal = (part == 0) ? VectorStart : Identity; - cast(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader); + cast(VecRdxPhi[part])->addIncoming(StartVal, + LoopVectorPreHeader); cast(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody.back()); } @@ -2568,7 +2723,7 @@ void InnerLoopVectorizer::vectorizeLoop() { VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr); PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); Value *StartVal = (part == 0) ? VectorStart : Identity; - for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody.back()); @@ -2626,6 +2781,13 @@ void InnerLoopVectorizer::vectorizeLoop() { Builder.getInt32(0)); } + // Create a phi node that merges control-flow from the backedge-taken check + // block and the middle block. + PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx", + LoopScalarPreHeader->getTerminator()); + BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]); + BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); + // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. // We know that the loop is in LCSSA form. We need to update the @@ -2655,7 +2817,7 @@ void InnerLoopVectorizer::vectorizeLoop() { assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); // Pick the other block. int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); - (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx); + (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); }// end of for each redux variable. @@ -2674,7 +2836,7 @@ void InnerLoopVectorizer::fixLCSSAPHIs() { LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), LoopMiddleBlock); } -} +} InnerLoopVectorizer::VectorParts InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { @@ -2797,6 +2959,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, LoopVectorizationLegality::InductionInfo II = Legal->getInductionVars()->lookup(P); + // FIXME: The newly created binary instructions should contain nsw/nuw flags, + // which can be found from the original scalar operations. switch (II.IK) { case LoopVectorizationLegality::IK_NoInduction: llvm_unreachable("Unknown induction"); @@ -2814,80 +2978,42 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx"); NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); - Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx, - "offset.idx"); + Broadcasted = II.transform(Builder, NormalizedIdx); + Broadcasted->setName("offset.idx"); } Broadcasted = getBroadcastInstrs(Broadcasted); // After broadcasting the induction variable we need to make the vector // consecutive by adding 0, 1, 2, etc. for (unsigned part = 0; part < UF; ++part) - Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false); + Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue); return; } - case LoopVectorizationLegality::IK_ReverseIntInduction: case LoopVectorizationLegality::IK_PtrInduction: - case LoopVectorizationLegality::IK_ReversePtrInduction: - // Handle reverse integer and pointer inductions. - Value *StartIdx = ExtendedIdx; - // This is the normalized GEP that starts counting at zero. - Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, - "normalized.idx"); - - // Handle the reverse integer induction variable case. - if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) { - IntegerType *DstTy = cast(II.StartValue->getType()); - Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, - "resize.norm.idx"); - Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, - "reverse.idx"); - - // This is a new value so do not hoist it out. - Value *Broadcasted = getBroadcastInstrs(ReverseInd); - // After broadcasting the induction variable we need to make the - // vector consecutive by adding ... -3, -2, -1, 0. - for (unsigned part = 0; part < UF; ++part) - Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part, - true); - return; - } - // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); - - // Is this a reverse induction ptr or a consecutive induction ptr. - bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction == - II.IK); - + // This is the normalized GEP that starts counting at zero. + Value *NormalizedIdx = + Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx"); // This is the vector of results. Notice that we don't generate // vector geps because scalar geps result in better code. for (unsigned part = 0; part < UF; ++part) { if (VF == 1) { - int EltIndex = (part) * (Reverse ? -1 : 1); + int EltIndex = part; Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); - Value *GlobalIdx; - if (Reverse) - GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); - else - GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); - - Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, - "next.gep"); + Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); + Value *SclrGep = II.transform(Builder, GlobalIdx); + SclrGep->setName("next.gep"); Entry[part] = SclrGep; continue; } Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); for (unsigned int i = 0; i < VF; ++i) { - int EltIndex = (i + part * VF) * (Reverse ? -1 : 1); + int EltIndex = i + part * VF; Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); - Value *GlobalIdx; - if (!Reverse) - GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); - else - GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx"); - - Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, - "next.gep"); + Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); + Value *SclrGep = II.transform(Builder, GlobalIdx); + SclrGep->setName("next.gep"); VecVal = Builder.CreateInsertElement(VecVal, SclrGep, Builder.getInt32(i), "insert.gep"); @@ -2907,7 +3033,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Nothing to do for PHIs and BR, since we already took care of the // loop control flow instructions. continue; - case Instruction::PHI:{ + case Instruction::PHI: { // Vectorize PHINodes. widenPHIInstruction(it, Entry, UF, VF, PV); continue; @@ -2941,21 +3067,13 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { for (unsigned Part = 0; Part < UF; ++Part) { Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); - // Update the NSW, NUW and Exact flags. Notice: V can be an Undef. - BinaryOperator *VecOp = dyn_cast(V); - if (VecOp && isa(BinOp)) { - VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); - VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); - } - if (VecOp && isa(VecOp)) - VecOp->setIsExact(BinOp->isExact()); - - // Copy the fast-math flags. - if (VecOp && isa(V)) - VecOp->setFastMathFlags(it->getFastMathFlags()); + if (BinaryOperator *VecOp = dyn_cast(V)) + VecOp->copyIRFlags(BinOp); Entry[Part] = V; } + + propagateMetadata(Entry, it); break; } case Instruction::Select: { @@ -2983,6 +3101,8 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Op0[Part], Op1[Part]); } + + propagateMetadata(Entry, it); break; } @@ -3002,6 +3122,8 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); Entry[Part] = C; } + + propagateMetadata(Entry, it); break; } @@ -3032,8 +3154,13 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, CI->getType()); Value *Broadcasted = getBroadcastInstrs(ScalarCast); + LoopVectorizationLegality::InductionInfo II = + Legal->getInductionVars()->lookup(OldInduction); + Constant *Step = + ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue()); for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false); + Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); + propagateMetadata(Entry, it); break; } /// Vectorize casts. @@ -3043,6 +3170,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { VectorParts &A = getVectorValue(it->getOperand(0)); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); + propagateMetadata(Entry, it); break; } @@ -3057,14 +3185,20 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); assert(ID && "Not an intrinsic call!"); switch (ID) { + case Intrinsic::assume: case Intrinsic::lifetime_end: case Intrinsic::lifetime_start: scalarizeInstruction(it); break; default: + bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1); for (unsigned Part = 0; Part < UF; ++Part) { SmallVector Args; for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + if (HasScalarOpd && i == 1) { + Args.push_back(CI->getArgOperand(i)); + continue; + } VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); Args.push_back(Arg[Part]); } @@ -3075,6 +3209,8 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Function *F = Intrinsic::getDeclaration(M, ID, Tys); Entry[Part] = Builder.CreateCall(F, Args); } + + propagateMetadata(Entry, it); break; } break; @@ -3112,10 +3248,10 @@ void InnerLoopVectorizer::updateAnalysis() { } } - DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front()); - DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); + DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]); + DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); - DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); + DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); DEBUG(DT->verifyDomTree()); } @@ -3138,8 +3274,10 @@ static bool canIfConvertPHINodes(BasicBlock *BB) { } bool LoopVectorizationLegality::canVectorizeWithIfConvert() { - if (!EnableIfConversion) + if (!EnableIfConversion) { + emitAnalysis(VectorizationReport() << "if-conversion is disabled"); return false; + } assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); @@ -3169,16 +3307,24 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { BasicBlock *BB = *BI; // We don't support switch statements inside loops. - if (!isa(BB->getTerminator())) + if (!isa(BB->getTerminator())) { + emitAnalysis(VectorizationReport(BB->getTerminator()) + << "loop contains a switch statement"); return false; + } // We must be able to predicate all blocks that need to be predicated. if (blockNeedsPredication(BB)) { - if (!blockCanBePredicated(BB, SafePointes)) + if (!blockCanBePredicated(BB, SafePointes)) { + emitAnalysis(VectorizationReport(BB->getTerminator()) + << "control flow cannot be substituted for a select"); return false; - } else if (BB != Header && !canIfConvertPHINodes(BB)) + } + } else if (BB != Header && !canIfConvertPHINodes(BB)) { + emitAnalysis(VectorizationReport(BB->getTerminator()) + << "control flow cannot be substituted for a select"); return false; - + } } // We can if-convert this loop. @@ -3188,20 +3334,44 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { bool LoopVectorizationLegality::canVectorize() { // We must have a loop in canonical form. Loops with indirectbr in them cannot // be canonicalized. - if (!TheLoop->getLoopPreheader()) + if (!TheLoop->getLoopPreheader()) { + emitAnalysis( + VectorizationReport() << + "loop control flow is not understood by vectorizer"); return false; + } // We can only vectorize innermost loops. - if (TheLoop->getSubLoopsVector().size()) + if (!TheLoop->getSubLoopsVector().empty()) { + emitAnalysis(VectorizationReport() << "loop is not the innermost loop"); return false; + } // We must have a single backedge. - if (TheLoop->getNumBackEdges() != 1) + if (TheLoop->getNumBackEdges() != 1) { + emitAnalysis( + VectorizationReport() << + "loop control flow is not understood by vectorizer"); return false; + } // We must have a single exiting block. - if (!TheLoop->getExitingBlock()) + if (!TheLoop->getExitingBlock()) { + emitAnalysis( + VectorizationReport() << + "loop control flow is not understood by vectorizer"); + return false; + } + + // We only handle bottom-tested loops, i.e. loop in which the condition is + // checked at the end of each iteration. With that we can assume that all + // instructions in the loop are executed the same number of times. + if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { + emitAnalysis( + VectorizationReport() << + "loop control flow is not understood by vectorizer"); return false; + } // We need to have a loop header. DEBUG(dbgs() << "LV: Found a loop: " << @@ -3217,6 +3387,8 @@ bool LoopVectorizationLegality::canVectorize() { // ScalarEvolution needs to be able to find the exit count. const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); if (ExitCount == SE->getCouldNotCompute()) { + emitAnalysis(VectorizationReport() << + "could not determine number of loop iterations"); DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; } @@ -3237,7 +3409,8 @@ bool LoopVectorizationLegality::canVectorize() { collectLoopUniforms(); DEBUG(dbgs() << "LV: We can vectorize this loop" << - (PtrRtCheck.Need ? " (with a runtime bound check)" : "") + (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)" : + "") <<"!\n"); // Okay! We can vectorize. At this point we don't have any other mem analysis @@ -3269,7 +3442,7 @@ static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { /// \brief Check that the instruction has outside loop users and is not an /// identified reduction variable. static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, - SmallPtrSet &Reductions) { + SmallPtrSetImpl &Reductions) { // Reduction instructions are allowed to have exit users. All other // instructions must not have external users. if (!Reductions.count(Inst)) @@ -3292,9 +3465,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Look for the attribute signaling the absence of NaNs. Function &F = *Header->getParent(); if (F.hasFnAttribute("no-nans-fp-math")) - HasFunNoNaNAttr = F.getAttributes().getAttribute( - AttributeSet::FunctionIndex, - "no-nans-fp-math").getValueAsString() == "true"; + HasFunNoNaNAttr = + F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; // For each block in the loop. for (Loop::block_iterator bb = TheLoop->block_begin(), @@ -3310,6 +3482,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && !PhiTy->isPointerTy()) { + emitAnalysis(VectorizationReport(it) + << "loop control flow is not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; } @@ -3320,21 +3494,27 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (*bb != Header) { // Check that this instruction has no outside users or is an // identified reduction value with an outside user. - if(!hasOutsideLoopUser(TheLoop, it, AllowedExit)) + if (!hasOutsideLoopUser(TheLoop, it, AllowedExit)) continue; + emitAnalysis(VectorizationReport(it) << + "value could not be identified as " + "an induction or reduction variable"); return false; } - // We only allow if-converted PHIs with more than two incoming values. + // We only allow if-converted PHIs with exactly two incoming values. if (Phi->getNumIncomingValues() != 2) { + emitAnalysis(VectorizationReport(it) + << "control flow not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; } // This is the value coming from the preheader. Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); + ConstantInt *StepValue = nullptr; // Check if this is an induction variable. - InductionKind IK = isInductionVariable(Phi); + InductionKind IK = isInductionVariable(Phi, StepValue); if (IK_NoInduction != IK) { // Get the widest type. @@ -3344,7 +3524,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy); // Int inductions are special because we only allow one IV. - if (IK == IK_IntInduction) { + if (IK == IK_IntInduction && StepValue->isOne()) { // Use the phi node with the widest type as induction. Use the last // one if there are multiple (no good reason for doing this other // than it is expedient). @@ -3353,12 +3533,16 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } DEBUG(dbgs() << "LV: Found an induction variable.\n"); - Inductions[Phi] = InductionInfo(StartValue, IK); + Inductions[Phi] = InductionInfo(StartValue, IK, StepValue); // Until we explicitly handle the case of an induction variable with // an outside loop user we have to give up vectorizing this loop. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) + if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { + emitAnalysis(VectorizationReport(it) << + "use of induction value outside of the " + "loop is not handled by vectorizer"); return false; + } continue; } @@ -3401,6 +3585,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { continue; } + emitAnalysis(VectorizationReport(it) << + "value that could not be identified as " + "reduction is used outside the loop"); DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); return false; }// end of PHI handling @@ -3409,14 +3596,30 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // calls and we do handle certain intrinsic and libm functions. CallInst *CI = dyn_cast(it); if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa(CI)) { + emitAnalysis(VectorizationReport(it) << + "call instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found a call site.\n"); return false; } + // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the + // second argument is the same (i.e. loop invariant) + if (CI && + hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { + if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { + emitAnalysis(VectorizationReport(it) + << "intrinsic instruction cannot be vectorized"); + DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); + return false; + } + } + // Check that the instruction return type is vectorizable. // Also, we can't vectorize extractelement instructions. if ((!VectorType::isValidElementType(it->getType()) && !it->getType()->isVoidTy()) || isa(it)) { + emitAnalysis(VectorizationReport(it) + << "instruction return type cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; } @@ -3424,20 +3627,26 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Check that the stored type is vectorizable. if (StoreInst *ST = dyn_cast(it)) { Type *T = ST->getValueOperand()->getType(); - if (!VectorType::isValidElementType(T)) + if (!VectorType::isValidElementType(T)) { + emitAnalysis(VectorizationReport(ST) << + "store instruction cannot be vectorized"); return false; + } if (EnableMemAccessVersioning) - collectStridedAcccess(ST); + collectStridedAccess(ST); } if (EnableMemAccessVersioning) if (LoadInst *LI = dyn_cast(it)) - collectStridedAcccess(LI); + collectStridedAccess(LI); // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) + if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { + emitAnalysis(VectorizationReport(it) << + "value cannot be used outside the loop"); return false; + } } // next instr. @@ -3445,8 +3654,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!Induction) { DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); - if (Inductions.empty()) + if (Inductions.empty()) { + emitAnalysis(VectorizationReport() + << "loop induction variable could not be identified"); return false; + } } return true; @@ -3565,7 +3777,7 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, return Stride; } -void LoopVectorizationLegality::collectStridedAcccess(Value *MemAccess) { +void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { Value *Ptr = nullptr; if (LoadInst *LI = dyn_cast(MemAccess)) Ptr = LI->getPointerOperand(); @@ -3603,7 +3815,7 @@ void LoopVectorizationLegality::collectLoopUniforms() { if (I->getType()->isPointerTy() && isConsecutivePtr(I)) Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); - while (Worklist.size()) { + while (!Worklist.empty()) { Instruction *I = dyn_cast(Worklist.back()); Worklist.pop_back(); @@ -3621,932 +3833,33 @@ void LoopVectorizationLegality::collectLoopUniforms() { } } -namespace { -/// \brief Analyses memory accesses in a loop. -/// -/// Checks whether run time pointer checks are needed and builds sets for data -/// dependence checking. -class AccessAnalysis { -public: - /// \brief Read or write access location. - typedef PointerIntPair MemAccessInfo; - typedef SmallPtrSet MemAccessInfoSet; - - /// \brief Set of potential dependent memory accesses. - typedef EquivalenceClasses DepCandidates; - - AccessAnalysis(const DataLayout *Dl, DepCandidates &DA) : - DL(Dl), DepCands(DA), AreAllWritesIdentified(true), - AreAllReadsIdentified(true), IsRTCheckNeeded(false) {} - - /// \brief Register a load and whether it is only read from. - void addLoad(Value *Ptr, bool IsReadOnly) { - Accesses.insert(MemAccessInfo(Ptr, false)); - if (IsReadOnly) - ReadOnlyPtr.insert(Ptr); - } - - /// \brief Register a store. - void addStore(Value *Ptr) { - Accesses.insert(MemAccessInfo(Ptr, true)); - } +bool LoopVectorizationLegality::canVectorizeMemory() { + LAI = &LAA->getInfo(TheLoop, Strides); + auto &OptionalReport = LAI->getReport(); + if (OptionalReport) + emitAnalysis(*OptionalReport); + return LAI->canVectorizeMemory(); +} - /// \brief Check whether we can check the pointers at runtime for - /// non-intersection. - bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck, - unsigned &NumComparisons, ScalarEvolution *SE, - Loop *TheLoop, ValueToValueMap &Strides, - bool ShouldCheckStride = false); - - /// \brief Goes over all memory accesses, checks whether a RT check is needed - /// and builds sets of dependent accesses. - void buildDependenceSets() { - // Process read-write pointers first. - processMemAccesses(false); - // Next, process read pointers. - processMemAccesses(true); +static bool hasMultipleUsesOf(Instruction *I, + SmallPtrSetImpl &Insts) { + unsigned NumUses = 0; + for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) { + if (Insts.count(dyn_cast(*Use))) + ++NumUses; + if (NumUses > 1) + return true; } - bool isRTCheckNeeded() { return IsRTCheckNeeded; } - - bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } - void resetDepChecks() { CheckDeps.clear(); } + return false; +} - MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } - -private: - typedef SetVector PtrAccessSet; - typedef DenseMap UnderlyingObjToAccessMap; - - /// \brief Go over all memory access or only the deferred ones if - /// \p UseDeferred is true and check whether runtime pointer checks are needed - /// and build sets of dependency check candidates. - void processMemAccesses(bool UseDeferred); - - /// Set of all accesses. - PtrAccessSet Accesses; - - /// Set of access to check after all writes have been processed. - PtrAccessSet DeferredAccesses; - - /// Map of pointers to last access encountered. - UnderlyingObjToAccessMap ObjToLastAccess; - - /// Set of accesses that need a further dependence check. - MemAccessInfoSet CheckDeps; - - /// Set of pointers that are read only. - SmallPtrSet ReadOnlyPtr; - - /// Set of underlying objects already written to. - SmallPtrSet WriteObjects; - - const DataLayout *DL; - - /// Sets of potentially dependent accesses - members of one set share an - /// underlying pointer. The set "CheckDeps" identfies which sets really need a - /// dependence check. - DepCandidates &DepCands; - - bool AreAllWritesIdentified; - bool AreAllReadsIdentified; - bool IsRTCheckNeeded; -}; - -} // end anonymous namespace - -/// \brief Check whether a pointer can participate in a runtime bounds check. -static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides, - Value *Ptr) { - const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); - const SCEVAddRecExpr *AR = dyn_cast(PtrScev); - if (!AR) - return false; - - return AR->isAffine(); -} - -/// \brief Check the stride of the pointer and ensure that it does not wrap in -/// the address space. -static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, - const Loop *Lp, ValueToValueMap &StridesMap); - -bool AccessAnalysis::canCheckPtrAtRT( - LoopVectorizationLegality::RuntimePointerCheck &RtCheck, - unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop, - ValueToValueMap &StridesMap, bool ShouldCheckStride) { - // Find pointers with computable bounds. We are going to use this information - // to place a runtime bound check. - unsigned NumReadPtrChecks = 0; - unsigned NumWritePtrChecks = 0; - bool CanDoRT = true; - - bool IsDepCheckNeeded = isDependencyCheckNeeded(); - // We assign consecutive id to access from different dependence sets. - // Accesses within the same set don't need a runtime check. - unsigned RunningDepId = 1; - DenseMap DepSetId; - - for (PtrAccessSet::iterator AI = Accesses.begin(), AE = Accesses.end(); - AI != AE; ++AI) { - const MemAccessInfo &Access = *AI; - Value *Ptr = Access.getPointer(); - bool IsWrite = Access.getInt(); - - // Just add write checks if we have both. - if (!IsWrite && Accesses.count(MemAccessInfo(Ptr, true))) - continue; - - if (IsWrite) - ++NumWritePtrChecks; - else - ++NumReadPtrChecks; - - if (hasComputableBounds(SE, StridesMap, Ptr) && - // When we run after a failing dependency check we have to make sure we - // don't have wrapping pointers. - (!ShouldCheckStride || - isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) { - // The id of the dependence set. - unsigned DepId; - - if (IsDepCheckNeeded) { - Value *Leader = DepCands.getLeaderValue(Access).getPointer(); - unsigned &LeaderId = DepSetId[Leader]; - if (!LeaderId) - LeaderId = RunningDepId++; - DepId = LeaderId; - } else - // Each access has its own dependence set. - DepId = RunningDepId++; - - RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, StridesMap); - - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n'); - } else { - CanDoRT = false; - } - } - - if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) - NumComparisons = 0; // Only one dependence set. - else { - NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks + - NumWritePtrChecks - 1)); - } - - // If the pointers that we would use for the bounds comparison have different - // address spaces, assume the values aren't directly comparable, so we can't - // use them for the runtime check. We also have to assume they could - // overlap. In the future there should be metadata for whether address spaces - // are disjoint. - unsigned NumPointers = RtCheck.Pointers.size(); - for (unsigned i = 0; i < NumPointers; ++i) { - for (unsigned j = i + 1; j < NumPointers; ++j) { - // Only need to check pointers between two different dependency sets. - if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j]) - continue; - - Value *PtrI = RtCheck.Pointers[i]; - Value *PtrJ = RtCheck.Pointers[j]; - - unsigned ASi = PtrI->getType()->getPointerAddressSpace(); - unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); - if (ASi != ASj) { - DEBUG(dbgs() << "LV: Runtime check would require comparison between" - " different address spaces\n"); - return false; - } - } - } - - return CanDoRT; -} - -static bool isFunctionScopeIdentifiedObject(Value *Ptr) { - return isNoAliasArgument(Ptr) || isNoAliasCall(Ptr) || isa(Ptr); -} - -void AccessAnalysis::processMemAccesses(bool UseDeferred) { - // We process the set twice: first we process read-write pointers, last we - // process read-only pointers. This allows us to skip dependence tests for - // read-only pointers. - - PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; - for (PtrAccessSet::iterator AI = S.begin(), AE = S.end(); AI != AE; ++AI) { - const MemAccessInfo &Access = *AI; - Value *Ptr = Access.getPointer(); - bool IsWrite = Access.getInt(); - - DepCands.insert(Access); - - // Memorize read-only pointers for later processing and skip them in the - // first round (they need to be checked after we have seen all write - // pointers). Note: we also mark pointer that are not consecutive as - // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need the - // second check for "!IsWrite". - bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; - if (!UseDeferred && IsReadOnlyPtr) { - DeferredAccesses.insert(Access); - continue; - } - - bool NeedDepCheck = false; - // Check whether there is the possibility of dependency because of - // underlying objects being the same. - typedef SmallVector ValueVector; - ValueVector TempObjects; - GetUnderlyingObjects(Ptr, TempObjects, DL); - for (ValueVector::iterator UI = TempObjects.begin(), UE = TempObjects.end(); - UI != UE; ++UI) { - Value *UnderlyingObj = *UI; - - // If this is a write then it needs to be an identified object. If this a - // read and all writes (so far) are identified function scope objects we - // don't need an identified underlying object but only an Argument (the - // next write is going to invalidate this assumption if it is - // unidentified). - // This is a micro-optimization for the case where all writes are - // identified and we have one argument pointer. - // Otherwise, we do need a runtime check. - if ((IsWrite && !isFunctionScopeIdentifiedObject(UnderlyingObj)) || - (!IsWrite && (!AreAllWritesIdentified || - !isa(UnderlyingObj)) && - !isIdentifiedObject(UnderlyingObj))) { - DEBUG(dbgs() << "LV: Found an unidentified " << - (IsWrite ? "write" : "read" ) << " ptr: " << *UnderlyingObj << - "\n"); - IsRTCheckNeeded = (IsRTCheckNeeded || - !isIdentifiedObject(UnderlyingObj) || - !AreAllReadsIdentified); - - if (IsWrite) - AreAllWritesIdentified = false; - if (!IsWrite) - AreAllReadsIdentified = false; - } - - // If this is a write - check other reads and writes for conflicts. If - // this is a read only check other writes for conflicts (but only if there - // is no other write to the ptr - this is an optimization to catch "a[i] = - // a[i] + " without having to do a dependence check). - if ((IsWrite || IsReadOnlyPtr) && WriteObjects.count(UnderlyingObj)) - NeedDepCheck = true; - - if (IsWrite) - WriteObjects.insert(UnderlyingObj); - - // Create sets of pointers connected by shared underlying objects. - UnderlyingObjToAccessMap::iterator Prev = - ObjToLastAccess.find(UnderlyingObj); - if (Prev != ObjToLastAccess.end()) - DepCands.unionSets(Access, Prev->second); - - ObjToLastAccess[UnderlyingObj] = Access; - } - - if (NeedDepCheck) - CheckDeps.insert(Access); - } -} - -namespace { -/// \brief Checks memory dependences among accesses to the same underlying -/// object to determine whether there vectorization is legal or not (and at -/// which vectorization factor). -/// -/// This class works under the assumption that we already checked that memory -/// locations with different underlying pointers are "must-not alias". -/// We use the ScalarEvolution framework to symbolically evalutate access -/// functions pairs. Since we currently don't restructure the loop we can rely -/// on the program order of memory accesses to determine their safety. -/// At the moment we will only deem accesses as safe for: -/// * A negative constant distance assuming program order. -/// -/// Safe: tmp = a[i + 1]; OR a[i + 1] = x; -/// a[i] = tmp; y = a[i]; -/// -/// The latter case is safe because later checks guarantuee that there can't -/// be a cycle through a phi node (that is, we check that "x" and "y" is not -/// the same variable: a header phi can only be an induction or a reduction, a -/// reduction can't have a memory sink, an induction can't have a memory -/// source). This is important and must not be violated (or we have to -/// resort to checking for cycles through memory). -/// -/// * A positive constant distance assuming program order that is bigger -/// than the biggest memory access. -/// -/// tmp = a[i] OR b[i] = x -/// a[i+2] = tmp y = b[i+2]; -/// -/// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively. -/// -/// * Zero distances and all accesses have the same size. -/// -class MemoryDepChecker { -public: - typedef PointerIntPair MemAccessInfo; - typedef SmallPtrSet MemAccessInfoSet; - - MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L) - : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), - ShouldRetryWithRuntimeCheck(false) {} - - /// \brief Register the location (instructions are given increasing numbers) - /// of a write access. - void addAccess(StoreInst *SI) { - Value *Ptr = SI->getPointerOperand(); - Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx); - InstMap.push_back(SI); - ++AccessIdx; - } - - /// \brief Register the location (instructions are given increasing numbers) - /// of a write access. - void addAccess(LoadInst *LI) { - Value *Ptr = LI->getPointerOperand(); - Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx); - InstMap.push_back(LI); - ++AccessIdx; - } - - /// \brief Check whether the dependencies between the accesses are safe. - /// - /// Only checks sets with elements in \p CheckDeps. - bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, - MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides); - - /// \brief The maximum number of bytes of a vector register we can vectorize - /// the accesses safely with. - unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } - - /// \brief In same cases when the dependency check fails we can still - /// vectorize the loop with a dynamic array access check. - bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } - -private: - ScalarEvolution *SE; - const DataLayout *DL; - const Loop *InnermostLoop; - - /// \brief Maps access locations (ptr, read/write) to program order. - DenseMap > Accesses; - - /// \brief Memory access instructions in program order. - SmallVector InstMap; - - /// \brief The program order index to be used for the next instruction. - unsigned AccessIdx; - - // We can access this many bytes in parallel safely. - unsigned MaxSafeDepDistBytes; - - /// \brief If we see a non-constant dependence distance we can still try to - /// vectorize this loop with runtime checks. - bool ShouldRetryWithRuntimeCheck; - - /// \brief Check whether there is a plausible dependence between the two - /// accesses. - /// - /// Access \p A must happen before \p B in program order. The two indices - /// identify the index into the program order map. - /// - /// This function checks whether there is a plausible dependence (or the - /// absence of such can't be proved) between the two accesses. If there is a - /// plausible dependence but the dependence distance is bigger than one - /// element access it records this distance in \p MaxSafeDepDistBytes (if this - /// distance is smaller than any other distance encountered so far). - /// Otherwise, this function returns true signaling a possible dependence. - bool isDependent(const MemAccessInfo &A, unsigned AIdx, - const MemAccessInfo &B, unsigned BIdx, - ValueToValueMap &Strides); - - /// \brief Check whether the data dependence could prevent store-load - /// forwarding. - bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize); -}; - -} // end anonymous namespace - -static bool isInBoundsGep(Value *Ptr) { - if (GetElementPtrInst *GEP = dyn_cast(Ptr)) - return GEP->isInBounds(); - return false; -} - -/// \brief Check whether the access through \p Ptr has a constant stride. -static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, - const Loop *Lp, ValueToValueMap &StridesMap) { - const Type *Ty = Ptr->getType(); - assert(Ty->isPointerTy() && "Unexpected non-ptr"); - - // Make sure that the pointer does not point to aggregate types. - const PointerType *PtrTy = cast(Ty); - if (PtrTy->getElementType()->isAggregateType()) { - DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr << - "\n"); - return 0; - } - - const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr); - - const SCEVAddRecExpr *AR = dyn_cast(PtrScev); - if (!AR) { - DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " - << *Ptr << " SCEV: " << *PtrScev << "\n"); - return 0; - } - - // The accesss function must stride over the innermost loop. - if (Lp != AR->getLoop()) { - DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " << - *Ptr << " SCEV: " << *PtrScev << "\n"); - } - - // The address calculation must not wrap. Otherwise, a dependence could be - // inverted. - // An inbounds getelementptr that is a AddRec with a unit stride - // cannot wrap per definition. The unit stride requirement is checked later. - // An getelementptr without an inbounds attribute and unit stride would have - // to access the pointer value "0" which is undefined behavior in address - // space 0, therefore we can also vectorize this case. - bool IsInBoundsGEP = isInBoundsGep(Ptr); - bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask); - bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0; - if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) { - DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space " - << *Ptr << " SCEV: " << *PtrScev << "\n"); - return 0; - } - - // Check the step is constant. - const SCEV *Step = AR->getStepRecurrence(*SE); - - // Calculate the pointer stride and check if it is consecutive. - const SCEVConstant *C = dyn_cast(Step); - if (!C) { - DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr << - " SCEV: " << *PtrScev << "\n"); - return 0; - } - - int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType()); - const APInt &APStepVal = C->getValue()->getValue(); - - // Huge step value - give up. - if (APStepVal.getBitWidth() > 64) - return 0; - - int64_t StepVal = APStepVal.getSExtValue(); - - // Strided access. - int64_t Stride = StepVal / Size; - int64_t Rem = StepVal % Size; - if (Rem) - return 0; - - // If the SCEV could wrap but we have an inbounds gep with a unit stride we - // know we can't "wrap around the address space". In case of address space - // zero we know that this won't happen without triggering undefined behavior. - if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) && - Stride != 1 && Stride != -1) - return 0; - - return Stride; -} - -bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance, - unsigned TypeByteSize) { - // If loads occur at a distance that is not a multiple of a feasible vector - // factor store-load forwarding does not take place. - // Positive dependences might cause troubles because vectorizing them might - // prevent store-load forwarding making vectorized code run a lot slower. - // a[i] = a[i-3] ^ a[i-8]; - // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and - // hence on your typical architecture store-load forwarding does not take - // place. Vectorizing in such cases does not make sense. - // Store-load forwarding distance. - const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize; - // Maximum vector factor. - unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize; - if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues) - MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes; - - for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues; - vf *= 2) { - if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) { - MaxVFWithoutSLForwardIssues = (vf >>=1); - break; - } - } - - if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) { - DEBUG(dbgs() << "LV: Distance " << Distance << - " that could cause a store-load forwarding conflict\n"); - return true; - } - - if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes && - MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize) - MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues; - return false; -} - -bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, - const MemAccessInfo &B, unsigned BIdx, - ValueToValueMap &Strides) { - assert (AIdx < BIdx && "Must pass arguments in program order"); - - Value *APtr = A.getPointer(); - Value *BPtr = B.getPointer(); - bool AIsWrite = A.getInt(); - bool BIsWrite = B.getInt(); - - // Two reads are independent. - if (!AIsWrite && !BIsWrite) - return false; - - const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr); - const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr); - - int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides); - int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides); - - const SCEV *Src = AScev; - const SCEV *Sink = BScev; - - // If the induction step is negative we have to invert source and sink of the - // dependence. - if (StrideAPtr < 0) { - //Src = BScev; - //Sink = AScev; - std::swap(APtr, BPtr); - std::swap(Src, Sink); - std::swap(AIsWrite, BIsWrite); - std::swap(AIdx, BIdx); - std::swap(StrideAPtr, StrideBPtr); - } - - const SCEV *Dist = SE->getMinusSCEV(Sink, Src); - - DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink - << "(Induction step: " << StrideAPtr << ")\n"); - DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to " - << *InstMap[BIdx] << ": " << *Dist << "\n"); - - // Need consecutive accesses. We don't want to vectorize - // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in - // the address space. - if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){ - DEBUG(dbgs() << "Non-consecutive pointer access\n"); - return true; - } - - const SCEVConstant *C = dyn_cast(Dist); - if (!C) { - DEBUG(dbgs() << "LV: Dependence because of non-constant distance\n"); - ShouldRetryWithRuntimeCheck = true; - return true; - } - - Type *ATy = APtr->getType()->getPointerElementType(); - Type *BTy = BPtr->getType()->getPointerElementType(); - unsigned TypeByteSize = DL->getTypeAllocSize(ATy); - - // Negative distances are not plausible dependencies. - const APInt &Val = C->getValue()->getValue(); - if (Val.isNegative()) { - bool IsTrueDataDependence = (AIsWrite && !BIsWrite); - if (IsTrueDataDependence && - (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) || - ATy != BTy)) - return true; - - DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n"); - return false; - } - - // Write to the same location with the same size. - // Could be improved to assert type sizes are the same (i32 == float, etc). - if (Val == 0) { - if (ATy == BTy) - return false; - DEBUG(dbgs() << "LV: Zero dependence difference but different types\n"); - return true; - } - - assert(Val.isStrictlyPositive() && "Expect a positive value"); - - // Positive distance bigger than max vectorization factor. - if (ATy != BTy) { - DEBUG(dbgs() << - "LV: ReadWrite-Write positive dependency with different types\n"); - return false; - } - - unsigned Distance = (unsigned) Val.getZExtValue(); - - // Bail out early if passed-in parameters make vectorization not feasible. - unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1; - unsigned ForcedUnroll = VectorizationUnroll ? VectorizationUnroll : 1; - - // The distance must be bigger than the size needed for a vectorized version - // of the operation and the size of the vectorized operation must not be - // bigger than the currrent maximum size. - if (Distance < 2*TypeByteSize || - 2*TypeByteSize > MaxSafeDepDistBytes || - Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { - DEBUG(dbgs() << "LV: Failure because of Positive distance " - << Val.getSExtValue() << '\n'); - return true; - } - - MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? - Distance : MaxSafeDepDistBytes; - - bool IsTrueDataDependence = (!AIsWrite && BIsWrite); - if (IsTrueDataDependence && - couldPreventStoreLoadForward(Distance, TypeByteSize)) - return true; - - DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() << - " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'); - - return false; -} - -bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, - MemAccessInfoSet &CheckDeps, - ValueToValueMap &Strides) { - - MaxSafeDepDistBytes = -1U; - while (!CheckDeps.empty()) { - MemAccessInfo CurAccess = *CheckDeps.begin(); - - // Get the relevant memory access set. - EquivalenceClasses::iterator I = - AccessSets.findValue(AccessSets.getLeaderValue(CurAccess)); - - // Check accesses within this set. - EquivalenceClasses::member_iterator AI, AE; - AI = AccessSets.member_begin(I), AE = AccessSets.member_end(); - - // Check every access pair. - while (AI != AE) { - CheckDeps.erase(*AI); - EquivalenceClasses::member_iterator OI = std::next(AI); - while (OI != AE) { - // Check every accessing instruction pair in program order. - for (std::vector::iterator I1 = Accesses[*AI].begin(), - I1E = Accesses[*AI].end(); I1 != I1E; ++I1) - for (std::vector::iterator I2 = Accesses[*OI].begin(), - I2E = Accesses[*OI].end(); I2 != I2E; ++I2) { - if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides)) - return false; - if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides)) - return false; - } - ++OI; - } - AI++; - } - } - return true; -} - -bool LoopVectorizationLegality::canVectorizeMemory() { - - typedef SmallVector ValueVector; - typedef SmallPtrSet ValueSet; - - // Holds the Load and Store *instructions*. - ValueVector Loads; - ValueVector Stores; - - // Holds all the different accesses in the loop. - unsigned NumReads = 0; - unsigned NumReadWrites = 0; - - PtrRtCheck.Pointers.clear(); - PtrRtCheck.Need = false; - - const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); - MemoryDepChecker DepChecker(SE, DL, TheLoop); - - // For each block. - for (Loop::block_iterator bb = TheLoop->block_begin(), - be = TheLoop->block_end(); bb != be; ++bb) { - - // Scan the BB and collect legal loads and stores. - for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; - ++it) { - - // If this is a load, save it. If this instruction can read from memory - // but is not a load, then we quit. Notice that we don't handle function - // calls that read or write. - if (it->mayReadFromMemory()) { - // Many math library functions read the rounding mode. We will only - // vectorize a loop if it contains known function calls that don't set - // the flag. Therefore, it is safe to ignore this read from memory. - CallInst *Call = dyn_cast(it); - if (Call && getIntrinsicIDForCall(Call, TLI)) - continue; - - LoadInst *Ld = dyn_cast(it); - if (!Ld) return false; - if (!Ld->isSimple() && !IsAnnotatedParallel) { - DEBUG(dbgs() << "LV: Found a non-simple load.\n"); - return false; - } - NumLoads++; - Loads.push_back(Ld); - DepChecker.addAccess(Ld); - continue; - } - - // Save 'store' instructions. Abort if other instructions write to memory. - if (it->mayWriteToMemory()) { - StoreInst *St = dyn_cast(it); - if (!St) return false; - if (!St->isSimple() && !IsAnnotatedParallel) { - DEBUG(dbgs() << "LV: Found a non-simple store.\n"); - return false; - } - NumStores++; - Stores.push_back(St); - DepChecker.addAccess(St); - } - } // Next instr. - } // Next block. - - // Now we have two lists that hold the loads and the stores. - // Next, we find the pointers that they use. - - // Check if we see any stores. If there are no stores, then we don't - // care if the pointers are *restrict*. - if (!Stores.size()) { - DEBUG(dbgs() << "LV: Found a read-only loop!\n"); - return true; - } - - AccessAnalysis::DepCandidates DependentAccesses; - AccessAnalysis Accesses(DL, DependentAccesses); - - // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects - // multiple times on the same object. If the ptr is accessed twice, once - // for read and once for write, it will only appear once (on the write - // list). This is okay, since we are going to check for conflicts between - // writes and between reads and writes, but not between reads and reads. - ValueSet Seen; - - ValueVector::iterator I, IE; - for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { - StoreInst *ST = cast(*I); - Value* Ptr = ST->getPointerOperand(); - - if (isUniform(Ptr)) { - DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); - return false; - } - - // If we did *not* see this pointer before, insert it to the read-write - // list. At this phase it is only a 'write' list. - if (Seen.insert(Ptr)) { - ++NumReadWrites; - Accesses.addStore(Ptr); - } - } - - if (IsAnnotatedParallel) { - DEBUG(dbgs() - << "LV: A loop annotated parallel, ignore memory dependency " - << "checks.\n"); - return true; - } - - for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { - LoadInst *LD = cast(*I); - Value* Ptr = LD->getPointerOperand(); - // If we did *not* see this pointer before, insert it to the - // read list. If we *did* see it before, then it is already in - // the read-write list. This allows us to vectorize expressions - // such as A[i] += x; Because the address of A[i] is a read-write - // pointer. This only works if the index of A[i] is consecutive. - // If the address of i is unknown (for example A[B[i]]) then we may - // read a few words, modify, and write a few words, and some of the - // words may be written to the same address. - bool IsReadOnlyPtr = false; - if (Seen.insert(Ptr) || !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) { - ++NumReads; - IsReadOnlyPtr = true; - } - Accesses.addLoad(Ptr, IsReadOnlyPtr); - } - - // If we write (or read-write) to a single destination and there are no - // other reads in this loop then is it safe to vectorize. - if (NumReadWrites == 1 && NumReads == 0) { - DEBUG(dbgs() << "LV: Found a write-only loop!\n"); - return true; - } - - // Build dependence sets and check whether we need a runtime pointer bounds - // check. - Accesses.buildDependenceSets(); - bool NeedRTCheck = Accesses.isRTCheckNeeded(); - - // Find pointers with computable bounds. We are going to use this information - // to place a runtime bound check. - unsigned NumComparisons = 0; - bool CanDoRT = false; - if (NeedRTCheck) - CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop, - Strides); - - DEBUG(dbgs() << "LV: We need to do " << NumComparisons << - " pointer comparisons.\n"); - - // If we only have one set of dependences to check pointers among we don't - // need a runtime check. - if (NumComparisons == 0 && NeedRTCheck) - NeedRTCheck = false; - - // Check that we did not collect too many pointers or found an unsizeable - // pointer. - if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { - PtrRtCheck.reset(); - CanDoRT = false; - } - - if (CanDoRT) { - DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); - } - - if (NeedRTCheck && !CanDoRT) { - DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << - "the array bounds.\n"); - PtrRtCheck.reset(); - return false; - } - - PtrRtCheck.Need = NeedRTCheck; - - bool CanVecMem = true; - if (Accesses.isDependencyCheckNeeded()) { - DEBUG(dbgs() << "LV: Checking memory dependencies\n"); - CanVecMem = DepChecker.areDepsSafe( - DependentAccesses, Accesses.getDependenciesToCheck(), Strides); - MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); - - if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) { - DEBUG(dbgs() << "LV: Retrying with memory checks\n"); - NeedRTCheck = true; - - // Clear the dependency checks. We assume they are not needed. - Accesses.resetDepChecks(); - - PtrRtCheck.reset(); - PtrRtCheck.Need = true; - - CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, - TheLoop, Strides, true); - // Check that we did not collect too many pointers or found an unsizeable - // pointer. - if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { - DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n"); - PtrRtCheck.reset(); - return false; - } - - CanVecMem = true; - } - } - - DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << - " need a runtime memory check.\n"); - - return CanVecMem; -} - -static bool hasMultipleUsesOf(Instruction *I, - SmallPtrSet &Insts) { - unsigned NumUses = 0; - for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) { - if (Insts.count(dyn_cast(*Use))) - ++NumUses; - if (NumUses > 1) - return true; - } - - return false; -} - -static bool areAllUsesIn(Instruction *I, SmallPtrSet &Set) { - for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) - if (!Set.count(dyn_cast(*Use))) - return false; - return true; -} +static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl &Set) { + for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) + if (!Set.count(dyn_cast(*Use))) + return false; + return true; +} bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, ReductionKind Kind) { @@ -4679,7 +3992,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // value must only be used once, except by phi nodes and min/max // reductions which are represented as a cmp followed by a select. ReductionInstDesc IgnoredVal(false, nullptr); - if (VisitedInsts.insert(UI)) { + if (VisitedInsts.insert(UI).second) { if (isa(UI)) PHIs.push_back(UI); else @@ -4781,7 +4094,7 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I, ReductionKind Kind, ReductionInstDesc &Prev) { bool FP = I->getType()->isFloatingPointTy(); - bool FastMath = (FP && I->isCommutative() && I->isAssociative()); + bool FastMath = FP && I->hasUnsafeAlgebra(); switch (I->getOpcode()) { default: return ReductionInstDesc(false, I); @@ -4803,6 +4116,7 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I, return ReductionInstDesc(Kind == RK_IntegerXor, I); case Instruction::FMul: return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I); + case Instruction::FSub: case Instruction::FAdd: return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I); case Instruction::FCmp: @@ -4816,7 +4130,8 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I, } LoopVectorizationLegality::InductionKind -LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { +LoopVectorizationLegality::isInductionVariable(PHINode *Phi, + ConstantInt *&StepValue) { Type *PhiTy = Phi->getType(); // We only handle integer and pointer inductions variables. if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) @@ -4829,30 +4144,32 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); return IK_NoInduction; } - const SCEV *Step = AR->getStepRecurrence(*SE); - - // Integer inductions need to have a stride of one. - if (PhiTy->isIntegerTy()) { - if (Step->isOne()) - return IK_IntInduction; - if (Step->isAllOnesValue()) - return IK_ReverseIntInduction; - return IK_NoInduction; - } + const SCEV *Step = AR->getStepRecurrence(*SE); // Calculate the pointer stride and check if it is consecutive. const SCEVConstant *C = dyn_cast(Step); if (!C) return IK_NoInduction; + ConstantInt *CV = C->getValue(); + if (PhiTy->isIntegerTy()) { + StepValue = CV; + return IK_IntInduction; + } + assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); - uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); - if (C->getValue()->equalsInt(Size)) - return IK_PtrInduction; - else if (C->getValue()->equalsInt(0 - Size)) - return IK_ReversePtrInduction; + Type *PointerElementType = PhiTy->getPointerElementType(); + // The pointer stride cannot be determined if the pointer element type is not + // sized. + if (!PointerElementType->isSized()) + return IK_NoInduction; - return IK_NoInduction; + int64_t Size = static_cast(DL->getTypeAllocSize(PointerElementType)); + int64_t CVSize = CV->getSExtValue(); + if (CVSize % Size) + return IK_NoInduction; + StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size); + return IK_PtrInduction; } bool LoopVectorizationLegality::isInductionVariable(const Value *V) { @@ -4865,21 +4182,32 @@ bool LoopVectorizationLegality::isInductionVariable(const Value *V) { } bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { - assert(TheLoop->contains(BB) && "Unknown block used"); - - // Blocks that do not dominate the latch need predication. - BasicBlock* Latch = TheLoop->getLoopLatch(); - return !DT->dominates(BB, Latch); + return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); } bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, - SmallPtrSet& SafePtrs) { + SmallPtrSetImpl &SafePtrs) { + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + // Check that we don't have a constant expression that can trap as operand. + for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end(); + OI != OE; ++OI) { + if (Constant *C = dyn_cast(*OI)) + if (C->canTrap()) + return false; + } // We might be able to hoist the load. if (it->mayReadFromMemory()) { LoadInst *LI = dyn_cast(it); - if (!LI || !SafePtrs.count(LI->getPointerOperand())) + if (!LI) return false; + if (!SafePtrs.count(LI->getPointerOperand())) { + if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) { + MaskedOp.insert(LI); + continue; + } + return false; + } } // We don't predicate stores at the moment. @@ -4887,22 +4215,30 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, StoreInst *SI = dyn_cast(it); // We only support predication of stores in basic blocks with one // predecessor. - if (!SI || ++NumPredStores > NumberOfStoresToPredicate || - !SafePtrs.count(SI->getPointerOperand()) || - !SI->getParent()->getSinglePredecessor()) + if (!SI) return false; + + bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0); + bool isSinglePredecessor = SI->getParent()->getSinglePredecessor(); + + if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr || + !isSinglePredecessor) { + // Build a masked store if it is legal for the target, otherwise scalarize + // the block. + bool isLegalMaskedOp = + isLegalMaskedStore(SI->getValueOperand()->getType(), + SI->getPointerOperand()); + if (isLegalMaskedOp) { + --NumPredStores; + MaskedOp.insert(SI); + continue; + } + return false; + } } if (it->mayThrow()) return false; - // Check that we don't have a constant expression that can trap as operand. - for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end(); - OI != OE; ++OI) { - if (Constant *C = dyn_cast(*OI)) - if (C->canTrap()) - return false; - } - // The instructions below can trap. switch (it->getOpcode()) { default: continue; @@ -4910,7 +4246,7 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, case Instruction::SDiv: case Instruction::URem: case Instruction::SRem: - return false; + return false; } } @@ -4918,23 +4254,27 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, } LoopVectorizationCostModel::VectorizationFactor -LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, - unsigned UserVF, - bool ForceVectorization) { +LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // Width 1 means no vectorize VectorizationFactor Factor = { 1U, 0U }; if (OptForSize && Legal->getRuntimePointerCheck()->Need) { + emitAnalysis(VectorizationReport() << + "runtime pointer checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os"); DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); return Factor; } - if (!EnableCondStoresVectorization && Legal->NumPredStores) { + if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { + emitAnalysis(VectorizationReport() << + "store that is conditionally executed prevents vectorization"); DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); return Factor; } // Find the trip count. - unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); + unsigned TC = SE->getSmallConstantTripCount(TheLoop); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); unsigned WidestType = getWidestType(); @@ -4954,7 +4294,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, MaxVectorSize = 1; } - assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements" + assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements" " into one vector!"); unsigned VF = MaxVectorSize; @@ -4963,6 +4303,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, if (OptForSize) { // If we are unable to calculate the trip count then don't try to vectorize. if (TC < 2) { + emitAnalysis + (VectorizationReport() << + "unable to calculate the loop count due to complex control flow"); DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); return Factor; } @@ -4976,11 +4319,17 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, // If the trip count that we found modulo the vectorization factor is not // zero then we require a tail. if (VF < 2) { + emitAnalysis(VectorizationReport() << + "cannot optimize for size and vectorize at the " + "same time. Enable vectorization of this loop " + "with '#pragma clang loop vectorize(enable)' " + "when compiling with -Os"); DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); return Factor; } } + int UserVF = Hints->getWidth(); if (UserVF != 0) { assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two"); DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); @@ -4996,6 +4345,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, unsigned Width = 1; DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); + bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; // Ignore scalar width, because the user explicitly wants vectorization. if (ForceVectorization && VF > 1) { Width = 2; @@ -5036,6 +4386,10 @@ unsigned LoopVectorizationCostModel::getWidestType() { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { Type *T = it->getType(); + // Ignore ephemeral values. + if (EphValues.count(it)) + continue; + // Only examine Loads, Stores and PHINodes. if (!isa(it) && !isa(it) && !isa(it)) continue; @@ -5065,29 +4419,29 @@ unsigned LoopVectorizationCostModel::getWidestType() { unsigned LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, - unsigned UserUF, unsigned VF, unsigned LoopCost) { // -- The unroll heuristics -- // We unroll the loop in order to expose ILP and reduce the loop overhead. // There are many micro-architectural considerations that we can't predict - // at this level. For example frontend pressure (on decode or fetch) due to + // at this level. For example, frontend pressure (on decode or fetch) due to // code size, or the number and capabilities of the execution ports. // // We use the following heuristics to select the unroll factor: - // 1. If the code has reductions the we unroll in order to break the cross + // 1. If the code has reductions, then we unroll in order to break the cross // iteration dependency. - // 2. If the loop is really small then we unroll in order to reduce the loop + // 2. If the loop is really small, then we unroll in order to reduce the loop // overhead. // 3. We don't unroll if we think that we will spill registers to memory due // to the increased register pressure. // Use the user preference, unless 'auto' is selected. + int UserUF = Hints->getInterleave(); if (UserUF != 0) return UserUF; - // When we optimize for size we don't unroll. + // When we optimize for size, we don't unroll. if (OptForSize) return 1; @@ -5096,8 +4450,7 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, return 1; // Do not unroll loops with a relatively small trip count. - unsigned TC = SE->getSmallConstantTripCount(TheLoop, - TheLoop->getLoopLatch()); + unsigned TC = SE->getSmallConstantTripCount(TheLoop); if (TC > 1 && TC < TinyTripCountUnrollThreshold) return 1; @@ -5136,15 +4489,15 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, std::max(1U, (R.MaxLocalUsers - 1))); // Clamp the unroll factor ranges to reasonable factors. - unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor(); + unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(); // Check if the user has overridden the unroll max. if (VF == 1) { - if (ForceTargetMaxScalarUnrollFactor.getNumOccurrences() > 0) - MaxUnrollSize = ForceTargetMaxScalarUnrollFactor; + if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) + MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor; } else { - if (ForceTargetMaxVectorUnrollFactor.getNumOccurrences() > 0) - MaxUnrollSize = ForceTargetMaxVectorUnrollFactor; + if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) + MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor; } // If we did not calculate the cost for VF (because the user selected the VF) @@ -5154,8 +4507,8 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, // Clamp the calculated UF to be between the 1 and the max unroll factor // that the target allows. - if (UF > MaxUnrollSize) - UF = MaxUnrollSize; + if (UF > MaxInterleaveSize) + UF = MaxInterleaveSize; else if (UF < 1) UF = 1; @@ -5183,8 +4536,22 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, // Unroll until store/load ports (estimated by max unroll factor) are // saturated. - unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1); - unsigned LoadsUF = UF / (Legal->NumLoads ? Legal->NumLoads : 1); + unsigned NumStores = Legal->getNumStores(); + unsigned NumLoads = Legal->getNumLoads(); + unsigned StoresUF = UF / (NumStores ? NumStores : 1); + unsigned LoadsUF = UF / (NumLoads ? NumLoads : 1); + + // If we have a scalar reduction (vector reductions are already dealt with + // by this point), we can increase the critical path length if the loop + // we're unrolling is inside another loop. Limit, by default to 2, so the + // critical path only gets increased by one reduction operation. + if (Legal->getReductionVars()->size() && + TheLoop->getLoopDepth() > 1) { + unsigned F = static_cast(MaxNestedScalarReductionUF); + SmallUF = std::min(SmallUF, F); + StoresUF = std::min(StoresUF, F); + LoadsUF = std::min(LoadsUF, F); + } if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) { DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n"); @@ -5287,6 +4654,10 @@ LoopVectorizationCostModel::calculateRegisterUsage() { // Ignore instructions that are never used within the loop. if (!Ends.count(I)) continue; + // Ignore ephemeral values. + if (EphValues.count(I)) + continue; + // Remove all of the instructions that end at this location. InstrList &List = TransposeEnds[i]; for (unsigned int j=0, e = List.size(); j < e; ++j) @@ -5327,6 +4698,10 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { if (isa(it)) continue; + // Ignore ephemeral values. + if (EphValues.count(it)) + continue; + unsigned C = getInstructionCost(it, VF); // Check if we should override the cost. @@ -5460,18 +4835,31 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueProperties Op1VP = + TargetTransformInfo::OP_None; + TargetTransformInfo::OperandValueProperties Op2VP = + TargetTransformInfo::OP_None; Value *Op2 = I->getOperand(1); // Check for a splat of a constant or for a non uniform vector of constants. - if (isa(Op2)) + if (isa(Op2)) { + ConstantInt *CInt = cast(Op2); + if (CInt && CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; Op2VK = TargetTransformInfo::OK_UniformConstantValue; - else if (isa(Op2) || isa(Op2)) { + } else if (isa(Op2) || isa(Op2)) { Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; - if (cast(Op2)->getSplatValue() != nullptr) + Constant *SplatValue = cast(Op2)->getSplatValue(); + if (SplatValue) { + ConstantInt *CInt = dyn_cast(SplatValue); + if (CInt && CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; Op2VK = TargetTransformInfo::OK_UniformConstantValue; + } } - return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK); + return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, + Op1VP, Op2VP); } case Instruction::Select: { SelectInst *SI = cast(I); @@ -5539,7 +4927,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // Wide load/stores. unsigned Cost = TTI.getAddressComputationCost(VectorTy); - Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + if (Legal->isMaskRequired(I)) + Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, + AS); + else + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); if (Reverse) Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, @@ -5612,13 +5004,16 @@ Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) -INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { @@ -5715,7 +5110,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, ConstantInt::get(Cond[Part]->getType(), 1)); CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); LoopVectorBody.push_back(CondBlock); - VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase()); + VectorLp->addBasicBlockToLoop(CondBlock, *LI); // Update Builder with newly created basic block. Builder.SetInsertPoint(InsertPt); } @@ -5741,7 +5136,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, if (IfPredicateStore) { BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); LoopVectorBody.push_back(NewIfBlock); - VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase()); + VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); Builder.SetInsertPoint(InsertPt); Instruction *OldBr = IfBlock->getTerminator(); BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); @@ -5766,12 +5161,10 @@ Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } -Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx, - bool Negate) { +Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) { // When unrolling and the VF is 1, we only need to add a simple scalar. Type *ITy = Val->getType(); assert(!ITy->isVectorTy() && "Val must be a scalar"); - Constant *C = ConstantInt::get(ITy, StartIdx, Negate); - return Builder.CreateAdd(Val, C, "induction"); + Constant *C = ConstantInt::get(ITy, StartIdx); + return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction"); } -