X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTransforms%2FVectorize%2FLoopVectorize.cpp;h=ae5ec8cb88a84411c7c22f81b98464daf431860c;hp=b7faa204927d70803bd3391946dae459498b17de;hb=c287d4cc994b8f20905449213efb62b865c0bba5;hpb=cf0db29df20d9c665da7e82bb261bdd7cf7f1b2b diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index b7faa204927..ae5ec8cb88a 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -48,7 +48,6 @@ #include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" @@ -58,10 +57,13 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DemandedBits.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" @@ -96,9 +98,10 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/VectorUtils.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include +#include #include #include @@ -148,8 +151,9 @@ static cl::opt MaxInterleaveGroupFactor( cl::desc("Maximum factor for an interleaved access group (default = 8)"), cl::init(8)); -/// We don't unroll loops with a known constant trip count below this number. -static const unsigned TinyTripCountUnrollThreshold = 128; +/// We don't interleave loops with a known constant trip count below this +/// number. +static const unsigned TinyTripCountInterleaveThreshold = 128; static cl::opt ForceTargetNumScalarRegs( "force-target-num-scalar-regs", cl::init(0), cl::Hidden, @@ -180,7 +184,8 @@ static cl::opt ForceTargetInstructionCost( static cl::opt SmallLoopCost( "small-loop-cost", cl::init(20), cl::Hidden, - cl::desc("The cost of a loop that is considered 'small' by the unroller.")); + cl::desc( + "The cost of a loop that is considered 'small' by the interleaver.")); static cl::opt LoopVectorizeWithBlockFrequency( "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden, @@ -188,10 +193,11 @@ static cl::opt LoopVectorizeWithBlockFrequency( "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions.")); -// Runtime unroll loops for load/store throughput. -static cl::opt EnableLoadStoreRuntimeUnroll( - "enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden, - cl::desc("Enable runtime unrolling until load/store ports are saturated")); +// Runtime interleave loops for load/store throughput. +static cl::opt EnableLoadStoreRuntimeInterleave( + "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, + cl::desc( + "Enable runtime interleaving until load/store ports are saturated")); /// The number of stores in a loop that are allowed to need predication. static cl::opt NumberOfStoresToPredicate( @@ -200,23 +206,29 @@ static cl::opt NumberOfStoresToPredicate( static cl::opt EnableIndVarRegisterHeur( "enable-ind-var-reg-heur", cl::init(true), cl::Hidden, - cl::desc("Count the induction variable only once when unrolling")); + cl::desc("Count the induction variable only once when interleaving")); static cl::opt EnableCondStoresVectorization( "enable-cond-stores-vec", cl::init(false), cl::Hidden, cl::desc("Enable if predication of stores during vectorization.")); -static cl::opt MaxNestedScalarReductionUF( - "max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden, - cl::desc("The maximum unroll factor to use when unrolling a scalar " +static cl::opt MaxNestedScalarReductionIC( + "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, + cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop.")); +static cl::opt PragmaVectorizeMemoryCheckThreshold( + "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden, + cl::desc("The maximum allowed number of runtime memory checks with a " + "vectorize(enable) pragma.")); + namespace { // Forward declarations. +class LoopVectorizeHints; class LoopVectorizationLegality; class LoopVectorizationCostModel; -class LoopVectorizeHints; +class LoopVectorizationRequirements; /// \brief This modifies LoopAccessReport to initialize message with /// loop-vectorizer-specific part. @@ -265,18 +277,22 @@ public: : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), - Legal(nullptr), AddedSafetyChecks(false) {} + TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr), + AddedSafetyChecks(false) {} // Perform the actual loop widening (vectorization). - void vectorize(LoopVectorizationLegality *L) { + // MinimumBitWidths maps scalar integer values to the smallest bitwidth they + // can be validly truncated to. The cost model has assumed this truncation + // will happen when vectorizing. + void vectorize(LoopVectorizationLegality *L, + DenseMap MinimumBitWidths) { + MinBWs = MinimumBitWidths; Legal = L; // Create a new empty loop. Unlink the old loop and connect the new one. createEmptyLoop(); // Widen each instruction in the old loop to a new one in the new loop. // Use the Legality module to find the induction and reduction variables. vectorizeLoop(); - // Register the new loop and update the analysis passes. - updateAnalysis(); } // Return true if any runtime check is added. @@ -307,6 +323,9 @@ protected: /// Create an empty loop, based on the loop ranges of the old loop. void createEmptyLoop(); + /// Create a new induction variable inside L. + PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, + Value *Step, Instruction *DL); /// Copy and widen the instructions from the old loop. virtual void vectorizeLoop(); @@ -316,6 +335,9 @@ protected: /// See PR14725. void fixLCSSAPHIs(); + /// Shrinks vector element sizes based on information in "MinBWs". + void truncateToMinimalBitwidths(); + /// A helper function that computes the predicate of the block BB, assuming /// that the header block of the loop is set to True. It returns the *entry* /// mask for the block BB. @@ -326,7 +348,7 @@ protected: /// A helper function to vectorize a single BB within the innermost loop. void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV); - + /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. @@ -371,6 +393,22 @@ protected: /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); + /// Returns (and creates if needed) the original loop trip count. + Value *getOrCreateTripCount(Loop *NewLoop); + + /// Returns (and creates if needed) the trip count of the widened loop. + Value *getOrCreateVectorTripCount(Loop *NewLoop); + + /// Emit a bypass check to see if the trip count would overflow, or we + /// wouldn't have enough iterations to execute one vector loop. + void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); + /// Emit a bypass check to see if the vector trip count is nonzero. + void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass); + /// Emit bypass checks to check if strides we've assumed to be one really are. + void emitStrideChecks(Loop *L, BasicBlock *Bypass); + /// Emit bypass checks to check any memory assumptions we may have made. + void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); + /// This is a helper class that holds the vectorizer state. It maps scalar /// instructions to vector instructions. When the code is 'unrolled' then /// then a single scalar value is mapped to multiple vector parts. The parts @@ -459,12 +497,21 @@ protected: PHINode *Induction; /// The induction variable of the old basic block. PHINode *OldInduction; - /// Holds the extended (to the widest induction type) start index. - Value *ExtendedIdx; /// Maps scalars to widened vectors. ValueMap WidenMap; + /// Store instructions that should be predicated, as a pair + /// + SmallVector, 4> PredicatedStores; EdgeMaskCache MaskCache; - + /// Trip count of the original loop. + Value *TripCount; + /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) + Value *VectorTripCount; + + /// Map of scalar integer values to the smallest bitwidth they can be legally + /// represented as. The vector equivalents of these values should be truncated + /// to this type. + DenseMap MinBWs; LoopVectorizationLegality *Legal; // Record whether runtime check is added. @@ -548,7 +595,8 @@ static void propagateMetadata(Instruction *To, const Instruction *From) { if (Kind != LLVMContext::MD_tbaa && Kind != LLVMContext::MD_alias_scope && Kind != LLVMContext::MD_noalias && - Kind != LLVMContext::MD_fpmath) + Kind != LLVMContext::MD_fpmath && + Kind != LLVMContext::MD_nontemporal) continue; To->setMetadata(Kind, M.second); @@ -775,6 +823,304 @@ private: const ValueToValueMap &Strides); }; +/// Utility class for getting and setting loop vectorizer hints in the form +/// of loop metadata. +/// This class keeps a number of loop annotations locally (as member variables) +/// and can, upon request, write them back as metadata on the loop. It will +/// initially scan the loop for existing metadata, and will update the local +/// values based on information in the loop. +/// We cannot write all values to metadata, as the mere presence of some info, +/// for example 'force', means a decision has been made. So, we need to be +/// careful NOT to add them if the user hasn't specifically asked so. +class LoopVectorizeHints { + enum HintKind { + HK_WIDTH, + HK_UNROLL, + HK_FORCE + }; + + /// Hint - associates name and validation with the hint value. + struct Hint { + const char * Name; + unsigned Value; // This may have to change for non-numeric values. + HintKind Kind; + + Hint(const char * Name, unsigned Value, HintKind Kind) + : Name(Name), Value(Value), Kind(Kind) { } + + bool validate(unsigned Val) { + switch (Kind) { + case HK_WIDTH: + return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; + case HK_UNROLL: + return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; + case HK_FORCE: + return (Val <= 1); + } + return false; + } + }; + + /// Vectorization width. + Hint Width; + /// Vectorization interleave factor. + Hint Interleave; + /// Vectorization forced + Hint Force; + + /// Return the loop metadata prefix. + static StringRef Prefix() { return "llvm.loop."; } + +public: + enum ForceKind { + FK_Undefined = -1, ///< Not selected. + FK_Disabled = 0, ///< Forcing disabled. + FK_Enabled = 1, ///< Forcing enabled. + }; + + LoopVectorizeHints(const Loop *L, bool DisableInterleaving) + : Width("vectorize.width", VectorizerParams::VectorizationFactor, + HK_WIDTH), + Interleave("interleave.count", DisableInterleaving, HK_UNROLL), + Force("vectorize.enable", FK_Undefined, HK_FORCE), + TheLoop(L) { + // Populate values with existing loop metadata. + getHintsFromMetadata(); + + // force-vector-interleave overrides DisableInterleaving. + if (VectorizerParams::isInterleaveForced()) + Interleave.Value = VectorizerParams::VectorizationInterleave; + + DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() + << "LV: Interleaving disabled by the pass manager\n"); + } + + /// Mark the loop L as already vectorized by setting the width to 1. + void setAlreadyVectorized() { + Width.Value = Interleave.Value = 1; + Hint Hints[] = {Width, Interleave}; + writeHintsToMetadata(Hints); + } + + bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const { + if (getForce() == LoopVectorizeHints::FK_Disabled) { + DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); + emitOptimizationRemarkAnalysis(F->getContext(), + vectorizeAnalysisPassName(), *F, + L->getStartLoc(), emitRemark()); + return false; + } + + if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) { + DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); + emitOptimizationRemarkAnalysis(F->getContext(), + vectorizeAnalysisPassName(), *F, + L->getStartLoc(), emitRemark()); + return false; + } + + if (getWidth() == 1 && getInterleave() == 1) { + // FIXME: Add a separate metadata to indicate when the loop has already + // been vectorized instead of setting width and count to 1. + DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); + // FIXME: Add interleave.disable metadata. This will allow + // vectorize.disable to be used without disabling the pass and errors + // to differentiate between disabled vectorization and a width of 1. + emitOptimizationRemarkAnalysis( + F->getContext(), vectorizeAnalysisPassName(), *F, L->getStartLoc(), + "loop not vectorized: vectorization and interleaving are explicitly " + "disabled, or vectorize width and interleave count are both set to " + "1"); + return false; + } + + return true; + } + + /// Dumps all the hint information. + std::string emitRemark() const { + VectorizationReport R; + if (Force.Value == LoopVectorizeHints::FK_Disabled) + R << "vectorization is explicitly disabled"; + else { + R << "use -Rpass-analysis=loop-vectorize for more info"; + if (Force.Value == LoopVectorizeHints::FK_Enabled) { + R << " (Force=true"; + if (Width.Value != 0) + R << ", Vector Width=" << Width.Value; + if (Interleave.Value != 0) + R << ", Interleave Count=" << Interleave.Value; + R << ")"; + } + } + + return R.str(); + } + + unsigned getWidth() const { return Width.Value; } + unsigned getInterleave() const { return Interleave.Value; } + enum ForceKind getForce() const { return (ForceKind)Force.Value; } + const char *vectorizeAnalysisPassName() const { + // If hints are provided that don't disable vectorization use the + // AlwaysPrint pass name to force the frontend to print the diagnostic. + if (getWidth() == 1) + return LV_NAME; + if (getForce() == LoopVectorizeHints::FK_Disabled) + return LV_NAME; + if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0) + return LV_NAME; + return DiagnosticInfo::AlwaysPrint; + } + + bool allowReordering() const { + // When enabling loop hints are provided we allow the vectorizer to change + // the order of operations that is given by the scalar loop. This is not + // enabled by default because can be unsafe or inefficient. For example, + // reordering floating-point operations will change the way round-off + // error accumulates in the loop. + return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1; + } + +private: + /// Find hints specified in the loop metadata and update local values. + void getHintsFromMetadata() { + MDNode *LoopID = TheLoop->getLoopID(); + if (!LoopID) + return; + + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + const MDString *S = nullptr; + SmallVector Args; + + // The expected hint is either a MDString or a MDNode with the first + // operand a MDString. + if (const MDNode *MD = dyn_cast(LoopID->getOperand(i))) { + if (!MD || MD->getNumOperands() == 0) + continue; + S = dyn_cast(MD->getOperand(0)); + for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) + Args.push_back(MD->getOperand(i)); + } else { + S = dyn_cast(LoopID->getOperand(i)); + assert(Args.size() == 0 && "too many arguments for MDString"); + } + + if (!S) + continue; + + // Check if the hint starts with the loop metadata prefix. + StringRef Name = S->getString(); + if (Args.size() == 1) + setHint(Name, Args[0]); + } + } + + /// Checks string hint with one operand and set value if valid. + void setHint(StringRef Name, Metadata *Arg) { + if (!Name.startswith(Prefix())) + return; + Name = Name.substr(Prefix().size(), StringRef::npos); + + const ConstantInt *C = mdconst::dyn_extract(Arg); + if (!C) return; + unsigned Val = C->getZExtValue(); + + Hint *Hints[] = {&Width, &Interleave, &Force}; + for (auto H : Hints) { + if (Name == H->Name) { + if (H->validate(Val)) + H->Value = Val; + else + DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); + break; + } + } + } + + /// Create a new hint from name / value pair. + MDNode *createHintMetadata(StringRef Name, unsigned V) const { + LLVMContext &Context = TheLoop->getHeader()->getContext(); + Metadata *MDs[] = {MDString::get(Context, Name), + ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(Context), V))}; + return MDNode::get(Context, MDs); + } + + /// Matches metadata with hint name. + bool matchesHintMetadataName(MDNode *Node, ArrayRef HintTypes) { + MDString* Name = dyn_cast(Node->getOperand(0)); + if (!Name) + return false; + + for (auto H : HintTypes) + if (Name->getString().endswith(H.Name)) + return true; + return false; + } + + /// Sets current hints into loop metadata, keeping other values intact. + void writeHintsToMetadata(ArrayRef HintTypes) { + if (HintTypes.size() == 0) + return; + + // Reserve the first element to LoopID (see below). + SmallVector MDs(1); + // If the loop already has metadata, then ignore the existing operands. + MDNode *LoopID = TheLoop->getLoopID(); + if (LoopID) { + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + MDNode *Node = cast(LoopID->getOperand(i)); + // If node in update list, ignore old value. + if (!matchesHintMetadataName(Node, HintTypes)) + MDs.push_back(Node); + } + } + + // Now, add the missing hints. + for (auto H : HintTypes) + MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); + + // Replace current metadata node with new one. + LLVMContext &Context = TheLoop->getHeader()->getContext(); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + + TheLoop->setLoopID(NewLoopID); + } + + /// The loop these hints belong to. + const Loop *TheLoop; +}; + +static void emitAnalysisDiag(const Function *TheFunction, const Loop *TheLoop, + const LoopVectorizeHints &Hints, + const LoopAccessReport &Message) { + const char *Name = Hints.vectorizeAnalysisPassName(); + LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, Name); +} + +static void emitMissedWarning(Function *F, Loop *L, + const LoopVectorizeHints &LH) { + emitOptimizationRemarkMissed(F->getContext(), LV_NAME, *F, L->getStartLoc(), + LH.emitRemark()); + + if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { + if (LH.getWidth() != 1) + emitLoopVectorizeWarning( + F->getContext(), *F, L->getStartLoc(), + "failed explicitly specified loop vectorization"); + else if (LH.getInterleave() != 1) + emitLoopInterleaveWarning( + F->getContext(), *F, L->getStartLoc(), + "failed explicitly specified loop interleaving"); + } +} + /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and /// to what vectorization factor. /// This class does not look at the profitability of vectorization, only the @@ -793,82 +1139,13 @@ public: LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT, TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F, const TargetTransformInfo *TTI, - LoopAccessAnalysis *LAA) + LoopAccessAnalysis *LAA, + LoopVectorizationRequirements *R, + const LoopVectorizeHints *H) : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F), TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(SE, L, DT), - Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false) {} - - /// This enum represents the kinds of inductions that we support. - enum InductionKind { - IK_NoInduction, ///< Not an induction variable. - IK_IntInduction, ///< Integer induction variable. Step = C. - IK_PtrInduction ///< Pointer induction var. Step = C / sizeof(elem). - }; - - /// A struct for saving information about induction variables. - struct InductionInfo { - InductionInfo(Value *Start, InductionKind K, ConstantInt *Step) - : StartValue(Start), IK(K), StepValue(Step) { - assert(IK != IK_NoInduction && "Not an induction"); - assert(StartValue && "StartValue is null"); - assert(StepValue && !StepValue->isZero() && "StepValue is zero"); - assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) && - "StartValue is not a pointer for pointer induction"); - assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) && - "StartValue is not an integer for integer induction"); - assert(StepValue->getType()->isIntegerTy() && - "StepValue is not an integer"); - } - InductionInfo() - : StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {} - - /// Get the consecutive direction. Returns: - /// 0 - unknown or non-consecutive. - /// 1 - consecutive and increasing. - /// -1 - consecutive and decreasing. - int getConsecutiveDirection() const { - if (StepValue && (StepValue->isOne() || StepValue->isMinusOne())) - return StepValue->getSExtValue(); - return 0; - } - - /// Compute the transformed value of Index at offset StartValue using step - /// StepValue. - /// For integer induction, returns StartValue + Index * StepValue. - /// For pointer induction, returns StartValue[Index * StepValue]. - /// FIXME: The newly created binary instructions should contain nsw/nuw - /// flags, which can be found from the original scalar operations. - Value *transform(IRBuilder<> &B, Value *Index) const { - switch (IK) { - case IK_IntInduction: - assert(Index->getType() == StartValue->getType() && - "Index type does not match StartValue type"); - if (StepValue->isMinusOne()) - return B.CreateSub(StartValue, Index); - if (!StepValue->isOne()) - Index = B.CreateMul(Index, StepValue); - return B.CreateAdd(StartValue, Index); - - case IK_PtrInduction: - if (StepValue->isMinusOne()) - Index = B.CreateNeg(Index); - else if (!StepValue->isOne()) - Index = B.CreateMul(Index, StepValue); - return B.CreateGEP(nullptr, StartValue, Index); - - case IK_NoInduction: - return nullptr; - } - llvm_unreachable("invalid enum"); - } - - /// Start value. - TrackingVH StartValue; - /// Induction kind. - InductionKind IK; - /// Step value. - ConstantInt *StepValue; - }; + Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), + Requirements(R), Hints(H) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. @@ -876,7 +1153,7 @@ public: /// InductionList saves induction variables and maps them to the /// induction descriptor. - typedef MapVector InductionList; + typedef MapVector InductionList; /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this @@ -919,8 +1196,8 @@ public: bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } /// Returns the information that we collected about runtime memory check. - const LoopAccessInfo::RuntimePointerCheck *getRuntimePointerCheck() const { - return LAI->getRuntimePointerCheck(); + const RuntimePointerChecking *getRuntimePointerChecking() const { + return LAI->getRuntimePointerChecking(); } const LoopAccessInfo *getLAI() const { @@ -949,12 +1226,12 @@ public: /// Returns true if the target machine supports masked store operation /// for the given \p DataType and kind of access to \p Ptr. bool isLegalMaskedStore(Type *DataType, Value *Ptr) { - return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr)); + return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType); } /// Returns true if the target machine supports masked load operation /// for the given \p DataType and kind of access to \p Ptr. bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { - return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr)); + return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType); } /// Returns true if vector representation of the instruction \p I /// requires mask. @@ -994,10 +1271,6 @@ private: /// and we know that we can read from them without segfault. bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl &SafePtrs); - /// Returns the induction kind of Phi and record the step. This function may - /// return NoInduction if the PHI is not an induction variable. - InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue); - /// \brief Collect memory access with loop invariant strides. /// /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop @@ -1008,8 +1281,8 @@ private: /// not vectorized. These are handled as LoopAccessReport rather than /// VectorizationReport because the << operator of VectorizationReport returns /// LoopAccessReport. - void emitAnalysis(const LoopAccessReport &Message) { - LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); + void emitAnalysis(const LoopAccessReport &Message) const { + emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); } unsigned NumPredStores; @@ -1060,6 +1333,12 @@ private: /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; + /// Vectorization requirements that will go through late-evaluation. + LoopVectorizationRequirements *Requirements; + + /// Used to emit an analysis of any legality issues. + const LoopVectorizeHints *Hints; + ValueToValueMap Strides; SmallPtrSet StrideSet; @@ -1080,328 +1359,165 @@ public: LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, - const TargetLibraryInfo *TLI, AssumptionCache *AC, - const Function *F, const LoopVectorizeHints *Hints) - : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), - TheFunction(F), Hints(Hints) { - CodeMetrics::collectEphemeralValues(L, AC, EphValues); - } + const TargetLibraryInfo *TLI, DemandedBits *DB, + AssumptionCache *AC, + const Function *F, const LoopVectorizeHints *Hints, + SmallPtrSetImpl &ValuesToIgnore) + : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), + TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {} /// Information about vectorization costs struct VectorizationFactor { unsigned Width; // Vector width with best cost unsigned Cost; // Cost of the loop with that width - }; - /// \return The most profitable vectorization factor and the cost of that VF. - /// This method checks every power of two up to VF. If UserVF is not ZERO - /// then this vectorization factor will be selected if vectorization is - /// possible. - VectorizationFactor selectVectorizationFactor(bool OptForSize); - - /// \return The size (in bits) of the widest type in the code that - /// needs to be vectorized. We ignore values that remain scalar such as - /// 64 bit loop indices. - unsigned getWidestType(); - - /// \return The most profitable unroll factor. - /// If UserUF is non-zero then this method finds the best unroll-factor - /// based on register pressure and other parameters. - /// VF and LoopCost are the selected vectorization factor and the cost of the - /// selected VF. - unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost); - - /// \brief A struct that represents some properties of the register usage - /// of a loop. - struct RegisterUsage { - /// Holds the number of loop invariant values that are used in the loop. - unsigned LoopInvariantRegs; - /// Holds the maximum number of concurrent live intervals in the loop. - unsigned MaxLocalUsers; - /// Holds the number of instructions in the loop. - unsigned NumInstructions; - }; - - /// \return information about the register usage of the loop. - RegisterUsage calculateRegisterUsage(); - -private: - /// Returns the expected execution cost. The unit of the cost does - /// not matter because we use the 'cost' units to compare different - /// vector widths. The cost that is returned is *not* normalized by - /// the factor width. - unsigned expectedCost(unsigned VF); - - /// Returns the execution time cost of an instruction for a given vector - /// width. Vector width of one means scalar. - unsigned getInstructionCost(Instruction *I, unsigned VF); - - /// Returns whether the instruction is a load or store and will be a emitted - /// as a vector operation. - bool isConsecutiveLoadOrStore(Instruction *I); - - /// Report an analysis message to assist the user in diagnosing loops that are - /// not vectorized. These are handled as LoopAccessReport rather than - /// VectorizationReport because the << operator of VectorizationReport returns - /// LoopAccessReport. - void emitAnalysis(const LoopAccessReport &Message) { - LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); - } - - /// Values used only by @llvm.assume calls. - SmallPtrSet EphValues; - - /// The loop that we evaluate. - Loop *TheLoop; - /// Scev analysis. - ScalarEvolution *SE; - /// Loop Info analysis. - LoopInfo *LI; - /// Vectorization legality. - LoopVectorizationLegality *Legal; - /// Vector target information. - const TargetTransformInfo &TTI; - /// Target Library Info. - const TargetLibraryInfo *TLI; - const Function *TheFunction; - // Loop Vectorize Hint. - const LoopVectorizeHints *Hints; -}; - -/// Utility class for getting and setting loop vectorizer hints in the form -/// of loop metadata. -/// This class keeps a number of loop annotations locally (as member variables) -/// and can, upon request, write them back as metadata on the loop. It will -/// initially scan the loop for existing metadata, and will update the local -/// values based on information in the loop. -/// We cannot write all values to metadata, as the mere presence of some info, -/// for example 'force', means a decision has been made. So, we need to be -/// careful NOT to add them if the user hasn't specifically asked so. -class LoopVectorizeHints { - enum HintKind { - HK_WIDTH, - HK_UNROLL, - HK_FORCE - }; - - /// Hint - associates name and validation with the hint value. - struct Hint { - const char * Name; - unsigned Value; // This may have to change for non-numeric values. - HintKind Kind; - - Hint(const char * Name, unsigned Value, HintKind Kind) - : Name(Name), Value(Value), Kind(Kind) { } - - bool validate(unsigned Val) { - switch (Kind) { - case HK_WIDTH: - return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; - case HK_UNROLL: - return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; - case HK_FORCE: - return (Val <= 1); - } - return false; - } - }; - - /// Vectorization width. - Hint Width; - /// Vectorization interleave factor. - Hint Interleave; - /// Vectorization forced - Hint Force; - - /// Return the loop metadata prefix. - static StringRef Prefix() { return "llvm.loop."; } - -public: - enum ForceKind { - FK_Undefined = -1, ///< Not selected. - FK_Disabled = 0, ///< Forcing disabled. - FK_Enabled = 1, ///< Forcing enabled. - }; - - LoopVectorizeHints(const Loop *L, bool DisableInterleaving) - : Width("vectorize.width", VectorizerParams::VectorizationFactor, - HK_WIDTH), - Interleave("interleave.count", DisableInterleaving, HK_UNROLL), - Force("vectorize.enable", FK_Undefined, HK_FORCE), - TheLoop(L) { - // Populate values with existing loop metadata. - getHintsFromMetadata(); - - // force-vector-interleave overrides DisableInterleaving. - if (VectorizerParams::isInterleaveForced()) - Interleave.Value = VectorizerParams::VectorizationInterleave; - - DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() - << "LV: Interleaving disabled by the pass manager\n"); - } - - /// Mark the loop L as already vectorized by setting the width to 1. - void setAlreadyVectorized() { - Width.Value = Interleave.Value = 1; - Hint Hints[] = {Width, Interleave}; - writeHintsToMetadata(Hints); - } - - /// Dumps all the hint information. - std::string emitRemark() const { - VectorizationReport R; - if (Force.Value == LoopVectorizeHints::FK_Disabled) - R << "vectorization is explicitly disabled"; - else { - R << "use -Rpass-analysis=loop-vectorize for more info"; - if (Force.Value == LoopVectorizeHints::FK_Enabled) { - R << " (Force=true"; - if (Width.Value != 0) - R << ", Vector Width=" << Width.Value; - if (Interleave.Value != 0) - R << ", Interleave Count=" << Interleave.Value; - R << ")"; - } - } - - return R.str(); - } - - unsigned getWidth() const { return Width.Value; } - unsigned getInterleave() const { return Interleave.Value; } - enum ForceKind getForce() const { return (ForceKind)Force.Value; } - -private: - /// Find hints specified in the loop metadata and update local values. - void getHintsFromMetadata() { - MDNode *LoopID = TheLoop->getLoopID(); - if (!LoopID) - return; - - // First operand should refer to the loop id itself. - assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); - assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); - - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - const MDString *S = nullptr; - SmallVector Args; - - // The expected hint is either a MDString or a MDNode with the first - // operand a MDString. - if (const MDNode *MD = dyn_cast(LoopID->getOperand(i))) { - if (!MD || MD->getNumOperands() == 0) - continue; - S = dyn_cast(MD->getOperand(0)); - for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) - Args.push_back(MD->getOperand(i)); - } else { - S = dyn_cast(LoopID->getOperand(i)); - assert(Args.size() == 0 && "too many arguments for MDString"); - } - - if (!S) - continue; - - // Check if the hint starts with the loop metadata prefix. - StringRef Name = S->getString(); - if (Args.size() == 1) - setHint(Name, Args[0]); - } - } - - /// Checks string hint with one operand and set value if valid. - void setHint(StringRef Name, Metadata *Arg) { - if (!Name.startswith(Prefix())) - return; - Name = Name.substr(Prefix().size(), StringRef::npos); - - const ConstantInt *C = mdconst::dyn_extract(Arg); - if (!C) return; - unsigned Val = C->getZExtValue(); - - Hint *Hints[] = {&Width, &Interleave, &Force}; - for (auto H : Hints) { - if (Name == H->Name) { - if (H->validate(Val)) - H->Value = Val; - else - DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n"); - break; - } - } - } + }; + /// \return The most profitable vectorization factor and the cost of that VF. + /// This method checks every power of two up to VF. If UserVF is not ZERO + /// then this vectorization factor will be selected if vectorization is + /// possible. + VectorizationFactor selectVectorizationFactor(bool OptForSize); - /// Create a new hint from name / value pair. - MDNode *createHintMetadata(StringRef Name, unsigned V) const { - LLVMContext &Context = TheLoop->getHeader()->getContext(); - Metadata *MDs[] = {MDString::get(Context, Name), - ConstantAsMetadata::get( - ConstantInt::get(Type::getInt32Ty(Context), V))}; - return MDNode::get(Context, MDs); - } + /// \return The size (in bits) of the widest type in the code that + /// needs to be vectorized. We ignore values that remain scalar such as + /// 64 bit loop indices. + unsigned getWidestType(); - /// Matches metadata with hint name. - bool matchesHintMetadataName(MDNode *Node, ArrayRef HintTypes) { - MDString* Name = dyn_cast(Node->getOperand(0)); - if (!Name) - return false; + /// \return The desired interleave count. + /// If interleave count has been specified by metadata it will be returned. + /// Otherwise, the interleave count is computed and returned. VF and LoopCost + /// are the selected vectorization factor and the cost of the selected VF. + unsigned selectInterleaveCount(bool OptForSize, unsigned VF, + unsigned LoopCost); - for (auto H : HintTypes) - if (Name->getString().endswith(H.Name)) - return true; - return false; - } + /// \return The most profitable unroll factor. + /// This method finds the best unroll-factor based on register pressure and + /// other parameters. VF and LoopCost are the selected vectorization factor + /// and the cost of the selected VF. + unsigned computeInterleaveCount(bool OptForSize, unsigned VF, + unsigned LoopCost); - /// Sets current hints into loop metadata, keeping other values intact. - void writeHintsToMetadata(ArrayRef HintTypes) { - if (HintTypes.size() == 0) - return; + /// \brief A struct that represents some properties of the register usage + /// of a loop. + struct RegisterUsage { + /// Holds the number of loop invariant values that are used in the loop. + unsigned LoopInvariantRegs; + /// Holds the maximum number of concurrent live intervals in the loop. + unsigned MaxLocalUsers; + /// Holds the number of instructions in the loop. + unsigned NumInstructions; + }; - // Reserve the first element to LoopID (see below). - SmallVector MDs(1); - // If the loop already has metadata, then ignore the existing operands. - MDNode *LoopID = TheLoop->getLoopID(); - if (LoopID) { - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - MDNode *Node = cast(LoopID->getOperand(i)); - // If node in update list, ignore old value. - if (!matchesHintMetadataName(Node, HintTypes)) - MDs.push_back(Node); - } - } + /// \return information about the register usage of the loop. + RegisterUsage calculateRegisterUsage(); - // Now, add the missing hints. - for (auto H : HintTypes) - MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); +private: + /// Returns the expected execution cost. The unit of the cost does + /// not matter because we use the 'cost' units to compare different + /// vector widths. The cost that is returned is *not* normalized by + /// the factor width. + unsigned expectedCost(unsigned VF); - // Replace current metadata node with new one. - LLVMContext &Context = TheLoop->getHeader()->getContext(); - MDNode *NewLoopID = MDNode::get(Context, MDs); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + unsigned getInstructionCost(Instruction *I, unsigned VF); - TheLoop->setLoopID(NewLoopID); + /// Returns whether the instruction is a load or store and will be a emitted + /// as a vector operation. + bool isConsecutiveLoadOrStore(Instruction *I); + + /// Report an analysis message to assist the user in diagnosing loops that are + /// not vectorized. These are handled as LoopAccessReport rather than + /// VectorizationReport because the << operator of VectorizationReport returns + /// LoopAccessReport. + void emitAnalysis(const LoopAccessReport &Message) const { + emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message); } - /// The loop these hints belong to. - const Loop *TheLoop; +public: + /// Map of scalar integer values to the smallest bitwidth they can be legally + /// represented as. The vector equivalents of these values should be truncated + /// to this type. + DenseMap MinBWs; + + /// The loop that we evaluate. + Loop *TheLoop; + /// Scev analysis. + ScalarEvolution *SE; + /// Loop Info analysis. + LoopInfo *LI; + /// Vectorization legality. + LoopVectorizationLegality *Legal; + /// Vector target information. + const TargetTransformInfo &TTI; + /// Target Library Info. + const TargetLibraryInfo *TLI; + /// Demanded bits analysis + DemandedBits *DB; + const Function *TheFunction; + // Loop Vectorize Hint. + const LoopVectorizeHints *Hints; + // Values to ignore in the cost model. + const SmallPtrSetImpl &ValuesToIgnore; }; -static void emitMissedWarning(Function *F, Loop *L, - const LoopVectorizeHints &LH) { - emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F, - L->getStartLoc(), LH.emitRemark()); +/// \brief This holds vectorization requirements that must be verified late in +/// the process. The requirements are set by legalize and costmodel. Once +/// vectorization has been determined to be possible and profitable the +/// requirements can be verified by looking for metadata or compiler options. +/// For example, some loops require FP commutativity which is only allowed if +/// vectorization is explicitly specified or if the fast-math compiler option +/// has been provided. +/// Late evaluation of these requirements allows helpful diagnostics to be +/// composed that tells the user what need to be done to vectorize the loop. For +/// example, by specifying #pragma clang loop vectorize or -ffast-math. Late +/// evaluation should be used only when diagnostics can generated that can be +/// followed by a non-expert user. +class LoopVectorizationRequirements { +public: + LoopVectorizationRequirements() + : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr) {} + + void addUnsafeAlgebraInst(Instruction *I) { + // First unsafe algebra instruction. + if (!UnsafeAlgebraInst) + UnsafeAlgebraInst = I; + } + + void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; } + + bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) { + const char *Name = Hints.vectorizeAnalysisPassName(); + bool Failed = false; + if (UnsafeAlgebraInst && !Hints.allowReordering()) { + emitOptimizationRemarkAnalysisFPCommute( + F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(), + VectorizationReport() << "cannot prove it is safe to reorder " + "floating-point operations"); + Failed = true; + } - if (LH.getForce() == LoopVectorizeHints::FK_Enabled) { - if (LH.getWidth() != 1) - emitLoopVectorizeWarning( - F->getContext(), *F, L->getStartLoc(), - "failed explicitly specified loop vectorization"); - else if (LH.getInterleave() != 1) - emitLoopInterleaveWarning( - F->getContext(), *F, L->getStartLoc(), - "failed explicitly specified loop interleaving"); + // Test if runtime memcheck thresholds are exceeded. + bool PragmaThresholdReached = + NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold; + bool ThresholdReached = + NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold; + if ((ThresholdReached && !Hints.allowReordering()) || + PragmaThresholdReached) { + emitOptimizationRemarkAnalysisAliasing( + F->getContext(), Name, *F, L->getStartLoc(), + VectorizationReport() + << "cannot prove it is safe to reorder memory operations"); + DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); + Failed = true; + } + + return Failed; } -} + +private: + unsigned NumRuntimePointerChecks; + Instruction *UnsafeAlgebraInst; +}; static void addInnerLoop(Loop &L, SmallVectorImpl &V) { if (L.empty()) @@ -1429,6 +1545,7 @@ struct LoopVectorize : public FunctionPass { DominatorTree *DT; BlockFrequencyInfo *BFI; TargetLibraryInfo *TLI; + DemandedBits *DB; AliasAnalysis *AA; AssumptionCache *AC; LoopAccessAnalysis *LAA; @@ -1438,25 +1555,31 @@ struct LoopVectorize : public FunctionPass { BlockFrequency ColdEntryFreq; bool runOnFunction(Function &F) override { - SE = &getAnalysis(); + SE = &getAnalysis().getSE(); LI = &getAnalysis().getLoopInfo(); TTI = &getAnalysis().getTTI(F); DT = &getAnalysis().getDomTree(); - BFI = &getAnalysis(); + BFI = &getAnalysis().getBFI(); auto *TLIP = getAnalysisIfAvailable(); TLI = TLIP ? &TLIP->getTLI() : nullptr; - AA = &getAnalysis(); + AA = &getAnalysis().getAAResults(); AC = &getAnalysis().getAssumptionCache(F); LAA = &getAnalysis(); + DB = &getAnalysis(); // Compute some weights outside of the loop over the loops. Compute this // using a BranchProbability to re-use its scaling math. const BranchProbability ColdProb(1, 5); // 20% ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb; - // If the target claims to have no vector registers don't attempt - // vectorization. - if (!TTI->getNumberOfRegisters(true)) + // Don't attempt if + // 1. the target claims to have no vector registers, and + // 2. interleaving won't help ILP. + // + // The second condition is necessary because, even if the target has no + // vector registers, loop vectorization may still enable scalar + // interleaving. + if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) return false; // Build up a worklist of inner-loops to vectorize. This is necessary as @@ -1545,26 +1668,8 @@ struct LoopVectorize : public FunctionPass { // less verbose reporting vectorized loops and unvectorized loops that may // benefit from vectorization, respectively. - if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) { - DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); - emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, - L->getStartLoc(), Hints.emitRemark()); - return false; - } - - if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) { - DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); - emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F, - L->getStartLoc(), Hints.emitRemark()); - return false; - } - - if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) { - DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); - emitOptimizationRemarkAnalysis( - F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - "loop not vectorized: vector width and interleave count are " - "explicitly set to 1"); + if (!Hints.allowVectorization(F, L, AlwaysVectorize)) { + DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); return false; } @@ -1578,32 +1683,45 @@ struct LoopVectorize : public FunctionPass { DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { DEBUG(dbgs() << "\n"); - emitOptimizationRemarkAnalysis( - F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - "vectorization is not beneficial and is not explicitly forced"); + emitAnalysisDiag(F, L, Hints, VectorizationReport() + << "vectorization is not beneficial " + "and is not explicitly forced"); return false; } } // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA); + LoopVectorizationRequirements Requirements; + LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA, + &Requirements, &Hints); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); emitMissedWarning(F, L, Hints); return false; } + // Collect values we want to ignore in the cost model. This includes + // type-promoting instructions we identified during reduction detection. + SmallPtrSet ValuesToIgnore; + CodeMetrics::collectEphemeralValues(L, AC, ValuesToIgnore); + for (auto &Reduction : *LVL.getReductionVars()) { + RecurrenceDescriptor &RedDes = Reduction.second; + SmallPtrSetImpl &Casts = RedDes.getCastInsts(); + ValuesToIgnore.insert(Casts.begin(), Casts.end()); + } + // Use the cost model. - LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints); + LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, DB, AC, F, &Hints, + ValuesToIgnore); // Check the function attributes to find out if this function should be // optimized for size. bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled && - F->hasFnAttribute(Attribute::OptimizeForSize); + F->optForSize(); // Compute the weighted frequency of this loop being executed and see if it // is less than 20% of the function entry baseline frequency. Note that we - // always have a canonical loop here because we think we *can* vectoriez. + // always have a canonical loop here because we think we *can* vectorize. // FIXME: This is hidden behind a flag due to pervasive problems with // exactly what block frequency models. if (LoopVectorizeWithBlockFrequency) { @@ -1613,16 +1731,17 @@ struct LoopVectorize : public FunctionPass { OptForSize = true; } - // Check the function attributes to see if implicit floats are allowed.a + // Check the function attributes to see if implicit floats are allowed. // FIXME: This check doesn't seem possibly correct -- what if the loop is // an integer loop and the vector instructions selected are purely integer // vector instructions? if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" "attribute is used.\n"); - emitOptimizationRemarkAnalysis( - F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - "loop not vectorized due to NoImplicitFloat attribute"); + emitAnalysisDiag( + F, L, Hints, + VectorizationReport() + << "loop not vectorized due to NoImplicitFloat attribute"); emitMissedWarning(F, L, Hints); return false; } @@ -1631,39 +1750,89 @@ struct LoopVectorize : public FunctionPass { const LoopVectorizationCostModel::VectorizationFactor VF = CM.selectVectorizationFactor(OptForSize); - // Select the unroll factor. - const unsigned UF = - CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost); + // Select the interleave count. + unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); + + // Get user interleave count. + unsigned UserIC = Hints.getInterleave(); - DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " - << DebugLocStr << '\n'); - DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n'); + // Identify the diagnostic messages that should be produced. + std::string VecDiagMsg, IntDiagMsg; + bool VectorizeLoop = true, InterleaveLoop = true; + + if (Requirements.doesNotMeet(F, L, Hints)) { + DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " + "requirements.\n"); + emitMissedWarning(F, L, Hints); + return false; + } if (VF.Width == 1) { - DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n"); + DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); + VecDiagMsg = + "the cost-model indicates that vectorization is not beneficial"; + VectorizeLoop = false; + } - if (UF == 1) { - emitOptimizationRemarkAnalysis( - F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - "not beneficial to vectorize and user disabled interleaving"); - return false; - } - DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n"); + if (IC == 1 && UserIC <= 1) { + // Tell the user interleaving is not beneficial. + DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n"); + IntDiagMsg = + "the cost-model indicates that interleaving is not beneficial"; + InterleaveLoop = false; + if (UserIC == 1) + IntDiagMsg += + " and is explicitly disabled or interleave count is set to 1"; + } else if (IC > 1 && UserIC == 1) { + // Tell the user interleaving is beneficial, but it explicitly disabled. + DEBUG(dbgs() + << "LV: Interleaving is beneficial but is explicitly disabled."); + IntDiagMsg = "the cost-model indicates that interleaving is beneficial " + "but is explicitly disabled or interleave count is set to 1"; + InterleaveLoop = false; + } - // Report the unrolling decision. - emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - Twine("unrolled with interleaving factor " + - Twine(UF) + - " (vectorization not beneficial)")); + // Override IC if user provided an interleave count. + IC = UserIC > 0 ? UserIC : IC; + + // Emit diagnostic messages, if any. + const char *VAPassName = Hints.vectorizeAnalysisPassName(); + if (!VectorizeLoop && !InterleaveLoop) { + // Do not vectorize or interleaving the loop. + emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F, + L->getStartLoc(), VecDiagMsg); + emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F, + L->getStartLoc(), IntDiagMsg); + return false; + } else if (!VectorizeLoop && InterleaveLoop) { + DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); + emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F, + L->getStartLoc(), VecDiagMsg); + } else if (VectorizeLoop && !InterleaveLoop) { + DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " + << DebugLocStr << '\n'); + emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F, + L->getStartLoc(), IntDiagMsg); + } else if (VectorizeLoop && InterleaveLoop) { + DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in " + << DebugLocStr << '\n'); + DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); + } - // We decided not to vectorize, but we may want to unroll. + if (!VectorizeLoop) { + assert(IC > 1 && "interleave count should not be 1 or 0"); + // If we decided that it is not legal to vectorize the loop then + // interleave it. + InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC); + Unroller.vectorize(&LVL, CM.MinBWs); - InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, UF); - Unroller.vectorize(&LVL); + emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), + Twine("interleaved loop (interleaved count: ") + + Twine(IC) + ")"); } else { // If we decided that it is *legal* to vectorize the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, UF); - LB.vectorize(&LVL); + InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC); + LB.vectorize(&LVL, CM.MinBWs); ++LoopsVectorized; // Add metadata to disable runtime unrolling scalar loop when there's no @@ -1673,10 +1842,10 @@ struct LoopVectorize : public FunctionPass { AddRuntimeUnrollDisableMetaData(L); // Report the vectorization decision. - emitOptimizationRemark( - F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), - Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) + - ", unrolling interleave factor: " + Twine(UF) + ")"); + emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), + Twine("vectorized loop (vectorization width: ") + + Twine(VF.Width) + ", interleaved count: " + + Twine(IC) + ")"); } // Mark the loop as already vectorized to avoid vectorizing again. @@ -1690,16 +1859,19 @@ struct LoopVectorize : public FunctionPass { AU.addRequired(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.addPreserved(); - AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); } }; @@ -1758,31 +1930,6 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, return Builder.CreateAdd(Val, Step, "induction"); } -/// \brief Find the operand of the GEP that should be checked for consecutive -/// stores. This ignores trailing indices that have no effect on the final -/// pointer. -static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) { - const DataLayout &DL = Gep->getModule()->getDataLayout(); - unsigned LastOperand = Gep->getNumOperands() - 1; - unsigned GEPAllocSize = DL.getTypeAllocSize( - cast(Gep->getType()->getScalarType())->getElementType()); - - // Walk backwards and try to peel off zeros. - while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) { - // Find the type we're currently indexing into. - gep_type_iterator GEPTI = gep_type_begin(Gep); - std::advance(GEPTI, LastOperand - 1); - - // If it's a type with the same allocation size as the result of the GEP we - // can peel off the zero index. - if (DL.getTypeAllocSize(*GEPTI) != GEPAllocSize) - break; - --LastOperand; - } - - return LastOperand; -} - int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); // Make sure that the pointer does not point to structs. @@ -1792,7 +1939,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // If this value is a pointer induction variable we know it is consecutive. PHINode *Phi = dyn_cast_or_null(Ptr); if (Phi && Inductions.count(Phi)) { - InductionInfo II = Inductions[Phi]; + InductionDescriptor II = Inductions[Phi]; return II.getConsecutiveDirection(); } @@ -1817,7 +1964,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) return 0; - InductionInfo II = Inductions[Phi]; + InductionDescriptor II = Inductions[Phi]; return II.getConsecutiveDirection(); } @@ -1887,6 +2034,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) { // If this scalar is unknown, assume that it is a constant or that it is // loop invariant. Broadcast V and save the value for future uses. Value *B = getBroadcastInstrs(V); + return WidenMap.splat(V, B); } @@ -2249,14 +2397,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // We don't want to update the value in the map as it might be used in // another expression. So don't use a reference type for "StoredVal". VectorParts StoredVal = getVectorValue(SI->getValueOperand()); - + for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. Value *PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); if (Reverse) { - // If we store to reverse consecutive memory locations then we need + // If we store to reverse consecutive memory locations, then we need // to reverse the order of elements in the stored value. StoredVal[Part] = reverseVector(StoredVal[Part]); // If the address is consecutive but reversed, then the @@ -2330,7 +2478,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic // Try using previously calculated values. Instruction *SrcInst = dyn_cast(SrcOp); - // If the src is an instruction that appeared earlier in the basic block + // If the src is an instruction that appeared earlier in the basic block, // then it should already be vectorized. if (SrcInst && OrigLoop->contains(SrcInst)) { assert(WidenMap.has(SrcInst) && "Source operand is unavailable"); @@ -2355,19 +2503,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); - Instruction *InsertPt = Builder.GetInsertPoint(); - BasicBlock *IfBlock = Builder.GetInsertBlock(); - BasicBlock *CondBlock = nullptr; - VectorParts Cond; - Loop *VectorLp = nullptr; if (IfPredicateStore) { assert(Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks"); Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), Instr->getParent()); - VectorLp = LI->getLoopFor(IfBlock); - assert(VectorLp && "Must have a loop for this block"); } // For each vector unroll 'part': @@ -2380,11 +2521,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic if (IfPredicateStore) { Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); - CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); - LoopVectorBody.push_back(CondBlock); - VectorLp->addBasicBlockToLoop(CondBlock, *LI); - // Update Builder with newly created basic block. - Builder.SetInsertPoint(InsertPt); } Instruction *Cloned = Instr->clone(); @@ -2399,95 +2535,276 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic Cloned->setOperand(op, Op); } - // Place the cloned scalar in the new loop. - Builder.Insert(Cloned); + // Place the cloned scalar in the new loop. + Builder.Insert(Cloned); + + // If the original scalar returns a value we need to place it in a vector + // so that future users will be able to use it. + if (!IsVoidRetTy) + VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, + Builder.getInt32(Width)); + // End if-block. + if (IfPredicateStore) + PredicatedStores.push_back(std::make_pair(cast(Cloned), + Cmp)); + } + } +} + +static Instruction *getFirstInst(Instruction *FirstInst, Value *V, + Instruction *Loc) { + if (FirstInst) + return FirstInst; + if (Instruction *I = dyn_cast(V)) + return I->getParent() == Loc->getParent() ? I : nullptr; + return nullptr; +} + +std::pair +InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { + Instruction *tnullptr = nullptr; + if (!Legal->mustCheckStrides()) + return std::pair(tnullptr, tnullptr); + + IRBuilder<> ChkBuilder(Loc); + + // Emit checks. + Value *Check = nullptr; + Instruction *FirstInst = nullptr; + for (SmallPtrSet::iterator SI = Legal->strides_begin(), + SE = Legal->strides_end(); + SI != SE; ++SI) { + Value *Ptr = stripIntegerCast(*SI); + Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1), + "stride.chk"); + // Store the first instruction we create. + FirstInst = getFirstInst(FirstInst, C, Loc); + if (Check) + Check = ChkBuilder.CreateOr(Check, C); + else + Check = C; + } + + // We have to do this trickery because the IRBuilder might fold the check to a + // constant expression in which case there is no Instruction anchored in a + // the block. + LLVMContext &Ctx = Loc->getContext(); + Instruction *TheCheck = + BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx)); + ChkBuilder.Insert(TheCheck, "stride.not.one"); + FirstInst = getFirstInst(FirstInst, TheCheck, Loc); + + return std::make_pair(FirstInst, TheCheck); +} + +PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, + Value *Start, + Value *End, + Value *Step, + Instruction *DL) { + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); + // As we're just creating this loop, it's possible no latch exists + // yet. If so, use the header as this will be a single block loop. + if (!Latch) + Latch = Header; + + IRBuilder<> Builder(&*Header->getFirstInsertionPt()); + setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); + auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index"); + + Builder.SetInsertPoint(Latch->getTerminator()); + + // Create i+1 and fill the PHINode. + Value *Next = Builder.CreateAdd(Induction, Step, "index.next"); + Induction->addIncoming(Start, L->getLoopPreheader()); + Induction->addIncoming(Next, Latch); + // Create the compare. + Value *ICmp = Builder.CreateICmpEQ(Next, End); + Builder.CreateCondBr(ICmp, L->getExitBlock(), Header); + + // Now we have two terminators. Remove the old one from the block. + Latch->getTerminator()->eraseFromParent(); + + return Induction; +} + +Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { + if (TripCount) + return TripCount; + + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + // Find the loop boundaries. + const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop); + assert(BackedgeTakenCount != SE->getCouldNotCompute() && "Invalid loop count"); + + Type *IdxTy = Legal->getWidestInductionType(); + + // The exit count might have the type of i64 while the phi is i32. This can + // happen if we have an induction variable that is sign extended before the + // compare. The only way that we get a backedge taken count is that the + // induction variable was signed and as such will not overflow. In such a case + // truncation is legal. + if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() > + IdxTy->getPrimitiveSizeInBits()) + BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy); + BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy); + + // Get the total trip count from the count by adding 1. + const SCEV *ExitCount = SE->getAddExpr( + BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); + + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + + // Expand the trip count and place the new instructions in the preheader. + // Notice that the pre-header does not change, only the loop body. + SCEVExpander Exp(*SE, DL, "induction"); + + // Count holds the overall loop count (N). + TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), + L->getLoopPreheader()->getTerminator()); + + if (TripCount->getType()->isPointerTy()) + TripCount = + CastInst::CreatePointerCast(TripCount, IdxTy, + "exitcount.ptrcnt.to.int", + L->getLoopPreheader()->getTerminator()); + + return TripCount; +} + +Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { + if (VectorTripCount) + return VectorTripCount; + + Value *TC = getOrCreateTripCount(L); + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + + // Now we need to generate the expression for N - (N % VF), which is + // the part that the vectorized body will execute. + // The loop step is equal to the vectorization factor (num of SIMD elements) + // times the unroll factor (num of SIMD instructions). + Constant *Step = ConstantInt::get(TC->getType(), VF * UF); + Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); + VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); + + return VectorTripCount; +} - // If the original scalar returns a value we need to place it in a vector - // so that future users will be able to use it. - if (!IsVoidRetTy) - VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned, - Builder.getInt32(Width)); - // End if-block. - if (IfPredicateStore) { - BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); - LoopVectorBody.push_back(NewIfBlock); - VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); - Builder.SetInsertPoint(InsertPt); - Instruction *OldBr = IfBlock->getTerminator(); - BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); - OldBr->eraseFromParent(); - IfBlock = NewIfBlock; - } - } - } +void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, + BasicBlock *Bypass) { + Value *Count = getOrCreateTripCount(L); + BasicBlock *BB = L->getLoopPreheader(); + IRBuilder<> Builder(BB->getTerminator()); + + // Generate code to check that the loop's trip count that we computed by + // adding one to the backedge-taken count will not overflow. + Value *CheckMinIters = + Builder.CreateICmpULT(Count, + ConstantInt::get(Count->getType(), VF * UF), + "min.iters.check"); + + BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), + "min.iters.checked"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, CheckMinIters)); + LoopBypassBlocks.push_back(BB); } -static Instruction *getFirstInst(Instruction *FirstInst, Value *V, - Instruction *Loc) { - if (FirstInst) - return FirstInst; - if (Instruction *I = dyn_cast(V)) - return I->getParent() == Loc->getParent() ? I : nullptr; - return nullptr; +void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L, + BasicBlock *Bypass) { + Value *TC = getOrCreateVectorTripCount(L); + BasicBlock *BB = L->getLoopPreheader(); + IRBuilder<> Builder(BB->getTerminator()); + + // Now, compare the new count to zero. If it is zero skip the vector loop and + // jump to the scalar loop. + Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()), + "cmp.zero"); + + // Generate code to check that the loop's trip count that we computed by + // adding one to the backedge-taken count will not overflow. + BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), + "vector.ph"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, Cmp)); + LoopBypassBlocks.push_back(BB); } -std::pair -InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { - Instruction *tnullptr = nullptr; - if (!Legal->mustCheckStrides()) - return std::pair(tnullptr, tnullptr); +void InnerLoopVectorizer::emitStrideChecks(Loop *L, + BasicBlock *Bypass) { + BasicBlock *BB = L->getLoopPreheader(); + + // Generate the code to check that the strides we assumed to be one are really + // one. We want the new basic block to start at the first instruction in a + // sequence of instructions that form a check. + Instruction *StrideCheck; + Instruction *FirstCheckInst; + std::tie(FirstCheckInst, StrideCheck) = addStrideCheck(BB->getTerminator()); + if (!StrideCheck) + return; - IRBuilder<> ChkBuilder(Loc); + // Create a new block containing the stride check. + BB->setName("vector.stridecheck"); + auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, StrideCheck)); + LoopBypassBlocks.push_back(BB); + AddedSafetyChecks = true; +} - // Emit checks. - Value *Check = nullptr; - Instruction *FirstInst = nullptr; - for (SmallPtrSet::iterator SI = Legal->strides_begin(), - SE = Legal->strides_end(); - SI != SE; ++SI) { - Value *Ptr = stripIntegerCast(*SI); - Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1), - "stride.chk"); - // Store the first instruction we create. - FirstInst = getFirstInst(FirstInst, C, Loc); - if (Check) - Check = ChkBuilder.CreateOr(Check, C); - else - Check = C; - } +void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, + BasicBlock *Bypass) { + BasicBlock *BB = L->getLoopPreheader(); - // We have to do this trickery because the IRBuilder might fold the check to a - // constant expression in which case there is no Instruction anchored in a - // the block. - LLVMContext &Ctx = Loc->getContext(); - Instruction *TheCheck = - BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx)); - ChkBuilder.Insert(TheCheck, "stride.not.one"); - FirstInst = getFirstInst(FirstInst, TheCheck, Loc); + // Generate the code that checks in runtime if arrays overlap. We put the + // checks into a separate block to make the more common case of few elements + // faster. + Instruction *FirstCheckInst; + Instruction *MemRuntimeCheck; + std::tie(FirstCheckInst, MemRuntimeCheck) = + Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); + if (!MemRuntimeCheck) + return; - return std::make_pair(FirstInst, TheCheck); + // Create a new block containing the memory check. + BB->setName("vector.memcheck"); + auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); + LoopBypassBlocks.push_back(BB); + AddedSafetyChecks = true; } + void InnerLoopVectorizer::createEmptyLoop() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the scalar remainder. - [ ] <-- Back-edge taken count overflow check. + [ ] <-- loop iteration number check. / | / v | [ ] <-- vector loop bypass (may consist of multiple blocks). | / | | / v || [ ] <-- vector pre header. - || | - || v - || [ ] \ - || [ ]_| <-- vector loop. - || | - | \ v - | >[ ] <--- middle-block. + |/ | + | v + | [ ] \ + | [ ]_| <-- vector loop. + | | + | v + | -[ ] <--- middle-block. | / | | / v -|- >[ ] <--- new preheader. @@ -2502,86 +2819,28 @@ void InnerLoopVectorizer::createEmptyLoop() { */ BasicBlock *OldBasicBlock = OrigLoop->getHeader(); - BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); + BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); BasicBlock *ExitBlock = OrigLoop->getExitBlock(); - assert(BypassBlock && "Invalid loop structure"); + assert(VectorPH && "Invalid loop structure"); assert(ExitBlock && "Must have an exit block"); // Some loops have a single integer induction variable, while other loops // don't. One example is c++ iterators that often have multiple pointer // induction variables. In the code below we also support a case where we // don't have a single induction variable. + // + // We try to obtain an induction variable from the original loop as hard + // as possible. However if we don't find one that: + // - is an integer + // - counts from zero, stepping by one + // - is the size of the widest induction variable type + // then we create a new one. OldInduction = Legal->getInduction(); Type *IdxTy = Legal->getWidestInductionType(); - // Find the loop boundaries. - const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop); - assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); - - // The exit count might have the type of i64 while the phi is i32. This can - // happen if we have an induction variable that is sign extended before the - // compare. The only way that we get a backedge taken count is that the - // induction variable was signed and as such will not overflow. In such a case - // truncation is legal. - if (ExitCount->getType()->getPrimitiveSizeInBits() > - IdxTy->getPrimitiveSizeInBits()) - ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); - - const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); - // Get the total trip count from the count by adding 1. - ExitCount = SE->getAddExpr(BackedgeTakeCount, - SE->getConstant(BackedgeTakeCount->getType(), 1)); - - const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout(); - - // Expand the trip count and place the new instructions in the preheader. - // Notice that the pre-header does not change, only the loop body. - SCEVExpander Exp(*SE, DL, "induction"); - - // We need to test whether the backedge-taken count is uint##_max. Adding one - // to it will cause overflow and an incorrect loop trip count in the vector - // body. In case of overflow we want to directly jump to the scalar remainder - // loop. - Value *BackedgeCount = - Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(), - BypassBlock->getTerminator()); - if (BackedgeCount->getType()->isPointerTy()) - BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy, - "backedge.ptrcnt.to.int", - BypassBlock->getTerminator()); - Instruction *CheckBCOverflow = - CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount, - Constant::getAllOnesValue(BackedgeCount->getType()), - "backedge.overflow", BypassBlock->getTerminator()); - - // The loop index does not have to start at Zero. Find the original start - // value from the induction PHI node. If we don't have an induction variable - // then we know that it starts at zero. - Builder.SetInsertPoint(BypassBlock->getTerminator()); - Value *StartIdx = ExtendedIdx = OldInduction ? - Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock), - IdxTy): - ConstantInt::get(IdxTy, 0); - - // We need an instruction to anchor the overflow check on. StartIdx needs to - // be defined before the overflow check branch. Because the scalar preheader - // is going to merge the start index and so the overflow branch block needs to - // contain a definition of the start index. - Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd( - StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor", - BypassBlock->getTerminator()); - - // Count holds the overall loop count (N). - Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - BypassBlock->getTerminator()); - - LoopBypassBlocks.push_back(BypassBlock); - // Split the single block loop into the two loop structure described above. - BasicBlock *VectorPH = - BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); BasicBlock *VecBody = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); + VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); BasicBlock *ScalarPH = @@ -2596,128 +2855,42 @@ void InnerLoopVectorizer::createEmptyLoop() { if (ParentLoop) { ParentLoop->addChildLoop(Lp); ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); - ParentLoop->addBasicBlockToLoop(VectorPH, *LI); ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); } else { LI->addTopLevelLoop(Lp); } Lp->addBasicBlockToLoop(VecBody, *LI); - // Use this IR builder to create the loop instructions (Phi, Br, Cmp) - // inside the loop. - Builder.SetInsertPoint(VecBody->getFirstNonPHI()); - - // Generate the induction variable. - setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction)); - Induction = Builder.CreatePHI(IdxTy, 2, "index"); - // The loop step is equal to the vectorization factor (num of SIMD elements) - // times the unroll factor (num of SIMD instructions). - Constant *Step = ConstantInt::get(IdxTy, VF * UF); - - // This is the IR builder that we use to add all of the logic for bypassing - // the new vector loop. - IRBuilder<> BypassBuilder(BypassBlock->getTerminator()); - setDebugLocFromInst(BypassBuilder, - getDebugLocFromInstOrOperands(OldInduction)); - - // We may need to extend the index in case there is a type mismatch. - // We know that the count starts at zero and does not overflow. - if (Count->getType() != IdxTy) { - // The exit count can be of pointer type. Convert it to the correct - // integer type. - if (ExitCount->getType()->isPointerTy()) - Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int"); - else - Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast"); - } - - // Add the start index to the loop count to get the new end index. - Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx"); + // Find the loop boundaries. + Value *Count = getOrCreateTripCount(Lp); - // Now we need to generate the expression for N - (N % VF), which is - // the part that the vectorized body will execute. - Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf"); - Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec"); - Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx, - "end.idx.rnd.down"); + Value *StartIdx = ConstantInt::get(IdxTy, 0); + // We need to test whether the backedge-taken count is uint##_max. Adding one + // to it will cause overflow and an incorrect loop trip count in the vector + // body. In case of overflow we want to directly jump to the scalar remainder + // loop. + emitMinimumIterationCountCheck(Lp, ScalarPH); // Now, compare the new count to zero. If it is zero skip the vector loop and // jump to the scalar loop. - Value *Cmp = - BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero"); - - BasicBlock *LastBypassBlock = BypassBlock; - - // Generate code to check that the loops trip count that we computed by adding - // one to the backedge-taken count will not overflow. - { - auto PastOverflowCheck = - std::next(BasicBlock::iterator(OverflowCheckAnchor)); - BasicBlock *CheckBlock = - LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); - LoopBypassBlocks.push_back(CheckBlock); - Instruction *OldTerm = LastBypassBlock->getTerminator(); - BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm); - OldTerm->eraseFromParent(); - LastBypassBlock = CheckBlock; - } - + emitVectorLoopEnteredCheck(Lp, ScalarPH); // Generate the code to check that the strides we assumed to be one are really // one. We want the new basic block to start at the first instruction in a // sequence of instructions that form a check. - Instruction *StrideCheck; - Instruction *FirstCheckInst; - std::tie(FirstCheckInst, StrideCheck) = - addStrideCheck(LastBypassBlock->getTerminator()); - if (StrideCheck) { - AddedSafetyChecks = true; - // Create a new block containing the stride check. - BasicBlock *CheckBlock = - LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); - LoopBypassBlocks.push_back(CheckBlock); - - // Replace the branch into the memory check block with a conditional branch - // for the "few elements case". - Instruction *OldTerm = LastBypassBlock->getTerminator(); - BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); - OldTerm->eraseFromParent(); - - Cmp = StrideCheck; - LastBypassBlock = CheckBlock; - } - + emitStrideChecks(Lp, ScalarPH); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - Instruction *MemRuntimeCheck; - std::tie(FirstCheckInst, MemRuntimeCheck) = - Legal->getLAI()->addRuntimeCheck(LastBypassBlock->getTerminator()); - if (MemRuntimeCheck) { - AddedSafetyChecks = true; - // Create a new block containing the memory check. - BasicBlock *CheckBlock = - LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.memcheck"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); - LoopBypassBlocks.push_back(CheckBlock); - - // Replace the branch into the memory check block with a conditional branch - // for the "few elements case". - Instruction *OldTerm = LastBypassBlock->getTerminator(); - BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); - OldTerm->eraseFromParent(); - - Cmp = MemRuntimeCheck; - LastBypassBlock = CheckBlock; - } - - LastBypassBlock->getTerminator()->eraseFromParent(); - BranchInst::Create(MiddleBlock, VectorPH, Cmp, - LastBypassBlock); + emitMemRuntimeChecks(Lp, ScalarPH); + + // Generate the induction variable. + // The loop step is equal to the vectorization factor (num of SIMD elements) + // times the unroll factor (num of SIMD instructions). + Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + Constant *Step = ConstantInt::get(IdxTy, VF * UF); + Induction = + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the @@ -2727,151 +2900,60 @@ void InnerLoopVectorizer::createEmptyLoop() { // If we come from a bypass edge then we need to start from the original // start value. - // This variable saves the new starting index for the scalar loop. - PHINode *ResumeIndex = nullptr; + // This variable saves the new starting index for the scalar loop. It is used + // to test if there are any tail iterations left once the vector loop has + // completed. LoopVectorizationLegality::InductionList::iterator I, E; LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); - // Set builder to point to last bypass block. - BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator()); for (I = List->begin(), E = List->end(); I != E; ++I) { PHINode *OrigPhi = I->first; - LoopVectorizationLegality::InductionInfo II = I->second; - - Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType(); - PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val", - MiddleBlock->getTerminator()); - // We might have extended the type of the induction variable but we need a - // truncated version for the scalar loop. - PHINode *TruncResumeVal = (OrigPhi == OldInduction) ? - PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val", - MiddleBlock->getTerminator()) : nullptr; + InductionDescriptor II = I->second; // Create phi nodes to merge from the backedge-taken check block. - PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val", + PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, + "bc.resume.val", ScalarPH->getTerminator()); - BCResumeVal->addIncoming(ResumeVal, MiddleBlock); - - PHINode *BCTruncResumeVal = nullptr; + Value *EndValue; if (OrigPhi == OldInduction) { - BCTruncResumeVal = - PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val", - ScalarPH->getTerminator()); - BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock); - } - - Value *EndValue = nullptr; - switch (II.IK) { - case LoopVectorizationLegality::IK_NoInduction: - llvm_unreachable("Unknown induction"); - case LoopVectorizationLegality::IK_IntInduction: { - // Handle the integer induction counter. - assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); - - // We have the canonical induction variable. - if (OrigPhi == OldInduction) { - // Create a truncated version of the resume value for the scalar loop, - // we might have promoted the type to a larger width. - EndValue = - BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType()); - // The new PHI merges the original incoming value, in case of a bypass, - // or the value at the end of the vectorized loop. - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); - TruncResumeVal->addIncoming(EndValue, VecBody); - - BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); - - // We know what the end value is. - EndValue = IdxEndRoundDown; - // We also know which PHI node holds it. - ResumeIndex = ResumeVal; - break; - } - - // Not the canonical induction variable - add the vector loop count to the - // start value. - Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown, - II.StartValue->getType(), - "cast.crd"); - EndValue = II.transform(BypassBuilder, CRD); + // We know what the end value is. + EndValue = CountRoundDown; + } else { + IRBuilder<> B(LoopBypassBlocks.back()->getTerminator()); + Value *CRD = B.CreateSExtOrTrunc(CountRoundDown, + II.getStepValue()->getType(), + "cast.crd"); + EndValue = II.transform(B, CRD); EndValue->setName("ind.end"); - break; } - case LoopVectorizationLegality::IK_PtrInduction: { - EndValue = II.transform(BypassBuilder, CountRoundDown); - EndValue->setName("ptr.ind.end"); - break; - } - }// end of case // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) { - if (OrigPhi == OldInduction) - ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]); - else - ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); - } - ResumeVal->addIncoming(EndValue, VecBody); + BCResumeVal->addIncoming(EndValue, MiddleBlock); // Fix the scalar body counter (PHI node). unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); // The old induction's phi node in the scalar body needs the truncated // value. - if (OrigPhi == OldInduction) { - BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]); - OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal); - } else { - BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]); - OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); - } - } - - // If we are generating a new induction variable then we also need to - // generate the code that calculates the exit value. This value is not - // simply the end of the counter because we may skip the vectorized body - // in case of a runtime check. - if (!OldInduction){ - assert(!ResumeIndex && "Unexpected resume value found"); - ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", - MiddleBlock->getTerminator()); - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); - ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]); + OrigPhi->setIncomingValue(BlockIdx, BCResumeVal); } - // Make sure that we found the index where scalar loop needs to continue. - assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && - "Invalid resume Index"); - // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. - Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, - ResumeIndex, "cmp.n", + Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, + CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); - - BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); - // Remove the old terminator. - MiddleBlock->getTerminator()->eraseFromParent(); - - // Create i+1 and fill the PHINode. - Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); - Induction->addIncoming(StartIdx, VectorPH); - Induction->addIncoming(NextIdx, VecBody); - // Create the compare. - Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); - Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); - - // Now we have two terminators. Remove the old one from the block. - VecBody->getTerminator()->eraseFromParent(); + ReplaceInstWithInst(MiddleBlock->getTerminator(), + BranchInst::Create(ExitBlock, ScalarPH, CmpN)); // Get ready to start creating new instructions into the vectorized body. - Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); // Save the state. - LoopVectorPreHeader = VectorPH; + LoopVectorPreHeader = Lp->getLoopPreheader(); LoopScalarPreHeader = ScalarPH; LoopMiddleBlock = MiddleBlock; LoopExitBlock = ExitBlock; @@ -2906,7 +2988,7 @@ struct CSEDenseMapInfo { return LHS->isIdenticalTo(RHS); } }; -} // namespace +} /// \brief Check whether this block is a predicated block. /// Due to if predication of stores we might create a sequence of "if(pred) a[i] @@ -2924,7 +3006,7 @@ static void cse(SmallVector &BBs) { for (unsigned i = 0, e = BBs.size(); i != e; ++i) { BasicBlock *BB = BBs[i]; for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { - Instruction *In = I++; + Instruction *In = &*I++; if (!CSEDenseMapInfo::canHandle(In)) continue; @@ -3046,6 +3128,117 @@ static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); } +static Type *smallestIntegerVectorType(Type *T1, Type *T2) { + IntegerType *I1 = cast(T1->getVectorElementType()); + IntegerType *I2 = cast(T2->getVectorElementType()); + return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; +} +static Type *largestIntegerVectorType(Type *T1, Type *T2) { + IntegerType *I1 = cast(T1->getVectorElementType()); + IntegerType *I2 = cast(T2->getVectorElementType()); + return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; +} + +void InnerLoopVectorizer::truncateToMinimalBitwidths() { + // For every instruction `I` in MinBWs, truncate the operands, create a + // truncated version of `I` and reextend its result. InstCombine runs + // later and will remove any ext/trunc pairs. + // + for (auto &KV : MinBWs) { + VectorParts &Parts = WidenMap.get(KV.first); + for (Value *&I : Parts) { + if (I->use_empty()) + continue; + Type *OriginalTy = I->getType(); + Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(), + KV.second); + Type *TruncatedTy = VectorType::get(ScalarTruncatedTy, + OriginalTy->getVectorNumElements()); + if (TruncatedTy == OriginalTy) + continue; + + IRBuilder<> B(cast(I)); + auto ShrinkOperand = [&](Value *V) -> Value* { + if (auto *ZI = dyn_cast(V)) + if (ZI->getSrcTy() == TruncatedTy) + return ZI->getOperand(0); + return B.CreateZExtOrTrunc(V, TruncatedTy); + }; + + // The actual instruction modification depends on the instruction type, + // unfortunately. + Value *NewI = nullptr; + if (BinaryOperator *BO = dyn_cast(I)) { + NewI = B.CreateBinOp(BO->getOpcode(), + ShrinkOperand(BO->getOperand(0)), + ShrinkOperand(BO->getOperand(1))); + cast(NewI)->copyIRFlags(I); + } else if (ICmpInst *CI = dyn_cast(I)) { + NewI = B.CreateICmp(CI->getPredicate(), + ShrinkOperand(CI->getOperand(0)), + ShrinkOperand(CI->getOperand(1))); + } else if (SelectInst *SI = dyn_cast(I)) { + NewI = B.CreateSelect(SI->getCondition(), + ShrinkOperand(SI->getTrueValue()), + ShrinkOperand(SI->getFalseValue())); + } else if (CastInst *CI = dyn_cast(I)) { + switch (CI->getOpcode()) { + default: llvm_unreachable("Unhandled cast!"); + case Instruction::Trunc: + NewI = ShrinkOperand(CI->getOperand(0)); + break; + case Instruction::SExt: + NewI = B.CreateSExtOrTrunc(CI->getOperand(0), + smallestIntegerVectorType(OriginalTy, + TruncatedTy)); + break; + case Instruction::ZExt: + NewI = B.CreateZExtOrTrunc(CI->getOperand(0), + smallestIntegerVectorType(OriginalTy, + TruncatedTy)); + break; + } + } else if (ShuffleVectorInst *SI = dyn_cast(I)) { + auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements(); + auto *O0 = + B.CreateZExtOrTrunc(SI->getOperand(0), + VectorType::get(ScalarTruncatedTy, Elements0)); + auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements(); + auto *O1 = + B.CreateZExtOrTrunc(SI->getOperand(1), + VectorType::get(ScalarTruncatedTy, Elements1)); + + NewI = B.CreateShuffleVector(O0, O1, SI->getMask()); + } else if (isa(I)) { + // Don't do anything with the operands, just extend the result. + continue; + } else { + llvm_unreachable("Unhandled instruction type!"); + } + + // Lastly, extend the result. + NewI->takeName(cast(I)); + Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy); + I->replaceAllUsesWith(Res); + cast(I)->eraseFromParent(); + I = Res; + } + } + + // We'll have created a bunch of ZExts that are now parentless. Clean up. + for (auto &KV : MinBWs) { + VectorParts &Parts = WidenMap.get(KV.first); + for (Value *&I : Parts) { + ZExtInst *Inst = dyn_cast(I); + if (Inst && Inst->use_empty()) { + Value *NewI = Inst->getOperand(0); + Inst->eraseFromParent(); + I = NewI; + } + } + } +} + void InnerLoopVectorizer::vectorizeLoop() { //===------------------------------------------------===// // @@ -3076,6 +3269,11 @@ void InnerLoopVectorizer::vectorizeLoop() { be = DFS.endRPO(); bb != be; ++bb) vectorizeBlockInLoop(*bb, &RdxPHIsToFix); + // Insert truncates and extends for any truncated instructions as hints to + // InstCombine. + if (VF > 1) + truncateToMinimalBitwidths(); + // At this point every instruction in the original loop is widened to // a vector form. We are almost done. Now, we need to fix the PHI nodes // that we vectorized. The PHI nodes are currently empty because we did @@ -3166,21 +3364,33 @@ void InnerLoopVectorizer::vectorizeLoop() { // the PHIs and the values we are going to write. // This allows us to write both PHINodes and the extractelement // instructions. - Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); + Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); - VectorParts RdxParts; + VectorParts RdxParts = getVectorValue(LoopExitInst); setDebugLocFromInst(Builder, LoopExitInst); - for (unsigned part = 0; part < UF; ++part) { - // This PHINode contains the vectorized reduction variable, or - // the initial value vector, if we bypass the vector loop. - VectorParts &RdxExitVal = getVectorValue(LoopExitInst); - PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); - Value *StartVal = (part == 0) ? VectorStart : Identity; - for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) - NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]); - NewPhi->addIncoming(RdxExitVal[part], - LoopVectorBody.back()); - RdxParts.push_back(NewPhi); + + // If the vector reduction can be performed in a smaller type, we truncate + // then extend the loop exit value to enable InstCombine to evaluate the + // entire expression in the smaller type. + if (VF > 1 && RdxPhi->getType() != RdxDesc.getRecurrenceType()) { + Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); + Builder.SetInsertPoint(LoopVectorBody.back()->getTerminator()); + for (unsigned part = 0; part < UF; ++part) { + Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy); + Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy) + : Builder.CreateZExt(Trunc, VecTy); + for (Value::user_iterator UI = RdxParts[part]->user_begin(); + UI != RdxParts[part]->user_end();) + if (*UI != Trunc) { + (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd); + RdxParts[part] = Extnd; + } else { + ++UI; + } + } + Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt()); + for (unsigned part = 0; part < UF; ++part) + RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy); } // Reduce all of the unrolled parts into a single vector. @@ -3233,13 +3443,22 @@ void InnerLoopVectorizer::vectorizeLoop() { // The result is in the first element of the vector. ReducedPartRdx = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + + // If the reduction can be performed in a smaller type, we need to extend + // the reduction to the wider type before we branch to the original loop. + if (RdxPhi->getType() != RdxDesc.getRecurrenceType()) + ReducedPartRdx = + RdxDesc.isSigned() + ? Builder.CreateSExt(ReducedPartRdx, RdxPhi->getType()) + : Builder.CreateZExt(ReducedPartRdx, RdxPhi->getType()); } // Create a phi node that merges control-flow from the backedge-taken check // block and the middle block. PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx", LoopScalarPreHeader->getTerminator()); - BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]); + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]); BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock); // Now, we need to fix the users of the reduction variable @@ -3277,6 +3496,20 @@ void InnerLoopVectorizer::vectorizeLoop() { fixLCSSAPHIs(); + // Make sure DomTree is updated. + updateAnalysis(); + + // Predicate any stores. + for (auto KV : PredicatedStores) { + BasicBlock::iterator I(KV.first); + auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI); + auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, + /*BranchWeights=*/nullptr, DT); + I->moveBefore(T); + I->getParent()->setName("pred.store.if"); + BB->setName("pred.store.continue"); + } + DEBUG(DT->verifyDomTree()); // Remove redundant induction instructions. cse(LoopVectorBody); } @@ -3361,8 +3594,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, // This is phase one of vectorizing PHIs. Type *VecTy = (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); - Entry[part] = PHINode::Create(VecTy, 2, "vec.phi", - LoopVectorBody.back()-> getFirstInsertionPt()); + Entry[part] = PHINode::Create( + VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt()); } PV->push_back(P); return; @@ -3410,51 +3643,43 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, assert(Legal->getInductionVars()->count(P) && "Not an induction variable"); - LoopVectorizationLegality::InductionInfo II = - Legal->getInductionVars()->lookup(P); + InductionDescriptor II = Legal->getInductionVars()->lookup(P); // FIXME: The newly created binary instructions should contain nsw/nuw flags, // which can be found from the original scalar operations. - switch (II.IK) { - case LoopVectorizationLegality::IK_NoInduction: + switch (II.getKind()) { + case InductionDescriptor::IK_NoInduction: llvm_unreachable("Unknown induction"); - case LoopVectorizationLegality::IK_IntInduction: { - assert(P->getType() == II.StartValue->getType() && "Types must match"); - Type *PhiTy = P->getType(); - Value *Broadcasted; - if (P == OldInduction) { - // Handle the canonical induction variable. We might have had to - // extend the type. - Broadcasted = Builder.CreateTrunc(Induction, PhiTy); - } else { - // Handle other induction variables that are now based on the - // canonical one. - Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx, - "normalized.idx"); - NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy); - Broadcasted = II.transform(Builder, NormalizedIdx); - Broadcasted->setName("offset.idx"); + case InductionDescriptor::IK_IntInduction: { + assert(P->getType() == II.getStartValue()->getType() && "Types must match"); + // Handle other induction variables that are now based on the + // canonical one. + Value *V = Induction; + if (P != OldInduction) { + V = Builder.CreateSExtOrTrunc(Induction, P->getType()); + V = II.transform(Builder, V); + V->setName("offset.idx"); } - Broadcasted = getBroadcastInstrs(Broadcasted); + Value *Broadcasted = getBroadcastInstrs(V); // After broadcasting the induction variable we need to make the vector // consecutive by adding 0, 1, 2, etc. for (unsigned part = 0; part < UF; ++part) - Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue); + Entry[part] = getStepVector(Broadcasted, VF * part, II.getStepValue()); return; } - case LoopVectorizationLegality::IK_PtrInduction: + case InductionDescriptor::IK_PtrInduction: // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); // This is the normalized GEP that starts counting at zero. - Value *NormalizedIdx = - Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx"); + Value *PtrInd = Induction; + PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStepValue()->getType()); // This is the vector of results. Notice that we don't generate // vector geps because scalar geps result in better code. for (unsigned part = 0; part < UF; ++part) { if (VF == 1) { int EltIndex = part; - Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); - Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); + Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); + Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = II.transform(Builder, GlobalIdx); SclrGep->setName("next.gep"); Entry[part] = SclrGep; @@ -3464,8 +3689,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); for (unsigned int i = 0; i < VF; ++i) { int EltIndex = i + part * VF; - Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex); - Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx); + Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex); + Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = II.transform(Builder, GlobalIdx); SclrGep->setName("next.gep"); VecVal = Builder.CreateInsertElement(VecVal, SclrGep, @@ -3481,7 +3706,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // For each instruction in the old loop. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - VectorParts &Entry = WidenMap.get(it); + VectorParts &Entry = WidenMap.get(&*it); + switch (it->getOpcode()) { case Instruction::Br: // Nothing to do for PHIs and BR, since we already took care of the @@ -3489,7 +3715,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { continue; case Instruction::PHI: { // Vectorize PHINodes. - widenPHIInstruction(it, Entry, UF, VF, PV); + widenPHIInstruction(&*it, Entry, UF, VF, PV); continue; }// End of PHI. @@ -3527,7 +3753,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Entry[Part] = V; } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } case Instruction::Select: { @@ -3536,7 +3762,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // instruction with a scalar condition. Otherwise, use vector-select. bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), OrigLoop); - setDebugLocFromInst(Builder, it); + setDebugLocFromInst(Builder, &*it); // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. @@ -3545,7 +3771,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { VectorParts &Cond = getVectorValue(it->getOperand(0)); VectorParts &Op0 = getVectorValue(it->getOperand(1)); VectorParts &Op1 = getVectorValue(it->getOperand(2)); - + Value *ScalarCond = (VF == 1) ? Cond[0] : Builder.CreateExtractElement(Cond[0], Builder.getInt32(0)); @@ -3556,7 +3782,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Op1[Part]); } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } @@ -3565,25 +3791,27 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Widen compares. Generate vector compares. bool FCmp = (it->getOpcode() == Instruction::FCmp); CmpInst *Cmp = dyn_cast(it); - setDebugLocFromInst(Builder, it); + setDebugLocFromInst(Builder, &*it); VectorParts &A = getVectorValue(it->getOperand(0)); VectorParts &B = getVectorValue(it->getOperand(1)); for (unsigned Part = 0; Part < UF; ++Part) { Value *C = nullptr; - if (FCmp) + if (FCmp) { C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]); - else + cast(C)->copyFastMathFlags(&*it); + } else { C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]); + } Entry[Part] = C; } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } case Instruction::Store: case Instruction::Load: - vectorizeMemoryInstruction(it); + vectorizeMemoryInstruction(&*it); break; case Instruction::ZExt: case Instruction::SExt: @@ -3598,7 +3826,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { case Instruction::FPTrunc: case Instruction::BitCast: { CastInst *CI = dyn_cast(it); - setDebugLocFromInst(Builder, it); + setDebugLocFromInst(Builder, &*it); /// Optimize the special case where the source is the induction /// variable. Notice that we can only optimize the 'trunc' case /// because: a. FP conversions lose precision, b. sext/zext may wrap, @@ -3608,13 +3836,12 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, CI->getType()); Value *Broadcasted = getBroadcastInstrs(ScalarCast); - LoopVectorizationLegality::InductionInfo II = - Legal->getInductionVars()->lookup(OldInduction); + InductionDescriptor II = Legal->getInductionVars()->lookup(OldInduction); Constant *Step = - ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue()); + ConstantInt::getSigned(CI->getType(), II.getStepValue()->getSExtValue()); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } /// Vectorize casts. @@ -3624,7 +3851,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { VectorParts &A = getVectorValue(it->getOperand(0)); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy); - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } @@ -3632,7 +3859,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Ignore dbg intrinsics. if (isa(it)) break; - setDebugLocFromInst(Builder, it); + setDebugLocFromInst(Builder, &*it); Module *M = BB->getParent()->getParent(); CallInst *CI = cast(it); @@ -3648,7 +3875,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || ID == Intrinsic::lifetime_start)) { - scalarizeInstruction(it); + scalarizeInstruction(&*it); break; } // The flag shows whether we use Intrinsic or a usual Call for vectorized @@ -3659,7 +3886,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { bool UseVectorIntrinsic = ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; if (!UseVectorIntrinsic && NeedToScalarize) { - scalarizeInstruction(it); + scalarizeInstruction(&*it); break; } @@ -3700,13 +3927,13 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Entry[Part] = Builder.CreateCall(VectorF, Args); } - propagateMetadata(Entry, it); + propagateMetadata(Entry, &*it); break; } default: // All other instructions are unsupported. Scalarize them. - scalarizeInstruction(it); + scalarizeInstruction(&*it); break; }// end of switch. }// end of for_each instr. @@ -3724,19 +3951,12 @@ void InnerLoopVectorizer::updateAnalysis() { DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]); DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back()); - // Due to if predication of stores we might create a sequence of "if(pred) - // a[i] = ...; " blocks. - for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) { - if (i == 0) - DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); - else if (isPredicatedBlock(i)) { - DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]); - } else { - DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]); - } - } + // We don't predicate stores by this point, so the vector body should be a + // single loop. + assert(LoopVectorBody.size() == 1 && "Expected single block loop!"); + DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader); - DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]); + DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back()); DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); @@ -3830,7 +4050,7 @@ bool LoopVectorizationLegality::canVectorize() { } // We can only vectorize innermost loops. - if (!TheLoop->getSubLoopsVector().empty()) { + if (!TheLoop->empty()) { emitAnalysis(VectorizationReport() << "loop is not the innermost loop"); return false; } @@ -3896,14 +4116,21 @@ bool LoopVectorizationLegality::canVectorize() { // Collect all of the variables that remain uniform after vectorization. collectLoopUniforms(); - DEBUG(dbgs() << "LV: We can vectorize this loop" << - (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)" : - "") - <<"!\n"); + DEBUG(dbgs() << "LV: We can vectorize this loop" + << (LAI->getRuntimePointerChecking()->Need + ? " (with a runtime bound check)" + : "") + << "!\n"); + + bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); + + // If an override option has been passed in for interleaved accesses, use it. + if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) + UseInterleaved = EnableInterleavedMemAccesses; // Analyze interleaved memory accesses. - if (EnableInterleavedMemAccesses) - InterleaveInfo.analyzeInterleaving(Strides); + if (UseInterleaved) + InterleaveInfo.analyzeInterleaving(Strides); // Okay! We can vectorize. At this point we don't have any other mem analysis // which may limit our maximum vectorization factor, so just return true with @@ -3951,7 +4178,6 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, } bool LoopVectorizationLegality::canVectorizeInstrs() { - BasicBlock *PreHeader = TheLoop->getLoopPreheader(); BasicBlock *Header = TheLoop->getHeader(); // Look for the attribute signaling the absence of NaNs. @@ -3975,7 +4201,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && !PhiTy->isPointerTy()) { - emitAnalysis(VectorizationReport(it) + emitAnalysis(VectorizationReport(&*it) << "loop control flow is not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; @@ -3987,9 +4213,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (*bb != Header) { // Check that this instruction has no outside users or is an // identified reduction value with an outside user. - if (!hasOutsideLoopUser(TheLoop, it, AllowedExit)) + if (!hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) continue; - emitAnalysis(VectorizationReport(it) << + emitAnalysis(VectorizationReport(&*it) << "value could not be identified as " "an induction or reduction variable"); return false; @@ -3997,19 +4223,15 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // We only allow if-converted PHIs with exactly two incoming values. if (Phi->getNumIncomingValues() != 2) { - emitAnalysis(VectorizationReport(it) + emitAnalysis(VectorizationReport(&*it) << "control flow not understood by vectorizer"); DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; } - // This is the value coming from the preheader. - Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); - ConstantInt *StepValue = nullptr; - // Check if this is an induction variable. - InductionKind IK = isInductionVariable(Phi, StepValue); - - if (IK_NoInduction != IK) { + InductionDescriptor ID; + if (InductionDescriptor::isInductionPHI(Phi, SE, ID)) { + Inductions[Phi] = ID; // Get the widest type. if (!WidestIndTy) WidestIndTy = convertPointerToIntegerType(DL, PhiTy); @@ -4017,21 +4239,24 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); // Int inductions are special because we only allow one IV. - if (IK == IK_IntInduction && StepValue->isOne()) { + if (ID.getKind() == InductionDescriptor::IK_IntInduction && + ID.getStepValue()->isOne() && + isa(ID.getStartValue()) && + cast(ID.getStartValue())->isNullValue()) { // Use the phi node with the widest type as induction. Use the last // one if there are multiple (no good reason for doing this other - // than it is expedient). + // than it is expedient). We've checked that it begins at zero and + // steps by one, so this is a canonical induction variable. if (!Induction || PhiTy == WidestIndTy) Induction = Phi; } DEBUG(dbgs() << "LV: Found an induction variable.\n"); - Inductions[Phi] = InductionInfo(StartValue, IK, StepValue); // Until we explicitly handle the case of an induction variable with // an outside loop user we have to give up vectorizing this loop. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { - emitAnalysis(VectorizationReport(it) << + if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) { + emitAnalysis(VectorizationReport(&*it) << "use of induction value outside of the " "loop is not handled by vectorizer"); return false; @@ -4042,11 +4267,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, Reductions[Phi])) { + if (Reductions[Phi].hasUnsafeAlgebra()) + Requirements->addUnsafeAlgebraInst( + Reductions[Phi].getUnsafeAlgebraInst()); AllowedExit.insert(Reductions[Phi].getLoopExitInstr()); continue; } - emitAnalysis(VectorizationReport(it) << + emitAnalysis(VectorizationReport(&*it) << "value that could not be identified as " "reduction is used outside the loop"); DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); @@ -4061,8 +4289,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa(CI) && !(CI->getCalledFunction() && TLI && TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { - emitAnalysis(VectorizationReport(it) << - "call instruction cannot be vectorized"); + emitAnalysis(VectorizationReport(&*it) + << "call instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); return false; } @@ -4072,7 +4300,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (CI && hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { - emitAnalysis(VectorizationReport(it) + emitAnalysis(VectorizationReport(&*it) << "intrinsic instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); return false; @@ -4083,7 +4311,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Also, we can't vectorize extractelement instructions. if ((!VectorType::isValidElementType(it->getType()) && !it->getType()->isVoidTy()) || isa(it)) { - emitAnalysis(VectorizationReport(it) + emitAnalysis(VectorizationReport(&*it) << "instruction return type cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable type.\n"); return false; @@ -4107,8 +4335,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. - if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) { - emitAnalysis(VectorizationReport(it) << + if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) { + emitAnalysis(VectorizationReport(&*it) << "value cannot be used outside the loop"); return false; } @@ -4126,119 +4354,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } } - return true; -} - -///\brief Remove GEPs whose indices but the last one are loop invariant and -/// return the induction operand of the gep pointer. -static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { - GetElementPtrInst *GEP = dyn_cast(Ptr); - if (!GEP) - return Ptr; - - unsigned InductionOperand = getGEPInductionOperand(GEP); - - // Check that all of the gep indices are uniform except for our induction - // operand. - for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) - if (i != InductionOperand && - !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp)) - return Ptr; - return GEP->getOperand(InductionOperand); -} - -///\brief Look for a cast use of the passed value. -static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) { - Value *UniqueCast = nullptr; - for (User *U : Ptr->users()) { - CastInst *CI = dyn_cast(U); - if (CI && CI->getType() == Ty) { - if (!UniqueCast) - UniqueCast = CI; - else - return nullptr; - } - } - return UniqueCast; -} - -///\brief Get the stride of a pointer access in a loop. -/// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a -/// pointer to the Value, or null otherwise. -static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { - const PointerType *PtrTy = dyn_cast(Ptr->getType()); - if (!PtrTy || PtrTy->isAggregateType()) - return nullptr; - - // Try to remove a gep instruction to make the pointer (actually index at this - // point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the - // pointer, otherwise, we are analyzing the index. - Value *OrigPtr = Ptr; - - // The size of the pointer access. - int64_t PtrAccessSize = 1; - - Ptr = stripGetElementPtr(Ptr, SE, Lp); - const SCEV *V = SE->getSCEV(Ptr); - - if (Ptr != OrigPtr) - // Strip off casts. - while (const SCEVCastExpr *C = dyn_cast(V)) - V = C->getOperand(); - - const SCEVAddRecExpr *S = dyn_cast(V); - if (!S) - return nullptr; - - V = S->getStepRecurrence(*SE); - if (!V) - return nullptr; - - // Strip off the size of access multiplication if we are still analyzing the - // pointer. - if (OrigPtr == Ptr) { - const DataLayout &DL = Lp->getHeader()->getModule()->getDataLayout(); - DL.getTypeAllocSize(PtrTy->getElementType()); - if (const SCEVMulExpr *M = dyn_cast(V)) { - if (M->getOperand(0)->getSCEVType() != scConstant) - return nullptr; - - const APInt &APStepVal = - cast(M->getOperand(0))->getValue()->getValue(); - - // Huge step value - give up. - if (APStepVal.getBitWidth() > 64) - return nullptr; - - int64_t StepVal = APStepVal.getSExtValue(); - if (PtrAccessSize != StepVal) - return nullptr; - V = M->getOperand(1); - } - } - - // Strip off casts. - Type *StripedOffRecurrenceCast = nullptr; - if (const SCEVCastExpr *C = dyn_cast(V)) { - StripedOffRecurrenceCast = C->getType(); - V = C->getOperand(); - } - - // Look for the loop invariant symbolic value. - const SCEVUnknown *U = dyn_cast(V); - if (!U) - return nullptr; - - Value *Stride = U->getValue(); - if (!Lp->isLoopInvariant(Stride)) - return nullptr; - - // If we have stripped off the recurrence cast we have to make sure that we - // return the value that is used in this loop so that we can replace it later. - if (StripedOffRecurrenceCast) - Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast); + // Now we know the widest induction type, check if our found induction + // is the same size. If it's not, unset it here and InnerLoopVectorizer + // will create another. + if (Induction && WidestIndTy != Induction->getType()) + Induction = nullptr; - return Stride; + return true; } void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { @@ -4276,7 +4398,7 @@ void LoopVectorizationLegality::collectLoopUniforms() { BE = TheLoop->block_end(); B != BE; ++B) for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end(); I != IE; ++I) - if (I->getType()->isPointerTy() && isConsecutivePtr(I)) + if (I->getType()->isPointerTy() && isConsecutivePtr(&*I)) Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); while (!Worklist.empty()) { @@ -4313,30 +4435,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } - if (LAI->getNumRuntimePointerChecks() > - VectorizerParams::RuntimeMemoryCheckThreshold) { - emitAnalysis(VectorizationReport() - << LAI->getNumRuntimePointerChecks() << " exceeds limit of " - << VectorizerParams::RuntimeMemoryCheckThreshold - << " dependent memory operations checked at runtime"); - DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); - return false; - } - return true; -} + Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); -LoopVectorizationLegality::InductionKind -LoopVectorizationLegality::isInductionVariable(PHINode *Phi, - ConstantInt *&StepValue) { - if (!isInductionPHI(Phi, SE, StepValue)) - return IK_NoInduction; - - Type *PhiTy = Phi->getType(); - // Found an Integer induction variable. - if (PhiTy->isIntegerTy()) - return IK_IntInduction; - // Found an Pointer induction variable. - return IK_PtrInduction; + return true; } bool LoopVectorizationLegality::isInductionVariable(const Value *V) { @@ -4584,12 +4685,13 @@ LoopVectorizationCostModel::VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // Width 1 means no vectorize VectorizationFactor Factor = { 1U, 0U }; - if (OptForSize && Legal->getRuntimePointerCheck()->Need) { + if (OptForSize && Legal->getRuntimePointerChecking()->Need) { emitAnalysis(VectorizationReport() << "runtime pointer checks needed. Enable vectorization of this " "loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os"); - DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n"); + "compiling with -Os/-Oz"); + DEBUG(dbgs() << + "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); return Factor; } @@ -4604,6 +4706,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { unsigned TC = SE->getSmallConstantTripCount(TheLoop); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); + MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned WidestType = getWidestType(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); unsigned MaxSafeDepDist = -1U; @@ -4633,7 +4736,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { emitAnalysis (VectorizationReport() << "unable to calculate the loop count due to complex control flow"); - DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); + DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); return Factor; } @@ -4642,16 +4745,15 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { if (VF == 0) VF = MaxVectorSize; - - // If the trip count that we found modulo the vectorization factor is not - // zero then we require a tail. - if (VF < 2) { + else { + // If the trip count that we found modulo the vectorization factor is not + // zero then we require a tail. emitAnalysis(VectorizationReport() << "cannot optimize for size and vectorize at the " "same time. Enable vectorization of this loop " "with '#pragma clang loop vectorize(enable)' " - "when compiling with -Os"); - DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); + "when compiling with -Os/-Oz"); + DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n"); return Factor; } } @@ -4714,18 +4816,22 @@ unsigned LoopVectorizationCostModel::getWidestType() { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { Type *T = it->getType(); - // Ignore ephemeral values. - if (EphValues.count(it)) + // Skip ignored values. + if (ValuesToIgnore.count(&*it)) continue; // Only examine Loads, Stores and PHINodes. if (!isa(it) && !isa(it) && !isa(it)) continue; - // Examine PHI nodes that are reduction variables. - if (PHINode *PN = dyn_cast(it)) + // Examine PHI nodes that are reduction variables. Update the type to + // account for the recurrence type. + if (PHINode *PN = dyn_cast(it)) { if (!Legal->getReductionVars()->count(PN)) continue; + RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; + T = RdxDesc.getRecurrenceType(); + } // Examine the stored values. if (StoreInst *ST = dyn_cast(it)) @@ -4734,7 +4840,7 @@ unsigned LoopVectorizationCostModel::getWidestType() { // Ignore loaded pointer types and stored pointer types that are not // consecutive. However, we do want to take consecutive stores/loads of // pointer vectors into account. - if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) + if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it)) continue; MaxWidth = std::max(MaxWidth, @@ -4745,41 +4851,35 @@ unsigned LoopVectorizationCostModel::getWidestType() { return MaxWidth; } -unsigned -LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, - unsigned VF, - unsigned LoopCost) { +unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, + unsigned VF, + unsigned LoopCost) { - // -- The unroll heuristics -- - // We unroll the loop in order to expose ILP and reduce the loop overhead. + // -- The interleave heuristics -- + // We interleave the loop in order to expose ILP and reduce the loop overhead. // There are many micro-architectural considerations that we can't predict // at this level. For example, frontend pressure (on decode or fetch) due to // code size, or the number and capabilities of the execution ports. // - // We use the following heuristics to select the unroll factor: - // 1. If the code has reductions, then we unroll in order to break the cross + // We use the following heuristics to select the interleave count: + // 1. If the code has reductions, then we interleave to break the cross // iteration dependency. - // 2. If the loop is really small, then we unroll in order to reduce the loop + // 2. If the loop is really small, then we interleave to reduce the loop // overhead. - // 3. We don't unroll if we think that we will spill registers to memory due - // to the increased register pressure. + // 3. We don't interleave if we think that we will spill registers to memory + // due to the increased register pressure. - // Use the user preference, unless 'auto' is selected. - int UserUF = Hints->getInterleave(); - if (UserUF != 0) - return UserUF; - - // When we optimize for size, we don't unroll. + // When we optimize for size, we don't interleave. if (OptForSize) return 1; - // We used the distance for the unroll factor. + // We used the distance for the interleave count. if (Legal->getMaxSafeDepDistBytes() != -1U) return 1; - // Do not unroll loops with a relatively small trip count. + // Do not interleave loops with a relatively small trip count. unsigned TC = SE->getSmallConstantTripCount(TheLoop); - if (TC > 1 && TC < TinyTripCountUnrollThreshold) + if (TC > 1 && TC < TinyTripCountInterleaveThreshold) return 1; unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); @@ -4800,32 +4900,32 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); R.NumInstructions = std::max(R.NumInstructions, 1U); - // We calculate the unroll factor using the following formula. + // We calculate the interleave count using the following formula. // Subtract the number of loop invariants from the number of available - // registers. These registers are used by all of the unrolled instances. + // registers. These registers are used by all of the interleaved instances. // Next, divide the remaining registers by the number of registers that is // required by the loop, in order to estimate how many parallel instances // fit without causing spills. All of this is rounded down if necessary to be - // a power of two. We want power of two unroll factors to simplify any + // a power of two. We want power of two interleave count to simplify any // addressing operations or alignment considerations. - unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / + unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers); - // Don't count the induction variable as unrolled. + // Don't count the induction variable as interleaved. if (EnableIndVarRegisterHeur) - UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / + IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / std::max(1U, (R.MaxLocalUsers - 1))); - // Clamp the unroll factor ranges to reasonable factors. - unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(VF); + // Clamp the interleave ranges to reasonable counts. + unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); - // Check if the user has overridden the unroll max. + // Check if the user has overridden the max. if (VF == 1) { if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) - MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor; + MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; } else { if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) - MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor; + MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } // If we did not calculate the cost for VF (because the user selected the VF) @@ -4833,72 +4933,74 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, if (LoopCost == 0) LoopCost = expectedCost(VF); - // Clamp the calculated UF to be between the 1 and the max unroll factor + // Clamp the calculated IC to be between the 1 and the max interleave count // that the target allows. - if (UF > MaxInterleaveSize) - UF = MaxInterleaveSize; - else if (UF < 1) - UF = 1; + if (IC > MaxInterleaveCount) + IC = MaxInterleaveCount; + else if (IC < 1) + IC = 1; - // Unroll if we vectorized this loop and there is a reduction that could - // benefit from unrolling. + // Interleave if we vectorized this loop and there is a reduction that could + // benefit from interleaving. if (VF > 1 && Legal->getReductionVars()->size()) { - DEBUG(dbgs() << "LV: Unrolling because of reductions.\n"); - return UF; + DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); + return IC; } // Note that if we've already vectorized the loop we will have done the - // runtime check and so unrolling won't require further checks. - bool UnrollingRequiresRuntimePointerCheck = - (VF == 1 && Legal->getRuntimePointerCheck()->Need); + // runtime check and so interleaving won't require further checks. + bool InterleavingRequiresRuntimePointerCheck = + (VF == 1 && Legal->getRuntimePointerChecking()->Need); - // We want to unroll small loops in order to reduce the loop overhead and + // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'); - if (!UnrollingRequiresRuntimePointerCheck && - LoopCost < SmallLoopCost) { + if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) { // We assume that the cost overhead is 1 and we use the cost model - // to estimate the cost of the loop and unroll until the cost of the + // to estimate the cost of the loop and interleave until the cost of the // loop overhead is about 5% of the cost of the loop. - unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); + unsigned SmallIC = + std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost)); - // Unroll until store/load ports (estimated by max unroll factor) are + // Interleave until store/load ports (estimated by max interleave count) are // saturated. unsigned NumStores = Legal->getNumStores(); unsigned NumLoads = Legal->getNumLoads(); - unsigned StoresUF = UF / (NumStores ? NumStores : 1); - unsigned LoadsUF = UF / (NumLoads ? NumLoads : 1); + unsigned StoresIC = IC / (NumStores ? NumStores : 1); + unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); // If we have a scalar reduction (vector reductions are already dealt with // by this point), we can increase the critical path length if the loop - // we're unrolling is inside another loop. Limit, by default to 2, so the + // we're interleaving is inside another loop. Limit, by default to 2, so the // critical path only gets increased by one reduction operation. if (Legal->getReductionVars()->size() && TheLoop->getLoopDepth() > 1) { - unsigned F = static_cast(MaxNestedScalarReductionUF); - SmallUF = std::min(SmallUF, F); - StoresUF = std::min(StoresUF, F); - LoadsUF = std::min(LoadsUF, F); + unsigned F = static_cast(MaxNestedScalarReductionIC); + SmallIC = std::min(SmallIC, F); + StoresIC = std::min(StoresIC, F); + LoadsIC = std::min(LoadsIC, F); } - if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) { - DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n"); - return std::max(StoresUF, LoadsUF); + if (EnableLoadStoreRuntimeInterleave && + std::max(StoresIC, LoadsIC) > SmallIC) { + DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n"); + return std::max(StoresIC, LoadsIC); } - DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n"); - return SmallUF; + DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n"); + return SmallIC; } - // Unroll if this is a large loop (small loops are already dealt with by this - // point) that could benefit from interleaved unrolling. + // Interleave if this is a large loop (small loops are already dealt with by + // this + // point) that could benefit from interleaving. bool HasReductions = (Legal->getReductionVars()->size() > 0); if (TTI.enableAggressiveInterleaving(HasReductions)) { - DEBUG(dbgs() << "LV: Unrolling to expose ILP.\n"); - return UF; + DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); + return IC; } - DEBUG(dbgs() << "LV: Not Unrolling.\n"); + DEBUG(dbgs() << "LV: Not Interleaving.\n"); return 1; } @@ -4945,14 +5047,12 @@ LoopVectorizationCostModel::calculateRegisterUsage() { for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), be = DFS.endRPO(); bb != be; ++bb) { R.NumInstructions += (*bb)->size(); - for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; - ++it) { - Instruction *I = it; - IdxToInstr[Index++] = I; + for (Instruction &I : **bb) { + IdxToInstr[Index++] = &I; // Save the end location of each USE. - for (unsigned i = 0; i < I->getNumOperands(); ++i) { - Value *U = I->getOperand(i); + for (unsigned i = 0; i < I.getNumOperands(); ++i) { + Value *U = I.getOperand(i); Instruction *Instr = dyn_cast(U); // Ignore non-instruction values such as arguments, constants, etc. @@ -4990,8 +5090,8 @@ LoopVectorizationCostModel::calculateRegisterUsage() { // Ignore instructions that are never used within the loop. if (!Ends.count(I)) continue; - // Ignore ephemeral values. - if (EphValues.count(I)) + // Skip ignored values. + if (ValuesToIgnore.count(I)) continue; // Remove all of the instructions that end at this location. @@ -5034,11 +5134,11 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { if (isa(it)) continue; - // Ignore ephemeral values. - if (EphValues.count(it)) + // Skip ignored values. + if (ValuesToIgnore.count(&*it)) continue; - unsigned C = getInstructionCost(it, VF); + unsigned C = getInstructionCost(&*it, VF); // Check if we should override the cost. if (ForceTargetInstructionCost.getNumOccurrences() > 0) @@ -5115,9 +5215,8 @@ static bool isLikelyComplexAddressComputation(Value *Ptr, } static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { - if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1))) - return true; - return false; + return Legal->hasStride(I->getOperand(0)) || + Legal->hasStride(I->getOperand(1)); } unsigned @@ -5128,6 +5227,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { VF = 1; Type *RetTy = I->getType(); + if (VF > 1 && MinBWs.count(I)) + RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); Type *VectorTy = ToVectorTy(RetTy, VF); // TODO: We need to estimate the cost of intrinsic calls. @@ -5210,6 +5311,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::ICmp: case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); + if (VF > 1 && MinBWs.count(dyn_cast(I->getOperand(0)))) + ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]); VectorTy = ToVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); } @@ -5333,8 +5436,28 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { Legal->isInductionVariable(I->getOperand(0))) return TTI.getCastInstrCost(I->getOpcode(), I->getType(), I->getOperand(0)->getType()); - - Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); + + Type *SrcScalarTy = I->getOperand(0)->getType(); + Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF); + if (VF > 1 && MinBWs.count(I)) { + // This cast is going to be shrunk. This may remove the cast or it might + // turn it into slightly different cast. For example, if MinBW == 16, + // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". + // + // Calculate the modified src and dest types. + Type *MinVecTy = VectorTy; + if (I->getOpcode() == Instruction::Trunc) { + SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy); + VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF), + MinVecTy); + } else if (I->getOpcode() == Instruction::ZExt || + I->getOpcode() == Instruction::SExt) { + SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy); + VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF), + MinVecTy); + } + } + return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } case Instruction::Call: { @@ -5374,15 +5497,18 @@ char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) +INITIALIZE_PASS_DEPENDENCY(DemandedBits) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm { @@ -5450,19 +5576,12 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, // Create a new entry in the WidenMap and initialize it to Undef or Null. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec); - Instruction *InsertPt = Builder.GetInsertPoint(); - BasicBlock *IfBlock = Builder.GetInsertBlock(); - BasicBlock *CondBlock = nullptr; - VectorParts Cond; - Loop *VectorLp = nullptr; if (IfPredicateStore) { assert(Instr->getParent()->getSinglePredecessor() && "Only support single predecessor blocks"); Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(), Instr->getParent()); - VectorLp = LI->getLoopFor(IfBlock); - assert(VectorLp && "Must have a loop for this block"); } // For each vector unroll 'part': @@ -5477,11 +5596,6 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part], ConstantInt::get(Cond[Part]->getType(), 1)); - CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); - LoopVectorBody.push_back(CondBlock); - VectorLp->addBasicBlockToLoop(CondBlock, *LI); - // Update Builder with newly created basic block. - Builder.SetInsertPoint(InsertPt); } Instruction *Cloned = Instr->clone(); @@ -5501,17 +5615,10 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, if (!IsVoidRetTy) VecResults[Part] = Cloned; - // End if-block. - if (IfPredicateStore) { - BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); - LoopVectorBody.push_back(NewIfBlock); - VectorLp->addBasicBlockToLoop(NewIfBlock, *LI); - Builder.SetInsertPoint(InsertPt); - Instruction *OldBr = IfBlock->getTerminator(); - BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); - OldBr->eraseFromParent(); - IfBlock = NewIfBlock; - } + // End if-block. + if (IfPredicateStore) + PredicatedStores.push_back(std::make_pair(cast(Cloned), + Cmp)); } }