X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTransforms%2FVectorize%2FLoopVectorize.cpp;h=8986932309a5fa2167292a2e978ba5b9046af58c;hb=0973b7ddb8f8267132147c8b24dae7b2dfa1fd02;hp=66f2ad72fceeb7a3765f98fec357607a05ba3bcc;hpb=4926cfb1da523dc47a68f20ca0d8901db2b35f96;p=oota-llvm.git diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 66f2ad72fce..8986932309a 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -58,6 +58,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" @@ -92,6 +93,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/VectorUtils.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include #include #include @@ -105,15 +107,6 @@ using namespace llvm::PatternMatch; STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); -static cl::opt -VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, - cl::desc("Sets the SIMD width. Zero is autoselect.")); - -static cl::opt -VectorizationInterleave("force-vector-interleave", cl::init(0), cl::Hidden, - cl::desc("Sets the vectorization interleave count. " - "Zero is autoselect.")); - static cl::opt EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); @@ -144,13 +137,6 @@ static cl::opt EnableMemAccessVersioning( /// We don't unroll loops with a known constant trip count below this number. static const unsigned TinyTripCountUnrollThreshold = 128; -/// When performing memory disambiguation checks at runtime do not make more -/// than this number of comparisons. -static const unsigned RuntimeMemoryCheckThreshold = 8; - -/// Maximum simd width. -static const unsigned MaxVectorWidth = 64; - static cl::opt ForceTargetNumScalarRegs( "force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers.")); @@ -218,35 +204,30 @@ class LoopVectorizationLegality; class LoopVectorizationCostModel; class LoopVectorizeHints; -/// Optimization analysis message produced during vectorization. Messages inform -/// the user why vectorization did not occur. -class VectorizationReport { - std::string Message; - raw_string_ostream Out; - Instruction *Instr; - +/// \brief This modifies LoopAccessReport to initialize message with +/// loop-vectorizer-specific part. +class VectorizationReport : public LoopAccessReport { public: - VectorizationReport(Instruction *I = nullptr) : Out(Message), Instr(I) { - Out << "loop not vectorized: "; - } - - template VectorizationReport &operator<<(const A &Value) { - Out << Value; - return *this; - } - - Instruction *getInstr() { return Instr; } - - std::string &str() { return Out.str(); } - operator Twine() { return Out.str(); } - - /// \brief Emit an analysis note with the debug location from the instruction - /// in \p Message if available. Otherwise use the location of \p TheLoop. - static void emitAnalysis(VectorizationReport &Message, - const Function *TheFunction, - const Loop *TheLoop); + VectorizationReport(Instruction *I = nullptr) + : LoopAccessReport("loop not vectorized: ", I) {} + + /// \brief This allows promotion of the loop-access analysis report into the + /// loop-vectorizer report. It modifies the message to add the + /// loop-vectorizer-specific part of the message. + explicit VectorizationReport(const LoopAccessReport &R) + : LoopAccessReport(Twine("loop not vectorized: ") + R.str(), + R.getInstr()) {} }; +/// A helper function for converting Scalar types to vector types. +/// If the incoming type is void, we return void. If the VF is 1, we return +/// the scalar type. +static Type* ToVectorTy(Type *Scalar, unsigned VF) { + if (Scalar->isVoidTy() || VF == 1) + return Scalar; + return VectorType::get(Scalar, VF); +} + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -264,13 +245,13 @@ public: class InnerLoopVectorizer { public: InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, const DataLayout *DL, - const TargetLibraryInfo *TLI, unsigned VecWidth, + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, unsigned VecWidth, unsigned UnrollFactor) - : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI), + : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), - Legal(nullptr) {} + Legal(nullptr), AddedSafetyChecks(false) {} // Perform the actual loop widening (vectorization). void vectorize(LoopVectorizationLegality *L) { @@ -284,6 +265,11 @@ public: updateAnalysis(); } + // Return true if any runtime check is added. + bool IsSafetyChecksAdded() { + return AddedSafetyChecks; + } + virtual ~InnerLoopVectorizer() {} protected: @@ -299,13 +285,6 @@ protected: typedef DenseMap, VectorParts> EdgeMaskCache; - /// \brief Add code that checks at runtime if the accessed arrays overlap. - /// - /// Returns a pair of instructions where the first element is the first - /// instruction generated in possibly a sequence of instructions and the - /// second value is the final comparator value or NULL if no check is needed. - std::pair addRuntimeCheck(Instruction *Loc); - /// \brief Add checks for strides that where assumed to be 1. /// /// Returns the last check instruction and the first check instruction in the @@ -425,10 +404,10 @@ protected: DominatorTree *DT; /// Alias Analysis. AliasAnalysis *AA; - /// Data Layout. - const DataLayout *DL; /// Target Library Info. const TargetLibraryInfo *TLI; + /// Target Transform Info. + const TargetTransformInfo *TTI; /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. @@ -470,14 +449,17 @@ protected: EdgeMaskCache MaskCache; LoopVectorizationLegality *Legal; + + // Record whether runtime check is added. + bool AddedSafetyChecks; }; class InnerLoopUnroller : public InnerLoopVectorizer { public: InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, const DataLayout *DL, - const TargetLibraryInfo *TLI, unsigned UnrollFactor) : - InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { } + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, unsigned UnrollFactor) + : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {} private: void scalarizeInstruction(Instruction *Instr, @@ -522,9 +504,8 @@ static std::string getDebugLocString(const Loop *L) { std::string Result; if (L) { raw_string_ostream OS(Result); - const DebugLoc LoopDbgLoc = L->getStartLoc(); - if (!LoopDbgLoc.isUnknown()) - LoopDbgLoc.print(L->getHeader()->getContext(), OS); + if (const DebugLoc LoopDbgLoc = L->getStartLoc()) + LoopDbgLoc.print(OS); else // Just print the module name. OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); @@ -557,16 +538,6 @@ static void propagateMetadata(Instruction *To, const Instruction *From) { } } -void VectorizationReport::emitAnalysis(VectorizationReport &Message, - const Function *TheFunction, - const Loop *TheLoop) { - DebugLoc DL = TheLoop->getStartLoc(); - if (Instruction *I = Message.getInstr()) - DL = I->getDebugLoc(); - emitOptimizationRemarkAnalysis(TheFunction->getContext(), DEBUG_TYPE, - *TheFunction, DL, Message.str()); -} - /// \brief Propagate known metadata from one instruction to a vector of others. static void propagateMetadata(SmallVectorImpl &To, const Instruction *From) { for (Value *V : To) @@ -574,133 +545,6 @@ static void propagateMetadata(SmallVectorImpl &To, const Instruction *F propagateMetadata(I, From); } -namespace { -/// This struct holds information about the memory runtime legality -/// check that a group of pointers do not overlap. -struct RuntimePointerCheck { - RuntimePointerCheck() : Need(false) {} - - /// Reset the state of the pointer runtime information. - void reset() { - Need = false; - Pointers.clear(); - Starts.clear(); - Ends.clear(); - IsWritePtr.clear(); - DependencySetId.clear(); - AliasSetId.clear(); - } - - /// Insert a pointer and calculate the start and end SCEVs. - void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, - unsigned DepSetId, unsigned ASId, ValueToValueMap &Strides); - - /// This flag indicates if we need to add the runtime check. - bool Need; - /// Holds the pointers that we need to check. - SmallVector, 2> Pointers; - /// Holds the pointer value at the beginning of the loop. - SmallVector Starts; - /// Holds the pointer value at the end of the loop. - SmallVector Ends; - /// Holds the information if this pointer is used for writing to memory. - SmallVector IsWritePtr; - /// Holds the id of the set of pointers that could be dependent because of a - /// shared underlying object. - SmallVector DependencySetId; - /// Holds the id of the disjoint alias set to which this pointer belongs. - SmallVector AliasSetId; -}; - -/// \brief Drive the analysis of memory accesses in the loop -/// -/// This class is responsible for analyzing the memory accesses of a loop. It -/// collects the accesses and then its main helper the AccessAnalysis class -/// finds and categorizes the dependences in buildDependenceSets. -/// -/// For memory dependences that can be analyzed at compile time, it determines -/// whether the dependence is part of cycle inhibiting vectorization. This work -/// is delegated to the MemoryDepChecker class. -/// -/// For memory dependences that cannot be determined at compile time, it -/// generates run-time checks to prove independence. This is done by -/// AccessAnalysis::canCheckPtrAtRT and the checks are maintained by the -/// RuntimePointerCheck class. -class LoopAccessAnalysis { -public: - /// \brief Collection of parameters used from the vectorizer. - struct VectorizerParams { - /// \brief Maximum simd width. - unsigned MaxVectorWidth; - - /// \brief VF as overridden by the user. - unsigned VectorizationFactor; - /// \brief Interleave factor as overridden by the user. - unsigned VectorizationInterleave; - - /// \\brief When performing memory disambiguation checks at runtime do not - /// make more than this number of comparisons. - unsigned RuntimeMemoryCheckThreshold; - - VectorizerParams(unsigned MaxVectorWidth, - unsigned VectorizationFactor, - unsigned VectorizationInterleave, - unsigned RuntimeMemoryCheckThreshold) : - MaxVectorWidth(MaxVectorWidth), - VectorizationFactor(VectorizationFactor), - VectorizationInterleave(VectorizationInterleave), - RuntimeMemoryCheckThreshold(RuntimeMemoryCheckThreshold) {} - }; - - LoopAccessAnalysis(Function *F, Loop *L, ScalarEvolution *SE, - const DataLayout *DL, const TargetLibraryInfo *TLI, - AliasAnalysis *AA, DominatorTree *DT, - const VectorizerParams &VectParams) : - TheFunction(F), TheLoop(L), SE(SE), DL(DL), TLI(TLI), AA(AA), DT(DT), - NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1U), - VectParams(VectParams) {} - - /// Return true we can analyze the memory accesses in the loop and there are - /// no memory dependence cycles. Replaces symbolic strides using Strides. - bool canVectorizeMemory(ValueToValueMap &Strides); - - RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; } - - /// Return true if the block BB needs to be predicated in order for the loop - /// to be vectorized. - bool blockNeedsPredication(BasicBlock *BB); - - /// Returns true if the value V is uniform within the loop. - bool isUniform(Value *V); - - unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } - -private: - void emitAnalysis(Report &Message) { - emitLoopAnalysis(Message, TheFunction, TheLoop); - } - - /// We need to check that all of the pointers in this list are disjoint - /// at runtime. - RuntimePointerCheck PtrRtCheck; - Function *TheFunction; - Loop *TheLoop; - ScalarEvolution *SE; - const DataLayout *DL; - const TargetLibraryInfo *TLI; - AliasAnalysis *AA; - DominatorTree *DT; - - unsigned NumLoads; - unsigned NumStores; - - unsigned MaxSafeDepDistBytes; - - /// \brief Vectorizer parameters used by the analysis. - VectorizerParams VectParams; -}; -} // end anonymous namespace - /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and /// to what vectorization factor. /// This class does not look at the profitability of vectorization, only the @@ -716,17 +560,13 @@ private: /// induction variable and the different reduction variables. class LoopVectorizationLegality { public: - LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL, - DominatorTree *DT, TargetLibraryInfo *TLI, - AliasAnalysis *AA, Function *F, - const TargetTransformInfo *TTI) - : NumPredStores(0), TheLoop(L), SE(SE), DL(DL), TLI(TLI), TheFunction(F), - TTI(TTI), Induction(nullptr), WidestIndTy(nullptr), - LAA(F, L, SE, DL, TLI, AA, DT, - {MaxVectorWidth, VectorizationFactor, VectorizationInterleave, - RuntimeMemoryCheckThreshold}), - HasFunNoNaNAttr(false) { - } + LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT, + TargetLibraryInfo *TLI, AliasAnalysis *AA, + Function *F, const TargetTransformInfo *TTI, + LoopAccessAnalysis *LAA) + : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F), + TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), Induction(nullptr), + WidestIndTy(nullptr), HasFunNoNaNAttr(false) {} /// This enum represents the kinds of reductions that we support. enum ReductionKind { @@ -846,7 +686,7 @@ public: Index = B.CreateNeg(Index); else if (!StepValue->isOne()) Index = B.CreateMul(Index, StepValue); - return B.CreateGEP(StartValue, Index); + return B.CreateGEP(nullptr, StartValue, Index); case IK_NoInduction: return nullptr; @@ -911,15 +751,19 @@ public: bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); } /// Returns the information that we collected about runtime memory check. - RuntimePointerCheck *getRuntimePointerCheck() { - return LAA.getRuntimePointerCheck(); + const LoopAccessInfo::RuntimePointerCheck *getRuntimePointerCheck() const { + return LAI->getRuntimePointerCheck(); + } + + const LoopAccessInfo *getLAI() const { + return LAI; } /// This function returns the identity element (or neutral element) for /// the operation K. static Constant *getReductionIdentity(ReductionKind K, Type *Tp); - unsigned getMaxSafeDepDistBytes() { return LAA.getMaxSafeDepDistBytes(); } + unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } bool hasStride(Value *V) { return StrideSet.count(V); } bool mustCheckStrides() { return !StrideSet.empty(); } @@ -944,10 +788,10 @@ public: return (MaskedOp.count(I) != 0); } unsigned getNumStores() const { - return NumStores; + return LAI->getNumStores(); } unsigned getNumLoads() const { - return NumLoads; + return LAI->getNumLoads(); } unsigned getNumPredStores() const { return NumPredStores; @@ -1001,27 +845,32 @@ private: void collectStridedAccess(Value *LoadOrStoreInst); /// Report an analysis message to assist the user in diagnosing loops that are - /// not vectorized. - void emitAnalysis(VectorizationReport &Message) { - VectorizationReport::emitAnalysis(Message, TheFunction, TheLoop); + /// not vectorized. These are handled as LoopAccessReport rather than + /// VectorizationReport because the << operator of VectorizationReport returns + /// LoopAccessReport. + void emitAnalysis(const LoopAccessReport &Message) { + LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); } - unsigned NumLoads; - unsigned NumStores; unsigned NumPredStores; /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. ScalarEvolution *SE; - /// DataLayout analysis. - const DataLayout *DL; /// Target Library Info. TargetLibraryInfo *TLI; /// Parent function Function *TheFunction; /// Target Transform Info const TargetTransformInfo *TTI; + /// Dominator Tree. + DominatorTree *DT; + // LoopAccess analysis. + LoopAccessAnalysis *LAA; + // And the loop-accesses info corresponding to this loop. This pointer is + // null until canVectorizeMemory sets it up. + const LoopAccessInfo *LAI; // --- vectorization state --- // @@ -1043,13 +892,13 @@ private: /// This set holds the variables which are known to be uniform after /// vectorization. SmallPtrSet Uniforms; - LoopAccessAnalysis LAA; + /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; ValueToValueMap Strides; SmallPtrSet StrideSet; - + /// While vectorizing these instructions we have to generate a /// call to the appropriate masked intrinsic SmallPtrSet MaskedOp; @@ -1067,10 +916,9 @@ public: LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, - const DataLayout *DL, const TargetLibraryInfo *TLI, - AssumptionCache *AC, const Function *F, - const LoopVectorizeHints *Hints) - : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI), + const TargetLibraryInfo *TLI, AssumptionCache *AC, + const Function *F, const LoopVectorizeHints *Hints) + : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), TheFunction(F), Hints(Hints) { CodeMetrics::collectEphemeralValues(L, AC, EphValues); } @@ -1123,19 +971,16 @@ private: /// width. Vector width of one means scalar. unsigned getInstructionCost(Instruction *I, unsigned VF); - /// A helper function for converting Scalar types to vector types. - /// If the incoming type is void, we return void. If the VF is 1, we return - /// the scalar type. - static Type* ToVectorTy(Type *Scalar, unsigned VF); - /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); /// Report an analysis message to assist the user in diagnosing loops that are - /// not vectorized. - void emitAnalysis(VectorizationReport &Message) { - VectorizationReport::emitAnalysis(Message, TheFunction, TheLoop); + /// not vectorized. These are handled as LoopAccessReport rather than + /// VectorizationReport because the << operator of VectorizationReport returns + /// LoopAccessReport. + void emitAnalysis(const LoopAccessReport &Message) { + LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME); } /// Values used only by @llvm.assume calls. @@ -1151,8 +996,6 @@ private: LoopVectorizationLegality *Legal; /// Vector target information. const TargetTransformInfo &TTI; - /// Target data layout information. - const DataLayout *DL; /// Target Library Info. const TargetLibraryInfo *TLI; const Function *TheFunction; @@ -1188,7 +1031,7 @@ class LoopVectorizeHints { bool validate(unsigned Val) { switch (Kind) { case HK_WIDTH: - return isPowerOf2_32(Val) && Val <= MaxVectorWidth; + return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth; case HK_UNROLL: return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; case HK_FORCE: @@ -1216,7 +1059,8 @@ public: }; LoopVectorizeHints(const Loop *L, bool DisableInterleaving) - : Width("vectorize.width", VectorizationFactor, HK_WIDTH), + : Width("vectorize.width", VectorizerParams::VectorizationFactor, + HK_WIDTH), Interleave("interleave.count", DisableInterleaving, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), TheLoop(L) { @@ -1224,8 +1068,8 @@ public: getHintsFromMetadata(); // force-vector-interleave overrides DisableInterleaving. - if (VectorizationInterleave.getNumOccurrences() > 0) - Interleave.Value = VectorizationInterleave; + if (VectorizerParams::isInterleaveForced()) + Interleave.Value = VectorizerParams::VectorizationInterleave; DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() << "LV: Interleaving disabled by the pass manager\n"); @@ -1416,7 +1260,6 @@ struct LoopVectorize : public FunctionPass { } ScalarEvolution *SE; - const DataLayout *DL; LoopInfo *LI; TargetTransformInfo *TTI; DominatorTree *DT; @@ -1424,6 +1267,7 @@ struct LoopVectorize : public FunctionPass { TargetLibraryInfo *TLI; AliasAnalysis *AA; AssumptionCache *AC; + LoopAccessAnalysis *LAA; bool DisableUnrolling; bool AlwaysVectorize; @@ -1431,8 +1275,6 @@ struct LoopVectorize : public FunctionPass { bool runOnFunction(Function &F) override { SE = &getAnalysis(); - DataLayoutPass *DLP = getAnalysisIfAvailable(); - DL = DLP ? &DLP->getDataLayout() : nullptr; LI = &getAnalysis().getLoopInfo(); TTI = &getAnalysis().getTTI(F); DT = &getAnalysis().getDomTree(); @@ -1441,6 +1283,7 @@ struct LoopVectorize : public FunctionPass { TLI = TLIP ? &TLIP->getTLI() : nullptr; AA = &getAnalysis(); AC = &getAnalysis().getAssumptionCache(F); + LAA = &getAnalysis(); // Compute some weights outside of the loop over the loops. Compute this // using a BranchProbability to re-use its scaling math. @@ -1452,12 +1295,6 @@ struct LoopVectorize : public FunctionPass { if (!TTI->getNumberOfRegisters(true)) return false; - if (!DL) { - DEBUG(dbgs() << "\nLV: Not vectorizing " << F.getName() - << ": Missing data layout\n"); - return false; - } - // Build up a worklist of inner-loops to vectorize. This is necessary as // the act of vectorizing or partially unrolling a loop creates new loops // and can invalidate iterators across the loops. @@ -1477,6 +1314,40 @@ struct LoopVectorize : public FunctionPass { return Changed; } + static void AddRuntimeUnrollDisableMetaData(Loop *L) { + SmallVector MDs; + // Reserve first location for self reference to the LoopID metadata node. + MDs.push_back(nullptr); + bool IsUnrollMetadata = false; + MDNode *LoopID = L->getLoopID(); + if (LoopID) { + // First find existing loop unrolling disable metadata. + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + MDNode *MD = dyn_cast(LoopID->getOperand(i)); + if (MD) { + const MDString *S = dyn_cast(MD->getOperand(0)); + IsUnrollMetadata = + S && S->getString().startswith("llvm.loop.unroll.disable"); + } + MDs.push_back(LoopID->getOperand(i)); + } + } + + if (!IsUnrollMetadata) { + // Add runtime unroll disable metadata. + LLVMContext &Context = L->getHeader()->getContext(); + SmallVector DisableOperands; + DisableOperands.push_back( + MDString::get(Context, "llvm.loop.unroll.runtime.disable")); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + MDs.push_back(DisableNode); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + L->setLoopID(NewLoopID); + } + } + bool processLoop(Loop *L) { assert(L->empty() && "Only process inner loops."); @@ -1551,7 +1422,7 @@ struct LoopVectorize : public FunctionPass { } // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL, DT, TLI, AA, F, TTI); + LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); emitMissedWarning(F, L, Hints); @@ -1559,8 +1430,7 @@ struct LoopVectorize : public FunctionPass { } // Use the cost model. - LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AC, F, - &Hints); + LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints); // Check the function attributes to find out if this function should be // optimized for size. @@ -1624,14 +1494,20 @@ struct LoopVectorize : public FunctionPass { // We decided not to vectorize, but we may want to unroll. - InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF); + InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, UF); Unroller.vectorize(&LVL); } else { // If we decided that it is *legal* to vectorize the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); + InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, UF); LB.vectorize(&LVL); ++LoopsVectorized; + // Add metadata to disable runtime unrolling scalar loop when there's no + // runtime check about strides and memory. Because at this situation, + // scalar loop is rarely used not worthy to be unrolled. + if (!LB.IsSafetyChecksAdded()) + AddRuntimeUnrollDisableMetaData(L); + // Report the vectorization decision. emitOptimizationRemark( F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), @@ -1656,6 +1532,7 @@ struct LoopVectorize : public FunctionPass { AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); @@ -1670,65 +1547,6 @@ struct LoopVectorize : public FunctionPass { // LoopVectorizationCostModel. //===----------------------------------------------------------------------===// -static Value *stripIntegerCast(Value *V) { - if (CastInst *CI = dyn_cast(V)) - if (CI->getOperand(0)->getType()->isIntegerTy()) - return CI->getOperand(0); - return V; -} - -///\brief Replaces the symbolic stride in a pointer SCEV expression by one. -/// -/// If \p OrigPtr is not null, use it to look up the stride value instead of -/// \p Ptr. -static const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE, - ValueToValueMap &PtrToStride, - Value *Ptr, Value *OrigPtr = nullptr) { - - const SCEV *OrigSCEV = SE->getSCEV(Ptr); - - // If there is an entry in the map return the SCEV of the pointer with the - // symbolic stride replaced by one. - ValueToValueMap::iterator SI = PtrToStride.find(OrigPtr ? OrigPtr : Ptr); - if (SI != PtrToStride.end()) { - Value *StrideVal = SI->second; - - // Strip casts. - StrideVal = stripIntegerCast(StrideVal); - - // Replace symbolic stride by one. - Value *One = ConstantInt::get(StrideVal->getType(), 1); - ValueToValueMap RewriteMap; - RewriteMap[StrideVal] = One; - - const SCEV *ByOne = - SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true); - DEBUG(dbgs() << "LV: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne - << "\n"); - return ByOne; - } - - // Otherwise, just return the SCEV of the original pointer. - return SE->getSCEV(Ptr); -} - -void RuntimePointerCheck::insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, - bool WritePtr, unsigned DepSetId, - unsigned ASId, ValueToValueMap &Strides) { - // Get the stride replaced scev. - const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr); - const SCEVAddRecExpr *AR = dyn_cast(Sc); - assert(AR && "Invalid addrec expression"); - const SCEV *Ex = SE->getBackedgeTakenCount(Lp); - const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); - Pointers.push_back(Ptr); - Starts.push_back(AR->getStart()); - Ends.push_back(ScEnd); - IsWritePtr.push_back(WritePtr); - DependencySetId.push_back(DepSetId); - AliasSetId.push_back(ASId); -} - Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { // We need to place the broadcast of invariant variables outside the loop. Instruction *Instr = dyn_cast(V); @@ -1779,10 +1597,10 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, /// \brief Find the operand of the GEP that should be checked for consecutive /// stores. This ignores trailing indices that have no effect on the final /// pointer. -static unsigned getGEPInductionOperand(const DataLayout *DL, - const GetElementPtrInst *Gep) { +static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) { + const DataLayout &DL = Gep->getModule()->getDataLayout(); unsigned LastOperand = Gep->getNumOperands() - 1; - unsigned GEPAllocSize = DL->getTypeAllocSize( + unsigned GEPAllocSize = DL.getTypeAllocSize( cast(Gep->getType()->getScalarType())->getElementType()); // Walk backwards and try to peel off zeros. @@ -1793,7 +1611,7 @@ static unsigned getGEPInductionOperand(const DataLayout *DL, // If it's a type with the same allocation size as the result of the GEP we // can peel off the zero index. - if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize) + if (DL.getTypeAllocSize(*GEPTI) != GEPAllocSize) break; --LastOperand; } @@ -1839,7 +1657,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { return II.getConsecutiveDirection(); } - unsigned InductionOperand = getGEPInductionOperand(DL, Gep); + unsigned InductionOperand = getGEPInductionOperand(Gep); // Check that all of the gep indices are uniform except for our induction // operand. @@ -1885,12 +1703,8 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { return 0; } -bool LoopAccessAnalysis::isUniform(Value *V) { - return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); -} - bool LoopVectorizationLegality::isUniform(Value *V) { - return LAA.isUniform(V); + return LAI->isUniform(V); } InnerLoopVectorizer::VectorParts& @@ -1936,11 +1750,12 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. + const DataLayout &DL = Instr->getModule()->getDataLayout(); if (!Alignment) - Alignment = DL->getABITypeAlignment(ScalarDataTy); + Alignment = DL.getABITypeAlignment(ScalarDataTy); unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); - unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy); - unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF; + unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy); + unsigned VectorElementSize = DL.getTypeStoreSize(DataTy) / VF; if (SI && Legal->blockNeedsPredication(SI->getParent()) && !Legal->isMaskRequired(SI)) @@ -1981,7 +1796,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; unsigned NumOperands = Gep->getNumOperands(); - unsigned InductionOperand = getGEPInductionOperand(DL, Gep); + unsigned InductionOperand = getGEPInductionOperand(Gep); // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast(Gep->clone()); @@ -2024,7 +1839,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. - Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); + Value *PartPtr = + Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); if (Reverse) { // If we store to reverse consecutive memory locations then we need @@ -2032,8 +1848,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { StoredVal[Part] = reverseVector(StoredVal[Part]); // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. - PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); - PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); Mask[Part] = reverseVector(Mask[Part]); } @@ -2056,13 +1872,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { setDebugLocFromInst(Builder, LI); for (unsigned Part = 0; Part < UF; ++Part) { // Calculate the pointer for the specific unroll-part. - Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); + Value *PartPtr = + Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF)); if (Reverse) { // If the address is consecutive but reversed, then the // wide load needs to start at the last vector element. - PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); - PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF)); Mask[Part] = reverseVector(Mask[Part]); } @@ -2238,101 +2055,6 @@ InnerLoopVectorizer::addStrideCheck(Instruction *Loc) { return std::make_pair(FirstInst, TheCheck); } -std::pair -InnerLoopVectorizer::addRuntimeCheck(Instruction *Loc) { - RuntimePointerCheck *PtrRtCheck = Legal->getRuntimePointerCheck(); - - Instruction *tnullptr = nullptr; - if (!PtrRtCheck->Need) - return std::pair(tnullptr, tnullptr); - - unsigned NumPointers = PtrRtCheck->Pointers.size(); - SmallVector , 2> Starts; - SmallVector , 2> Ends; - - LLVMContext &Ctx = Loc->getContext(); - SCEVExpander Exp(*SE, "induction"); - Instruction *FirstInst = nullptr; - - for (unsigned i = 0; i < NumPointers; ++i) { - Value *Ptr = PtrRtCheck->Pointers[i]; - const SCEV *Sc = SE->getSCEV(Ptr); - - if (SE->isLoopInvariant(Sc, OrigLoop)) { - DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << - *Ptr <<"\n"); - Starts.push_back(Ptr); - Ends.push_back(Ptr); - } else { - DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n'); - unsigned AS = Ptr->getType()->getPointerAddressSpace(); - - // Use this type for pointer arithmetic. - Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); - - Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); - Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); - Starts.push_back(Start); - Ends.push_back(End); - } - } - - IRBuilder<> ChkBuilder(Loc); - // Our instructions might fold to a constant. - Value *MemoryRuntimeCheck = nullptr; - for (unsigned i = 0; i < NumPointers; ++i) { - for (unsigned j = i+1; j < NumPointers; ++j) { - // No need to check if two readonly pointers intersect. - if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) - continue; - - // Only need to check pointers between two different dependency sets. - if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) - continue; - // Only need to check pointers in the same alias set. - if (PtrRtCheck->AliasSetId[i] != PtrRtCheck->AliasSetId[j]) - continue; - - unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); - unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); - - assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && - (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && - "Trying to bounds check pointers with different address spaces"); - - Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); - Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); - - Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc"); - Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc"); - Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc"); - Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc"); - - Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); - FirstInst = getFirstInst(FirstInst, Cmp0, Loc); - Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); - FirstInst = getFirstInst(FirstInst, Cmp1, Loc); - Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); - FirstInst = getFirstInst(FirstInst, IsConflict, Loc); - if (MemoryRuntimeCheck) { - IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, - "conflict.rdx"); - FirstInst = getFirstInst(FirstInst, IsConflict, Loc); - } - MemoryRuntimeCheck = IsConflict; - } - } - - // We have to do this trickery because the IRBuilder might fold the check to a - // constant expression in which case there is no Instruction anchored in a - // the block. - Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, - ConstantInt::getTrue(Ctx)); - ChkBuilder.Insert(Check, "memcheck.conflict"); - FirstInst = getFirstInst(FirstInst, Check, Loc); - return std::make_pair(FirstInst, Check); -} - void InnerLoopVectorizer::createEmptyLoop() { /* In this function we generate a new loop. The new loop will contain @@ -2397,9 +2119,11 @@ void InnerLoopVectorizer::createEmptyLoop() { ExitCount = SE->getAddExpr(BackedgeTakeCount, SE->getConstant(BackedgeTakeCount->getType(), 1)); + const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout(); + // Expand the trip count and place the new instructions in the preheader. // Notice that the pre-header does not change, only the loop body. - SCEVExpander Exp(*SE, "induction"); + SCEVExpander Exp(*SE, DL, "induction"); // We need to test whether the backedge-taken count is uint##_max. Adding one // to it will cause overflow and an incorrect loop trip count in the vector @@ -2535,6 +2259,7 @@ void InnerLoopVectorizer::createEmptyLoop() { std::tie(FirstCheckInst, StrideCheck) = addStrideCheck(LastBypassBlock->getTerminator()); if (StrideCheck) { + AddedSafetyChecks = true; // Create a new block containing the stride check. BasicBlock *CheckBlock = LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck"); @@ -2557,11 +2282,12 @@ void InnerLoopVectorizer::createEmptyLoop() { // faster. Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = - addRuntimeCheck(LastBypassBlock->getTerminator()); + Legal->getLAI()->addRuntimeCheck(LastBypassBlock->getTerminator()); if (MemRuntimeCheck) { + AddedSafetyChecks = true; // Create a new block containing the memory check. BasicBlock *CheckBlock = - LastBypassBlock->splitBasicBlock(MemRuntimeCheck, "vector.memcheck"); + LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.memcheck"); if (ParentLoop) ParentLoop->addBasicBlockToLoop(CheckBlock, *LI); LoopBypassBlocks.push_back(CheckBlock); @@ -2797,10 +2523,9 @@ getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { } } -Value *createMinMaxOp(IRBuilder<> &Builder, - LoopVectorizationLegality::MinMaxReductionKind RK, - Value *Left, - Value *Right) { +static Value *createMinMaxOp(IRBuilder<> &Builder, + LoopVectorizationLegality::MinMaxReductionKind RK, + Value *Left, Value *Right) { CmpInst::Predicate P = CmpInst::ICMP_NE; switch (RK) { default: @@ -2911,6 +2636,95 @@ static Value *addFastMathFlag(Value *V) { return V; } +/// Estimate the overhead of scalarizing a value. Insert and Extract are set if +/// the result needs to be inserted and/or extracted from vectors. +static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, + const TargetTransformInfo &TTI) { + if (Ty->isVoidTy()) + return 0; + + assert(Ty->isVectorTy() && "Can only scalarize vectors"); + unsigned Cost = 0; + + for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { + if (Insert) + Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, i); + if (Extract) + Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, i); + } + + return Cost; +} + +// Estimate cost of a call instruction CI if it were vectorized with factor VF. +// Return the cost of the instruction, including scalarization overhead if it's +// needed. The flag NeedToScalarize shows if the call needs to be scalarized - +// i.e. either vector version isn't available, or is too expensive. +static unsigned getVectorCallCost(CallInst *CI, unsigned VF, + const TargetTransformInfo &TTI, + const TargetLibraryInfo *TLI, + bool &NeedToScalarize) { + Function *F = CI->getCalledFunction(); + StringRef FnName = CI->getCalledFunction()->getName(); + Type *ScalarRetTy = CI->getType(); + SmallVector Tys, ScalarTys; + for (auto &ArgOp : CI->arg_operands()) + ScalarTys.push_back(ArgOp->getType()); + + // Estimate cost of scalarized vector call. The source operands are assumed + // to be vectors, so we need to extract individual elements from there, + // execute VF scalar calls, and then gather the result into the vector return + // value. + unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys); + if (VF == 1) + return ScalarCallCost; + + // Compute corresponding vector type for return value and arguments. + Type *RetTy = ToVectorTy(ScalarRetTy, VF); + for (unsigned i = 0, ie = ScalarTys.size(); i != ie; ++i) + Tys.push_back(ToVectorTy(ScalarTys[i], VF)); + + // Compute costs of unpacking argument values for the scalar calls and + // packing the return values to a vector. + unsigned ScalarizationCost = + getScalarizationOverhead(RetTy, true, false, TTI); + for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) + ScalarizationCost += getScalarizationOverhead(Tys[i], false, true, TTI); + + unsigned Cost = ScalarCallCost * VF + ScalarizationCost; + + // If we can't emit a vector call for this function, then the currently found + // cost is the cost we need to return. + NeedToScalarize = true; + if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin()) + return Cost; + + // If the corresponding vector cost is cheaper, return its cost. + unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys); + if (VectorCallCost < Cost) { + NeedToScalarize = false; + return VectorCallCost; + } + return Cost; +} + +// Estimate cost of an intrinsic call instruction CI if it were vectorized with +// factor VF. Return the cost of the instruction, including scalarization +// overhead if it's needed. +static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, + const TargetTransformInfo &TTI, + const TargetLibraryInfo *TLI) { + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); + assert(ID && "Expected intrinsic call!"); + + Type *RetTy = ToVectorTy(CI->getType(), VF); + SmallVector Tys; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) + Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); + + return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); +} + void InnerLoopVectorizer::vectorizeLoop() { //===------------------------------------------------===// // @@ -3498,37 +3312,71 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Module *M = BB->getParent()->getParent(); CallInst *CI = cast(it); + + StringRef FnName = CI->getCalledFunction()->getName(); + Function *F = CI->getCalledFunction(); + Type *RetTy = ToVectorTy(CI->getType(), VF); + SmallVector Tys; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) + Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); - assert(ID && "Not an intrinsic call!"); - switch (ID) { - case Intrinsic::assume: - case Intrinsic::lifetime_end: - case Intrinsic::lifetime_start: + if (ID && + (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || + ID == Intrinsic::lifetime_start)) { scalarizeInstruction(it); break; - default: - bool HasScalarOpd = hasVectorInstrinsicScalarOpd(ID, 1); - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector Args; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - if (HasScalarOpd && i == 1) { - Args.push_back(CI->getArgOperand(i)); - continue; - } - VectorParts &Arg = getVectorValue(CI->getArgOperand(i)); - Args.push_back(Arg[Part]); - } - Type *Tys[] = {CI->getType()}; - if (VF > 1) - Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF); + } + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize; + unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + bool UseVectorIntrinsic = + ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + if (!UseVectorIntrinsic && NeedToScalarize) { + scalarizeInstruction(it); + break; + } - Function *F = Intrinsic::getDeclaration(M, ID, Tys); - Entry[Part] = Builder.CreateCall(F, Args); + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector Args; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + Value *Arg = CI->getArgOperand(i); + // Some intrinsics have a scalar argument - don't replace it with a + // vector. + if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) { + VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i)); + Arg = VectorArg[Part]; + } + Args.push_back(Arg); } - propagateMetadata(Entry, it); - break; + Function *VectorF; + if (UseVectorIntrinsic) { + // Use vector version of the intrinsic. + Type *TysForDecl[] = {CI->getType()}; + if (VF > 1) + TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); + VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); + } else { + // Use vector version of the library call. + StringRef VFnName = TLI->getVectorizedFunction(FnName, VF); + assert(!VFnName.empty() && "Vector function name is empty."); + VectorF = M->getFunction(VFnName); + if (!VectorF) { + // Generate a declaration + FunctionType *FTy = FunctionType::get(RetTy, Tys, false); + VectorF = + Function::Create(FTy, Function::ExternalLinkage, VFnName, M); + VectorF->copyAttributesFrom(F); + } + } + assert(VectorF && "Can't create vector function."); + Entry[Part] = Builder.CreateCall(VectorF, Args); } + + propagateMetadata(Entry, it); break; } @@ -3725,7 +3573,7 @@ bool LoopVectorizationLegality::canVectorize() { collectLoopUniforms(); DEBUG(dbgs() << "LV: We can vectorize this loop" << - (LAA.getRuntimePointerCheck()->Need ? " (with a runtime bound check)" : + (LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)" : "") <<"!\n"); @@ -3780,10 +3628,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Look for the attribute signaling the absence of NaNs. Function &F = *Header->getParent(); + const DataLayout &DL = F.getParent()->getDataLayout(); if (F.hasFnAttribute("no-nans-fp-math")) - HasFunNoNaNAttr = F.getAttributes().getAttribute( - AttributeSet::FunctionIndex, - "no-nans-fp-math").getValueAsString() == "true"; + HasFunNoNaNAttr = + F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; // For each block in the loop. for (Loop::block_iterator bb = TheLoop->block_begin(), @@ -3836,9 +3684,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (IK_NoInduction != IK) { // Get the widest type. if (!WidestIndTy) - WidestIndTy = convertPointerToIntegerType(*DL, PhiTy); + WidestIndTy = convertPointerToIntegerType(DL, PhiTy); else - WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy); + WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); // Int inductions are special because we only allow one IV. if (IK == IK_IntInduction && StepValue->isOne()) { @@ -3909,13 +3757,17 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { return false; }// end of PHI handling - // We still don't handle functions. However, we can ignore dbg intrinsic - // calls and we do handle certain intrinsic and libm functions. + // We handle calls that: + // * Are debug info intrinsics. + // * Have a mapping to an IR intrinsic. + // * Have a vector version available. CallInst *CI = dyn_cast(it); - if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa(CI)) { + if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa(CI) && + !(CI->getCalledFunction() && TLI && + TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) { emitAnalysis(VectorizationReport(it) << "call instruction cannot be vectorized"); - DEBUG(dbgs() << "LV: Found a call site.\n"); + DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n"); return false; } @@ -3983,13 +3835,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { ///\brief Remove GEPs whose indices but the last one are loop invariant and /// return the induction operand of the gep pointer. -static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, - const DataLayout *DL, Loop *Lp) { +static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { GetElementPtrInst *GEP = dyn_cast(Ptr); if (!GEP) return Ptr; - unsigned InductionOperand = getGEPInductionOperand(DL, GEP); + unsigned InductionOperand = getGEPInductionOperand(GEP); // Check that all of the gep indices are uniform except for our induction // operand. @@ -4018,8 +3869,7 @@ static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) { ///\brief Get the stride of a pointer access in a loop. /// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a /// pointer to the Value, or null otherwise. -static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, - const DataLayout *DL, Loop *Lp) { +static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { const PointerType *PtrTy = dyn_cast(Ptr->getType()); if (!PtrTy || PtrTy->isAggregateType()) return nullptr; @@ -4032,7 +3882,7 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, // The size of the pointer access. int64_t PtrAccessSize = 1; - Ptr = stripGetElementPtr(Ptr, SE, DL, Lp); + Ptr = stripGetElementPtr(Ptr, SE, Lp); const SCEV *V = SE->getSCEV(Ptr); if (Ptr != OrigPtr) @@ -4051,7 +3901,8 @@ static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, // Strip off the size of access multiplication if we are still analyzing the // pointer. if (OrigPtr == Ptr) { - DL->getTypeAllocSize(PtrTy->getElementType()); + const DataLayout &DL = Lp->getHeader()->getModule()->getDataLayout(); + DL.getTypeAllocSize(PtrTy->getElementType()); if (const SCEVMulExpr *M = dyn_cast(V)) { if (M->getOperand(0)->getSCEVType() != scConstant) return nullptr; @@ -4103,7 +3954,7 @@ void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { else return; - Value *Stride = getStrideFromPointer(Ptr, SE, DL, TheLoop); + Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop); if (!Stride) return; @@ -4150,985 +4001,34 @@ void LoopVectorizationLegality::collectLoopUniforms() { } } -namespace { -/// \brief Analyses memory accesses in a loop. -/// -/// Checks whether run time pointer checks are needed and builds sets for data -/// dependence checking. -class AccessAnalysis { -public: - /// \brief Read or write access location. - typedef PointerIntPair MemAccessInfo; - typedef SmallPtrSet MemAccessInfoSet; - - /// \brief Set of potential dependent memory accesses. - typedef EquivalenceClasses DepCandidates; - - AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) : - DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {} - - /// \brief Register a load and whether it is only read from. - void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) { - Value *Ptr = const_cast(Loc.Ptr); - AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); - Accesses.insert(MemAccessInfo(Ptr, false)); - if (IsReadOnly) - ReadOnlyPtr.insert(Ptr); - } - - /// \brief Register a store. - void addStore(AliasAnalysis::Location &Loc) { - Value *Ptr = const_cast(Loc.Ptr); - AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); - Accesses.insert(MemAccessInfo(Ptr, true)); - } - - /// \brief Check whether we can check the pointers at runtime for - /// non-intersection. - bool canCheckPtrAtRT(RuntimePointerCheck &RtCheck, unsigned &NumComparisons, - ScalarEvolution *SE, Loop *TheLoop, - ValueToValueMap &Strides, - bool ShouldCheckStride = false); - - /// \brief Goes over all memory accesses, checks whether a RT check is needed - /// and builds sets of dependent accesses. - void buildDependenceSets() { - processMemAccesses(); - } - - bool isRTCheckNeeded() { return IsRTCheckNeeded; } - - bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } - void resetDepChecks() { CheckDeps.clear(); } - - MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } - -private: - typedef SetVector PtrAccessSet; - - /// \brief Go over all memory access and check whether runtime pointer checks - /// are needed /// and build sets of dependency check candidates. - void processMemAccesses(); - - /// Set of all accesses. - PtrAccessSet Accesses; - - /// Set of accesses that need a further dependence check. - MemAccessInfoSet CheckDeps; - - /// Set of pointers that are read only. - SmallPtrSet ReadOnlyPtr; - - const DataLayout *DL; - - /// An alias set tracker to partition the access set by underlying object and - //intrinsic property (such as TBAA metadata). - AliasSetTracker AST; - - /// Sets of potentially dependent accesses - members of one set share an - /// underlying pointer. The set "CheckDeps" identfies which sets really need a - /// dependence check. - DepCandidates &DepCands; - - bool IsRTCheckNeeded; -}; - -} // end anonymous namespace - -/// \brief Check whether a pointer can participate in a runtime bounds check. -static bool hasComputableBounds(ScalarEvolution *SE, ValueToValueMap &Strides, - Value *Ptr) { - const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); - const SCEVAddRecExpr *AR = dyn_cast(PtrScev); - if (!AR) - return false; - - return AR->isAffine(); -} - -/// \brief Check the stride of the pointer and ensure that it does not wrap in -/// the address space. -static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, - const Loop *Lp, ValueToValueMap &StridesMap); - -bool AccessAnalysis::canCheckPtrAtRT( - RuntimePointerCheck &RtCheck, - unsigned &NumComparisons, ScalarEvolution *SE, Loop *TheLoop, - ValueToValueMap &StridesMap, bool ShouldCheckStride) { - // Find pointers with computable bounds. We are going to use this information - // to place a runtime bound check. - bool CanDoRT = true; - - bool IsDepCheckNeeded = isDependencyCheckNeeded(); - NumComparisons = 0; - - // We assign a consecutive id to access from different alias sets. - // Accesses between different groups doesn't need to be checked. - unsigned ASId = 1; - for (auto &AS : AST) { - unsigned NumReadPtrChecks = 0; - unsigned NumWritePtrChecks = 0; - - // We assign consecutive id to access from different dependence sets. - // Accesses within the same set don't need a runtime check. - unsigned RunningDepId = 1; - DenseMap DepSetId; - - for (auto A : AS) { - Value *Ptr = A.getValue(); - bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true)); - MemAccessInfo Access(Ptr, IsWrite); - - if (IsWrite) - ++NumWritePtrChecks; - else - ++NumReadPtrChecks; - - if (hasComputableBounds(SE, StridesMap, Ptr) && - // When we run after a failing dependency check we have to make sure we - // don't have wrapping pointers. - (!ShouldCheckStride || - isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) { - // The id of the dependence set. - unsigned DepId; - - if (IsDepCheckNeeded) { - Value *Leader = DepCands.getLeaderValue(Access).getPointer(); - unsigned &LeaderId = DepSetId[Leader]; - if (!LeaderId) - LeaderId = RunningDepId++; - DepId = LeaderId; - } else - // Each access has its own dependence set. - DepId = RunningDepId++; - - RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap); - - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n'); - } else { - CanDoRT = false; - } - } - - if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) - NumComparisons += 0; // Only one dependence set. - else { - NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks + - NumWritePtrChecks - 1)); - } - - ++ASId; - } - - // If the pointers that we would use for the bounds comparison have different - // address spaces, assume the values aren't directly comparable, so we can't - // use them for the runtime check. We also have to assume they could - // overlap. In the future there should be metadata for whether address spaces - // are disjoint. - unsigned NumPointers = RtCheck.Pointers.size(); - for (unsigned i = 0; i < NumPointers; ++i) { - for (unsigned j = i + 1; j < NumPointers; ++j) { - // Only need to check pointers between two different dependency sets. - if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j]) - continue; - // Only need to check pointers in the same alias set. - if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j]) - continue; - - Value *PtrI = RtCheck.Pointers[i]; - Value *PtrJ = RtCheck.Pointers[j]; - - unsigned ASi = PtrI->getType()->getPointerAddressSpace(); - unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); - if (ASi != ASj) { - DEBUG(dbgs() << "LV: Runtime check would require comparison between" - " different address spaces\n"); - return false; - } - } - } - - return CanDoRT; -} - -void AccessAnalysis::processMemAccesses() { - // We process the set twice: first we process read-write pointers, last we - // process read-only pointers. This allows us to skip dependence tests for - // read-only pointers. - - DEBUG(dbgs() << "LV: Processing memory accesses...\n"); - DEBUG(dbgs() << " AST: "; AST.dump()); - DEBUG(dbgs() << "LV: Accesses:\n"); - DEBUG({ - for (auto A : Accesses) - dbgs() << "\t" << *A.getPointer() << " (" << - (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? - "read-only" : "read")) << ")\n"; - }); - - // The AliasSetTracker has nicely partitioned our pointers by metadata - // compatibility and potential for underlying-object overlap. As a result, we - // only need to check for potential pointer dependencies within each alias - // set. - for (auto &AS : AST) { - // Note that both the alias-set tracker and the alias sets themselves used - // linked lists internally and so the iteration order here is deterministic - // (matching the original instruction order within each set). - - bool SetHasWrite = false; - - // Map of pointers to last access encountered. - typedef DenseMap UnderlyingObjToAccessMap; - UnderlyingObjToAccessMap ObjToLastAccess; - - // Set of access to check after all writes have been processed. - PtrAccessSet DeferredAccesses; - - // Iterate over each alias set twice, once to process read/write pointers, - // and then to process read-only pointers. - for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { - bool UseDeferred = SetIteration > 0; - PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; - - for (auto AV : AS) { - Value *Ptr = AV.getValue(); - - // For a single memory access in AliasSetTracker, Accesses may contain - // both read and write, and they both need to be handled for CheckDeps. - for (auto AC : S) { - if (AC.getPointer() != Ptr) - continue; - - bool IsWrite = AC.getInt(); - - // If we're using the deferred access set, then it contains only - // reads. - bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; - if (UseDeferred && !IsReadOnlyPtr) - continue; - // Otherwise, the pointer must be in the PtrAccessSet, either as a - // read or a write. - assert(((IsReadOnlyPtr && UseDeferred) || IsWrite || - S.count(MemAccessInfo(Ptr, false))) && - "Alias-set pointer not in the access set?"); - - MemAccessInfo Access(Ptr, IsWrite); - DepCands.insert(Access); - - // Memorize read-only pointers for later processing and skip them in - // the first round (they need to be checked after we have seen all - // write pointers). Note: we also mark pointer that are not - // consecutive as "read-only" pointers (so that we check - // "a[b[i]] +="). Hence, we need the second check for "!IsWrite". - if (!UseDeferred && IsReadOnlyPtr) { - DeferredAccesses.insert(Access); - continue; - } - - // If this is a write - check other reads and writes for conflicts. If - // this is a read only check other writes for conflicts (but only if - // there is no other write to the ptr - this is an optimization to - // catch "a[i] = a[i] + " without having to do a dependence check). - if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { - CheckDeps.insert(Access); - IsRTCheckNeeded = true; - } - - if (IsWrite) - SetHasWrite = true; - - // Create sets of pointers connected by a shared alias set and - // underlying object. - typedef SmallVector ValueVector; - ValueVector TempObjects; - GetUnderlyingObjects(Ptr, TempObjects, DL); - for (Value *UnderlyingObj : TempObjects) { - UnderlyingObjToAccessMap::iterator Prev = - ObjToLastAccess.find(UnderlyingObj); - if (Prev != ObjToLastAccess.end()) - DepCands.unionSets(Access, Prev->second); - - ObjToLastAccess[UnderlyingObj] = Access; - } - } - } - } - } -} - -namespace { -/// \brief Checks memory dependences among accesses to the same underlying -/// object to determine whether there vectorization is legal or not (and at -/// which vectorization factor). -/// -/// This class works under the assumption that we already checked that memory -/// locations with different underlying pointers are "must-not alias". -/// We use the ScalarEvolution framework to symbolically evalutate access -/// functions pairs. Since we currently don't restructure the loop we can rely -/// on the program order of memory accesses to determine their safety. -/// At the moment we will only deem accesses as safe for: -/// * A negative constant distance assuming program order. -/// -/// Safe: tmp = a[i + 1]; OR a[i + 1] = x; -/// a[i] = tmp; y = a[i]; -/// -/// The latter case is safe because later checks guarantuee that there can't -/// be a cycle through a phi node (that is, we check that "x" and "y" is not -/// the same variable: a header phi can only be an induction or a reduction, a -/// reduction can't have a memory sink, an induction can't have a memory -/// source). This is important and must not be violated (or we have to -/// resort to checking for cycles through memory). -/// -/// * A positive constant distance assuming program order that is bigger -/// than the biggest memory access. -/// -/// tmp = a[i] OR b[i] = x -/// a[i+2] = tmp y = b[i+2]; -/// -/// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively. -/// -/// * Zero distances and all accesses have the same size. -/// -class MemoryDepChecker { -public: - typedef PointerIntPair MemAccessInfo; - typedef SmallPtrSet MemAccessInfoSet; - - MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L, - const LoopAccessAnalysis::VectorizerParams &VectParams) - : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), - ShouldRetryWithRuntimeCheck(false), VectParams(VectParams) {} - - /// \brief Register the location (instructions are given increasing numbers) - /// of a write access. - void addAccess(StoreInst *SI) { - Value *Ptr = SI->getPointerOperand(); - Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx); - InstMap.push_back(SI); - ++AccessIdx; - } - - /// \brief Register the location (instructions are given increasing numbers) - /// of a write access. - void addAccess(LoadInst *LI) { - Value *Ptr = LI->getPointerOperand(); - Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx); - InstMap.push_back(LI); - ++AccessIdx; - } - - /// \brief Check whether the dependencies between the accesses are safe. - /// - /// Only checks sets with elements in \p CheckDeps. - bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, - MemAccessInfoSet &CheckDeps, ValueToValueMap &Strides); - - /// \brief The maximum number of bytes of a vector register we can vectorize - /// the accesses safely with. - unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } - - /// \brief In same cases when the dependency check fails we can still - /// vectorize the loop with a dynamic array access check. - bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } - -private: - ScalarEvolution *SE; - const DataLayout *DL; - const Loop *InnermostLoop; - - /// \brief Maps access locations (ptr, read/write) to program order. - DenseMap > Accesses; - - /// \brief Memory access instructions in program order. - SmallVector InstMap; - - /// \brief The program order index to be used for the next instruction. - unsigned AccessIdx; - - // We can access this many bytes in parallel safely. - unsigned MaxSafeDepDistBytes; - - /// \brief If we see a non-constant dependence distance we can still try to - /// vectorize this loop with runtime checks. - bool ShouldRetryWithRuntimeCheck; - - /// \brief Vectorizer parameters used by the analysis. - LoopAccessAnalysis::VectorizerParams VectParams; - - /// \brief Check whether there is a plausible dependence between the two - /// accesses. - /// - /// Access \p A must happen before \p B in program order. The two indices - /// identify the index into the program order map. - /// - /// This function checks whether there is a plausible dependence (or the - /// absence of such can't be proved) between the two accesses. If there is a - /// plausible dependence but the dependence distance is bigger than one - /// element access it records this distance in \p MaxSafeDepDistBytes (if this - /// distance is smaller than any other distance encountered so far). - /// Otherwise, this function returns true signaling a possible dependence. - bool isDependent(const MemAccessInfo &A, unsigned AIdx, - const MemAccessInfo &B, unsigned BIdx, - ValueToValueMap &Strides); - - /// \brief Check whether the data dependence could prevent store-load - /// forwarding. - bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize); -}; - -} // end anonymous namespace - -static bool isInBoundsGep(Value *Ptr) { - if (GetElementPtrInst *GEP = dyn_cast(Ptr)) - return GEP->isInBounds(); - return false; -} - -/// \brief Check whether the access through \p Ptr has a constant stride. -static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, - const Loop *Lp, ValueToValueMap &StridesMap) { - const Type *Ty = Ptr->getType(); - assert(Ty->isPointerTy() && "Unexpected non-ptr"); - - // Make sure that the pointer does not point to aggregate types. - const PointerType *PtrTy = cast(Ty); - if (PtrTy->getElementType()->isAggregateType()) { - DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr << - "\n"); - return 0; - } - - const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr); - - const SCEVAddRecExpr *AR = dyn_cast(PtrScev); - if (!AR) { - DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer " - << *Ptr << " SCEV: " << *PtrScev << "\n"); - return 0; - } - - // The accesss function must stride over the innermost loop. - if (Lp != AR->getLoop()) { - DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " << - *Ptr << " SCEV: " << *PtrScev << "\n"); - } - - // The address calculation must not wrap. Otherwise, a dependence could be - // inverted. - // An inbounds getelementptr that is a AddRec with a unit stride - // cannot wrap per definition. The unit stride requirement is checked later. - // An getelementptr without an inbounds attribute and unit stride would have - // to access the pointer value "0" which is undefined behavior in address - // space 0, therefore we can also vectorize this case. - bool IsInBoundsGEP = isInBoundsGep(Ptr); - bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask); - bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0; - if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) { - DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space " - << *Ptr << " SCEV: " << *PtrScev << "\n"); - return 0; - } - - // Check the step is constant. - const SCEV *Step = AR->getStepRecurrence(*SE); - - // Calculate the pointer stride and check if it is consecutive. - const SCEVConstant *C = dyn_cast(Step); - if (!C) { - DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr << - " SCEV: " << *PtrScev << "\n"); - return 0; - } - - int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType()); - const APInt &APStepVal = C->getValue()->getValue(); - - // Huge step value - give up. - if (APStepVal.getBitWidth() > 64) - return 0; - - int64_t StepVal = APStepVal.getSExtValue(); - - // Strided access. - int64_t Stride = StepVal / Size; - int64_t Rem = StepVal % Size; - if (Rem) - return 0; - - // If the SCEV could wrap but we have an inbounds gep with a unit stride we - // know we can't "wrap around the address space". In case of address space - // zero we know that this won't happen without triggering undefined behavior. - if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) && - Stride != 1 && Stride != -1) - return 0; - - return Stride; -} - -bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance, - unsigned TypeByteSize) { - // If loads occur at a distance that is not a multiple of a feasible vector - // factor store-load forwarding does not take place. - // Positive dependences might cause troubles because vectorizing them might - // prevent store-load forwarding making vectorized code run a lot slower. - // a[i] = a[i-3] ^ a[i-8]; - // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and - // hence on your typical architecture store-load forwarding does not take - // place. Vectorizing in such cases does not make sense. - // Store-load forwarding distance. - const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize; - // Maximum vector factor. - unsigned MaxVFWithoutSLForwardIssues = VectParams.MaxVectorWidth*TypeByteSize; - if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues) - MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes; - - for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues; - vf *= 2) { - if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) { - MaxVFWithoutSLForwardIssues = (vf >>=1); - break; - } - } - - if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) { - DEBUG(dbgs() << "LV: Distance " << Distance << - " that could cause a store-load forwarding conflict\n"); - return true; - } - - if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes && - MaxVFWithoutSLForwardIssues != VectParams.MaxVectorWidth*TypeByteSize) - MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues; - return false; -} - -bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, - const MemAccessInfo &B, unsigned BIdx, - ValueToValueMap &Strides) { - assert (AIdx < BIdx && "Must pass arguments in program order"); - - Value *APtr = A.getPointer(); - Value *BPtr = B.getPointer(); - bool AIsWrite = A.getInt(); - bool BIsWrite = B.getInt(); - - // Two reads are independent. - if (!AIsWrite && !BIsWrite) +bool LoopVectorizationLegality::canVectorizeMemory() { + LAI = &LAA->getInfo(TheLoop, Strides); + auto &OptionalReport = LAI->getReport(); + if (OptionalReport) + emitAnalysis(VectorizationReport(*OptionalReport)); + if (!LAI->canVectorizeMemory()) return false; - // We cannot check pointers in different address spaces. - if (APtr->getType()->getPointerAddressSpace() != - BPtr->getType()->getPointerAddressSpace()) - return true; - - const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr); - const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr); - - int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides); - int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides); - - const SCEV *Src = AScev; - const SCEV *Sink = BScev; - - // If the induction step is negative we have to invert source and sink of the - // dependence. - if (StrideAPtr < 0) { - //Src = BScev; - //Sink = AScev; - std::swap(APtr, BPtr); - std::swap(Src, Sink); - std::swap(AIsWrite, BIsWrite); - std::swap(AIdx, BIdx); - std::swap(StrideAPtr, StrideBPtr); - } - - const SCEV *Dist = SE->getMinusSCEV(Sink, Src); - - DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink - << "(Induction step: " << StrideAPtr << ")\n"); - DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to " - << *InstMap[BIdx] << ": " << *Dist << "\n"); - - // Need consecutive accesses. We don't want to vectorize - // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in - // the address space. - if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){ - DEBUG(dbgs() << "Non-consecutive pointer access\n"); - return true; - } - - const SCEVConstant *C = dyn_cast(Dist); - if (!C) { - DEBUG(dbgs() << "LV: Dependence because of non-constant distance\n"); - ShouldRetryWithRuntimeCheck = true; - return true; - } - - Type *ATy = APtr->getType()->getPointerElementType(); - Type *BTy = BPtr->getType()->getPointerElementType(); - unsigned TypeByteSize = DL->getTypeAllocSize(ATy); - - // Negative distances are not plausible dependencies. - const APInt &Val = C->getValue()->getValue(); - if (Val.isNegative()) { - bool IsTrueDataDependence = (AIsWrite && !BIsWrite); - if (IsTrueDataDependence && - (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) || - ATy != BTy)) - return true; - - DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n"); + if (LAI->hasStoreToLoopInvariantAddress()) { + emitAnalysis( + VectorizationReport() + << "write to a loop invariant address could not be vectorized"); + DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); return false; } - // Write to the same location with the same size. - // Could be improved to assert type sizes are the same (i32 == float, etc). - if (Val == 0) { - if (ATy == BTy) - return false; - DEBUG(dbgs() << "LV: Zero dependence difference but different types\n"); - return true; - } - - assert(Val.isStrictlyPositive() && "Expect a positive value"); - - // Positive distance bigger than max vectorization factor. - if (ATy != BTy) { - DEBUG(dbgs() << - "LV: ReadWrite-Write positive dependency with different types\n"); + if (LAI->getNumRuntimePointerChecks() > + VectorizerParams::RuntimeMemoryCheckThreshold) { + emitAnalysis(VectorizationReport() + << LAI->getNumRuntimePointerChecks() << " exceeds limit of " + << VectorizerParams::RuntimeMemoryCheckThreshold + << " dependent memory operations checked at runtime"); + DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); return false; } - - unsigned Distance = (unsigned) Val.getZExtValue(); - - // Bail out early if passed-in parameters make vectorization not feasible. - unsigned ForcedFactor = (VectParams.VectorizationFactor ? - VectParams.VectorizationFactor : 1); - unsigned ForcedUnroll = (VectParams.VectorizationInterleave ? - VectParams.VectorizationInterleave : 1); - - // The distance must be bigger than the size needed for a vectorized version - // of the operation and the size of the vectorized operation must not be - // bigger than the currrent maximum size. - if (Distance < 2*TypeByteSize || - 2*TypeByteSize > MaxSafeDepDistBytes || - Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { - DEBUG(dbgs() << "LV: Failure because of Positive distance " - << Val.getSExtValue() << '\n'); - return true; - } - - MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? - Distance : MaxSafeDepDistBytes; - - bool IsTrueDataDependence = (!AIsWrite && BIsWrite); - if (IsTrueDataDependence && - couldPreventStoreLoadForward(Distance, TypeByteSize)) - return true; - - DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() << - " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'); - - return false; -} - -bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, - MemAccessInfoSet &CheckDeps, - ValueToValueMap &Strides) { - - MaxSafeDepDistBytes = -1U; - while (!CheckDeps.empty()) { - MemAccessInfo CurAccess = *CheckDeps.begin(); - - // Get the relevant memory access set. - EquivalenceClasses::iterator I = - AccessSets.findValue(AccessSets.getLeaderValue(CurAccess)); - - // Check accesses within this set. - EquivalenceClasses::member_iterator AI, AE; - AI = AccessSets.member_begin(I), AE = AccessSets.member_end(); - - // Check every access pair. - while (AI != AE) { - CheckDeps.erase(*AI); - EquivalenceClasses::member_iterator OI = std::next(AI); - while (OI != AE) { - // Check every accessing instruction pair in program order. - for (std::vector::iterator I1 = Accesses[*AI].begin(), - I1E = Accesses[*AI].end(); I1 != I1E; ++I1) - for (std::vector::iterator I2 = Accesses[*OI].begin(), - I2E = Accesses[*OI].end(); I2 != I2E; ++I2) { - if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides)) - return false; - if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides)) - return false; - } - ++OI; - } - AI++; - } - } return true; } -bool LoopAccessAnalysis::canVectorizeMemory(ValueToValueMap &Strides) { - - typedef SmallVector ValueVector; - typedef SmallPtrSet ValueSet; - - // Holds the Load and Store *instructions*. - ValueVector Loads; - ValueVector Stores; - - // Holds all the different accesses in the loop. - unsigned NumReads = 0; - unsigned NumReadWrites = 0; - - PtrRtCheck.Pointers.clear(); - PtrRtCheck.Need = false; - - const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); - MemoryDepChecker DepChecker(SE, DL, TheLoop, VectParams); - - // For each block. - for (Loop::block_iterator bb = TheLoop->block_begin(), - be = TheLoop->block_end(); bb != be; ++bb) { - - // Scan the BB and collect legal loads and stores. - for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; - ++it) { - - // If this is a load, save it. If this instruction can read from memory - // but is not a load, then we quit. Notice that we don't handle function - // calls that read or write. - if (it->mayReadFromMemory()) { - // Many math library functions read the rounding mode. We will only - // vectorize a loop if it contains known function calls that don't set - // the flag. Therefore, it is safe to ignore this read from memory. - CallInst *Call = dyn_cast(it); - if (Call && getIntrinsicIDForCall(Call, TLI)) - continue; - - LoadInst *Ld = dyn_cast(it); - if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) { - emitAnalysis(VectorizationReport(Ld) - << "read with atomic ordering or volatile read"); - DEBUG(dbgs() << "LV: Found a non-simple load.\n"); - return false; - } - NumLoads++; - Loads.push_back(Ld); - DepChecker.addAccess(Ld); - continue; - } - - // Save 'store' instructions. Abort if other instructions write to memory. - if (it->mayWriteToMemory()) { - StoreInst *St = dyn_cast(it); - if (!St) { - emitAnalysis(VectorizationReport(it) << - "instruction cannot be vectorized"); - return false; - } - if (!St->isSimple() && !IsAnnotatedParallel) { - emitAnalysis(VectorizationReport(St) - << "write with atomic ordering or volatile write"); - DEBUG(dbgs() << "LV: Found a non-simple store.\n"); - return false; - } - NumStores++; - Stores.push_back(St); - DepChecker.addAccess(St); - } - } // Next instr. - } // Next block. - - // Now we have two lists that hold the loads and the stores. - // Next, we find the pointers that they use. - - // Check if we see any stores. If there are no stores, then we don't - // care if the pointers are *restrict*. - if (!Stores.size()) { - DEBUG(dbgs() << "LV: Found a read-only loop!\n"); - return true; - } - - AccessAnalysis::DepCandidates DependentAccesses; - AccessAnalysis Accesses(DL, AA, DependentAccesses); - - // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects - // multiple times on the same object. If the ptr is accessed twice, once - // for read and once for write, it will only appear once (on the write - // list). This is okay, since we are going to check for conflicts between - // writes and between reads and writes, but not between reads and reads. - ValueSet Seen; - - ValueVector::iterator I, IE; - for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { - StoreInst *ST = cast(*I); - Value* Ptr = ST->getPointerOperand(); - - if (isUniform(Ptr)) { - emitAnalysis( - VectorizationReport(ST) - << "write to a loop invariant address could not be vectorized"); - DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); - return false; - } - - // If we did *not* see this pointer before, insert it to the read-write - // list. At this phase it is only a 'write' list. - if (Seen.insert(Ptr).second) { - ++NumReadWrites; - - AliasAnalysis::Location Loc = AA->getLocation(ST); - // The TBAA metadata could have a control dependency on the predication - // condition, so we cannot rely on it when determining whether or not we - // need runtime pointer checks. - if (blockNeedsPredication(ST->getParent())) - Loc.AATags.TBAA = nullptr; - - Accesses.addStore(Loc); - } - } - - if (IsAnnotatedParallel) { - DEBUG(dbgs() - << "LV: A loop annotated parallel, ignore memory dependency " - << "checks.\n"); - return true; - } - - for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { - LoadInst *LD = cast(*I); - Value* Ptr = LD->getPointerOperand(); - // If we did *not* see this pointer before, insert it to the - // read list. If we *did* see it before, then it is already in - // the read-write list. This allows us to vectorize expressions - // such as A[i] += x; Because the address of A[i] is a read-write - // pointer. This only works if the index of A[i] is consecutive. - // If the address of i is unknown (for example A[B[i]]) then we may - // read a few words, modify, and write a few words, and some of the - // words may be written to the same address. - bool IsReadOnlyPtr = false; - if (Seen.insert(Ptr).second || - !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) { - ++NumReads; - IsReadOnlyPtr = true; - } - - AliasAnalysis::Location Loc = AA->getLocation(LD); - // The TBAA metadata could have a control dependency on the predication - // condition, so we cannot rely on it when determining whether or not we - // need runtime pointer checks. - if (blockNeedsPredication(LD->getParent())) - Loc.AATags.TBAA = nullptr; - - Accesses.addLoad(Loc, IsReadOnlyPtr); - } - - // If we write (or read-write) to a single destination and there are no - // other reads in this loop then is it safe to vectorize. - if (NumReadWrites == 1 && NumReads == 0) { - DEBUG(dbgs() << "LV: Found a write-only loop!\n"); - return true; - } - - // Build dependence sets and check whether we need a runtime pointer bounds - // check. - Accesses.buildDependenceSets(); - bool NeedRTCheck = Accesses.isRTCheckNeeded(); - - // Find pointers with computable bounds. We are going to use this information - // to place a runtime bound check. - unsigned NumComparisons = 0; - bool CanDoRT = false; - if (NeedRTCheck) - CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop, - Strides); - - DEBUG(dbgs() << "LV: We need to do " << NumComparisons << - " pointer comparisons.\n"); - - // If we only have one set of dependences to check pointers among we don't - // need a runtime check. - if (NumComparisons == 0 && NeedRTCheck) - NeedRTCheck = false; - - // Check that we did not collect too many pointers or found an unsizeable - // pointer. - if (!CanDoRT || NumComparisons > VectParams.RuntimeMemoryCheckThreshold) { - PtrRtCheck.reset(); - CanDoRT = false; - } - - if (CanDoRT) { - DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); - } - - if (NeedRTCheck && !CanDoRT) { - emitAnalysis(VectorizationReport() << "cannot identify array bounds"); - DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << - "the array bounds.\n"); - PtrRtCheck.reset(); - return false; - } - - PtrRtCheck.Need = NeedRTCheck; - - bool CanVecMem = true; - if (Accesses.isDependencyCheckNeeded()) { - DEBUG(dbgs() << "LV: Checking memory dependencies\n"); - CanVecMem = DepChecker.areDepsSafe( - DependentAccesses, Accesses.getDependenciesToCheck(), Strides); - MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); - - if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) { - DEBUG(dbgs() << "LV: Retrying with memory checks\n"); - NeedRTCheck = true; - - // Clear the dependency checks. We assume they are not needed. - Accesses.resetDepChecks(); - - PtrRtCheck.reset(); - PtrRtCheck.Need = true; - - CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, - TheLoop, Strides, true); - // Check that we did not collect too many pointers or found an unsizeable - // pointer. - if (!CanDoRT || NumComparisons > VectParams.RuntimeMemoryCheckThreshold) { - if (!CanDoRT && NumComparisons > 0) - emitAnalysis(VectorizationReport() - << "cannot check memory dependencies at runtime"); - else - emitAnalysis(VectorizationReport() - << NumComparisons << " exceeds limit of " - << VectParams.RuntimeMemoryCheckThreshold - << " dependent memory operations checked at runtime"); - DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n"); - PtrRtCheck.reset(); - return false; - } - - CanVecMem = true; - } - } - - if (!CanVecMem) - emitAnalysis(VectorizationReport() << - "unsafe dependent memory operations in loop"); - - DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") << - " need a runtime memory check.\n"); - - return CanVecMem; -} - -bool LoopVectorizationLegality::canVectorizeMemory() { - return LAA.canVectorizeMemory(Strides); -} - static bool hasMultipleUsesOf(Instruction *I, SmallPtrSetImpl &Insts) { unsigned NumUses = 0; @@ -5417,32 +4317,31 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I, } } -LoopVectorizationLegality::InductionKind -LoopVectorizationLegality::isInductionVariable(PHINode *Phi, - ConstantInt *&StepValue) { +bool llvm::isInductionPHI(PHINode *Phi, ScalarEvolution *SE, + ConstantInt *&StepValue) { Type *PhiTy = Phi->getType(); // We only handle integer and pointer inductions variables. if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) - return IK_NoInduction; + return false; // Check that the PHI is consecutive. const SCEV *PhiScev = SE->getSCEV(Phi); const SCEVAddRecExpr *AR = dyn_cast(PhiScev); if (!AR) { DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); - return IK_NoInduction; + return false; } const SCEV *Step = AR->getStepRecurrence(*SE); // Calculate the pointer stride and check if it is consecutive. const SCEVConstant *C = dyn_cast(Step); if (!C) - return IK_NoInduction; + return false; ConstantInt *CV = C->getValue(); if (PhiTy->isIntegerTy()) { StepValue = CV; - return IK_IntInduction; + return true; } assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); @@ -5450,13 +4349,28 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi, // The pointer stride cannot be determined if the pointer element type is not // sized. if (!PointerElementType->isSized()) - return IK_NoInduction; + return false; - int64_t Size = static_cast(DL->getTypeAllocSize(PointerElementType)); + const DataLayout &DL = Phi->getModule()->getDataLayout(); + int64_t Size = static_cast(DL.getTypeAllocSize(PointerElementType)); int64_t CVSize = CV->getSExtValue(); if (CVSize % Size) - return IK_NoInduction; + return false; StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size); + return true; +} + +LoopVectorizationLegality::InductionKind +LoopVectorizationLegality::isInductionVariable(PHINode *Phi, + ConstantInt *&StepValue) { + if (!isInductionPHI(Phi, SE, StepValue)) + return IK_NoInduction; + + Type *PhiTy = Phi->getType(); + // Found an Integer induction variable. + if (PhiTy->isIntegerTy()) + return IK_IntInduction; + // Found an Pointer induction variable. return IK_PtrInduction; } @@ -5469,16 +4383,8 @@ bool LoopVectorizationLegality::isInductionVariable(const Value *V) { return Inductions.count(PN); } -bool LoopAccessAnalysis::blockNeedsPredication(BasicBlock *BB) { - assert(TheLoop->contains(BB) && "Unknown block used"); - - // Blocks that do not dominate the latch need predication. - BasicBlock* Latch = TheLoop->getLoopLatch(); - return !DT->dominates(BB, Latch); -} - bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { - return LAA.blockNeedsPredication(BB); + return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); } bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, @@ -5672,6 +4578,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { unsigned LoopVectorizationCostModel::getWidestType() { unsigned MaxWidth = 8; + const DataLayout &DL = TheFunction->getParent()->getDataLayout(); // For each block. for (Loop::block_iterator bb = TheLoop->block_begin(), @@ -5706,7 +4613,7 @@ unsigned LoopVectorizationCostModel::getWidestType() { continue; MaxWidth = std::max(MaxWidth, - (unsigned)DL->getTypeSizeInBits(T->getScalarType())); + (unsigned)DL.getTypeSizeInBits(T->getScalarType())); } } @@ -5858,6 +4765,14 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, return SmallUF; } + // Unroll if this is a large loop (small loops are already dealt with by this + // point) that could benefit from interleaved unrolling. + bool HasReductions = (Legal->getReductionVars()->size() > 0); + if (TTI.enableAggressiveInterleaving(HasReductions)) { + DEBUG(dbgs() << "LV: Unrolling to expose ILP.\n"); + return UF; + } + DEBUG(dbgs() << "LV: Not Unrolling.\n"); return 1; } @@ -6195,8 +5110,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // Scalarized loads/stores. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; - unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy); - unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF; + const DataLayout &DL = I->getModule()->getDataLayout(); + unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy); + unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF; if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) { bool IsComplexComputation = isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop); @@ -6223,7 +5139,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // Wide load/stores. unsigned Cost = TTI.getAddressComputationCost(VectorTy); - Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + if (Legal->isMaskRequired(I)) + Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, + AS); + else + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); if (Reverse) Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, @@ -6253,14 +5173,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } case Instruction::Call: { + bool NeedToScalarize; CallInst *CI = cast(I); - Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); - assert(ID && "Not an intrinsic call!"); - Type *RetTy = ToVectorTy(CI->getType(), VF); - SmallVector Tys; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) - Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF)); - return TTI.getIntrinsicInstrCost(ID, RetTy, Tys); + unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize); + if (getIntrinsicIDForCall(CI, TLI)) + return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI)); + return CallCost; } default: { // We are scalarizing the instruction. Return the cost of the scalar @@ -6287,12 +5205,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { }// end of switch. } -Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { - if (Scalar->isVoidTy() || VF == 1) - return Scalar; - return VectorType::get(Scalar, VF); -} - char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) @@ -6305,6 +5217,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false) namespace llvm {