X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;ds=sidebyside;f=lib%2FTransforms%2FVectorize%2FLoopVectorize.cpp;h=35b2ecf99ceaf96ca5efc05378bd59ae0c7c6cd1;hb=54786a0936bf0ba3d83be3e8fd32f1488ba9e709;hp=e0d8e939df1d1035e41af332815dd750b9c656ec;hpb=dca126522d7c13d8f4e3a42dc191250bd567550d;p=oota-llvm.git diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index e0d8e939df1..35b2ecf99ce 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -55,7 +55,9 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/AssumptionTracker.h" #include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" @@ -108,8 +110,8 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect.")); static cl::opt -VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden, - cl::desc("Sets the vectorization unroll count. " +VectorizationInterleave("force-vector-interleave", cl::init(0), cl::Hidden, + cl::desc("Sets the vectorization interleave count. " "Zero is autoselect.")); static cl::opt @@ -157,17 +159,17 @@ static cl::opt ForceTargetNumVectorRegs( "force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers.")); -/// Maximum vectorization unroll count. -static const unsigned MaxUnrollFactor = 16; +/// Maximum vectorization interleave count. +static const unsigned MaxInterleaveFactor = 16; -static cl::opt ForceTargetMaxScalarUnrollFactor( - "force-target-max-scalar-unroll", cl::init(0), cl::Hidden, - cl::desc("A flag that overrides the target's max unroll factor for scalar " - "loops.")); +static cl::opt ForceTargetMaxScalarInterleaveFactor( + "force-target-max-scalar-interleave", cl::init(0), cl::Hidden, + cl::desc("A flag that overrides the target's max interleave factor for " + "scalar loops.")); -static cl::opt ForceTargetMaxVectorUnrollFactor( - "force-target-max-vector-unroll", cl::init(0), cl::Hidden, - cl::desc("A flag that overrides the target's max unroll factor for " +static cl::opt ForceTargetMaxVectorInterleaveFactor( + "force-target-max-vector-interleave", cl::init(0), cl::Hidden, + cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops.")); static cl::opt ForceTargetInstructionCost( @@ -204,6 +206,11 @@ static cl::opt EnableCondStoresVectorization( "enable-cond-stores-vec", cl::init(false), cl::Hidden, cl::desc("Enable if predication of stores during vectorization.")); +static cl::opt MaxNestedScalarReductionUF( + "max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden, + cl::desc("The maximum unroll factor to use when unrolling a scalar " + "reduction in a nested loop.")); + namespace { // Forward declarations. @@ -783,7 +790,7 @@ private: /// Return true if all of the instructions in the block can be speculatively /// executed. \p SafePtrs is a list of addresses that are known to be legal /// and we know that we can read from them without segfault. - bool blockCanBePredicated(BasicBlock *BB, SmallPtrSet& SafePtrs); + bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl &SafePtrs); /// Returns True, if 'Phi' is the kind of reduction variable for type /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. @@ -879,8 +886,12 @@ public: LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const DataLayout *DL, const TargetLibraryInfo *TLI, - const Function *F, const LoopVectorizeHints *Hints) - : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI), TheFunction(F), Hints(Hints) {} + AssumptionTracker *AT, const Function *F, + const LoopVectorizeHints *Hints) + : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI), + TheFunction(F), Hints(Hints) { + CodeMetrics::collectEphemeralValues(L, AT, EphValues); + } /// Information about vectorization costs struct VectorizationFactor { @@ -949,6 +960,9 @@ private: *TheFunction, DL, Message.str()); } + /// Values used only by @llvm.assume calls. + SmallPtrSet EphValues; + /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. @@ -998,7 +1012,7 @@ class LoopVectorizeHints { case HK_WIDTH: return isPowerOf2_32(Val) && Val <= MaxVectorWidth; case HK_UNROLL: - return isPowerOf2_32(Val) && Val <= MaxUnrollFactor; + return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor; case HK_FORCE: return (Val <= 1); } @@ -1008,12 +1022,10 @@ class LoopVectorizeHints { /// Vectorization width. Hint Width; - /// Vectorization unroll factor. - Hint Unroll; + /// Vectorization interleave factor. + Hint Interleave; /// Vectorization forced Hint Force; - /// Array to help iterating through all hints. - Hint *Hints[3] = { &Width, &Unroll, &Force }; /// Return the loop metadata prefix. static StringRef Prefix() { return "llvm.loop."; } @@ -1025,26 +1037,27 @@ public: FK_Enabled = 1, ///< Forcing enabled. }; - LoopVectorizeHints(const Loop *L, bool DisableUnrolling) + LoopVectorizeHints(const Loop *L, bool DisableInterleaving) : Width("vectorize.width", VectorizationFactor, HK_WIDTH), - Unroll("interleave.count", DisableUnrolling, HK_UNROLL), + Interleave("interleave.count", DisableInterleaving, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), TheLoop(L) { // Populate values with existing loop metadata. getHintsFromMetadata(); - // force-vector-unroll overrides DisableUnrolling. - if (VectorizationUnroll.getNumOccurrences() > 0) - Unroll.Value = VectorizationUnroll; + // force-vector-interleave overrides DisableInterleaving. + if (VectorizationInterleave.getNumOccurrences() > 0) + Interleave.Value = VectorizationInterleave; - DEBUG(if (DisableUnrolling && Unroll.Value == 1) dbgs() - << "LV: Unrolling disabled by the pass manager\n"); + DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs() + << "LV: Interleaving disabled by the pass manager\n"); } /// Mark the loop L as already vectorized by setting the width to 1. void setAlreadyVectorized() { - Width.Value = Unroll.Value = 1; - writeHintsToMetadata({ Width, Unroll }); + Width.Value = Interleave.Value = 1; + Hint Hints[] = {Width, Interleave}; + writeHintsToMetadata(Hints); } /// Dumps all the hint information. @@ -1058,8 +1071,8 @@ public: R << " (Force=true"; if (Width.Value != 0) R << ", Vector Width=" << Width.Value; - if (Unroll.Value != 0) - R << ", Interleave Count=" << Unroll.Value; + if (Interleave.Value != 0) + R << ", Interleave Count=" << Interleave.Value; R << ")"; } } @@ -1068,7 +1081,7 @@ public: } unsigned getWidth() const { return Width.Value; } - unsigned getUnroll() const { return Unroll.Value; } + unsigned getInterleave() const { return Interleave.Value; } enum ForceKind getForce() const { return (ForceKind)Force.Value; } private: @@ -1119,6 +1132,7 @@ private: if (!C) return; unsigned Val = C->getZExtValue(); + Hint *Hints[] = {&Width, &Interleave, &Force}; for (auto H : Hints) { if (Name == H->Name) { if (H->validate(Val)) @@ -1133,26 +1147,25 @@ private: /// Create a new hint from name / value pair. MDNode *createHintMetadata(StringRef Name, unsigned V) const { LLVMContext &Context = TheLoop->getHeader()->getContext(); - SmallVector Vals; - Vals.push_back(MDString::get(Context, Name)); - Vals.push_back(ConstantInt::get(Type::getInt32Ty(Context), V)); + Value *Vals[] = {MDString::get(Context, Name), + ConstantInt::get(Type::getInt32Ty(Context), V)}; return MDNode::get(Context, Vals); } /// Matches metadata with hint name. - bool matchesHintMetadataName(MDNode *Node, std::vector &HintTypes) { + bool matchesHintMetadataName(MDNode *Node, ArrayRef HintTypes) { MDString* Name = dyn_cast(Node->getOperand(0)); if (!Name) return false; for (auto H : HintTypes) - if (Name->getName().endswith(H.Name)) + if (Name->getString().endswith(H.Name)) return true; return false; } /// Sets current hints into loop metadata, keeping other values intact. - void writeHintsToMetadata(std::vector HintTypes) { + void writeHintsToMetadata(ArrayRef HintTypes) { if (HintTypes.size() == 0) return; @@ -1200,7 +1213,7 @@ static void emitMissedWarning(Function *F, Loop *L, emitLoopVectorizeWarning( F->getContext(), *F, L->getStartLoc(), "failed explicitly specified loop vectorization"); - else if (LH.getUnroll() != 1) + else if (LH.getInterleave() != 1) emitLoopInterleaveWarning( F->getContext(), *F, L->getStartLoc(), "failed explicitly specified loop interleaving"); @@ -1235,6 +1248,7 @@ struct LoopVectorize : public FunctionPass { BlockFrequencyInfo *BFI; TargetLibraryInfo *TLI; AliasAnalysis *AA; + AssumptionTracker *AT; bool DisableUnrolling; bool AlwaysVectorize; @@ -1250,6 +1264,7 @@ struct LoopVectorize : public FunctionPass { BFI = &getAnalysis(); TLI = getAnalysisIfAvailable(); AA = &getAnalysis(); + AT = &getAnalysis(); // Compute some weights outside of the loop over the loops. Compute this // using a BranchProbability to re-use its scaling math. @@ -1306,7 +1321,7 @@ struct LoopVectorize : public FunctionPass { : (Hints.getForce() == LoopVectorizeHints::FK_Enabled ? "enabled" : "?")) << " width=" << Hints.getWidth() - << " unroll=" << Hints.getUnroll() << "\n"); + << " unroll=" << Hints.getInterleave() << "\n"); // Function containing loop Function *F = L->getHeader()->getParent(); @@ -1333,7 +1348,7 @@ struct LoopVectorize : public FunctionPass { return false; } - if (Hints.getWidth() == 1 && Hints.getUnroll() == 1) { + if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) { DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n"); emitOptimizationRemarkAnalysis( F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(), @@ -1344,8 +1359,7 @@ struct LoopVectorize : public FunctionPass { // Check the loop for a trip count threshold: // do not vectorize loops with a tiny trip count. - BasicBlock *Latch = L->getLoopLatch(); - const unsigned TC = SE->getSmallConstantTripCount(L, Latch); + const unsigned TC = SE->getSmallConstantTripCount(L); if (TC > 0u && TC < TinyTripCountVectorThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing."); @@ -1369,7 +1383,8 @@ struct LoopVectorize : public FunctionPass { } // Use the cost model. - LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, F, &Hints); + LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI, AT, F, + &Hints); // Check the function attributes to find out if this function should be // optimized for size. @@ -1456,6 +1471,7 @@ struct LoopVectorize : public FunctionPass { } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addRequired(); @@ -2965,7 +2981,7 @@ void InnerLoopVectorizer::fixLCSSAPHIs() { LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), LoopMiddleBlock); } -} +} InnerLoopVectorizer::VectorParts InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { @@ -3232,18 +3248,8 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { for (unsigned Part = 0; Part < UF; ++Part) { Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]); - // Update the NSW, NUW and Exact flags. Notice: V can be an Undef. - BinaryOperator *VecOp = dyn_cast(V); - if (VecOp && isa(BinOp)) { - VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); - VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); - } - if (VecOp && isa(VecOp)) - VecOp->setIsExact(BinOp->isExact()); - - // Copy the fast-math flags. - if (VecOp && isa(V)) - VecOp->setFastMathFlags(it->getFastMathFlags()); + if (BinaryOperator *VecOp = dyn_cast(V)) + VecOp->copyIRFlags(BinOp); Entry[Part] = V; } @@ -3356,6 +3362,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); assert(ID && "Not an intrinsic call!"); switch (ID) { + case Intrinsic::assume: case Intrinsic::lifetime_end: case Intrinsic::lifetime_start: scalarizeInstruction(it); @@ -3421,7 +3428,7 @@ void InnerLoopVectorizer::updateAnalysis() { DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]); DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); - DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); + DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); DEBUG(DT->verifyDomTree()); } @@ -3597,7 +3604,7 @@ static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { /// \brief Check that the instruction has outside loop users and is not an /// identified reduction variable. static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, - SmallPtrSet &Reductions) { + SmallPtrSetImpl &Reductions) { // Reduction instructions are allowed to have exit users. All other // instructions must not have external users. if (!Reductions.count(Inst)) @@ -4261,9 +4268,9 @@ void AccessAnalysis::processMemAccesses() { if (IsWrite) SetHasWrite = true; - // Create sets of pointers connected by a shared alias set and - // underlying object. - typedef SmallVector ValueVector; + // Create sets of pointers connected by a shared alias set and + // underlying object. + typedef SmallVector ValueVector; ValueVector TempObjects; GetUnderlyingObjects(Ptr, TempObjects, DL); for (Value *UnderlyingObj : TempObjects) { @@ -4622,7 +4629,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, // Bail out early if passed-in parameters make vectorization not feasible. unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1; - unsigned ForcedUnroll = VectorizationUnroll ? VectorizationUnroll : 1; + unsigned ForcedUnroll = VectorizationInterleave ? VectorizationInterleave : 1; // The distance must be bigger than the size needed for a vectorized version // of the operation and the size of the vectorized operation must not be @@ -4794,7 +4801,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // If we did *not* see this pointer before, insert it to the read-write // list. At this phase it is only a 'write' list. - if (Seen.insert(Ptr)) { + if (Seen.insert(Ptr).second) { ++NumReadWrites; AliasAnalysis::Location Loc = AA->getLocation(ST); @@ -4827,7 +4834,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // read a few words, modify, and write a few words, and some of the // words may be written to the same address. bool IsReadOnlyPtr = false; - if (Seen.insert(Ptr) || !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) { + if (Seen.insert(Ptr).second || + !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) { ++NumReads; IsReadOnlyPtr = true; } @@ -4940,7 +4948,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } static bool hasMultipleUsesOf(Instruction *I, - SmallPtrSet &Insts) { + SmallPtrSetImpl &Insts) { unsigned NumUses = 0; for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) { if (Insts.count(dyn_cast(*Use))) @@ -4952,7 +4960,7 @@ static bool hasMultipleUsesOf(Instruction *I, return false; } -static bool areAllUsesIn(Instruction *I, SmallPtrSet &Set) { +static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl &Set) { for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) if (!Set.count(dyn_cast(*Use))) return false; @@ -5090,7 +5098,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // value must only be used once, except by phi nodes and min/max // reductions which are represented as a cmp followed by a select. ReductionInstDesc IgnoredVal(false, nullptr); - if (VisitedInsts.insert(UI)) { + if (VisitedInsts.insert(UI).second) { if (isa(UI)) PHIs.push_back(UI); else @@ -5258,7 +5266,13 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { return IK_NoInduction; assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); - uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); + Type *PointerElementType = PhiTy->getPointerElementType(); + // The pointer stride cannot be determined if the pointer element type is not + // sized. + if (!PointerElementType->isSized()) + return IK_NoInduction; + + uint64_t Size = DL->getTypeAllocSize(PointerElementType); if (C->getValue()->equalsInt(Size)) return IK_PtrInduction; else if (C->getValue()->equalsInt(0 - Size)) @@ -5285,7 +5299,7 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { } bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, - SmallPtrSet& SafePtrs) { + SmallPtrSetImpl &SafePtrs) { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { // We might be able to hoist the load. if (it->mayReadFromMemory()) { @@ -5346,7 +5360,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { } // Find the trip count. - unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); + unsigned TC = SE->getSmallConstantTripCount(TheLoop); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); unsigned WidestType = getWidestType(); @@ -5389,7 +5403,10 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { // If the trip count that we found modulo the vectorization factor is not // zero then we require a tail. if (VF < 2) { - emitAnalysis(Report() << "cannot optimize for size and vectorize at the same time. Enable vectorization of this loop with '#pragma clang loop vectorize(enable)' when compiling with -Os"); + emitAnalysis(Report() << "cannot optimize for size and vectorize at the " + "same time. Enable vectorization of this loop " + "with '#pragma clang loop vectorize(enable)' " + "when compiling with -Os"); DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n"); return Factor; } @@ -5452,6 +5469,10 @@ unsigned LoopVectorizationCostModel::getWidestType() { for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { Type *T = it->getType(); + // Ignore ephemeral values. + if (EphValues.count(it)) + continue; + // Only examine Loads, Stores and PHINodes. if (!isa(it) && !isa(it) && !isa(it)) continue; @@ -5487,23 +5508,23 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, // -- The unroll heuristics -- // We unroll the loop in order to expose ILP and reduce the loop overhead. // There are many micro-architectural considerations that we can't predict - // at this level. For example frontend pressure (on decode or fetch) due to + // at this level. For example, frontend pressure (on decode or fetch) due to // code size, or the number and capabilities of the execution ports. // // We use the following heuristics to select the unroll factor: - // 1. If the code has reductions the we unroll in order to break the cross + // 1. If the code has reductions, then we unroll in order to break the cross // iteration dependency. - // 2. If the loop is really small then we unroll in order to reduce the loop + // 2. If the loop is really small, then we unroll in order to reduce the loop // overhead. // 3. We don't unroll if we think that we will spill registers to memory due // to the increased register pressure. // Use the user preference, unless 'auto' is selected. - int UserUF = Hints->getUnroll(); + int UserUF = Hints->getInterleave(); if (UserUF != 0) return UserUF; - // When we optimize for size we don't unroll. + // When we optimize for size, we don't unroll. if (OptForSize) return 1; @@ -5512,8 +5533,7 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, return 1; // Do not unroll loops with a relatively small trip count. - unsigned TC = SE->getSmallConstantTripCount(TheLoop, - TheLoop->getLoopLatch()); + unsigned TC = SE->getSmallConstantTripCount(TheLoop); if (TC > 1 && TC < TinyTripCountUnrollThreshold) return 1; @@ -5552,15 +5572,15 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, std::max(1U, (R.MaxLocalUsers - 1))); // Clamp the unroll factor ranges to reasonable factors. - unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor(); + unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor(); // Check if the user has overridden the unroll max. if (VF == 1) { - if (ForceTargetMaxScalarUnrollFactor.getNumOccurrences() > 0) - MaxUnrollSize = ForceTargetMaxScalarUnrollFactor; + if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) + MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor; } else { - if (ForceTargetMaxVectorUnrollFactor.getNumOccurrences() > 0) - MaxUnrollSize = ForceTargetMaxVectorUnrollFactor; + if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) + MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor; } // If we did not calculate the cost for VF (because the user selected the VF) @@ -5570,8 +5590,8 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, // Clamp the calculated UF to be between the 1 and the max unroll factor // that the target allows. - if (UF > MaxUnrollSize) - UF = MaxUnrollSize; + if (UF > MaxInterleaveSize) + UF = MaxInterleaveSize; else if (UF < 1) UF = 1; @@ -5602,6 +5622,18 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1); unsigned LoadsUF = UF / (Legal->NumLoads ? Legal->NumLoads : 1); + // If we have a scalar reduction (vector reductions are already dealt with + // by this point), we can increase the critical path length if the loop + // we're unrolling is inside another loop. Limit, by default to 2, so the + // critical path only gets increased by one reduction operation. + if (Legal->getReductionVars()->size() && + TheLoop->getLoopDepth() > 1) { + unsigned F = static_cast(MaxNestedScalarReductionUF); + SmallUF = std::min(SmallUF, F); + StoresUF = std::min(StoresUF, F); + LoadsUF = std::min(LoadsUF, F); + } + if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) { DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n"); return std::max(StoresUF, LoadsUF); @@ -5703,6 +5735,10 @@ LoopVectorizationCostModel::calculateRegisterUsage() { // Ignore instructions that are never used within the loop. if (!Ends.count(I)) continue; + // Ignore ephemeral values. + if (EphValues.count(I)) + continue; + // Remove all of the instructions that end at this location. InstrList &List = TransposeEnds[i]; for (unsigned int j=0, e = List.size(); j < e; ++j) @@ -5743,6 +5779,10 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { if (isa(it)) continue; + // Ignore ephemeral values. + if (EphValues.count(it)) + continue; + unsigned C = getInstructionCost(it, VF); // Check if we should override the cost. @@ -5876,18 +5916,31 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueProperties Op1VP = + TargetTransformInfo::OP_None; + TargetTransformInfo::OperandValueProperties Op2VP = + TargetTransformInfo::OP_None; Value *Op2 = I->getOperand(1); // Check for a splat of a constant or for a non uniform vector of constants. - if (isa(Op2)) + if (isa(Op2)) { + ConstantInt *CInt = cast(Op2); + if (CInt && CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; Op2VK = TargetTransformInfo::OK_UniformConstantValue; - else if (isa(Op2) || isa(Op2)) { + } else if (isa(Op2) || isa(Op2)) { Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; - if (cast(Op2)->getSplatValue() != nullptr) + Constant *SplatValue = cast(Op2)->getSplatValue(); + if (SplatValue) { + ConstantInt *CInt = dyn_cast(SplatValue); + if (CInt && CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; Op2VK = TargetTransformInfo::OK_UniformConstantValue; + } } - return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK); + return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, + Op1VP, Op2VP); } case Instruction::Select: { SelectInst *SI = cast(I); @@ -6030,6 +6083,7 @@ static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)