X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTransforms%2FVectorize%2FLoopVectorize.cpp;h=62b1cce48bc2f93362be47626afad09ac2c23a29;hp=40d9488fddcb8fd00424108accec3ad9bdbb57f4;hb=c731de4630f28b4a4e0fba7d60fce473f7837783;hpb=c895fd0d677be116efaac45af934b4270e4a35a1 diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 40d9488fddc..62b1cce48bc 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -144,7 +144,7 @@ static cl::opt MaximizeBandwidth( /// ... static cl::opt EnableMemAccessVersioning( "enable-mem-access-versioning", cl::init(true), cl::Hidden, - cl::desc("Enable symblic stride memory access versioning")); + cl::desc("Enable symbolic stride memory access versioning")); static cl::opt EnableInterleavedMemAccesses( "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, @@ -268,6 +268,32 @@ static Type* ToVectorTy(Type *Scalar, unsigned VF) { return VectorType::get(Scalar, VF); } +/// A helper function that returns GEP instruction and knows to skip a +/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination +/// pointee types of the 'bitcast' have the same size. +/// For example: +/// bitcast double** %var to i64* - can be skipped +/// bitcast double** %var to i8* - can not +static GetElementPtrInst *getGEPInstruction(Value *Ptr) { + + if (isa(Ptr)) + return cast(Ptr); + + if (isa(Ptr) && + isa(cast(Ptr)->getOperand(0))) { + Type *BitcastTy = Ptr->getType(); + Type *GEPTy = cast(Ptr)->getSrcTy(); + if (!isa(BitcastTy) || !isa(GEPTy)) + return nullptr; + Type *Pointee1Ty = cast(BitcastTy)->getPointerElementType(); + Type *Pointee2Ty = cast(GEPTy)->getPointerElementType(); + const DataLayout &DL = cast(Ptr)->getModule()->getDataLayout(); + if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty)) + return cast(cast(Ptr)->getOperand(0)); + } + return nullptr; +} + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -284,22 +310,23 @@ static Type* ToVectorTy(Type *Scalar, unsigned VF) { /// and reduction variables that were found to a given vectorization factor. class InnerLoopVectorizer { public: - InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, const TargetLibraryInfo *TLI, + InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, + LoopInfo *LI, DominatorTree *DT, + const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, unsigned VecWidth, - unsigned UnrollFactor, SCEVUnionPredicate &Preds) - : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), - VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), + unsigned UnrollFactor) + : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), + VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr), - AddedSafetyChecks(false), Preds(Preds) {} + AddedSafetyChecks(false) {} // Perform the actual loop widening (vectorization). // MinimumBitWidths maps scalar integer values to the smallest bitwidth they // can be validly truncated to. The cost model has assumed this truncation // will happen when vectorizing. void vectorize(LoopVectorizationLegality *L, - DenseMap MinimumBitWidths) { + MapVector MinimumBitWidths) { MinBWs = MinimumBitWidths; Legal = L; // Create a new empty loop. Unlink the old loop and connect the new one. @@ -460,8 +487,10 @@ protected: /// The original loop. Loop *OrigLoop; - /// Scev analysis to use. - ScalarEvolution *SE; + /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies + /// dynamic knowledge to simplify SCEV expressions and converts them to a + /// more usable form. + PredicatedScalarEvolution &PSE; /// Loop Info. LoopInfo *LI; /// Dominator Tree. @@ -520,28 +549,20 @@ protected: /// Map of scalar integer values to the smallest bitwidth they can be legally /// represented as. The vector equivalents of these values should be truncated /// to this type. - DenseMap MinBWs; + MapVector MinBWs; LoopVectorizationLegality *Legal; // Record whether runtime check is added. bool AddedSafetyChecks; - - /// The SCEV predicate containing all the SCEV-related assumptions. - /// The predicate is used to simplify existing expressions in the - /// context of existing SCEV assumptions. Since legality checking is - /// not done here, we don't need to use this predicate to record - /// further assumptions. - SCEVUnionPredicate &Preds; }; class InnerLoopUnroller : public InnerLoopVectorizer { public: - InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, unsigned UnrollFactor, - SCEVUnionPredicate &Preds) - : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor, - Preds) {} + InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, + LoopInfo *LI, DominatorTree *DT, + const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, unsigned UnrollFactor) + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, 1, UnrollFactor) {} private: void scalarizeInstruction(Instruction *Instr, @@ -622,7 +643,8 @@ static void propagateMetadata(Instruction *To, const Instruction *From) { } /// \brief Propagate known metadata from one instruction to a vector of others. -static void propagateMetadata(SmallVectorImpl &To, const Instruction *From) { +static void propagateMetadata(SmallVectorImpl &To, + const Instruction *From) { for (Value *V : To) if (Instruction *I = dyn_cast(V)) propagateMetadata(I, From); @@ -762,9 +784,9 @@ private: /// between the member and the group in a map. class InterleavedAccessInfo { public: - InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT, - SCEVUnionPredicate &Preds) - : SE(SE), TheLoop(L), DT(DT), Preds(Preds) {} + InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, + DominatorTree *DT) + : PSE(PSE), TheLoop(L), DT(DT) {} ~InterleavedAccessInfo() { SmallSet DelSet; @@ -794,17 +816,14 @@ public: } private: - ScalarEvolution *SE; + /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. + /// Simplifies SCEV expressions in the context of existing SCEV assumptions. + /// The interleaved access analysis can also add new predicates (for example + /// by versioning strides of pointers). + PredicatedScalarEvolution &PSE; Loop *TheLoop; DominatorTree *DT; - /// The SCEV predicate containing all the SCEV-related assumptions. - /// The predicate is used to simplify SCEV expressions in the - /// context of existing SCEV assumptions. The interleaved access - /// analysis can also add new predicates (for example by versioning - /// strides of pointers). - SCEVUnionPredicate &Preds; - /// Holds the relationships between the members and the interleave group. DenseMap InterleaveGroupMap; @@ -1162,18 +1181,17 @@ static void emitMissedWarning(Function *F, Loop *L, /// induction variable and the different reduction variables. class LoopVectorizationLegality { public: - LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT, - TargetLibraryInfo *TLI, AliasAnalysis *AA, - Function *F, const TargetTransformInfo *TTI, + LoopVectorizationLegality(Loop *L, PredicatedScalarEvolution &PSE, + DominatorTree *DT, TargetLibraryInfo *TLI, + AliasAnalysis *AA, Function *F, + const TargetTransformInfo *TTI, LoopAccessAnalysis *LAA, LoopVectorizationRequirements *R, - const LoopVectorizeHints *H, - SCEVUnionPredicate &Preds) - : NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F), - TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), - InterleaveInfo(SE, L, DT, Preds), Induction(nullptr), - WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), Hints(H), - Preds(Preds) {} + const LoopVectorizeHints *H) + : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F), + TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT), + Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false), + Requirements(R), Hints(H) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. @@ -1203,6 +1221,9 @@ public: /// Returns True if V is an induction variable in this loop. bool isInductionVariable(const Value *V); + /// Returns True if PN is a reduction variable in this loop. + bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); } + /// Return true if the block BB needs to be predicated in order for the loop /// to be vectorized. bool blockNeedsPredication(BasicBlock *BB); @@ -1317,8 +1338,12 @@ private: /// The loop that we evaluate. Loop *TheLoop; - /// Scev analysis. - ScalarEvolution *SE; + /// A wrapper around ScalarEvolution used to add runtime SCEV checks. + /// Applies dynamic knowledge to simplify SCEV expressions in the context + /// of existing SCEV assumptions. The analysis will also add a minimal set + /// of new predicates if this is required to enable vectorization and + /// unrolling. + PredicatedScalarEvolution &PSE; /// Target Library Info. TargetLibraryInfo *TLI; /// Parent function @@ -1373,13 +1398,6 @@ private: /// While vectorizing these instructions we have to generate a /// call to the appropriate masked intrinsic SmallPtrSet MaskedOp; - - /// The SCEV predicate containing all the SCEV-related assumptions. - /// The predicate is used to simplify SCEV expressions in the - /// context of existing SCEV assumptions. The analysis will also - /// add a minimal set of new predicates if this is required to - /// enable vectorization/unrolling. - SCEVUnionPredicate &Preds; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1391,16 +1409,14 @@ private: /// different operations. class LoopVectorizationCostModel { public: - LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, - LoopVectorizationLegality *Legal, + LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE, + LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, const Function *F, - const LoopVectorizeHints *Hints, - SmallPtrSetImpl &ValuesToIgnore, - SCEVUnionPredicate &Preds) - : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), - TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {} + const LoopVectorizeHints *Hints) + : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), + AC(AC), TheFunction(F), Hints(Hints) {} /// Information about vectorization costs struct VectorizationFactor { @@ -1448,6 +1464,9 @@ public: SmallVector calculateRegisterUsage(const SmallVector &VFs); + /// Collect values we want to ignore in the cost model. + void collectValuesToIgnore(); + private: /// Returns the expected execution cost. The unit of the cost does /// not matter because we use the 'cost' units to compare different @@ -1475,12 +1494,12 @@ public: /// Map of scalar integer values to the smallest bitwidth they can be legally /// represented as. The vector equivalents of these values should be truncated /// to this type. - DenseMap MinBWs; + MapVector MinBWs; /// The loop that we evaluate. Loop *TheLoop; - /// Scev analysis. - ScalarEvolution *SE; + /// Predicated scalar evolution analysis. + PredicatedScalarEvolution &PSE; /// Loop Info analysis. LoopInfo *LI; /// Vectorization legality. @@ -1489,13 +1508,17 @@ public: const TargetTransformInfo &TTI; /// Target Library Info. const TargetLibraryInfo *TLI; - /// Demanded bits analysis + /// Demanded bits analysis. DemandedBits *DB; + /// Assumption cache. + AssumptionCache *AC; const Function *TheFunction; - // Loop Vectorize Hint. + /// Loop Vectorize Hint. const LoopVectorizeHints *Hints; - // Values to ignore in the cost model. - const SmallPtrSetImpl &ValuesToIgnore; + /// Values to ignore in the cost model. + SmallPtrSet ValuesToIgnore; + /// Values to ignore in the cost model when VF > 1. + SmallPtrSet VecValuesToIgnore; }; /// \brief This holds vectorization requirements that must be verified late in @@ -1728,31 +1751,22 @@ struct LoopVectorize : public FunctionPass { } } - SCEVUnionPredicate Preds; + PredicatedScalarEvolution PSE(*SE); // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements; - LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA, - &Requirements, &Hints, Preds); + LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, + &Requirements, &Hints); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); emitMissedWarning(F, L, Hints); return false; } - // Collect values we want to ignore in the cost model. This includes - // type-promoting instructions we identified during reduction detection. - SmallPtrSet ValuesToIgnore; - CodeMetrics::collectEphemeralValues(L, AC, ValuesToIgnore); - for (auto &Reduction : *LVL.getReductionVars()) { - RecurrenceDescriptor &RedDes = Reduction.second; - SmallPtrSetImpl &Casts = RedDes.getCastInsts(); - ValuesToIgnore.insert(Casts.begin(), Casts.end()); - } - // Use the cost model. - LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, DB, AC, F, &Hints, - ValuesToIgnore, Preds); + LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, F, + &Hints); + CM.collectValuesToIgnore(); // Check the function attributes to find out if this function should be // optimized for size. @@ -1863,7 +1877,7 @@ struct LoopVectorize : public FunctionPass { assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop then // interleave it. - InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC, Preds); + InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, IC); Unroller.vectorize(&LVL, CM.MinBWs); emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(), @@ -1871,7 +1885,7 @@ struct LoopVectorize : public FunctionPass { Twine(IC) + ")"); } else { // If we decided that it is *legal* to vectorize the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC, Preds); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, VF.Width, IC); LB.vectorize(&LVL, CM.MinBWs); ++LoopsVectorized; @@ -1972,6 +1986,7 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); + auto *SE = PSE.getSE(); // Make sure that the pointer does not point to structs. if (Ptr->getType()->getPointerElementType()->isAggregateType()) return 0; @@ -1983,7 +1998,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { return II.getConsecutiveDirection(); } - GetElementPtrInst *Gep = dyn_cast_or_null(Ptr); + GetElementPtrInst *Gep = getGEPInstruction(Ptr); if (!Gep) return 0; @@ -2001,7 +2016,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // Make sure that all of the index operands are loop invariant. for (unsigned i = 1; i < NumOperands; ++i) - if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) + if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) return 0; InductionDescriptor II = Inductions[Phi]; @@ -2014,14 +2029,14 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // operand. for (unsigned i = 0; i != NumOperands; ++i) if (i != InductionOperand && - !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) + !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) return 0; // We can emit wide load/stores only if the last non-zero index is the // induction variable. const SCEV *Last = nullptr; if (!Strides.count(Gep)) - Last = SE->getSCEV(Gep->getOperand(InductionOperand)); + Last = PSE.getSCEV(Gep->getOperand(InductionOperand)); else { // Because of the multiplication by a stride we can have a s/zext cast. // We are going to replace this stride by 1 so the cast is safe to ignore. @@ -2032,7 +2047,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { // %idxprom = zext i32 %mul to i64 << Safe cast. // %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom // - Last = replaceSymbolicStrideSCEV(SE, Strides, Preds, + Last = replaceSymbolicStrideSCEV(PSE, Strides, Gep->getOperand(InductionOperand), Gep); if (const SCEVCastExpr *C = dyn_cast(Last)) Last = @@ -2074,7 +2089,6 @@ InnerLoopVectorizer::getVectorValue(Value *V) { // If this scalar is unknown, assume that it is a constant or that it is // loop invariant. Broadcast V and save the value for future uses. Value *B = getBroadcastInstrs(V); - return WidenMap.splat(V, B); } @@ -2377,7 +2391,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { VectorParts &Entry = WidenMap.get(Instr); // Handle consecutive loads/stores. - GetElementPtrInst *Gep = dyn_cast(Ptr); + GetElementPtrInst *Gep = getGEPInstruction(Ptr); if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { setDebugLocFromInst(Builder, Gep); Value *PtrOperand = Gep->getPointerOperand(); @@ -2391,8 +2405,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { Ptr = Builder.Insert(Gep2); } else if (Gep) { setDebugLocFromInst(Builder, Gep); - assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), - OrigLoop) && "Base ptr must be invariant"); + assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()), + OrigLoop) && + "Base ptr must be invariant"); // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; @@ -2409,7 +2424,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { if (i == InductionOperand || (GepOperandInst && OrigLoop->contains(GepOperandInst))) { assert((i == InductionOperand || - SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) && + PSE.getSE()->isLoopInvariant(PSE.getSCEV(GepOperandInst), + OrigLoop)) && "Must be last index or loop invariant"); VectorParts &GEPParts = getVectorValue(GepOperand); @@ -2498,7 +2514,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { } } -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) { +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, + bool IfPredicateStore) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector Params; @@ -2560,7 +2577,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic Value *Cmp = nullptr; if (IfPredicateStore) { Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width)); - Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1)); + Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, + ConstantInt::get(Cmp->getType(), 1)); } Instruction *Cloned = Instr->clone(); @@ -2627,8 +2645,10 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); // Find the loop boundaries. + ScalarEvolution *SE = PSE.getSE(); const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop); - assert(BackedgeTakenCount != SE->getCouldNotCompute() && "Invalid loop count"); + assert(BackedgeTakenCount != SE->getCouldNotCompute() && + "Invalid loop count"); Type *IdxTy = Legal->getWidestInductionType(); @@ -2733,8 +2753,10 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { // Generate the code to check that the SCEV assumptions that we made. // We want the new basic block to start at the first instruction in a // sequence of instructions that form a check. - SCEVExpander Exp(*SE, Bypass->getModule()->getDataLayout(), "scev.check"); - Value *SCEVCheck = Exp.expandCodeForPredicate(&Preds, BB->getTerminator()); + SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), + "scev.check"); + Value *SCEVCheck = + Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); if (auto *C = dyn_cast(SCEVCheck)) if (C->isZero()) @@ -3281,7 +3303,7 @@ void InnerLoopVectorizer::vectorizeLoop() { assert(RdxPhi && "Unable to recover vectorized PHI"); // Find the reduction variable descriptor. - assert(Legal->getReductionVars()->count(RdxPhi) && + assert(Legal->isReductionVariable(RdxPhi) && "Unable to find the reduction variable"); RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; @@ -3576,12 +3598,12 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { return BlockMask; } -void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, - InnerLoopVectorizer::VectorParts &Entry, - unsigned UF, unsigned VF, PhiVector *PV) { +void InnerLoopVectorizer::widenPHIInstruction( + Instruction *PN, InnerLoopVectorizer::VectorParts &Entry, unsigned UF, + unsigned VF, PhiVector *PV) { PHINode* P = cast(PN); // Handle reduction variables: - if (Legal->getReductionVars()->count(P)) { + if (Legal->isReductionVariable(P)) { for (unsigned part = 0; part < UF; ++part) { // This is phase one of vectorizing PHIs. Type *VecTy = (VF == 1) ? PN->getType() : @@ -3643,7 +3665,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, case InductionDescriptor::IK_NoInduction: llvm_unreachable("Unknown induction"); case InductionDescriptor::IK_IntInduction: { - assert(P->getType() == II.getStartValue()->getType() && "Types must match"); + assert(P->getType() == II.getStartValue()->getType() && + "Types must match"); // Handle other induction variables that are now based on the // canonical one. Value *V = Induction; @@ -3752,8 +3775,9 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // Widen selects. // If the selector is loop invariant we can create a select // instruction with a scalar condition. Otherwise, use vector-select. - bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)), - OrigLoop); + auto *SE = PSE.getSE(); + bool InvariantCond = + SE->isLoopInvariant(PSE.getSCEV(it->getOperand(0)), OrigLoop); setDebugLocFromInst(Builder, &*it); // The condition can be loop invariant but still defined inside the @@ -3828,9 +3852,10 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction, CI->getType()); Value *Broadcasted = getBroadcastInstrs(ScalarCast); - InductionDescriptor II = Legal->getInductionVars()->lookup(OldInduction); - Constant *Step = - ConstantInt::getSigned(CI->getType(), II.getStepValue()->getSExtValue()); + InductionDescriptor II = + Legal->getInductionVars()->lookup(OldInduction); + Constant *Step = ConstantInt::getSigned( + CI->getType(), II.getStepValue()->getSExtValue()); for (unsigned Part = 0; Part < UF; ++Part) Entry[Part] = getStepVector(Broadcasted, VF * Part, Step); propagateMetadata(Entry, &*it); @@ -3933,7 +3958,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { void InnerLoopVectorizer::updateAnalysis() { // Forget the original basic block. - SE->forgetLoop(OrigLoop); + PSE.getSE()->forgetLoop(OrigLoop); // Update the dominator tree information. assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && @@ -4085,10 +4110,10 @@ bool LoopVectorizationLegality::canVectorize() { } // ScalarEvolution needs to be able to find the exit count. - const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); - if (ExitCount == SE->getCouldNotCompute()) { - emitAnalysis(VectorizationReport() << - "could not determine number of loop iterations"); + const SCEV *ExitCount = PSE.getSE()->getBackedgeTakenCount(TheLoop); + if (ExitCount == PSE.getSE()->getCouldNotCompute()) { + emitAnalysis(VectorizationReport() + << "could not determine number of loop iterations"); DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; } @@ -4128,7 +4153,7 @@ bool LoopVectorizationLegality::canVectorize() { if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; - if (Preds.getComplexity() > SCEVThreshold) { + if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { emitAnalysis(VectorizationReport() << "Too many SCEV assumptions need to be made and checked " << "at runtime"); @@ -4234,7 +4259,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } InductionDescriptor ID; - if (InductionDescriptor::isInductionPHI(Phi, SE, ID)) { + if (InductionDescriptor::isInductionPHI(Phi, PSE.getSE(), ID)) { Inductions[Phi] = ID; // Get the widest type. if (!WidestIndTy) @@ -4303,7 +4328,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // second argument is the same (i.e. loop invariant) if (CI && hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) { - if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) { + auto *SE = PSE.getSE(); + if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) { emitAnalysis(VectorizationReport(&*it) << "intrinsic instruction cannot be vectorized"); DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n"); @@ -4376,7 +4402,7 @@ void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { else return; - Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop); + Value *Stride = getStrideFromPointer(Ptr, PSE.getSE(), TheLoop); if (!Stride) return; @@ -4440,7 +4466,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { } Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); - Preds.add(&LAI->Preds); + PSE.addPredicate(LAI->PSE.getUnionPredicate()); return true; } @@ -4496,8 +4522,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB, if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr || !isSinglePredecessor) { - // Build a masked store if it is legal for the target, otherwise scalarize - // the block. + // Build a masked store if it is legal for the target, otherwise + // scalarize the block. bool isLegalMaskedOp = isLegalMaskedStore(SI->getValueOperand()->getType(), SI->getPointerOperand()); @@ -4555,7 +4581,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses( StoreInst *SI = dyn_cast(I); Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); - int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides, Preds); + int Stride = isStridedPtr(PSE, Ptr, TheLoop, Strides); // The factor of the corresponding interleave group. unsigned Factor = std::abs(Stride); @@ -4564,7 +4590,7 @@ void InterleavedAccessInfo::collectConstStridedAccesses( if (Factor < 2 || Factor > MaxInterleaveGroupFactor) continue; - const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr); + const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); PointerType *PtrTy = dyn_cast(Ptr->getType()); unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); @@ -4651,8 +4677,8 @@ void InterleavedAccessInfo::analyzeInterleaving( continue; // Calculate the distance and prepare for the rule 3. - const SCEVConstant *DistToA = - dyn_cast(SE->getMinusSCEV(DesB.Scev, DesA.Scev)); + const SCEVConstant *DistToA = dyn_cast( + PSE.getSE()->getMinusSCEV(DesB.Scev, DesA.Scev)); if (!DistToA) continue; @@ -4708,7 +4734,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) { } // Find the trip count. - unsigned TC = SE->getSmallConstantTripCount(TheLoop); + unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); @@ -4857,7 +4883,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { // Examine PHI nodes that are reduction variables. Update the type to // account for the recurrence type. if (PHINode *PN = dyn_cast(it)) { - if (!Legal->getReductionVars()->count(PN)) + if (!Legal->isReductionVariable(PN)) continue; RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN]; T = RdxDesc.getRecurrenceType(); @@ -4910,7 +4936,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, return 1; // Do not interleave loops with a relatively small trip count. - unsigned TC = SE->getSmallConstantTripCount(TheLoop); + unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); if (TC > 1 && TC < TinyTripCountInterleaveThreshold) return 1; @@ -5024,8 +5050,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, } // Interleave if this is a large loop (small loops are already dealt with by - // this - // point) that could benefit from interleaving. + // this point) that could benefit from interleaving. bool HasReductions = (Legal->getReductionVars()->size() > 0); if (TTI.enableAggressiveInterleaving(HasReductions)) { DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); @@ -5139,15 +5164,15 @@ LoopVectorizationCostModel::calculateRegisterUsage( // Ignore instructions that are never used within the loop. if (!Ends.count(I)) continue; - // Skip ignored values. - if (ValuesToIgnore.count(I)) - continue; - // Remove all of the instructions that end at this location. InstrList &List = TransposeEnds[i]; for (unsigned int j = 0, e = List.size(); j < e; ++j) OpenIntervals.erase(List[j]); + // Skip ignored values. + if (ValuesToIgnore.count(I)) + continue; + // For each VF find the maximum usage of registers. for (unsigned j = 0, e = VFs.size(); j < e; ++j) { if (VFs[j] == 1) { @@ -5155,10 +5180,14 @@ LoopVectorizationCostModel::calculateRegisterUsage( continue; } - // Count the number of live interals. + // Count the number of live intervals. unsigned RegUsage = 0; - for (auto Inst : OpenIntervals) + for (auto Inst : OpenIntervals) { + // Skip ignored values for VF > 1. + if (VecValuesToIgnore.count(Inst)) + continue; RegUsage += GetRegUsage(Inst->getType(), VFs[j]); + } MaxUsages[j] = std::max(MaxUsages[j], RegUsage); } @@ -5302,6 +5331,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { if (VF > 1 && MinBWs.count(I)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); Type *VectorTy = ToVectorTy(RetTy, VF); + auto SE = PSE.getSE(); // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { @@ -5383,8 +5413,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::ICmp: case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); - if (VF > 1 && MinBWs.count(dyn_cast(I->getOperand(0)))) - ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]); + Instruction *Op0AsInstruction = dyn_cast(I->getOperand(0)); + auto It = MinBWs.find(Op0AsInstruction); + if (VF > 1 && It != MinBWs.end()) + ValTy = IntegerType::get(ValTy->getContext(), It->second); VectorTy = ToVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy); } @@ -5601,6 +5633,79 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) { return false; } +void LoopVectorizationCostModel::collectValuesToIgnore() { + // Ignore ephemeral values. + CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore); + + // Ignore type-promoting instructions we identified during reduction + // detection. + for (auto &Reduction : *Legal->getReductionVars()) { + RecurrenceDescriptor &RedDes = Reduction.second; + SmallPtrSetImpl &Casts = RedDes.getCastInsts(); + VecValuesToIgnore.insert(Casts.begin(), Casts.end()); + } + + // Ignore induction phis that are only used in either GetElementPtr or ICmp + // instruction to exit loop. Induction variables usually have large types and + // can have big impact when estimating register usage. + // This is for when VF > 1. + for (auto &Induction : *Legal->getInductionVars()) { + auto *PN = Induction.first; + auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch()); + + // Check that the PHI is only used by the induction increment (UpdateV) or + // by GEPs. Then check that UpdateV is only used by a compare instruction or + // the loop header PHI. + // FIXME: Need precise def-use analysis to determine if this instruction + // variable will be vectorized. + if (std::all_of(PN->user_begin(), PN->user_end(), + [&](const User *U) -> bool { + return U == UpdateV || isa(U); + }) && + std::all_of(UpdateV->user_begin(), UpdateV->user_end(), + [&](const User *U) -> bool { + return U == PN || isa(U); + })) { + VecValuesToIgnore.insert(PN); + VecValuesToIgnore.insert(UpdateV); + } + } + + // Ignore instructions that will not be vectorized. + // This is for when VF > 1. + for (auto bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; + ++bb) { + for (auto &Inst : **bb) { + switch (Inst.getOpcode()) { + case Instruction::GetElementPtr: { + // Ignore GEP if its last operand is an induction variable so that it is + // a consecutive load/store and won't be vectorized as scatter/gather + // pattern. + + GetElementPtrInst *Gep = cast(&Inst); + unsigned NumOperands = Gep->getNumOperands(); + unsigned InductionOperand = getGEPInductionOperand(Gep); + bool GepToIgnore = true; + + // Check that all of the gep indices are uniform except for the + // induction operand. + for (unsigned i = 0; i != NumOperands; ++i) { + if (i != InductionOperand && + !PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), + TheLoop)) { + GepToIgnore = false; + break; + } + } + + if (GepToIgnore) + VecValuesToIgnore.insert(&Inst); + break; + } + } + } + } +} void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {