X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTransforms%2FVectorize%2FLoopVectorize.cpp;h=8be31dc86d2556fd01a44c06e136fff594da72a4;hb=d6964741f52d2553517209319fa214afe8b3812b;hp=c11c66f1aeb5f77b659c515e5852f920312cd0ce;hpb=bf8772ed2cc89a495e2692919331d7a03e76d791;p=oota-llvm.git diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index c11c66f1aeb..8be31dc86d2 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7,17 +7,38 @@ // //===----------------------------------------------------------------------===// // -// This is a simple loop vectorizer. We currently only support single block -// loops. We have a very simple and restrictive legality check: we need to read -// and write from disjoint memory locations. We still don't have a cost model. -// We do support integer reductions. +// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops +// and generates target-independent LLVM-IR. Legalization of the IR is done +// in the codegen. However, the vectorizes uses (will use) the codegen +// interfaces to generate IR that is likely to result in an optimal binary. +// +// The loop vectorizer combines consecutive loop iteration into a single +// 'wide' iteration. After this transformation the index is incremented +// by the SIMD vector width, and not by one. // // This pass has three parts: // 1. The main loop pass that drives the different parts. -// 2. LoopVectorizationLegality - A helper class that checks for the legality +// 2. LoopVectorizationLegality - A unit that checks for the legality // of the vectorization. -// 3. SingleBlockLoopVectorizer - A helper class that performs the actual +// 3. SingleBlockLoopVectorizer - A unit that performs the actual // widening of instructions. +// 4. LoopVectorizationCostModel - A unit that checks for the profitability +// of vectorization. It decides on the optimal vector width, which +// can be one, if vectorization is not profitable. +// +//===----------------------------------------------------------------------===// +// +// The reduction-variable vectorization is based on the paper: +// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. +// +// Variable uniformity checks are inspired by: +// Karrenberg, R. and Hack, S. Whole Function Vectorization. +// +// Other ideas/concepts are from: +// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. +// +// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of +// Vectorizing Compilers. // //===----------------------------------------------------------------------===// #define LV_NAME "loop-vectorize" @@ -37,13 +58,15 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/TargetTransformInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -53,44 +76,71 @@ using namespace llvm; static cl::opt -DefaultVectorizationFactor("default-loop-vectorize-width", - cl::init(4), cl::Hidden, - cl::desc("Set the default loop vectorization width")); +VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, + cl::desc("Set the default vectorization width. Zero is autoselect.")); + +/// We don't vectorize loops with a known constant trip count below this number. +const unsigned TinyTripCountThreshold = 16; + +/// When performing a runtime memory check, do not check more than this +/// number of pointers. Notice that the check is quadratic! +const unsigned RuntimeMemoryCheckThreshold = 2; + +/// This is the highest vector width that we try to generate. +const unsigned MaxVectorSize = 8; + namespace { -// Forward declaration. +// Forward declarations. class LoopVectorizationLegality; - -/// Vectorize a simple loop. This class performs the widening of simple single -/// basic block loops into vectors. It does not perform any -/// vectorization-legality checks, and just does it. It widens the vectors -/// to a given vectorization factor (VF). +class LoopVectorizationCostModel; + +/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic +/// block to a specified vectorization factor (VF). +/// This class performs the widening of scalars into vectors, or multiple +/// scalars. This class also implements the following features: +/// * It inserts an epilogue loop for handling loops that don't have iteration +/// counts that are known to be a multiple of the vectorization factor. +/// * It handles the code generation for reduction variables. +/// * Scalarization (implementation using scalars) of un-vectorizable +/// instructions. +/// SingleBlockLoopVectorizer does not perform any vectorization-legality +/// checks, and relies on the caller to check for the different legality +/// aspects. The SingleBlockLoopVectorizer relies on the +/// LoopVectorizationLegality class to provide information about the induction +/// and reduction variables that were found to a given vectorization factor. class SingleBlockLoopVectorizer { public: /// Ctor. - SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li, - LPPassManager *Lpm, unsigned VecWidth): - Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth), + SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, + DominatorTree *Dt, DataLayout *Dl, + unsigned VecWidth): + OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth), Builder(Se->getContext()), Induction(0), OldInduction(0) { } // Perform the actual loop widening (vectorization). void vectorize(LoopVectorizationLegality *Legal) { - ///Create a new empty loop. Unlink the old loop and connect the new one. + // Create a new empty loop. Unlink the old loop and connect the new one. createEmptyLoop(Legal); - /// Widen each instruction in the old loop to a new one in the new loop. - /// Use the Legality module to find the induction and reduction variables. - vectorizeLoop(Legal); - // register the new loop. - cleanup(); + // Widen each instruction in the old loop to a new one in the new loop. + // Use the Legality module to find the induction and reduction variables. + vectorizeLoop(Legal); + // Register the new loop and update the analysis passes. + updateAnalysis(); } private: + /// Add code that checks at runtime if the accessed arrays overlap. + /// Returns the comperator value or NULL if no check is needed. + Value *addRuntimeCheck(LoopVectorizationLegality *Legal, + Instruction *Loc); /// Create an empty loop, based on the loop ranges of the old loop. void createEmptyLoop(LoopVectorizationLegality *Legal); /// Copy and widen the instructions from the old loop. void vectorizeLoop(LoopVectorizationLegality *Legal); - /// Insert the new loop to the loop hierarchy and pass manager. - void cleanup(); + /// Insert the new loop to the loop hierarchy and pass manager + /// and update the analysis passes. + void updateAnalysis(); /// This instruction is un-vectorizable. Implement it as a sequence /// of scalars. @@ -121,13 +171,15 @@ private: typedef DenseMap ValueMap; /// The original loop. - Loop *Orig; + Loop *OrigLoop; // Scev analysis to use. ScalarEvolution *SE; // Loop Info. LoopInfo *LI; - // Loop Pass Manager; - LPPassManager *LPM; + // Dominator Tree. + DominatorTree *DT; + // Data Layout. + DataLayout *DL; // The vectorization factor to use. unsigned VF; @@ -136,6 +188,10 @@ private: // --- Vectorization state --- + /// The vector-loop preheader. + BasicBlock *LoopVectorPreHeader; + /// The scalar-loop preheader. + BasicBlock *LoopScalarPreHeader; /// Middle Block between the vector and the scalar. BasicBlock *LoopMiddleBlock; ///The ExitBlock of the scalar loop. @@ -155,10 +211,19 @@ private: ValueMap WidenMap; }; -/// Perform the vectorization legality check. This class does not look at the -/// profitability of vectorization, only the legality. At the moment the checks -/// are very simple and focus on single basic block loops with a constant -/// iteration count and no reductions. +/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and +/// to what vectorization factor. +/// This class does not look at the profitability of vectorization, only the +/// legality. This class has two main kinds of checks: +/// * Memory checks - The code in canVectorizeMemory checks if vectorization +/// will change the order of memory accesses in a way that will change the +/// correctness of the program. +/// * Scalars checks - The code in canVectorizeBlock checks for a number +/// of different conditions, such as the availability of a single induction +/// variable, that all types are supported and vectorize-able, etc. +/// This code reflects the capabilities of SingleBlockLoopVectorizer. +/// This class is also used by SingleBlockLoopVectorizer for identifying +/// induction variable and the different reduction variables. class LoopVectorizationLegality { public: LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): @@ -166,25 +231,80 @@ public: /// This represents the kinds of reductions that we support. enum ReductionKind { - IntegerAdd, /// Sum of numbers. + NoReduction, /// Not a reduction. + IntegerAdd, /// Sum of numbers. IntegerMult, /// Product of numbers. - NoReduction /// Not a reduction. + IntegerOr, /// Bitwise or logical OR of numbers. + IntegerAnd, /// Bitwise or logical AND of numbers. + IntegerXor /// Bitwise or logical XOR of numbers. + }; + + /// This POD struct holds information about reduction variables. + struct ReductionDescriptor { + // Default C'tor + ReductionDescriptor(): + StartValue(0), LoopExitInstr(0), Kind(NoReduction) {} + + // C'tor. + ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K): + StartValue(Start), LoopExitInstr(Exit), Kind(K) {} + + // The starting value of the reduction. + // It does not have to be zero! + Value *StartValue; + // The instruction who's value is used outside the loop. + Instruction *LoopExitInstr; + // The kind of the reduction. + ReductionKind Kind; + }; + + // This POD struct holds information about the memory runtime legality + // check that a group of pointers do not overlap. + struct RuntimePointerCheck { + RuntimePointerCheck(): Need(false) {} + + /// Reset the state of the pointer runtime information. + void reset() { + Need = false; + Pointers.clear(); + Starts.clear(); + Ends.clear(); + } + + /// Insert a pointer and calculate the start and end SCEVs. + void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr) { + const SCEV *Sc = SE->getSCEV(Ptr); + const SCEVAddRecExpr *AR = dyn_cast(Sc); + assert(AR && "Invalid addrec expression"); + const SCEV *Ex = SE->getExitCount(Lp, Lp->getHeader()); + const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); + Pointers.push_back(Ptr); + Starts.push_back(AR->getStart()); + Ends.push_back(ScEnd); + } + + /// This flag indicates if we need to add the runtime check. + bool Need; + /// Holds the pointers that we need to check. + SmallVector Pointers; + /// Holds the pointer value at the beginning of the loop. + SmallVector Starts; + /// Holds the pointer value at the end of the loop. + SmallVector Ends; }; - // Holds a pairing of reduction instruction and the reduction kind. - typedef std::pair ReductionPair; + /// ReductionList contains the reduction descriptors for all + /// of the reductions that were found in the loop. + typedef DenseMap ReductionList; - /// ReductionList contains the reduction variables - /// as well as a single EXIT (from the block) value and the kind of - /// reduction variable.. - /// Notice that the EXIT instruction can also be the PHI itself. - typedef DenseMap ReductionList; + /// InductionList saves induction variables and maps them to the initial + /// value entring the loop. + typedef DenseMap InductionList; - /// Returns the maximum vectorization factor that we *can* use to vectorize - /// this loop. This does not mean that it is profitable to vectorize this - /// loop, only that it is legal to do so. This may be a large number. We - /// can vectorize to any SIMD width below this number. - unsigned getLoopMaxVF(); + /// Returns true if it is legal to vectorize this loop. + /// This does not mean that it is profitable to vectorize this + /// loop, only that it is legal to do so. + bool canVectorize(); /// Returns the Induction variable. PHINode *getInduction() {return Induction;} @@ -192,10 +312,23 @@ public: /// Returns the reduction variables found in the loop. ReductionList *getReductionVars() { return &Reductions; } - /// Check that the GEP operands are all uniform except for the last index - /// which has to be the induction variable. - bool isConsecutiveGep(Value *Ptr); + /// Returns the induction variables found in the loop. + InductionList *getInductionVars() { return &Inductions; } + + /// Check if this pointer is consecutive when vectorizing. This happens + /// when the last index of the GEP is the induction variable, or that the + /// pointer itself is an induction variable. + /// This check allows us to vectorize A[idx] into a wide load/store. + bool isConsecutivePtr(Value *Ptr); + + /// Returns true if the value V is uniform within the loop. + bool isUniform(Value *V); + + /// Returns true if this instruction will remain scalar after vectorization. + bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);} + /// Returns the information that we collected about runtime memory check. + RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; } private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -208,21 +341,16 @@ private: /// Returns true if BB is vectorizable bool canVectorizeMemory(BasicBlock &BB); - // Check if a pointer value is known to be disjoint. - // Example: Alloca, Global, NoAlias. - bool isIdentifiedSafeObject(Value* Val); - /// Returns True, if 'Phi' is the kind of reduction variable for type /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. bool AddReductionVar(PHINode *Phi, ReductionKind Kind); - /// Checks if a constant matches the reduction kind. - /// Sums starts with zero. Products start at one. - bool isReductionConstant(Value *V, ReductionKind Kind); /// Returns true if the instruction I can be a reduction variable of type /// 'Kind'. bool isReductionInstr(Instruction *I, ReductionKind Kind); /// Returns True, if 'Phi' is an induction variable. bool isInductionVariable(PHINode *Phi); + /// Return true if can compute the address bounds of Ptr within the loop. + bool hasComputableBounds(Value *Ptr); /// The loop that we evaluate. Loop *TheLoop; @@ -233,13 +361,72 @@ private: // --- vectorization state --- // - /// Holds the induction variable. + /// Holds the integer induction variable. This is the counter of the + /// loop. PHINode *Induction; /// Holds the reduction variables. ReductionList Reductions; + /// Holds all of the induction variables that we found in the loop. + /// Notice that inductions don't need to start at zero and that induction + /// variables can be pointers. + InductionList Inductions; + /// Allowed outside users. This holds the reduction /// vars which can be accessed from outside the loop. SmallPtrSet AllowedExit; + /// This set holds the variables which are known to be uniform after + /// vectorization. + SmallPtrSet Uniforms; + /// We need to check that all of the pointers in this list are disjoint + /// at runtime. + RuntimePointerCheck PtrRtCheck; +}; + +/// LoopVectorizationCostModel - estimates the expected speedups due to +/// vectorization. +/// In many cases vectorization is not profitable. This can happen because +/// of a number of reasons. In this class we mainly attempt to predict +/// the expected speedup/slowdowns due to the supported instruction set. +/// We use the VectorTargetTransformInfo to query the different backends +/// for the cost of different operations. +class LoopVectorizationCostModel { +public: + /// C'tor. + LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, + LoopVectorizationLegality *Leg, + const VectorTargetTransformInfo *Vtti): + TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { } + + /// Returns the most profitable vectorization factor for the loop that is + /// smaller or equal to the VF argument. This method checks every power + /// of two up to VF. + unsigned findBestVectorizationFactor(unsigned VF = MaxVectorSize); + +private: + /// Returns the expected execution cost. The unit of the cost does + /// not matter because we use the 'cost' units to compare different + /// vector widths. The cost that is returned is *not* normalized by + /// the factor width. + unsigned expectedCost(unsigned VF); + + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + unsigned getInstructionCost(Instruction *I, unsigned VF); + + /// A helper function for converting Scalar types to vector types. + /// If the incoming type is void, we return void. If the VF is 1, we return + /// the scalar type. + static Type* ToVectorTy(Type *Scalar, unsigned VF); + + /// The loop that we evaluate. + Loop *TheLoop; + /// Scev analysis. + ScalarEvolution *SE; + + /// Vectorization legality. + LoopVectorizationLegality *Legal; + /// Vector target information. + const VectorTargetTransformInfo *VTTI; }; struct LoopVectorize : public LoopPass { @@ -252,34 +439,56 @@ struct LoopVectorize : public LoopPass { ScalarEvolution *SE; DataLayout *DL; LoopInfo *LI; + TargetTransformInfo *TTI; + DominatorTree *DT; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { - - // Only vectorize innermost loops. + // We only vectorize innermost loops. if (!L->empty()) return false; SE = &getAnalysis(); DL = getAnalysisIfAvailable(); LI = &getAnalysis(); + TTI = getAnalysisIfAvailable(); + DT = &getAnalysis(); DEBUG(dbgs() << "LV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); // Check if it is legal to vectorize the loop. LoopVectorizationLegality LVL(L, SE, DL); - unsigned MaxVF = LVL.getLoopMaxVF(); - - // Check that we can vectorize using the chosen vectorization width. - if (MaxVF < DefaultVectorizationFactor) { - DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n"); + if (!LVL.canVectorize()) { + DEBUG(dbgs() << "LV: Not vectorizing.\n"); return false; } - DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n"); + // Select the preffered vectorization factor. + unsigned VF = 1; + if (VectorizationFactor == 0) { + const VectorTargetTransformInfo *VTTI = 0; + if (TTI) + VTTI = TTI->getVectorTargetTransformInfo(); + // Use the cost model. + LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); + VF = CM.findBestVectorizationFactor(); + + if (VF == 1) { + DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); + return false; + } + + } else { + // Use the user command flag. + VF = VectorizationFactor; + } + + DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< + L->getHeader()->getParent()->getParent()->getModuleIdentifier()<< + "\n"); - // If we decided that is is *legal* to vectorizer the loop. Do it. - SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor); + // If we decided that it is *legal* to vectorizer the loop then do it. + SingleBlockLoopVectorizer LB(L, SE, LI, DT, DL, VF); LB.vectorize(&LVL); DEBUG(verifyFunction(*L->getHeader()->getParent())); @@ -292,19 +501,29 @@ struct LoopVectorize : public LoopPass { AU.addRequiredID(LCSSAID); AU.addRequired(); AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); } }; Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { - // Instructions that access the old induction variable - // actually want to get the new one. - if (V == OldInduction) - V = Induction; // Create the types. LLVMContext &C = V->getContext(); Type *VTy = VectorType::get(V->getType(), VF); Type *I32 = IntegerType::getInt32Ty(C); + + // Save the current insertion location. + Instruction *Loc = Builder.GetInsertPoint(); + + // We need to place the broadcast of invariant variables outside the loop. + bool Invariant = (OrigLoop->isLoopInvariant(V) && V != Induction); + + // Place the code for broadcasting invariant variables in the new preheader. + if (Invariant) + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + Constant *Zero = ConstantInt::get(I32, 0); Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); Value *UndefVal = UndefValue::get(VTy); @@ -313,10 +532,11 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { // Broadcast the scalar into all locations in the vector. Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, "broadcast"); - // We are accessing the induction variable. Make sure to promote the - // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. - if (V == Induction) - return getConsecutiveVector(Shuf); + + // Restore the builder insertion point. + if (Invariant) + Builder.SetInsertPoint(Loc); + return Shuf; } @@ -340,8 +560,15 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { return Builder.CreateAdd(Val, Cv, "induction"); } -bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { - GetElementPtrInst *Gep = dyn_cast(Ptr); +bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { + assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); + + // If this pointer is an induction variable, return it. + PHINode *Phi = dyn_cast_or_null(Ptr); + if (Phi && getInductionVars()->count(Phi)) + return true; + + GetElementPtrInst *Gep = dyn_cast_or_null(Ptr); if (!Gep) return false; @@ -353,7 +580,7 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) return false; - // We can emit wide load/stores only of the last index is the induction + // We can emit wide load/stores only if the last index is the induction // variable. const SCEV *Last = SE->getSCEV(LastIndex); if (const SCEVAddRecExpr *AR = dyn_cast(Last)) { @@ -368,28 +595,27 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { return false; } +bool LoopVectorizationLegality::isUniform(Value *V) { + return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); +} + Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { + assert(V != Induction && "The new induction variable should not be used."); assert(!V->getType()->isVectorTy() && "Can't widen a vector"); // If we saved a vectorized copy of V, use it. - ValueMap::iterator it = WidenMap.find(V); - if (it != WidenMap.end()) - return it->second; + Value *&MapEntry = WidenMap[V]; + if (MapEntry) + return MapEntry; // Broadcast V and save the value for future uses. Value *B = getBroadcastInstrs(V); - WidenMap[V] = B; + MapEntry = B; return B; } Constant* SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { - SmallVector Indices; - // Create a vector of consecutive numbers from zero to VF. - for (unsigned i = 0; i < VF; ++i) - Indices.push_back(ConstantInt::get(ScalarTy, Val)); - - // Add the consecutive indices to the vector value. - return ConstantVector::get(Indices); + return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true)); } void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { @@ -403,7 +629,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { // If we are accessing the old induction variable, use the new one. if (SrcOp == OldInduction) { - Params.push_back(getBroadcastInstrs(Induction)); + Params.push_back(getVectorValue(SrcOp)); continue; } @@ -434,7 +660,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { if (!IsVoidRetTy) VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); - // For each scalar that we create. + // For each scalar that we create: for (unsigned i = 0; i < VF; ++i) { Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) @@ -462,13 +688,75 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { WidenMap[Instr] = VecResults; } -void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { +Value* +SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, + Instruction *Loc) { + LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = + Legal->getRuntimePointerCheck(); + + if (!PtrRtCheck->Need) + return NULL; + + Value *MemoryRuntimeCheck = 0; + unsigned NumPointers = PtrRtCheck->Pointers.size(); + SmallVector Starts; + SmallVector Ends; + + SCEVExpander Exp(*SE, "induction"); + + // Use this type for pointer arithmetic. + Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType(); + + for (unsigned i = 0; i < NumPointers; ++i) { + Value *Ptr = PtrRtCheck->Pointers[i]; + const SCEV *Sc = SE->getSCEV(Ptr); + + if (SE->isLoopInvariant(Sc, OrigLoop)) { + DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << + *Ptr <<"\n"); + Starts.push_back(Ptr); + Ends.push_back(Ptr); + } else { + DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); + + Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], + PtrArithTy, Loc); + Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); + Starts.push_back(Start); + Ends.push_back(End); + } + } + + for (unsigned i = 0; i < NumPointers; ++i) { + for (unsigned j = i+1; j < NumPointers; ++j) { + Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, + Starts[i], Ends[j], "bound0", Loc); + Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, + Starts[j], Ends[i], "bound1", Loc); + Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, + "found.conflict", Loc); + if (MemoryRuntimeCheck) + MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, + MemoryRuntimeCheck, + IsConflict, + "conflict.rdx", Loc); + else + MemoryRuntimeCheck = IsConflict; + + } + } + + return MemoryRuntimeCheck; +} + +void +SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the scalar remainder. - [ ] <-- vector loop bypass. + [ ] <-- vector loop bypass. / | / v | [ ] <-- vector pre header. @@ -492,41 +780,21 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal ... */ - // This is the original scalar-loop preheader. - BasicBlock *BypassBlock = Orig->getLoopPreheader(); - BasicBlock *ExitBlock = Orig->getExitBlock(); + BasicBlock *OldBasicBlock = OrigLoop->getHeader(); + BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); + BasicBlock *ExitBlock = OrigLoop->getExitBlock(); assert(ExitBlock && "Must have an exit block"); - assert(Orig->getNumBlocks() == 1 && "Invalid loop"); - assert(BypassBlock && "Invalid loop structure"); - - BasicBlock *VectorPH = - BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); - BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), - "vector.body"); - - BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), - "middle.block"); - BasicBlock *ScalarPH = - MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), - "scalar.preheader"); - // Find the induction variable. - BasicBlock *OldBasicBlock = Orig->getHeader(); + // Some loops have a single integer induction variable, while other loops + // don't. One example is c++ iterators that often have multiple pointer + // induction variables. In the code below we also support a case where we + // don't have a single induction variable. OldInduction = Legal->getInduction(); - assert(OldInduction && "We must have a single phi node."); - Type *IdxTy = OldInduction->getType(); - - // Use this IR builder to create the loop instructions (Phi, Br, Cmp) - // inside the loop. - Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); - - // Generate the induction variable. - Induction = Builder.CreatePHI(IdxTy, 2, "index"); - Constant *Zero = ConstantInt::get(IdxTy, 0); - Constant *Step = ConstantInt::get(IdxTy, VF); + Type *IdxTy = OldInduction ? OldInduction->getType() : + DL->getIntPtrType(SE->getContext()); // Find the loop boundaries. - const SCEV *ExitCount = SE->getExitCount(Orig, Orig->getHeader()); + const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); // Get the total trip count from the count by adding 1. @@ -536,35 +804,148 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal // Expand the trip count and place the new instructions in the preheader. // Notice that the pre-header does not change, only the loop body. SCEVExpander Exp(*SE, "induction"); + + // Count holds the overall loop count (N). + Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), + BypassBlock->getTerminator()); + + // The loop index does not have to start at Zero. Find the original start + // value from the induction PHI node. If we don't have an induction variable + // then we know that it starts at zero. + Value *StartIdx = OldInduction ? + OldInduction->getIncomingValueForBlock(BypassBlock): + ConstantInt::get(IdxTy, 0); + + assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); + assert(BypassBlock && "Invalid loop structure"); + + // Generate the code that checks in runtime if arrays overlap. + Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, + BypassBlock->getTerminator()); + + // Split the single block loop into the two loop structure described above. + BasicBlock *VectorPH = + BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); + BasicBlock *VecBody = + VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); + BasicBlock *MiddleBlock = + VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); + BasicBlock *ScalarPH = + MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); + + // This is the location in which we add all of the logic for bypassing + // the new vector loop. Instruction *Loc = BypassBlock->getTerminator(); + // Use this IR builder to create the loop instructions (Phi, Br, Cmp) + // inside the loop. + Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); + + // Generate the induction variable. + Induction = Builder.CreatePHI(IdxTy, 2, "index"); + Constant *Step = ConstantInt::get(IdxTy, VF); + // We may need to extend the index in case there is a type mismatch. // We know that the count starts at zero and does not overflow. - // We are using Zext because it should be less expensive. - if (ExitCount->getType() != Induction->getType()) - ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy); + if (Count->getType() != IdxTy) { + // The exit count can be of pointer type. Convert it to the correct + // integer type. + if (ExitCount->getType()->isPointerTy()) + Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc); + else + Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc); + } + + // Add the start index to the loop count to get the new end index. + Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); - // Count holds the overall loop count (N). - Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc); // Now we need to generate the expression for N - (N % VF), which is // the part that the vectorized body will execute. Constant *CIVF = ConstantInt::get(IdxTy, VF); Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); + Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, + "end.idx.rnd.down", Loc); - // Now, compare the new count to zero. If it is zero, jump to the scalar part. + // Now, compare the new count to zero. If it is zero skip the vector loop and + // jump to the scalar loop. Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, - CountRoundDown, ConstantInt::getNullValue(IdxTy), + IdxEndRoundDown, + StartIdx, "cmp.zero", Loc); + + // If we are using memory runtime checks, include them in. + if (MemoryRuntimeCheck) + Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, + "CntOrMem", Loc); + BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); // Remove the old terminator. Loc->eraseFromParent(); + // We are going to resume the execution of the scalar loop. + // Go over all of the induction variables that we found and fix the + // PHIs that are left in the scalar version of the loop. + // The starting values of PHI nodes depend on the counter of the last + // iteration in the vectorized loop. + // If we come from a bypass edge then we need to start from the original start + // value. + + // This variable saves the new starting index for the scalar loop. + PHINode *ResumeIndex = 0; + LoopVectorizationLegality::InductionList::iterator I, E; + LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); + for (I = List->begin(), E = List->end(); I != E; ++I) { + PHINode *OrigPhi = I->first; + PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", + MiddleBlock->getTerminator()); + Value *EndValue = 0; + if (OrigPhi->getType()->isIntegerTy()) { + // Handle the integer induction counter: + assert(OrigPhi == OldInduction && "Unknown integer PHI"); + // We know what the end value is. + EndValue = IdxEndRoundDown; + // We also know which PHI node holds it. + ResumeIndex = ResumeVal; + } else { + // For pointer induction variables, calculate the offset using + // the end index. + EndValue = GetElementPtrInst::Create(I->second, CountRoundDown, + "ptr.ind.end", + BypassBlock->getTerminator()); + } + + // The new PHI merges the original incoming value, in case of a bypass, + // or the value at the end of the vectorized loop. + ResumeVal->addIncoming(I->second, BypassBlock); + ResumeVal->addIncoming(EndValue, VecBody); + + // Fix the scalar body counter (PHI node). + unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); + OrigPhi->setIncomingValue(BlockIdx, ResumeVal); + } + + // If we are generating a new induction variable then we also need to + // generate the code that calculates the exit value. This value is not + // simply the end of the counter because we may skip the vectorized body + // in case of a runtime check. + if (!OldInduction){ + assert(!ResumeIndex && "Unexpected resume value found"); + ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", + MiddleBlock->getTerminator()); + ResumeIndex->addIncoming(StartIdx, BypassBlock); + ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); + } + + // Make sure that we found the index where scalar loop needs to continue. + assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && + "Invalid resume Index"); + // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. - Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, - CountRoundDown, "cmp.n", + Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, + ResumeIndex, "cmp.n", MiddleBlock->getTerminator()); BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); @@ -573,36 +954,37 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal // Create i+1 and fill the PHINode. Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); - Induction->addIncoming(Zero, VectorPH); + Induction->addIncoming(StartIdx, VectorPH); Induction->addIncoming(NextIdx, VecBody); // Create the compare. - Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown); + Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); // Now we have two terminators. Remove the old one from the block. VecBody->getTerminator()->eraseFromParent(); - // Fix the scalar body iteration count. - unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH); - OldInduction->setIncomingValue(BlockIdx, CountRoundDown); - // Get ready to start creating new instructions into the vectorized body. Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); - // Register the new loop. + // Create and register the new vector loop. Loop* Lp = new Loop(); - LPM->insertLoop(Lp, Orig->getParentLoop()); + Loop *ParentLoop = OrigLoop->getParentLoop(); - Lp->addBasicBlockToLoop(VecBody, LI->getBase()); - - Loop *ParentLoop = Orig->getParentLoop(); + // Insert the new loop into the loop nest and register the new basic blocks. if (ParentLoop) { + ParentLoop->addChildLoop(Lp); ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); + } else { + LI->addTopLevelLoop(Lp); } + Lp->addBasicBlockToLoop(VecBody, LI->getBase()); + // Save the state. + LoopVectorPreHeader = VectorPH; + LoopScalarPreHeader = ScalarPH; LoopMiddleBlock = MiddleBlock; LoopExitBlock = ExitBlock; LoopVectorBody = VecBody; @@ -610,20 +992,50 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal LoopBypassBlock = BypassBlock; } +/// This function returns the identity element (or neutral element) for +/// the operation K. +static unsigned +getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { + switch (K) { + case LoopVectorizationLegality::IntegerXor: + case LoopVectorizationLegality::IntegerAdd: + case LoopVectorizationLegality::IntegerOr: + // Adding, Xoring, Oring zero to a number does not change it. + return 0; + case LoopVectorizationLegality::IntegerMult: + // Multiplying a number by 1 does not change it. + return 1; + case LoopVectorizationLegality::IntegerAnd: + // AND-ing a number with an all-1 value does not change it. + return -1; + default: + llvm_unreachable("Unknown reduction kind"); + } +} + void SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { + //===------------------------------------------------===// + // + // Notice: any optimization or new instruction that go + // into the code below should be also be implemented in + // the cost-model. + // + //===------------------------------------------------===// typedef SmallVector PhiVector; - BasicBlock &BB = *Orig->getHeader(); + BasicBlock &BB = *OrigLoop->getHeader(); + Constant *Zero = ConstantInt::get( + IntegerType::getInt32Ty(BB.getContext()), 0); // In order to support reduction variables we need to be able to vectorize // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two - // steages. First, we create a new vector PHI node with no incoming edges. + // stages. First, we create a new vector PHI node with no incoming edges. // We use this value when we vectorize all of the instructions that use the // PHI. Next, after all of the instructions in the block are complete we // add the new incoming edges to the PHI. At this point all of the // instructions in the basic block are vectorized, so we can use them to // construct the PHI. - PhiVector PHIsToFix; + PhiVector RdxPHIsToFix; // For each instruction in the old loop. for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { @@ -636,15 +1048,58 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { continue; case Instruction::PHI:{ PHINode* P = cast(Inst); - // Special handling for the induction var. - if (OldInduction == Inst) + // Handle reduction variables: + if (Legal->getReductionVars()->count(P)) { + // This is phase one of vectorizing PHIs. + Type *VecTy = VectorType::get(Inst->getType(), VF); + WidenMap[Inst] = PHINode::Create(VecTy, 2, "vec.phi", + LoopVectorBody->getFirstInsertionPt()); + RdxPHIsToFix.push_back(P); continue; - // This is phase one of vectorizing PHIs. - // This has to be a reduction variable. - assert(Legal->getReductionVars()->count(P) && "Not a Reduction"); - Type *VecTy = VectorType::get(Inst->getType(), VF); - WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi"); - PHIsToFix.push_back(P); + } + + // This PHINode must be an induction variable. + // Make sure that we know about it. + assert(Legal->getInductionVars()->count(P) && + "Not an induction variable"); + + if (P->getType()->isIntegerTy()) { + assert(P == OldInduction && "Unexpected PHI"); + Value *Broadcasted = getBroadcastInstrs(Induction); + // After broadcasting the induction variable we need to make the + // vector consecutive by adding 0, 1, 2 ... + Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted); + + WidenMap[OldInduction] = ConsecutiveInduction; + continue; + } + + // Handle pointer inductions. + assert(P->getType()->isPointerTy() && "Unexpected type."); + Value *StartIdx = OldInduction ? + Legal->getInductionVars()->lookup(OldInduction) : + ConstantInt::get(Induction->getType(), 0); + + // This is the pointer value coming into the loop. + Value *StartPtr = Legal->getInductionVars()->lookup(P); + + // This is the normalized GEP that starts counting at zero. + Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, + "normalized.idx"); + + // This is the vector of results. Notice that we don't generate vector + // geps because scalar geps result in better code. + Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); + for (unsigned int i = 0; i < VF; ++i) { + Constant *Idx = ConstantInt::get(Induction->getType(), i); + Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx"); + Value *SclrGep = Builder.CreateGEP(StartPtr, GlobalIdx, "next.gep"); + VecVal = Builder.CreateInsertElement(VecVal, SclrGep, + Builder.getInt32(i), + "insert.gep"); + } + + WidenMap[Inst] = VecVal; continue; } case Instruction::Add: @@ -669,18 +1124,39 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { BinaryOperator *BinOp = dyn_cast(Inst); Value *A = getVectorValue(Inst->getOperand(0)); Value *B = getVectorValue(Inst->getOperand(1)); + // Use this vector value for all users of the original instruction. - WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B); + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); + WidenMap[Inst] = V; + + // Update the NSW, NUW and Exact flags. + BinaryOperator *VecOp = cast(V); + if (isa(BinOp)) { + VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); + VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); + } + if (isa(VecOp)) + VecOp->setIsExact(BinOp->isExact()); break; } case Instruction::Select: { // Widen selects. - // TODO: If the selector is loop invariant we can issue a select - // instruction with a scalar condition. - Value *A = getVectorValue(Inst->getOperand(0)); - Value *B = getVectorValue(Inst->getOperand(1)); - Value *C = getVectorValue(Inst->getOperand(2)); - WidenMap[Inst] = Builder.CreateSelect(A, B, C); + // If the selector is loop invariant we can create a select + // instruction with a scalar condition. Otherwise, use vector-select. + Value *Cond = Inst->getOperand(0); + bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + Cond = getVectorValue(Cond); + if (InvariantCond) + Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); + + Value *Op0 = getVectorValue(Inst->getOperand(1)); + Value *Op1 = getVectorValue(Inst->getOperand(2)); + WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1); break; } @@ -704,18 +1180,34 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); Value *Ptr = SI->getPointerOperand(); unsigned Alignment = SI->getAlignment(); + + assert(!Legal->isUniform(Ptr) && + "We do not allow storing to uniform addresses"); + GetElementPtrInst *Gep = dyn_cast(Ptr); + // This store does not use GEPs. - if (!Legal->isConsecutiveGep(Gep)) { + if (!Legal->isConsecutivePtr(Ptr)) { scalarizeInstruction(Inst); break; } - // Create the new GEP with the new induction variable. - GetElementPtrInst *Gep2 = cast(Gep->clone()); - unsigned NumOperands = Gep->getNumOperands(); - Gep2->setOperand(NumOperands - 1, Induction); - Ptr = Builder.Insert(Gep2); + if (Gep) { + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); + LastIndex = Builder.CreateExtractElement(LastIndex, Zero); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast(Gep->clone()); + Gep2->setOperand(NumOperands - 1, LastIndex); + Ptr = Builder.Insert(Gep2); + } else { + // Use the induction element ptr. + assert(isa(Ptr) && "Invalid induction ptr"); + Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); + } Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); Value *Val = getVectorValue(SI->getValueOperand()); Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); @@ -729,17 +1221,31 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { unsigned Alignment = LI->getAlignment(); GetElementPtrInst *Gep = dyn_cast(Ptr); - // We don't have a gep. Scalarize the load. - if (!Legal->isConsecutiveGep(Gep)) { + // If the pointer is loop invariant or if it is non consecutive, + // scalarize the load. + bool Con = Legal->isConsecutivePtr(Ptr); + if (Legal->isUniform(Ptr) || !Con) { scalarizeInstruction(Inst); break; } - // Create the new GEP with the new induction variable. - GetElementPtrInst *Gep2 = cast(Gep->clone()); - unsigned NumOperands = Gep->getNumOperands(); - Gep2->setOperand(NumOperands - 1, Induction); - Ptr = Builder.Insert(Gep2); + if (Gep) { + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); + LastIndex = Builder.CreateExtractElement(LastIndex, Zero); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast(Gep->clone()); + Gep2->setOperand(NumOperands - 1, LastIndex); + Ptr = Builder.Insert(Gep2); + } else { + // Use the induction element ptr. + assert(isa(Ptr) && "Invalid induction ptr"); + Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); + } + Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); LI = Builder.CreateLoad(Ptr); LI->setAlignment(Alignment); @@ -783,35 +1289,46 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Create the 'reduced' values for each of the induction vars. // The reduced values are the vector values that we scalarize and combine // after the loop is finished. - for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end(); + for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); it != e; ++it) { PHINode *RdxPhi = *it; PHINode *VecRdxPhi = dyn_cast(WidenMap[RdxPhi]); assert(RdxPhi && "Unable to recover vectorized PHI"); - // Find the reduction variable. + // Find the reduction variable descriptor. assert(Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable"); - LoopVectorizationLegality::ReductionPair ReductionVar = + LoopVectorizationLegality::ReductionDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; + // We need to generate a reduction vector from the incoming scalar. + // To do so, we need to generate the 'identity' vector and overide + // one of the elements with the incoming scalar reduction. We need + // to do it in the vector-loop preheader. + Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); + // This is the vector-clone of the value that leaves the loop. - Value *VectorExit = getVectorValue(ReductionVar.first); + Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); Type *VecTy = VectorExit->getType(); - // This is the kind of reduction. - LoopVectorizationLegality::ReductionKind RdxKind = ReductionVar.second; - // Find the reduction identity variable. - // Zero for addition. One for Multiplication. - unsigned IdentitySclr = - (RdxKind == LoopVectorizationLegality::IntegerAdd ? 0 : 1); - Constant *Identity = getUniformVector(IdentitySclr, VecTy->getScalarType()); + // Find the reduction identity variable. Zero for addition, or, xor, + // one for multiplication, -1 for And. + Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), + VecTy->getScalarType()); + + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + Value *VectorStart = Builder.CreateInsertElement(Identity, + RdxDesc.StartValue, Zero); // Fix the vector-loop phi. // We created the induction variable so we know that the // preheader is the first entry. BasicBlock *VecPreheader = Induction->getIncomingBlock(0); - VecRdxPhi->addIncoming(Identity, VecPreheader); + + // Reductions do not have to start at zero. They can start with + // any loop invariant values. + VecRdxPhi->addIncoming(VectorStart, VecPreheader); unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); VecRdxPhi->addIncoming(Val, LoopVectorBody); @@ -823,22 +1340,36 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); // This PHINode contains the vectorized reduction variable, or - // the identity vector, if we bypass the vector loop. + // the initial value vector, if we bypass the vector loop. PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); - NewPhi->addIncoming(Identity, LoopBypassBlock); - NewPhi->addIncoming(getVectorValue(ReductionVar.first), LoopVectorBody); + NewPhi->addIncoming(VectorStart, LoopBypassBlock); + NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); // Extract the first scalar. Value *Scalar0 = Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); - // Extract and sum the remaining vector elements. + // Extract and reduce the remaining vector elements. for (unsigned i=1; i < VF; ++i) { Value *Scalar1 = Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); - if (RdxKind == LoopVectorizationLegality::IntegerAdd) { - Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); - } else { - Scalar0 = Builder.CreateMul(Scalar0, Scalar1); + switch (RdxDesc.Kind) { + case LoopVectorizationLegality::IntegerAdd: + Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); + break; + case LoopVectorizationLegality::IntegerMult: + Scalar0 = Builder.CreateMul(Scalar0, Scalar1); + break; + case LoopVectorizationLegality::IntegerOr: + Scalar0 = Builder.CreateOr(Scalar0, Scalar1); + break; + case LoopVectorizationLegality::IntegerAnd: + Scalar0 = Builder.CreateAnd(Scalar0, Scalar1); + break; + case LoopVectorizationLegality::IntegerXor: + Scalar0 = Builder.CreateXor(Scalar0, Scalar1); + break; + default: + llvm_unreachable("Unknown reduction operation"); } } @@ -851,11 +1382,13 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { PHINode *LCSSAPhi = dyn_cast(LEI); if (!LCSSAPhi) continue; - // All PHINodes need to have a single entry edge, or two if we already fixed them. + // All PHINodes need to have a single entry edge, or two if + // we already fixed them. assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); - // We found our reduction value exit-PHI. Update it with the incoming bypass edge. - if (LCSSAPhi->getIncomingValue(0) == ReductionVar.first) { + // We found our reduction value exit-PHI. Update it with the + // incoming bypass edge. + if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { // Add an edge coming from the bypass. LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); break; @@ -867,72 +1400,102 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); - (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, ReductionVar.first); + (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); }// end of for each redux variable. } -void SingleBlockLoopVectorizer::cleanup() { - // The original basic block. - SE->forgetLoop(Orig); +void SingleBlockLoopVectorizer::updateAnalysis() { + // Forget the original basic block. + SE->forgetLoop(OrigLoop); + + // Update the dominator tree information. + assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && + "Entry does not dominate exit."); + + DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); + DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); + DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); + DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); + DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); + DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); + + DEBUG(DT->verifyAnalysis()); } -unsigned LoopVectorizationLegality::getLoopMaxVF() { - if (!TheLoop->getLoopPreheader()) { - assert(false && "No preheader!!"); - DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); - return 1; - } +bool LoopVectorizationLegality::canVectorize() { + assert(TheLoop->getLoopPreheader() && "No preheader!!"); // We can only vectorize single basic block loops. unsigned NumBlocks = TheLoop->getNumBlocks(); if (NumBlocks != 1) { DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n"); - return 1; + return false; } // We need to have a loop header. BasicBlock *BB = TheLoop->getHeader(); DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n"); - // Go over each instruction and look at memory deps. - if (!canVectorizeBlock(*BB)) { - DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); - return 1; - } - // ScalarEvolution needs to be able to find the exit count. const SCEV *ExitCount = SE->getExitCount(TheLoop, BB); if (ExitCount == SE->getCouldNotCompute()) { DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); - return 1; + return false; + } + + // Do not loop-vectorize loops with a tiny trip count. + unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB); + if (TC > 0u && TC < TinyTripCountThreshold) { + DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << + "This loop is not worth vectorizing.\n"); + return false; + } + + // Go over each instruction and look at memory deps. + if (!canVectorizeBlock(*BB)) { + DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); + return false; } - DEBUG(dbgs() << "LV: We can vectorize this loop!\n"); + DEBUG(dbgs() << "LV: We can vectorize this loop" << + (PtrRtCheck.Need ? " (with a runtime bound check)" : "") + <<"!\n"); // Okay! We can vectorize. At this point we don't have any other mem analysis - // which may limit our maximum vectorization factor, so just return the - // maximum SIMD size. - return DefaultVectorizationFactor; + // which may limit our maximum vectorization factor, so just return true with + // no restrictions. + return true; } bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { + + BasicBlock *PreHeader = TheLoop->getLoopPreheader(); + // Scan the instructions in the block and look for hazards. for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { Instruction *I = it; - PHINode *Phi = dyn_cast(I); - if (Phi) { + if (PHINode *Phi = dyn_cast(I)) { // This should not happen because the loop should be normalized. if (Phi->getNumIncomingValues() != 2) { DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; } - // We only look at integer phi nodes. - if (!Phi->getType()->isIntegerTy()) { - DEBUG(dbgs() << "LV: Found an non-int PHI.\n"); + + // This is the value coming from the preheader. + Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); + + // We only look at integer and pointer phi nodes. + if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) { + DEBUG(dbgs() << "LV: Found a pointer induction variable.\n"); + Inductions[Phi] = StartValue; + continue; + } else if (!Phi->getType()->isIntegerTy()) { + DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); return false; } + // Handle integer PHIs: if (isInductionVariable(Phi)) { if (Induction) { DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); @@ -940,6 +1503,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { } DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); Induction = Phi; + Inductions[Phi] = StartValue; continue; } if (AddReductionVar(Phi, IntegerAdd)) { @@ -947,16 +1511,30 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { continue; } if (AddReductionVar(Phi, IntegerMult)) { - DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n"); + DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); continue; } + if (AddReductionVar(Phi, IntegerOr)) { + DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, IntegerAnd)) { + DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, IntegerXor)) { + DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); + continue; + } + + DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); + return false; }// end of PHI handling // We still don't handle functions. CallInst *CI = dyn_cast(I); if (CI) { - DEBUG(dbgs() << "LV: Found a call site:"<< - CI->getCalledFunction()->getName() << "\n"); + DEBUG(dbgs() << "LV: Found a call site.\n"); return false; } @@ -984,13 +1562,39 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { } // next instr. if (!Induction) { - DEBUG(dbgs() << "LV: Did not find an induction var.\n"); - return false; + DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); + assert(getInductionVars()->size() && "No induction variables"); + } + + // Don't vectorize if the memory dependencies do not allow vectorization. + if (!canVectorizeMemory(BB)) + return false; + + // We now know that the loop is vectorizable! + // Collect variables that will remain uniform after vectorization. + std::vector Worklist; + + // Start with the conditional branch and walk up the block. + Worklist.push_back(BB.getTerminator()->getOperand(0)); + + while (Worklist.size()) { + Instruction *I = dyn_cast(Worklist.back()); + Worklist.pop_back(); + + // Look at instructions inside this block. Stop when reaching PHI nodes. + if (!I || I->getParent() != &BB || isa(I)) + continue; + + // This is a known uniform. + Uniforms.insert(I); + + // Insert all operands. + for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) { + Worklist.push_back(I->getOperand(i)); + } } - // If the memory dependencies do not prevent us from - // vectorizing, then vectorize. - return canVectorizeMemory(BB); + return true; } bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { @@ -999,6 +1603,8 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { // Holds the Load and Store *instructions*. ValueVector Loads; ValueVector Stores; + PtrRtCheck.Pointers.clear(); + PtrRtCheck.Need = false; // Scan the BB and collect legal loads and stores. for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { @@ -1056,6 +1662,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { StoreInst *ST = dyn_cast(*I); assert(ST && "Bad StoreInst"); Value* Ptr = ST->getPointerOperand(); + + if (isUniform(Ptr)) { + DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); + return false; + } + // If we did *not* see this pointer before, insert it to // the read-write list. At this phase it is only a 'write' list. if (Seen.insert(Ptr)) @@ -1074,10 +1686,50 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { // If the address of i is unknown (for example A[B[i]]) then we may // read a few words, modify, and write a few words, and some of the // words may be written to the same address. - if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr)) + if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr)) Reads.push_back(Ptr); } + // If we write (or read-write) to a single destination and there are no + // other reads in this loop then is it safe to vectorize. + if (ReadWrites.size() == 1 && Reads.size() == 0) { + DEBUG(dbgs() << "LV: Found a write-only loop!\n"); + return true; + } + + // Find pointers with computable bounds. We are going to use this information + // to place a runtime bound check. + bool RT = true; + for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) + if (hasComputableBounds(*I)) { + PtrRtCheck.insert(SE, TheLoop, *I); + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); + } else { + RT = false; + break; + } + for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) + if (hasComputableBounds(*I)) { + PtrRtCheck.insert(SE, TheLoop, *I); + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); + } else { + RT = false; + break; + } + + // Check that we did not collect too many pointers or found a + // unsizeable pointer. + if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { + PtrRtCheck.reset(); + RT = false; + } + + PtrRtCheck.Need = RT; + + if (RT) { + DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); + } + // Now that the pointers are in two lists (Reads and ReadWrites), we // can check that there are no conflicts between each of the writes and // between the writes to the reads. @@ -1090,14 +1742,14 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { GetUnderlyingObjects(*I, TempObjects, DL); for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); it != e; ++it) { - if (!isIdentifiedSafeObject(*it)) { + if (!isIdentifiedObject(*it)) { DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); - return false; + return RT; } if (!WriteObjects.insert(*it)) { DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **it <<"\n"); - return false; + return RT; } } TempObjects.clear(); @@ -1108,39 +1760,27 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { GetUnderlyingObjects(*I, TempObjects, DL); for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); it != e; ++it) { - if (!isIdentifiedSafeObject(*it)) { + if (!isIdentifiedObject(*it)) { DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); - return false; + return RT; } if (WriteObjects.count(*it)) { DEBUG(dbgs() << "LV: Found a possible read/write reorder:" << **it <<"\n"); - return false; + return RT; } } TempObjects.clear(); } - // All is okay. + // It is safe to vectorize and we don't need any runtime checks. + DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n"); + PtrRtCheck.reset(); return true; } -/// Checks if the value is a Global variable or if it is an Arguments -/// marked with the NoAlias attribute. -bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) { - assert(Val && "Invalid value"); - if (dyn_cast(Val)) - return true; - if (dyn_cast(Val)) - return true; - Argument *A = dyn_cast(Val); - if (!A) - return false; - return A->hasNoAliasAttr(); -} - bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, - ReductionKind Kind) { + ReductionKind Kind) { if (Phi->getNumIncomingValues() != 2) return false; @@ -1150,10 +1790,6 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry. Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx); - // We must have a constant that starts the reduction. - if (!isReductionConstant(RdxStart, Kind)) - return false; - // ExitInstruction is the single value which is used outside the loop. // We only allow for a single reduction value to be used outside the loop. // This includes users of the reduction, variables (which form a cycle @@ -1175,6 +1811,12 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, bool FoundInBlockUser = false; // Did we reach the initial PHI node ? bool FoundStartPHI = false; + + // If the instruction has no users then this is a broken + // chain and can't be a reduction variable. + if (Iter->use_empty()) + return false; + // For each of the *users* of iter. for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); it != e; ++it) { @@ -1205,25 +1847,15 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, if (FoundStartPHI && ExitInstruction) { // This instruction is allowed to have out-of-loop users. AllowedExit.insert(ExitInstruction); - // Mark this as a reduction var. - Reductions[Phi] = std::make_pair(ExitInstruction, Kind); + + // Save the description of this reduction variable. + ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); + Reductions[Phi] = RD; return true; } } } -bool -LoopVectorizationLegality::isReductionConstant(Value *V, ReductionKind Kind) { - ConstantInt *CI = dyn_cast(V); - if (!CI) - return false; - if (Kind == IntegerMult && CI->isOne()) - return true; - if (Kind == IntegerAdd && CI->isZero()) - return true; - return false; -} - bool LoopVectorizationLegality::isReductionInstr(Instruction *I, ReductionKind Kind) { @@ -1237,13 +1869,22 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I, case Instruction::Sub: return Kind == IntegerAdd; case Instruction::Mul: - case Instruction::UDiv: - case Instruction::SDiv: return Kind == IntegerMult; + case Instruction::And: + return Kind == IntegerAnd; + case Instruction::Or: + return Kind == IntegerOr; + case Instruction::Xor: + return Kind == IntegerXor; } } bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { + Type *PhiTy = Phi->getType(); + // We only handle integer and pointer inductions variables. + if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) + return false; + // Check that the PHI is consecutive and starts at zero. const SCEV *PhiScev = SE->getSCEV(Phi); const SCEVAddRecExpr *AR = dyn_cast(PhiScev); @@ -1252,13 +1893,234 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { return false; } const SCEV *Step = AR->getStepRecurrence(*SE); - const SCEV *Start = AR->getStart(); - if (!Step->isOne() || !Start->isZero()) { - DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n"); + // Integer inductions need to have a stride of one. + if (PhiTy->isIntegerTy()) + return Step->isOne(); + + // Calculate the pointer stride and check if it is consecutive. + const SCEVConstant *C = dyn_cast(Step); + if (!C) return false; + + assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); + uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); + return (C->getValue()->equalsInt(Size)); +} + +bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { + const SCEV *PhiScev = SE->getSCEV(Ptr); + const SCEVAddRecExpr *AR = dyn_cast(PhiScev); + if (!AR) return false; + + return AR->isAffine(); +} + +unsigned +LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { + if (!VTTI) { + DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); + return 1; } - return true; + + float Cost = expectedCost(1); + unsigned Width = 1; + DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); + for (unsigned i=2; i <= VF; i*=2) { + // Notice that the vector loop needs to be executed less times, so + // we need to divide the cost of the vector loops by the width of + // the vector elements. + float VectorCost = expectedCost(i) / (float)i; + DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << + (int)VectorCost << ".\n"); + if (VectorCost < Cost) { + Cost = VectorCost; + Width = i; + } + } + + DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); + return Width; +} + +unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { + // We can only estimate the cost of single basic block loops. + assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop"); + + BasicBlock *BB = TheLoop->getHeader(); + unsigned Cost = 0; + + // For each instruction in the old loop. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + Instruction *Inst = it; + unsigned C = getInstructionCost(Inst, VF); + Cost += C; + DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF << + " For instruction: "<< *Inst << "\n"); + } + + return Cost; +} + +unsigned +LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { + assert(VTTI && "Invalid vector target transformation info"); + + // If we know that this instruction will remain uniform, check the cost of + // the scalar version. + if (Legal->isUniformAfterVectorization(I)) + VF = 1; + + Type *RetTy = I->getType(); + Type *VectorTy = ToVectorTy(RetTy, VF); + + + // TODO: We need to estimate the cost of intrinsic calls. + switch (I->getOpcode()) { + case Instruction::GetElementPtr: + // We mark this instruction as zero-cost because scalar GEPs are usually + // lowered to the intruction addressing mode. At the moment we don't + // generate vector geps. + return 0; + case Instruction::Br: { + return VTTI->getCFInstrCost(I->getOpcode()); + } + case Instruction::PHI: + return 0; + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); + case Instruction::Select: { + SelectInst *SI = cast(I); + const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); + bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); + Type *CondTy = SI->getCondition()->getType(); + if (ScalarCond) + CondTy = VectorType::get(CondTy, VF); + + return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); + } + case Instruction::ICmp: + case Instruction::FCmp: { + Type *ValTy = I->getOperand(0)->getType(); + VectorTy = ToVectorTy(ValTy, VF); + return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); + } + case Instruction::Store: { + StoreInst *SI = cast(I); + Type *ValTy = SI->getValueOperand()->getType(); + VectorTy = ToVectorTy(ValTy, VF); + + if (VF == 1) + return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, + SI->getAlignment(), SI->getPointerAddressSpace()); + + // Scalarized stores. + if (!Legal->isConsecutivePtr(SI->getPointerOperand())) { + unsigned Cost = 0; + unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, + ValTy); + // The cost of extracting from the value vector. + Cost += VF * (ExtCost); + // The cost of the scalar stores. + Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), + ValTy->getScalarType(), + SI->getAlignment(), + SI->getPointerAddressSpace()); + return Cost; + } + + // Wide stores. + return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), + SI->getPointerAddressSpace()); + } + case Instruction::Load: { + LoadInst *LI = cast(I); + + if (VF == 1) + return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, + LI->getAlignment(), + LI->getPointerAddressSpace()); + + // Scalarized loads. + if (!Legal->isConsecutivePtr(LI->getPointerOperand())) { + unsigned Cost = 0; + unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); + // The cost of inserting the loaded value into the result vector. + Cost += VF * (InCost); + // The cost of the scalar stores. + Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), + RetTy->getScalarType(), + LI->getAlignment(), + LI->getPointerAddressSpace()); + return Cost; + } + + // Wide loads. + return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), + LI->getPointerAddressSpace()); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); + return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); + } + default: { + // We are scalarizing the instruction. Return the cost of the scalar + // instruction, plus the cost of insert and extract into vector + // elements, times the vector width. + unsigned Cost = 0; + + bool IsVoid = RetTy->isVoidTy(); + + unsigned InsCost = (IsVoid ? 0 : + VTTI->getInstrCost(Instruction::InsertElement, + VectorTy)); + + unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, + VectorTy); + + // The cost of inserting the results plus extracting each one of the + // operands. + Cost += VF * (InsCost + ExtCost * I->getNumOperands()); + + // The cost of executing VF copies of the scalar instruction. + Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); + return Cost; + } + }// end of switch. +} + +Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { + if (Scalar->isVoidTy() || VF == 1) + return Scalar; + return VectorType::get(Scalar, VF); } } // namespace