// 4. LoopVectorizationCostModel - A unit that checks for the profitability
// of vectorization. It decides on the optimal vector width, which
// can be one, if vectorization is not profitable.
+//
//===----------------------------------------------------------------------===//
//
// The reduction-variable vectorization is based on the paper:
// Other ideas/concepts are from:
// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
//
+// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
+// Vectorizing Compilers.
+//
//===----------------------------------------------------------------------===//
#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/Dominators.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/LoopInfo.h"
VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
cl::desc("Set the default vectorization width. Zero is autoselect."));
+/// We don't vectorize loops with a known constant trip count below this number.
+const unsigned TinyTripCountThreshold = 16;
+
+/// When performing a runtime memory check, do not check more than this
+/// number of pointers. Notice that the check is quadratic!
+const unsigned RuntimeMemoryCheckThreshold = 2;
+
+/// This is the highest vector width that we try to generate.
+const unsigned MaxVectorSize = 8;
+
namespace {
// Forward declarations.
public:
/// Ctor.
SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
- LPPassManager *Lpm, unsigned VecWidth):
- OrigLoop(Orig), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
+ DominatorTree *Dt, DataLayout *Dl,
+ unsigned VecWidth):
+ OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth),
Builder(Se->getContext()), Induction(0), OldInduction(0) { }
// Perform the actual loop widening (vectorization).
void vectorize(LoopVectorizationLegality *Legal) {
- ///Create a new empty loop. Unlink the old loop and connect the new one.
+ // Create a new empty loop. Unlink the old loop and connect the new one.
createEmptyLoop(Legal);
- /// Widen each instruction in the old loop to a new one in the new loop.
- /// Use the Legality module to find the induction and reduction variables.
+ // Widen each instruction in the old loop to a new one in the new loop.
+ // Use the Legality module to find the induction and reduction variables.
vectorizeLoop(Legal);
- // register the new loop.
- cleanup();
+ // Register the new loop and update the analysis passes.
+ updateAnalysis();
}
private:
+ /// Add code that checks at runtime if the accessed arrays overlap.
+ /// Returns the comperator value or NULL if no check is needed.
+ Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
+ Instruction *Loc);
/// Create an empty loop, based on the loop ranges of the old loop.
void createEmptyLoop(LoopVectorizationLegality *Legal);
/// Copy and widen the instructions from the old loop.
void vectorizeLoop(LoopVectorizationLegality *Legal);
- /// Insert the new loop to the loop hierarchy and pass manager.
- void cleanup();
+ /// Insert the new loop to the loop hierarchy and pass manager
+ /// and update the analysis passes.
+ void updateAnalysis();
/// This instruction is un-vectorizable. Implement it as a sequence
/// of scalars.
ScalarEvolution *SE;
// Loop Info.
LoopInfo *LI;
- // Loop Pass Manager;
- LPPassManager *LPM;
+ // Dominator Tree.
+ DominatorTree *DT;
+ // Data Layout.
+ DataLayout *DL;
// The vectorization factor to use.
unsigned VF;
// --- Vectorization state ---
+ /// The vector-loop preheader.
+ BasicBlock *LoopVectorPreHeader;
+ /// The scalar-loop preheader.
+ BasicBlock *LoopScalarPreHeader;
/// Middle Block between the vector and the scalar.
BasicBlock *LoopMiddleBlock;
///The ExitBlock of the scalar loop.
TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
/// This represents the kinds of reductions that we support.
- /// We use the enum values to hold the 'identity' value for
- /// each operand. This value does not change the result if applied.
enum ReductionKind {
- NoReduction = -1, /// Not a reduction.
- IntegerAdd = 0, /// Sum of numbers.
- IntegerMult = 1, /// Product of numbers.
- IntegerOr = 2, /// Bitwise or logical OR of numbers.
- IntegerAnd = 3, /// Bitwise or logical AND of numbers.
- IntegerXor = 4 /// Bitwise or logical XOR of numbers.
+ NoReduction, /// Not a reduction.
+ IntegerAdd, /// Sum of numbers.
+ IntegerMult, /// Product of numbers.
+ IntegerOr, /// Bitwise or logical OR of numbers.
+ IntegerAnd, /// Bitwise or logical AND of numbers.
+ IntegerXor /// Bitwise or logical XOR of numbers.
};
/// This POD struct holds information about reduction variables.
ReductionKind Kind;
};
+ // This POD struct holds information about the memory runtime legality
+ // check that a group of pointers do not overlap.
+ struct RuntimePointerCheck {
+ RuntimePointerCheck(): Need(false) {}
+
+ /// Reset the state of the pointer runtime information.
+ void reset() {
+ Need = false;
+ Pointers.clear();
+ Starts.clear();
+ Ends.clear();
+ }
+
+ /// Insert a pointer and calculate the start and end SCEVs.
+ void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr) {
+ const SCEV *Sc = SE->getSCEV(Ptr);
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+ assert(AR && "Invalid addrec expression");
+ const SCEV *Ex = SE->getExitCount(Lp, Lp->getHeader());
+ const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
+ Pointers.push_back(Ptr);
+ Starts.push_back(AR->getStart());
+ Ends.push_back(ScEnd);
+ }
+
+ /// This flag indicates if we need to add the runtime check.
+ bool Need;
+ /// Holds the pointers that we need to check.
+ SmallVector<Value*, 2> Pointers;
+ /// Holds the pointer value at the beginning of the loop.
+ SmallVector<const SCEV*, 2> Starts;
+ /// Holds the pointer value at the end of the loop.
+ SmallVector<const SCEV*, 2> Ends;
+ };
+
/// ReductionList contains the reduction descriptors for all
/// of the reductions that were found in the loop.
typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
+ /// InductionList saves induction variables and maps them to the initial
+ /// value entring the loop.
+ typedef DenseMap<PHINode*, Value*> InductionList;
+
/// Returns true if it is legal to vectorize this loop.
/// This does not mean that it is profitable to vectorize this
/// loop, only that it is legal to do so.
/// Returns the reduction variables found in the loop.
ReductionList *getReductionVars() { return &Reductions; }
- /// Check if the pointer returned by this GEP is consecutive
- /// when the index is vectorized. This happens when the last
- /// index of the GEP is consecutive, like the induction variable.
+ /// Returns the induction variables found in the loop.
+ InductionList *getInductionVars() { return &Inductions; }
+
+ /// Check if this pointer is consecutive when vectorizing. This happens
+ /// when the last index of the GEP is the induction variable, or that the
+ /// pointer itself is an induction variable.
/// This check allows us to vectorize A[idx] into a wide load/store.
- bool isConsecutiveGep(Value *Ptr);
+ bool isConsecutivePtr(Value *Ptr);
+
+ /// Returns true if the value V is uniform within the loop.
+ bool isUniform(Value *V);
/// Returns true if this instruction will remain scalar after vectorization.
bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
+ /// Returns the information that we collected about runtime memory check.
+ RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; }
private:
/// Check if a single basic block loop is vectorizable.
/// At this point we know that this is a loop with a constant trip count
bool isReductionInstr(Instruction *I, ReductionKind Kind);
/// Returns True, if 'Phi' is an induction variable.
bool isInductionVariable(PHINode *Phi);
+ /// Return true if can compute the address bounds of Ptr within the loop.
+ bool hasComputableBounds(Value *Ptr);
/// The loop that we evaluate.
Loop *TheLoop;
// --- vectorization state --- //
- /// Holds the induction variable.
+ /// Holds the integer induction variable. This is the counter of the
+ /// loop.
PHINode *Induction;
/// Holds the reduction variables.
ReductionList Reductions;
+ /// Holds all of the induction variables that we found in the loop.
+ /// Notice that inductions don't need to start at zero and that induction
+ /// variables can be pointers.
+ InductionList Inductions;
+
/// Allowed outside users. This holds the reduction
/// vars which can be accessed from outside the loop.
SmallPtrSet<Value*, 4> AllowedExit;
/// This set holds the variables which are known to be uniform after
/// vectorization.
SmallPtrSet<Instruction*, 4> Uniforms;
+ /// We need to check that all of the pointers in this list are disjoint
+ /// at runtime.
+ RuntimePointerCheck PtrRtCheck;
};
/// LoopVectorizationCostModel - estimates the expected speedups due to
/// Returns the most profitable vectorization factor for the loop that is
/// smaller or equal to the VF argument. This method checks every power
/// of two up to VF.
- unsigned findBestVectorizationFactor(unsigned VF = 8);
+ unsigned findBestVectorizationFactor(unsigned VF = MaxVectorSize);
private:
/// Returns the expected execution cost. The unit of the cost does
DataLayout *DL;
LoopInfo *LI;
TargetTransformInfo *TTI;
+ DominatorTree *DT;
virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
// We only vectorize innermost loops.
DL = getAnalysisIfAvailable<DataLayout>();
LI = &getAnalysis<LoopInfo>();
TTI = getAnalysisIfAvailable<TargetTransformInfo>();
+ DT = &getAnalysis<DominatorTree>();
DEBUG(dbgs() << "LV: Checking a loop in \"" <<
L->getHeader()->getParent()->getName() << "\"\n");
VF = VectorizationFactor;
}
- DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ").\n");
+ DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
+ L->getHeader()->getParent()->getParent()->getModuleIdentifier()<<
+ "\n");
// If we decided that it is *legal* to vectorizer the loop then do it.
- SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, VF);
+ SingleBlockLoopVectorizer LB(L, SE, LI, DT, DL, VF);
LB.vectorize(&LVL);
DEBUG(verifyFunction(*L->getHeader()->getParent()));
AU.addRequiredID(LCSSAID);
AU.addRequired<LoopInfo>();
AU.addRequired<ScalarEvolution>();
+ AU.addRequired<DominatorTree>();
+ AU.addPreserved<LoopInfo>();
+ AU.addPreserved<DominatorTree>();
}
};
Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
- // Instructions that access the old induction variable
- // actually want to get the new one.
- if (V == OldInduction)
- V = Induction;
// Create the types.
LLVMContext &C = V->getContext();
Type *VTy = VectorType::get(V->getType(), VF);
Type *I32 = IntegerType::getInt32Ty(C);
+
+ // Save the current insertion location.
+ Instruction *Loc = Builder.GetInsertPoint();
+
+ // We need to place the broadcast of invariant variables outside the loop.
+ bool Invariant = (OrigLoop->isLoopInvariant(V) && V != Induction);
+
+ // Place the code for broadcasting invariant variables in the new preheader.
+ if (Invariant)
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+
Constant *Zero = ConstantInt::get(I32, 0);
Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
Value *UndefVal = UndefValue::get(VTy);
// Broadcast the scalar into all locations in the vector.
Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
"broadcast");
- // We are accessing the induction variable. Make sure to promote the
- // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
- if (V == Induction)
- return getConsecutiveVector(Shuf);
+
+ // Restore the builder insertion point.
+ if (Invariant)
+ Builder.SetInsertPoint(Loc);
+
return Shuf;
}
return Builder.CreateAdd(Val, Cv, "induction");
}
-bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
+bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+ assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
+
+ // If this pointer is an induction variable, return it.
+ PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
+ if (Phi && getInductionVars()->count(Phi))
+ return true;
+
GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
if (!Gep)
return false;
if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
return false;
- // We can emit wide load/stores only of the last index is the induction
+ // We can emit wide load/stores only if the last index is the induction
// variable.
const SCEV *Last = SE->getSCEV(LastIndex);
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
return false;
}
+bool LoopVectorizationLegality::isUniform(Value *V) {
+ return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
+}
+
Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
+ assert(V != Induction && "The new induction variable should not be used.");
assert(!V->getType()->isVectorTy() && "Can't widen a vector");
// If we saved a vectorized copy of V, use it.
Value *&MapEntry = WidenMap[V];
Constant*
SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
- SmallVector<Constant*, 8> Indices;
- // Create a vector of consecutive numbers from zero to VF.
- for (unsigned i = 0; i < VF; ++i)
- Indices.push_back(ConstantInt::get(ScalarTy, Val));
-
- // Add the consecutive indices to the vector value.
- return ConstantVector::get(Indices);
+ return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true));
}
void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
// If we are accessing the old induction variable, use the new one.
if (SrcOp == OldInduction) {
- Params.push_back(getBroadcastInstrs(Induction));
+ Params.push_back(getVectorValue(SrcOp));
continue;
}
WidenMap[Instr] = VecResults;
}
-void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
+Value*
+SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
+ Instruction *Loc) {
+ LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
+ Legal->getRuntimePointerCheck();
+
+ if (!PtrRtCheck->Need)
+ return NULL;
+
+ Value *MemoryRuntimeCheck = 0;
+ unsigned NumPointers = PtrRtCheck->Pointers.size();
+ SmallVector<Value* , 2> Starts;
+ SmallVector<Value* , 2> Ends;
+
+ SCEVExpander Exp(*SE, "induction");
+
+ // Use this type for pointer arithmetic.
+ Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType();
+
+ for (unsigned i = 0; i < NumPointers; ++i) {
+ Value *Ptr = PtrRtCheck->Pointers[i];
+ const SCEV *Sc = SE->getSCEV(Ptr);
+
+ if (SE->isLoopInvariant(Sc, OrigLoop)) {
+ DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" <<
+ *Ptr <<"\n");
+ Starts.push_back(Ptr);
+ Ends.push_back(Ptr);
+ } else {
+ DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+
+ Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i],
+ PtrArithTy, Loc);
+ Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
+ Starts.push_back(Start);
+ Ends.push_back(End);
+ }
+ }
+
+ for (unsigned i = 0; i < NumPointers; ++i) {
+ for (unsigned j = i+1; j < NumPointers; ++j) {
+ Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+ Starts[i], Ends[j], "bound0", Loc);
+ Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+ Starts[j], Ends[i], "bound1", Loc);
+ Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
+ "found.conflict", Loc);
+ if (MemoryRuntimeCheck)
+ MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
+ MemoryRuntimeCheck,
+ IsConflict,
+ "conflict.rdx", Loc);
+ else
+ MemoryRuntimeCheck = IsConflict;
+
+ }
+ }
+
+ return MemoryRuntimeCheck;
+}
+
+void
+SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
/*
In this function we generate a new loop. The new loop will contain
the vectorized instructions while the old loop will continue to run the
...
*/
- // This is the original scalar-loop preheader.
+ BasicBlock *OldBasicBlock = OrigLoop->getHeader();
BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
BasicBlock *ExitBlock = OrigLoop->getExitBlock();
assert(ExitBlock && "Must have an exit block");
- assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
- assert(BypassBlock && "Invalid loop structure");
-
- BasicBlock *VectorPH =
- BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
- BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
- "vector.body");
-
- BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
- "middle.block");
- BasicBlock *ScalarPH =
- MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
- "scalar.preheader");
- // Find the induction variable.
- BasicBlock *OldBasicBlock = OrigLoop->getHeader();
+ // Some loops have a single integer induction variable, while other loops
+ // don't. One example is c++ iterators that often have multiple pointer
+ // induction variables. In the code below we also support a case where we
+ // don't have a single induction variable.
OldInduction = Legal->getInduction();
- assert(OldInduction && "We must have a single phi node.");
- Type *IdxTy = OldInduction->getType();
-
- // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
- // inside the loop.
- Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
-
- // Generate the induction variable.
- Induction = Builder.CreatePHI(IdxTy, 2, "index");
- Constant *Zero = ConstantInt::get(IdxTy, 0);
- Constant *Step = ConstantInt::get(IdxTy, VF);
+ Type *IdxTy = OldInduction ? OldInduction->getType() :
+ DL->getIntPtrType(SE->getContext());
// Find the loop boundaries.
const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
// Expand the trip count and place the new instructions in the preheader.
// Notice that the pre-header does not change, only the loop body.
SCEVExpander Exp(*SE, "induction");
+
+ // Count holds the overall loop count (N).
+ Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+ BypassBlock->getTerminator());
+
+ // The loop index does not have to start at Zero. Find the original start
+ // value from the induction PHI node. If we don't have an induction variable
+ // then we know that it starts at zero.
+ Value *StartIdx = OldInduction ?
+ OldInduction->getIncomingValueForBlock(BypassBlock):
+ ConstantInt::get(IdxTy, 0);
+
+ assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
+ assert(BypassBlock && "Invalid loop structure");
+
+ // Generate the code that checks in runtime if arrays overlap.
+ Value *MemoryRuntimeCheck = addRuntimeCheck(Legal,
+ BypassBlock->getTerminator());
+
+ // Split the single block loop into the two loop structure described above.
+ BasicBlock *VectorPH =
+ BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
+ BasicBlock *VecBody =
+ VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
+ BasicBlock *MiddleBlock =
+ VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
+ BasicBlock *ScalarPH =
+ MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
+
+ // This is the location in which we add all of the logic for bypassing
+ // the new vector loop.
Instruction *Loc = BypassBlock->getTerminator();
+ // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
+ // inside the loop.
+ Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+
+ // Generate the induction variable.
+ Induction = Builder.CreatePHI(IdxTy, 2, "index");
+ Constant *Step = ConstantInt::get(IdxTy, VF);
+
// We may need to extend the index in case there is a type mismatch.
// We know that the count starts at zero and does not overflow.
- // We are using Zext because it should be less expensive.
- if (ExitCount->getType() != Induction->getType())
- ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
+ if (Count->getType() != IdxTy) {
+ // The exit count can be of pointer type. Convert it to the correct
+ // integer type.
+ if (ExitCount->getType()->isPointerTy())
+ Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc);
+ else
+ Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc);
+ }
+
+ // Add the start index to the loop count to get the new end index.
+ Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
- // Count holds the overall loop count (N).
- Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
// Now we need to generate the expression for N - (N % VF), which is
// the part that the vectorized body will execute.
Constant *CIVF = ConstantInt::get(IdxTy, VF);
Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
+ Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
+ "end.idx.rnd.down", Loc);
- // Now, compare the new count to zero. If it is zero, jump to the scalar part.
+ // Now, compare the new count to zero. If it is zero skip the vector loop and
+ // jump to the scalar loop.
Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
- CountRoundDown, ConstantInt::getNullValue(IdxTy),
+ IdxEndRoundDown,
+ StartIdx,
"cmp.zero", Loc);
+
+ // If we are using memory runtime checks, include them in.
+ if (MemoryRuntimeCheck)
+ Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
+ "CntOrMem", Loc);
+
BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
// Remove the old terminator.
Loc->eraseFromParent();
+ // We are going to resume the execution of the scalar loop.
+ // Go over all of the induction variables that we found and fix the
+ // PHIs that are left in the scalar version of the loop.
+ // The starting values of PHI nodes depend on the counter of the last
+ // iteration in the vectorized loop.
+ // If we come from a bypass edge then we need to start from the original start
+ // value.
+
+ // This variable saves the new starting index for the scalar loop.
+ PHINode *ResumeIndex = 0;
+ LoopVectorizationLegality::InductionList::iterator I, E;
+ LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
+ for (I = List->begin(), E = List->end(); I != E; ++I) {
+ PHINode *OrigPhi = I->first;
+ PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
+ MiddleBlock->getTerminator());
+ Value *EndValue = 0;
+ if (OrigPhi->getType()->isIntegerTy()) {
+ // Handle the integer induction counter:
+ assert(OrigPhi == OldInduction && "Unknown integer PHI");
+ // We know what the end value is.
+ EndValue = IdxEndRoundDown;
+ // We also know which PHI node holds it.
+ ResumeIndex = ResumeVal;
+ } else {
+ // For pointer induction variables, calculate the offset using
+ // the end index.
+ EndValue = GetElementPtrInst::Create(I->second, CountRoundDown,
+ "ptr.ind.end",
+ BypassBlock->getTerminator());
+ }
+
+ // The new PHI merges the original incoming value, in case of a bypass,
+ // or the value at the end of the vectorized loop.
+ ResumeVal->addIncoming(I->second, BypassBlock);
+ ResumeVal->addIncoming(EndValue, VecBody);
+
+ // Fix the scalar body counter (PHI node).
+ unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
+ OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
+ }
+
+ // If we are generating a new induction variable then we also need to
+ // generate the code that calculates the exit value. This value is not
+ // simply the end of the counter because we may skip the vectorized body
+ // in case of a runtime check.
+ if (!OldInduction){
+ assert(!ResumeIndex && "Unexpected resume value found");
+ ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
+ MiddleBlock->getTerminator());
+ ResumeIndex->addIncoming(StartIdx, BypassBlock);
+ ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
+ }
+
+ // Make sure that we found the index where scalar loop needs to continue.
+ assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
+ "Invalid resume Index");
+
// Add a check in the middle block to see if we have completed
// all of the iterations in the first vector loop.
// If (N - N%VF) == N, then we *don't* need to run the remainder.
- Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
- CountRoundDown, "cmp.n",
+ Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
+ ResumeIndex, "cmp.n",
MiddleBlock->getTerminator());
BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
// Create i+1 and fill the PHINode.
Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
- Induction->addIncoming(Zero, VectorPH);
+ Induction->addIncoming(StartIdx, VectorPH);
Induction->addIncoming(NextIdx, VecBody);
// Create the compare.
- Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown);
+ Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
// Now we have two terminators. Remove the old one from the block.
VecBody->getTerminator()->eraseFromParent();
- // Fix the scalar body iteration count.
- unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
- OldInduction->setIncomingValue(BlockIdx, CountRoundDown);
-
// Get ready to start creating new instructions into the vectorized body.
Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
- // Register the new loop.
+ // Create and register the new vector loop.
Loop* Lp = new Loop();
- LPM->insertLoop(Lp, OrigLoop->getParentLoop());
-
- Lp->addBasicBlockToLoop(VecBody, LI->getBase());
-
Loop *ParentLoop = OrigLoop->getParentLoop();
+
+ // Insert the new loop into the loop nest and register the new basic blocks.
if (ParentLoop) {
+ ParentLoop->addChildLoop(Lp);
ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
+ } else {
+ LI->addTopLevelLoop(Lp);
}
+ Lp->addBasicBlockToLoop(VecBody, LI->getBase());
+
// Save the state.
+ LoopVectorPreHeader = VectorPH;
+ LoopScalarPreHeader = ScalarPH;
LoopMiddleBlock = MiddleBlock;
LoopExitBlock = ExitBlock;
LoopVectorBody = VecBody;
LoopBypassBlock = BypassBlock;
}
+/// This function returns the identity element (or neutral element) for
+/// the operation K.
+static unsigned
+getReductionIdentity(LoopVectorizationLegality::ReductionKind K) {
+ switch (K) {
+ case LoopVectorizationLegality::IntegerXor:
+ case LoopVectorizationLegality::IntegerAdd:
+ case LoopVectorizationLegality::IntegerOr:
+ // Adding, Xoring, Oring zero to a number does not change it.
+ return 0;
+ case LoopVectorizationLegality::IntegerMult:
+ // Multiplying a number by 1 does not change it.
+ return 1;
+ case LoopVectorizationLegality::IntegerAnd:
+ // AND-ing a number with an all-1 value does not change it.
+ return -1;
+ default:
+ llvm_unreachable("Unknown reduction kind");
+ }
+}
+
void
SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
//===------------------------------------------------===//
// In order to support reduction variables we need to be able to vectorize
// Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
- // steages. First, we create a new vector PHI node with no incoming edges.
+ // stages. First, we create a new vector PHI node with no incoming edges.
// We use this value when we vectorize all of the instructions that use the
// PHI. Next, after all of the instructions in the block are complete we
// add the new incoming edges to the PHI. At this point all of the
// instructions in the basic block are vectorized, so we can use them to
// construct the PHI.
- PhiVector PHIsToFix;
+ PhiVector RdxPHIsToFix;
// For each instruction in the old loop.
for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
continue;
case Instruction::PHI:{
PHINode* P = cast<PHINode>(Inst);
- // Special handling for the induction var.
- if (OldInduction == Inst)
+ // Handle reduction variables:
+ if (Legal->getReductionVars()->count(P)) {
+ // This is phase one of vectorizing PHIs.
+ Type *VecTy = VectorType::get(Inst->getType(), VF);
+ WidenMap[Inst] = PHINode::Create(VecTy, 2, "vec.phi",
+ LoopVectorBody->getFirstInsertionPt());
+ RdxPHIsToFix.push_back(P);
+ continue;
+ }
+
+ // This PHINode must be an induction variable.
+ // Make sure that we know about it.
+ assert(Legal->getInductionVars()->count(P) &&
+ "Not an induction variable");
+
+ if (P->getType()->isIntegerTy()) {
+ assert(P == OldInduction && "Unexpected PHI");
+ Value *Broadcasted = getBroadcastInstrs(Induction);
+ // After broadcasting the induction variable we need to make the
+ // vector consecutive by adding 0, 1, 2 ...
+ Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted);
+
+ WidenMap[OldInduction] = ConsecutiveInduction;
continue;
- // This is phase one of vectorizing PHIs.
- // This has to be a reduction variable.
- assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
- Type *VecTy = VectorType::get(Inst->getType(), VF);
- WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
- PHIsToFix.push_back(P);
+ }
+
+ // Handle pointer inductions.
+ assert(P->getType()->isPointerTy() && "Unexpected type.");
+ Value *StartIdx = OldInduction ?
+ Legal->getInductionVars()->lookup(OldInduction) :
+ ConstantInt::get(Induction->getType(), 0);
+
+ // This is the pointer value coming into the loop.
+ Value *StartPtr = Legal->getInductionVars()->lookup(P);
+
+ // This is the normalized GEP that starts counting at zero.
+ Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
+ "normalized.idx");
+
+ // This is the vector of results. Notice that we don't generate vector
+ // geps because scalar geps result in better code.
+ Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+ for (unsigned int i = 0; i < VF; ++i) {
+ Constant *Idx = ConstantInt::get(Induction->getType(), i);
+ Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
+ Value *SclrGep = Builder.CreateGEP(StartPtr, GlobalIdx, "next.gep");
+ VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+ Builder.getInt32(i),
+ "insert.gep");
+ }
+
+ WidenMap[Inst] = VecVal;
continue;
}
case Instruction::Add:
BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
Value *A = getVectorValue(Inst->getOperand(0));
Value *B = getVectorValue(Inst->getOperand(1));
+
// Use this vector value for all users of the original instruction.
- WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+ Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+ WidenMap[Inst] = V;
+
+ // Update the NSW, NUW and Exact flags.
+ BinaryOperator *VecOp = cast<BinaryOperator>(V);
+ if (isa<OverflowingBinaryOperator>(BinOp)) {
+ VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
+ VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
+ }
+ if (isa<PossiblyExactOperator>(VecOp))
+ VecOp->setIsExact(BinOp->isExact());
break;
}
case Instruction::Select: {
Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
Value *Ptr = SI->getPointerOperand();
unsigned Alignment = SI->getAlignment();
+
+ assert(!Legal->isUniform(Ptr) &&
+ "We do not allow storing to uniform addresses");
+
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+
// This store does not use GEPs.
- if (!Legal->isConsecutiveGep(Gep)) {
+ if (!Legal->isConsecutivePtr(Ptr)) {
scalarizeInstruction(Inst);
break;
}
- // The last index does not have to be the induction. It can be
- // consecutive and be a function of the index. For example A[I+1];
- unsigned NumOperands = Gep->getNumOperands();
- Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
- LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
-
- // Create the new GEP with the new induction variable.
- GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
- Gep2->setOperand(NumOperands - 1, LastIndex);
- Ptr = Builder.Insert(Gep2);
+ if (Gep) {
+ // The last index does not have to be the induction. It can be
+ // consecutive and be a function of the index. For example A[I+1];
+ unsigned NumOperands = Gep->getNumOperands();
+ Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
+ LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+ // Create the new GEP with the new induction variable.
+ GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+ Gep2->setOperand(NumOperands - 1, LastIndex);
+ Ptr = Builder.Insert(Gep2);
+ } else {
+ // Use the induction element ptr.
+ assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+ Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
+ }
Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
Value *Val = getVectorValue(SI->getValueOperand());
Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
unsigned Alignment = LI->getAlignment();
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
- // We don't have a gep. Scalarize the load.
- if (!Legal->isConsecutiveGep(Gep)) {
+ // If the pointer is loop invariant or if it is non consecutive,
+ // scalarize the load.
+ bool Con = Legal->isConsecutivePtr(Ptr);
+ if (Legal->isUniform(Ptr) || !Con) {
scalarizeInstruction(Inst);
break;
}
- // The last index does not have to be the induction. It can be
- // consecutive and be a function of the index. For example A[I+1];
- unsigned NumOperands = Gep->getNumOperands();
- Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
- LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
+ if (Gep) {
+ // The last index does not have to be the induction. It can be
+ // consecutive and be a function of the index. For example A[I+1];
+ unsigned NumOperands = Gep->getNumOperands();
+ Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+ LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+ // Create the new GEP with the new induction variable.
+ GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+ Gep2->setOperand(NumOperands - 1, LastIndex);
+ Ptr = Builder.Insert(Gep2);
+ } else {
+ // Use the induction element ptr.
+ assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+ Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
+ }
- // Create the new GEP with the new induction variable.
- GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
- Gep2->setOperand(NumOperands - 1, LastIndex);
- Ptr = Builder.Insert(Gep2);
Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
LI = Builder.CreateLoad(Ptr);
LI->setAlignment(Alignment);
// Create the 'reduced' values for each of the induction vars.
// The reduced values are the vector values that we scalarize and combine
// after the loop is finished.
- for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end();
+ for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
it != e; ++it) {
PHINode *RdxPhi = *it;
PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
Type *VecTy = VectorExit->getType();
- // Find the reduction identity variable. The value of the enum is the
- // identity. Zero for addition. One for Multiplication.
- unsigned IdentitySclr = RdxDesc.Kind;
- Constant *Identity = getUniformVector(IdentitySclr,
+ // Find the reduction identity variable. Zero for addition, or, xor,
+ // one for multiplication, -1 for And.
+ Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind),
VecTy->getScalarType());
// This vector is the Identity vector where the first element is the
Value *VectorStart = Builder.CreateInsertElement(Identity,
RdxDesc.StartValue, Zero);
-
// Fix the vector-loop phi.
// We created the induction variable so we know that the
// preheader is the first entry.
}// end of for each redux variable.
}
-void SingleBlockLoopVectorizer::cleanup() {
- // The original basic block.
+void SingleBlockLoopVectorizer::updateAnalysis() {
+ // Forget the original basic block.
SE->forgetLoop(OrigLoop);
+
+ // Update the dominator tree information.
+ assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) &&
+ "Entry does not dominate exit.");
+
+ DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock);
+ DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
+ DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock);
+ DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
+ DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
+ DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
+
+ DEBUG(DT->verifyAnalysis());
}
bool LoopVectorizationLegality::canVectorize() {
if (!TheLoop->getLoopPreheader()) {
assert(false && "No preheader!!");
DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
- return false;
+ return false;
}
// We can only vectorize single basic block loops.
BasicBlock *BB = TheLoop->getHeader();
DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
- // Go over each instruction and look at memory deps.
- if (!canVectorizeBlock(*BB)) {
- DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
- return false;
- }
-
// ScalarEvolution needs to be able to find the exit count.
const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
if (ExitCount == SE->getCouldNotCompute()) {
return false;
}
- DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
+ // Do not loop-vectorize loops with a tiny trip count.
+ unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB);
+ if (TC > 0u && TC < TinyTripCountThreshold) {
+ DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
+ "This loop is not worth vectorizing.\n");
+ return false;
+ }
+
+ // Go over each instruction and look at memory deps.
+ if (!canVectorizeBlock(*BB)) {
+ DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
+ return false;
+ }
+
+ DEBUG(dbgs() << "LV: We can vectorize this loop" <<
+ (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
+ <<"!\n");
// Okay! We can vectorize. At this point we don't have any other mem analysis
// which may limit our maximum vectorization factor, so just return true with
}
bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
+
+ BasicBlock *PreHeader = TheLoop->getLoopPreheader();
+
// Scan the instructions in the block and look for hazards.
for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
Instruction *I = it;
- PHINode *Phi = dyn_cast<PHINode>(I);
- if (Phi) {
+ if (PHINode *Phi = dyn_cast<PHINode>(I)) {
// This should not happen because the loop should be normalized.
if (Phi->getNumIncomingValues() != 2) {
DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
return false;
}
- // We only look at integer phi nodes.
- if (!Phi->getType()->isIntegerTy()) {
- DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
+
+ // This is the value coming from the preheader.
+ Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
+
+ // We only look at integer and pointer phi nodes.
+ if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) {
+ DEBUG(dbgs() << "LV: Found a pointer induction variable.\n");
+ Inductions[Phi] = StartValue;
+ continue;
+ } else if (!Phi->getType()->isIntegerTy()) {
+ DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
return false;
}
+ // Handle integer PHIs:
if (isInductionVariable(Phi)) {
if (Induction) {
DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
}
DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
Induction = Phi;
+ Inductions[Phi] = StartValue;
continue;
}
if (AddReductionVar(Phi, IntegerAdd)) {
// We still don't handle functions.
CallInst *CI = dyn_cast<CallInst>(I);
if (CI) {
- DEBUG(dbgs() << "LV: Found a call site:"<<
- CI->getCalledFunction()->getName() << "\n");
+ DEBUG(dbgs() << "LV: Found a call site.\n");
return false;
}
} // next instr.
if (!Induction) {
- DEBUG(dbgs() << "LV: Did not find an induction var.\n");
- return false;
+ DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+ assert(getInductionVars()->size() && "No induction variables");
}
// Don't vectorize if the memory dependencies do not allow vectorization.
while (Worklist.size()) {
Instruction *I = dyn_cast<Instruction>(Worklist.back());
Worklist.pop_back();
- // Look at instructions inside this block.
- if (!I) continue;
- if (I->getParent() != &BB) continue;
- // Stop when reaching PHI nodes.
- if (isa<PHINode>(I)) {
- assert(I == Induction && "Found a uniform PHI that is not the induction");
- break;
- }
+ // Look at instructions inside this block. Stop when reaching PHI nodes.
+ if (!I || I->getParent() != &BB || isa<PHINode>(I))
+ continue;
// This is a known uniform.
Uniforms.insert(I);
// Insert all operands.
- for (int i=0, Op = I->getNumOperands(); i < Op; ++i) {
+ for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) {
Worklist.push_back(I->getOperand(i));
}
}
// Holds the Load and Store *instructions*.
ValueVector Loads;
ValueVector Stores;
+ PtrRtCheck.Pointers.clear();
+ PtrRtCheck.Need = false;
// Scan the BB and collect legal loads and stores.
for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
StoreInst *ST = dyn_cast<StoreInst>(*I);
assert(ST && "Bad StoreInst");
Value* Ptr = ST->getPointerOperand();
+
+ if (isUniform(Ptr)) {
+ DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+ return false;
+ }
+
// If we did *not* see this pointer before, insert it to
// the read-write list. At this phase it is only a 'write' list.
if (Seen.insert(Ptr))
// If the address of i is unknown (for example A[B[i]]) then we may
// read a few words, modify, and write a few words, and some of the
// words may be written to the same address.
- if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr))
+ if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr))
Reads.push_back(Ptr);
}
+ // If we write (or read-write) to a single destination and there are no
+ // other reads in this loop then is it safe to vectorize.
+ if (ReadWrites.size() == 1 && Reads.size() == 0) {
+ DEBUG(dbgs() << "LV: Found a write-only loop!\n");
+ return true;
+ }
+
+ // Find pointers with computable bounds. We are going to use this information
+ // to place a runtime bound check.
+ bool RT = true;
+ for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
+ if (hasComputableBounds(*I)) {
+ PtrRtCheck.insert(SE, TheLoop, *I);
+ DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+ } else {
+ RT = false;
+ break;
+ }
+ for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
+ if (hasComputableBounds(*I)) {
+ PtrRtCheck.insert(SE, TheLoop, *I);
+ DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+ } else {
+ RT = false;
+ break;
+ }
+
+ // Check that we did not collect too many pointers or found a
+ // unsizeable pointer.
+ if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+ PtrRtCheck.reset();
+ RT = false;
+ }
+
+ PtrRtCheck.Need = RT;
+
+ if (RT) {
+ DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
+ }
+
// Now that the pointers are in two lists (Reads and ReadWrites), we
// can check that there are no conflicts between each of the writes and
// between the writes to the reads.
it != e; ++it) {
if (!isIdentifiedObject(*it)) {
DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
- return false;
+ return RT;
}
if (!WriteObjects.insert(*it)) {
DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
<< **it <<"\n");
- return false;
+ return RT;
}
}
TempObjects.clear();
it != e; ++it) {
if (!isIdentifiedObject(*it)) {
DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
- return false;
+ return RT;
}
if (WriteObjects.count(*it)) {
DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
<< **it <<"\n");
- return false;
+ return RT;
}
}
TempObjects.clear();
}
- // All is okay.
+ // It is safe to vectorize and we don't need any runtime checks.
+ DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n");
+ PtrRtCheck.reset();
return true;
}
case Instruction::Sub:
return Kind == IntegerAdd;
case Instruction::Mul:
- case Instruction::UDiv:
- case Instruction::SDiv:
return Kind == IntegerMult;
case Instruction::And:
return Kind == IntegerAnd;
}
bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
+ Type *PhiTy = Phi->getType();
+ // We only handle integer and pointer inductions variables.
+ if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
+ return false;
+
// Check that the PHI is consecutive and starts at zero.
const SCEV *PhiScev = SE->getSCEV(Phi);
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
return false;
}
const SCEV *Step = AR->getStepRecurrence(*SE);
- const SCEV *Start = AR->getStart();
- if (!Step->isOne() || !Start->isZero()) {
- DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
+ // Integer inductions need to have a stride of one.
+ if (PhiTy->isIntegerTy())
+ return Step->isOne();
+
+ // Calculate the pointer stride and check if it is consecutive.
+ const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+ if (!C) return false;
+
+ assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
+ uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
+ return (C->getValue()->equalsInt(Size));
+}
+
+bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
+ const SCEV *PhiScev = SE->getSCEV(Ptr);
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+ if (!AR)
return false;
- }
- return true;
+
+ return AR->isAffine();
}
unsigned
SI->getAlignment(), SI->getPointerAddressSpace());
// Scalarized stores.
- if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
+ if (!Legal->isConsecutivePtr(SI->getPointerOperand())) {
unsigned Cost = 0;
unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
ValTy);
LI->getPointerAddressSpace());
// Scalarized loads.
- if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
+ if (!Legal->isConsecutivePtr(LI->getPointerOperand())) {
unsigned Cost = 0;
unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
// The cost of inserting the loaded value into the result vector.