#include "llvm/Transforms/Vectorize.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/PatternMatch.h"
-#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
MRK_FloatMax
};
- /// This POD struct holds information about reduction variables.
+ /// This struct holds information about reduction variables.
struct ReductionDescriptor {
ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
MinMaxReductionKind MinMaxKind;
};
- // This POD struct holds information about the memory runtime legality
- // check that a group of pointers do not overlap.
+ /// This struct holds information about the memory runtime legality
+ /// check that a group of pointers do not overlap.
struct RuntimePointerCheck {
RuntimePointerCheck() : Need(false) {}
Pointers.clear();
Starts.clear();
Ends.clear();
+ IsWritePtr.clear();
+ DependencySetId.clear();
}
/// Insert a pointer and calculate the start and end SCEVs.
SmallVector<unsigned, 2> DependencySetId;
};
- /// A POD for saving information about induction variables.
+ /// A struct for saving information about induction variables.
struct InductionInfo {
InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
/// pointer itself is an induction variable.
/// This check allows us to vectorize A[idx] into a wide load/store.
/// Returns:
- /// 0 - Stride is unknown or non consecutive.
+ /// 0 - Stride is unknown or non-consecutive.
/// 1 - Address is consecutive.
/// -1 - Address is consecutive, and decreasing.
int isConsecutivePtr(Value *Ptr);
unsigned Width;
/// Vectorization unroll factor.
unsigned Unroll;
+ /// Vectorization forced (-1 not selected, 0 force disabled, 1 force enabled)
+ int Force;
LoopVectorizeHints(const Loop *L, bool DisableUnrolling)
: Width(VectorizationFactor)
, Unroll(DisableUnrolling ? 1 : VectorizationUnroll)
+ , Force(-1)
, LoopID(L->getLoopID()) {
getHints(L);
// The command line options override any loop metadata except for when
Vals.push_back(LoopID->getOperand(i));
Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width));
+ Vals.push_back(createHint(Context, Twine(Prefix(), "unroll").str(), 1));
MDNode *NewLoopID = MDNode::get(Context, Vals);
// Set operand 0 to refer to the loop id itself.
if (isPowerOf2_32(Val) && Val <= MaxVectorWidth)
Width = Val;
else
- DEBUG(dbgs() << "LV: ignoring invalid width hint metadata");
+ DEBUG(dbgs() << "LV: ignoring invalid width hint metadata\n");
} else if (Hint == "unroll") {
if (isPowerOf2_32(Val) && Val <= MaxUnrollFactor)
Unroll = Val;
else
- DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata");
+ DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n");
+ } else if (Hint == "enable") {
+ if (C->getBitWidth() == 1)
+ Force = Val;
+ else
+ DEBUG(dbgs() << "LV: ignoring invalid enable hint metadata\n");
} else {
- DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint);
+ DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n');
}
}
};
/// Pass identification, replacement for typeid
static char ID;
- explicit LoopVectorize(bool NoUnrolling = false)
- : LoopPass(ID), DisableUnrolling(NoUnrolling) {
+ explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
+ : LoopPass(ID),
+ DisableUnrolling(NoUnrolling),
+ AlwaysVectorize(AlwaysVectorize) {
initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
}
DominatorTree *DT;
TargetLibraryInfo *TLI;
bool DisableUnrolling;
+ bool AlwaysVectorize;
virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
// We only vectorize innermost loops.
return false;
if (DL == NULL) {
- DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout");
+ DEBUG(dbgs() << "LV: Not vectorizing: Missing data layout\n");
return false;
}
LoopVectorizeHints Hints(L, DisableUnrolling);
+ if (Hints.Force == 0) {
+ DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
+ return false;
+ }
+
+ if (!AlwaysVectorize && Hints.Force != 1) {
+ DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
+ return false;
+ }
+
if (Hints.Width == 1 && Hints.Unroll == 1) {
- DEBUG(dbgs() << "LV: Not vectorizing.\n");
+ DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
return false;
}
// Check if it is legal to vectorize the loop.
LoopVectorizationLegality LVL(L, SE, DL, DT, TLI);
if (!LVL.canVectorize()) {
- DEBUG(dbgs() << "LV: Not vectorizing.\n");
+ DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
return false;
}
Attribute::AttrKind SzAttr = Attribute::OptimizeForSize;
Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat;
unsigned FnIndex = AttributeSet::FunctionIndex;
- bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr);
+ bool OptForSize = Hints.Force != 1 &&
+ F->getAttributes().hasAttribute(FnIndex, SzAttr);
bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr);
if (NoFloat) {
unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
VF.Cost);
- if (VF.Width == 1) {
- DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
- }
-
DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
- F->getParent()->getModuleIdentifier()<<"\n");
- DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
+ F->getParent()->getModuleIdentifier() << '\n');
+ DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
if (VF.Width == 1) {
+ DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
if (UF == 1)
return false;
+ DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n");
// We decided not to vectorize, but we may want to unroll.
InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
Unroller.vectorize(&LVL);
}
Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
- // Save the current insertion location.
- Instruction *Loc = Builder.GetInsertPoint();
-
// We need to place the broadcast of invariant variables outside the loop.
Instruction *Instr = dyn_cast<Instruction>(V);
bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
// Place the code for broadcasting invariant variables in the new preheader.
+ IRBuilder<>::InsertPointGuard Guard(Builder);
if (Invariant)
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
// Broadcast the scalar into all locations in the vector.
Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
- // Restore the builder insertion point.
- if (Invariant)
- Builder.SetInsertPoint(Loc);
-
return Shuf;
}
return Builder.CreateAdd(Val, Cv, "induction");
}
+/// \brief Find the operand of the GEP that should be checked for consecutive
+/// stores. This ignores trailing indices that have no effect on the final
+/// pointer.
+static unsigned getGEPInductionOperand(DataLayout *DL,
+ const GetElementPtrInst *Gep) {
+ unsigned LastOperand = Gep->getNumOperands() - 1;
+ unsigned GEPAllocSize = DL->getTypeAllocSize(
+ cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
+
+ // Walk backwards and try to peel off zeros.
+ while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
+ // Find the type we're currently indexing into.
+ gep_type_iterator GEPTI = gep_type_begin(Gep);
+ std::advance(GEPTI, LastOperand - 1);
+
+ // If it's a type with the same allocation size as the result of the GEP we
+ // can peel off the zero index.
+ if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize)
+ break;
+ --LastOperand;
+ }
+
+ return LastOperand;
+}
+
int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
- assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
+ assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
// Make sure that the pointer does not point to structs.
- if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType())
+ if (Ptr->getType()->getPointerElementType()->isAggregateType())
return 0;
// If this value is a pointer induction variable we know it is consecutive.
return 0;
unsigned NumOperands = Gep->getNumOperands();
- Value *LastIndex = Gep->getOperand(NumOperands - 1);
-
Value *GpPtr = Gep->getPointerOperand();
// If this GEP value is a consecutive pointer induction variable and all of
// the indices are constant then we know it is consecutive. We can
return -1;
}
- // Check that all of the gep indices are uniform except for the last.
- for (unsigned i = 0; i < NumOperands - 1; ++i)
- if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+ unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
+
+ // Check that all of the gep indices are uniform except for our induction
+ // operand.
+ for (unsigned i = 0; i != NumOperands; ++i)
+ if (i != InductionOperand &&
+ !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
return 0;
- // We can emit wide load/stores only if the last index is the induction
- // variable.
- const SCEV *Last = SE->getSCEV(LastIndex);
+ // We can emit wide load/stores only if the last non-zero index is the
+ // induction variable.
+ const SCEV *Last = SE->getSCEV(Gep->getOperand(InductionOperand));
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
const SCEV *Step = AR->getStepRecurrence(*SE);
Type *DataTy = VectorType::get(ScalarDataTy, VF);
Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
+ // An alignment of 0 means target abi alignment. We need to use the scalar's
+ // target abi alignment in such a case.
+ if (!Alignment)
+ Alignment = DL->getABITypeAlignment(ScalarDataTy);
unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
if (ScalarAllocatedSize != VectorElementSize)
return scalarizeInstruction(Instr);
- // If the pointer is loop invariant or if it is non consecutive,
+ // If the pointer is loop invariant or if it is non-consecutive,
// scalarize the load.
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
bool Reverse = ConsecutiveStride < 0;
// The last index does not have to be the induction. It can be
// consecutive and be a function of the index. For example A[I+1];
unsigned NumOperands = Gep->getNumOperands();
- unsigned LastOperand = NumOperands - 1;
+ unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
// Create the new GEP with the new induction variable.
GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
// Update last index or loop invariant instruction anchored in loop.
- if (i == LastOperand ||
+ if (i == InductionOperand ||
(GepOperandInst && OrigLoop->contains(GepOperandInst))) {
- assert((i == LastOperand ||
+ assert((i == InductionOperand ||
SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
"Must be last index or loop invariant");
SmallVector<TrackingVH<Value> , 2> Starts;
SmallVector<TrackingVH<Value> , 2> Ends;
+ LLVMContext &Ctx = Loc->getContext();
SCEVExpander Exp(*SE, "induction");
- // Use this type for pointer arithmetic.
- Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0);
-
for (unsigned i = 0; i < NumPointers; ++i) {
Value *Ptr = PtrRtCheck->Pointers[i];
const SCEV *Sc = SE->getSCEV(Ptr);
Starts.push_back(Ptr);
Ends.push_back(Ptr);
} else {
- DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+ DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n');
+ unsigned AS = Ptr->getType()->getPointerAddressSpace();
+
+ // Use this type for pointer arithmetic.
+ Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j])
continue;
- Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
- Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
- Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc");
- Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy, "bc");
+ unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
+ unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace();
+
+ assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) &&
+ (AS1 == Ends[i]->getType()->getPointerAddressSpace()) &&
+ "Trying to bounds check pointers with different address spaces");
+
+ Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
+ Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
+
+ Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc");
+ Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc");
+ Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc");
+ Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc");
Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
// We have to do this trickery because the IRBuilder might fold the check to a
// constant expression in which case there is no Instruction anchored in a
// the block.
- LLVMContext &Ctx = Loc->getContext();
- Instruction * Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
- ConstantInt::getTrue(Ctx));
+ Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
+ ConstantInt::getTrue(Ctx));
ChkBuilder.Insert(Check, "memcheck.conflict");
return Check;
}
const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
+ // The exit count might have the type of i64 while the phi is i32. This can
+ // happen if we have an induction variable that is sign extended before the
+ // compare. The only way that we get a backedge taken count is that the
+ // induction variable was signed and as such will not overflow. In such a case
+ // truncation is legal.
+ if (ExitCount->getType()->getPrimitiveSizeInBits() >
+ IdxTy->getPrimitiveSizeInBits())
+ ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
+
+ ExitCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
// Get the total trip count from the count by adding 1.
ExitCount = SE->getAddExpr(ExitCount,
SE->getConstant(ExitCount->getType(), 1));
LoopExitBlock = ExitBlock;
LoopVectorBody = VecBody;
LoopScalarBody = OldBasicBlock;
+
+ LoopVectorizeHints Hints(Lp, true);
+ Hints.setAlreadyVectorized(Lp);
}
/// This function returns the identity element (or neutral element) for
return Select;
}
+namespace {
+struct CSEDenseMapInfo {
+ static bool canHandle(Instruction *I) {
+ return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+ isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
+ }
+ static inline Instruction *getEmptyKey() {
+ return DenseMapInfo<Instruction *>::getEmptyKey();
+ }
+ static inline Instruction *getTombstoneKey() {
+ return DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+ static unsigned getHashValue(Instruction *I) {
+ assert(canHandle(I) && "Unknown instruction!");
+ return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
+ I->value_op_end()));
+ }
+ static bool isEqual(Instruction *LHS, Instruction *RHS) {
+ if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
+ LHS == getTombstoneKey() || RHS == getTombstoneKey())
+ return LHS == RHS;
+ return LHS->isIdenticalTo(RHS);
+ }
+};
+}
+
+///\brief Perform cse of induction variable instructions.
+static void cse(BasicBlock *BB) {
+ // Perform simple cse.
+ SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+ Instruction *In = I++;
+
+ if (!CSEDenseMapInfo::canHandle(In))
+ continue;
+
+ // Check if we can replace this instruction with any of the
+ // visited instructions.
+ if (Instruction *V = CSEMap.lookup(In)) {
+ In->replaceAllUsesWith(V);
+ In->eraseFromParent();
+ continue;
+ }
+
+ CSEMap[In] = In;
+ }
+}
+
void
InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
//===------------------------------------------------===//
(RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx);
(RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
}// end of for each redux variable.
-
+
fixLCSSAPHIs();
+
+ // Remove redundant induction instructions.
+ cse(LoopVectorBody);
}
void InnerLoopVectorizer::fixLCSSAPHIs() {
setDebugLocFromInst(Builder, P);
// Check for PHI nodes that are lowered to vector selects.
if (P->getParent() != OrigLoop->getHeader()) {
- // We know that all PHIs in non header blocks are converted into
+ // We know that all PHIs in non-header blocks are converted into
// selects, so we don't have to worry about the insertion order and we
// can just use the builder.
// At this point we generate the predication tree. There may be
DEBUG(DT->verifyAnalysis());
}
+/// \brief Check whether it is safe to if-convert this phi node.
+///
+/// Phi nodes with constant expressions that can trap are not safe to if
+/// convert.
+static bool canIfConvertPHINodes(BasicBlock *BB) {
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+ PHINode *Phi = dyn_cast<PHINode>(I);
+ if (!Phi)
+ return true;
+ for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)
+ if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))
+ if (C->canTrap())
+ return false;
+ }
+ return true;
+}
+
bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
if (!EnableIfConversion)
return false;
assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
- std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
// A list of pointers that we can safely read and write to.
SmallPtrSet<Value *, 8> SafePointes;
// Collect safe addresses.
- for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
- BasicBlock *BB = LoopBlocks[i];
+ for (Loop::block_iterator BI = TheLoop->block_begin(),
+ BE = TheLoop->block_end(); BI != BE; ++BI) {
+ BasicBlock *BB = *BI;
if (blockNeedsPredication(BB))
continue;
}
// Collect the blocks that need predication.
- for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
- BasicBlock *BB = LoopBlocks[i];
+ BasicBlock *Header = TheLoop->getHeader();
+ for (Loop::block_iterator BI = TheLoop->block_begin(),
+ BE = TheLoop->block_end(); BI != BE; ++BI) {
+ BasicBlock *BB = *BI;
// We don't support switch statements inside loops.
if (!isa<BranchInst>(BB->getTerminator()))
return false;
// We must be able to predicate all blocks that need to be predicated.
- if (blockNeedsPredication(BB) && !blockCanBePredicated(BB, SafePointes))
+ if (blockNeedsPredication(BB)) {
+ if (!blockCanBePredicated(BB, SafePointes))
+ return false;
+ } else if (BB != Header && !canIfConvertPHINodes(BB))
return false;
+
}
// We can if-convert this loop.
if (!TheLoop->getExitingBlock())
return false;
- unsigned NumBlocks = TheLoop->getNumBlocks();
+ // We need to have a loop header.
+ DEBUG(dbgs() << "LV: Found a loop: " <<
+ TheLoop->getHeader()->getName() << '\n');
- // Check if we can if-convert non single-bb loops.
+ // Check if we can if-convert non-single-bb loops.
+ unsigned NumBlocks = TheLoop->getNumBlocks();
if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
return false;
}
- // We need to have a loop header.
- BasicBlock *Latch = TheLoop->getLoopLatch();
- DEBUG(dbgs() << "LV: Found a loop: " <<
- TheLoop->getHeader()->getName() << "\n");
-
// ScalarEvolution needs to be able to find the exit count.
const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
if (ExitCount == SE->getCouldNotCompute()) {
}
// Do not loop-vectorize loops with a tiny trip count.
+ BasicBlock *Latch = TheLoop->getLoopLatch();
unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
if (TC > 0u && TC < TinyTripCountVectorThreshold) {
DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
if (Ty->isPointerTy())
return DL.getIntPtrType(Ty);
+ // It is possible that char's or short's overflow when we ask for the loop's
+ // trip count, work around this by changing the type size.
+ if (Ty->getScalarSizeInBits() < 32)
+ return Type::getInt32Ty(Ty->getContext());
+
return Ty;
}
Instruction *U = cast<Instruction>(*I);
// This user may be a reduction exit value.
if (!TheLoop->contains(U)) {
- DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+ DEBUG(dbgs() << "LV: Found an outside user for : " << *U << '\n');
return true;
}
}
}
// Check that the instruction return type is vectorizable.
- if (!VectorType::isValidElementType(it->getType()) &&
- !it->getType()->isVoidTy()) {
- DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
+ // Also, we can't vectorize extractelement instructions.
+ if ((!VectorType::isValidElementType(it->getType()) &&
+ !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
+ DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
return false;
}
/// non-intersection.
bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
unsigned &NumComparisons, ScalarEvolution *SE,
- Loop *TheLoop);
+ Loop *TheLoop, bool ShouldCheckStride = false);
/// \brief Goes over all memory accesses, checks whether a RT check is needed
/// and builds sets of dependent accesses.
bool isRTCheckNeeded() { return IsRTCheckNeeded; }
bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
+ void resetDepChecks() { CheckDeps.clear(); }
MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
return AR->isAffine();
}
+/// \brief Check the stride of the pointer and ensure that it does not wrap in
+/// the address space.
+static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
+ const Loop *Lp);
+
bool AccessAnalysis::canCheckPtrAtRT(
LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
unsigned &NumComparisons, ScalarEvolution *SE,
- Loop *TheLoop) {
+ Loop *TheLoop, bool ShouldCheckStride) {
// Find pointers with computable bounds. We are going to use this information
// to place a runtime bound check.
unsigned NumReadPtrChecks = 0;
else
++NumReadPtrChecks;
- if (hasComputableBounds(SE, Ptr)) {
+ if (hasComputableBounds(SE, Ptr) &&
+ // When we run after a failing dependency check we have to make sure we
+ // don't have wrapping pointers.
+ (!ShouldCheckStride || isStridedPtr(SE, DL, Ptr, TheLoop) == 1)) {
// The id of the dependence set.
unsigned DepId;
RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId);
- DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr <<"\n");
+ DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n');
} else {
CanDoRT = false;
}
if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
NumComparisons = 0; // Only one dependence set.
- else
+ else {
NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks +
NumWritePtrChecks - 1));
+ }
+
+ // If the pointers that we would use for the bounds comparison have different
+ // address spaces, assume the values aren't directly comparable, so we can't
+ // use them for the runtime check. We also have to assume they could
+ // overlap. In the future there should be metadata for whether address spaces
+ // are disjoint.
+ unsigned NumPointers = RtCheck.Pointers.size();
+ for (unsigned i = 0; i < NumPointers; ++i) {
+ for (unsigned j = i + 1; j < NumPointers; ++j) {
+ // Only need to check pointers between two different dependency sets.
+ if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j])
+ continue;
+
+ Value *PtrI = RtCheck.Pointers[i];
+ Value *PtrJ = RtCheck.Pointers[j];
+
+ unsigned ASi = PtrI->getType()->getPointerAddressSpace();
+ unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
+ if (ASi != ASj) {
+ DEBUG(dbgs() << "LV: Runtime check would require comparison between"
+ " different address spaces\n");
+ return false;
+ }
+ }
+ }
+
return CanDoRT;
}
!isa<Argument>(UnderlyingObj)) &&
!isIdentifiedObject(UnderlyingObj))) {
DEBUG(dbgs() << "LV: Found an unidentified " <<
- (IsWrite ? "write" : "read" ) << " ptr:" << *UnderlyingObj <<
+ (IsWrite ? "write" : "read" ) << " ptr: " << *UnderlyingObj <<
"\n");
IsRTCheckNeeded = (IsRTCheckNeeded ||
!isIdentifiedObject(UnderlyingObj) ||
typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
- MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) :
- SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0) {}
+ MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L)
+ : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
+ ShouldRetryWithRuntimeCheck(false) {}
/// \brief Register the location (instructions are given increasing numbers)
/// of a write access.
/// the accesses safely with.
unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
+ /// \brief In same cases when the dependency check fails we can still
+ /// vectorize the loop with a dynamic array access check.
+ bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; }
+
private:
ScalarEvolution *SE;
DataLayout *DL;
// We can access this many bytes in parallel safely.
unsigned MaxSafeDepDistBytes;
+ /// \brief If we see a non-constant dependence distance we can still try to
+ /// vectorize this loop with runtime checks.
+ bool ShouldRetryWithRuntimeCheck;
+
/// \brief Check whether there is a plausible dependence between the two
/// accesses.
///
static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
const Loop *Lp) {
const Type *Ty = Ptr->getType();
- assert(Ty->isPointerTy() && "Unexpected non ptr");
+ assert(Ty->isPointerTy() && "Unexpected non-ptr");
// Make sure that the pointer does not point to aggregate types.
const PointerType *PtrTy = cast<PointerType>(Ty);
const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
if (!C) {
- DEBUG(dbgs() << "LV: Dependence because of non constant distance\n");
+ DEBUG(dbgs() << "LV: Dependence because of non-constant distance\n");
+ ShouldRetryWithRuntimeCheck = true;
return true;
}
if (Val == 0) {
if (ATy == BTy)
return false;
- DEBUG(dbgs() << "LV: Zero dependence difference but different types");
+ DEBUG(dbgs() << "LV: Zero dependence difference but different types\n");
return true;
}
// Positive distance bigger than max vectorization factor.
if (ATy != BTy) {
DEBUG(dbgs() <<
- "LV: ReadWrite-Write positive dependency with different types");
+ "LV: ReadWrite-Write positive dependency with different types\n");
return false;
}
2*TypeByteSize > MaxSafeDepDistBytes ||
Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
DEBUG(dbgs() << "LV: Failure because of Positive distance "
- << Val.getSExtValue() << "\n");
+ << Val.getSExtValue() << '\n');
return true;
}
return true;
DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() <<
- " with max VF=" << MaxSafeDepDistBytes/TypeByteSize << "\n");
+ " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n');
return false;
}
Stores.push_back(St);
DepChecker.addAccess(St);
}
- } // next instr.
- } // next block.
+ } // Next instr.
+ } // Next block.
// Now we have two lists that hold the loads and the stores.
// Next, we find the pointers that they use.
return true;
}
- SmallPtrSet<Value *, 16> ReadOnlyPtr;
for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
LoadInst *LD = cast<LoadInst>(*I);
Value* Ptr = LD->getPointerOperand();
if (NumComparisons == 0 && NeedRTCheck)
NeedRTCheck = false;
- // Check that we did not collect too many pointers or found a unsizeable
+ // Check that we did not collect too many pointers or found an unsizeable
// pointer.
if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
PtrRtCheck.reset();
CanVecMem = DepChecker.areDepsSafe(DependentAccesses,
Accesses.getDependenciesToCheck());
MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes();
+
+ if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) {
+ DEBUG(dbgs() << "LV: Retrying with memory checks\n");
+ NeedRTCheck = true;
+
+ // Clear the dependency checks. We assume they are not needed.
+ Accesses.resetDepChecks();
+
+ PtrRtCheck.reset();
+ PtrRtCheck.Need = true;
+
+ CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE,
+ TheLoop, true);
+ // Check that we did not collect too many pointers or found an unsizeable
+ // pointer.
+ if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
+ DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n");
+ PtrRtCheck.reset();
+ return false;
+ }
+
+ CanVecMem = true;
+ }
}
- DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") <<
+ DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") <<
" need a runtime memory check.\n");
return CanVecMem;
// Check whether we found a reduction operator.
FoundReduxOp |= !IsAPhi;
- // Process users of current instruction. Push non PHI nodes after PHI nodes
+ // Process users of current instruction. Push non-PHI nodes after PHI nodes
// onto the stack. This way we are going to have seen all inputs to PHI
// nodes once we get to them.
SmallVector<Instruction *, 8> NonPHIs;
if (ExitInstruction != 0 || Cur == Phi)
return false;
+ // The instruction used by an outside user must be the last instruction
+ // before we feed back to the reduction phi. Otherwise, we loose VF-1
+ // operations on the value.
+ if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end())
+ return false;
+
ExitInstruction = Cur;
continue;
}
if (it->mayWriteToMemory() || it->mayThrow())
return false;
+ // Check that we don't have a constant expression that can trap as operand.
+ for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
+ OI != OE; ++OI) {
+ if (Constant *C = dyn_cast<Constant>(*OI))
+ if (C->canTrap())
+ return false;
+ }
+
// The instructions below can trap.
switch (it->getOpcode()) {
default: continue;
// Find the trip count.
unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
- DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n");
+ DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
unsigned WidestType = getWidestType();
unsigned WidestRegister = TTI.getRegisterBitWidth(true);
WidestRegister : MaxSafeDepDist);
unsigned MaxVectorSize = WidestRegister / WidestType;
DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
- DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n");
+ DEBUG(dbgs() << "LV: The Widest register is: "
+ << WidestRegister << " bits.\n");
if (MaxVectorSize == 0) {
DEBUG(dbgs() << "LV: The target has no vector registers.\n");
if (UserVF != 0) {
assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
- DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n");
+ DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
Factor.Width = UserVF;
return Factor;
float Cost = expectedCost(1);
unsigned Width = 1;
- DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
+ DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n");
for (unsigned i=2; i <= VF; i*=2) {
// Notice that the vector loop needs to be executed less times, so
// we need to divide the cost of the vector loops by the width of
// the vector elements.
float VectorCost = expectedCost(i) / (float)i;
- DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
+ DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
(int)VectorCost << ".\n");
if (VectorCost < Cost) {
Cost = VectorCost;
}
if (HasReductions) {
- DEBUG(dbgs() << "LV: Unrolling because of reductions. \n");
+ DEBUG(dbgs() << "LV: Unrolling because of reductions.\n");
return UF;
}
// We assume that the cost overhead is 1 and we use the cost model
// to estimate the cost of the loop and unroll until the cost of the
// loop overhead is about 5% of the cost of the loop.
- DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n");
+ DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
if (LoopCost < SmallLoopCost) {
- DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n");
+ DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
unsigned NewUF = SmallLoopCost / (LoopCost + 1);
return std::min(NewUF, UF);
}
- DEBUG(dbgs() << "LV: Not Unrolling. \n");
+ DEBUG(dbgs() << "LV: Not Unrolling.\n");
return 1;
}
MaxUsage = std::max(MaxUsage, OpenIntervals.size());
DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
- OpenIntervals.size() <<"\n");
+ OpenIntervals.size() << '\n');
// Add the current instruction to the list of open intervals.
OpenIntervals.insert(I);
}
unsigned Invariant = LoopInvariants.size();
- DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n");
- DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n");
- DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n");
+ DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n');
+ DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
+ DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n');
R.LoopInvariantRegs = Invariant;
R.MaxLocalUsers = MaxUsage;
unsigned C = getInstructionCost(it, VF);
BlockCost += C;
- DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
- VF << " For instruction: "<< *it << "\n");
+ DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
+ VF << " For instruction: " << *it << '\n');
}
// We assume that if-converted blocks have a 50% chance of being executed.
static const char lv_name[] = "Loop Vectorization";
INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
namespace llvm {
- Pass *createLoopVectorizePass(bool NoUnrolling) {
- return new LoopVectorize(NoUnrolling);
+ Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
+ return new LoopVectorize(NoUnrolling, AlwaysVectorize);
}
}