//
//===----------------------------------------------------------------------===//
-#define LV_NAME "loop-vectorize"
-#define DEBUG_TYPE LV_NAME
-
#include "llvm/Transforms/Vectorize.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/EquivalenceClasses.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Support/PatternMatch.h"
-#include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/Format.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/VectorUtils.h"
#include <algorithm>
#include <map>
using namespace llvm;
using namespace llvm::PatternMatch;
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
+STATISTIC(LoopsVectorized, "Number of loops vectorized");
+STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
+
static cl::opt<unsigned>
VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
cl::desc("Sets the SIMD width. Zero is autoselect."));
/// Maximum simd width.
static const unsigned MaxVectorWidth = 64;
+static cl::opt<unsigned> ForceTargetNumScalarRegs(
+ "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's number of scalar registers."));
+
+static cl::opt<unsigned> ForceTargetNumVectorRegs(
+ "force-target-num-vector-regs", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's number of vector registers."));
+
/// Maximum vectorization unroll count.
static const unsigned MaxUnrollFactor = 16;
-/// The cost of a loop that is considered 'small' by the unroller.
-static const unsigned SmallLoopCost = 20;
+static cl::opt<unsigned> ForceTargetMaxScalarUnrollFactor(
+ "force-target-max-scalar-unroll", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's max unroll factor for scalar "
+ "loops."));
+
+static cl::opt<unsigned> ForceTargetMaxVectorUnrollFactor(
+ "force-target-max-vector-unroll", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's max unroll factor for "
+ "vectorized loops."));
+
+static cl::opt<unsigned> ForceTargetInstructionCost(
+ "force-target-instruction-cost", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's expected cost for "
+ "an instruction to a single constant value. Mostly "
+ "useful for getting consistent testing."));
+
+static cl::opt<unsigned> SmallLoopCost(
+ "small-loop-cost", cl::init(20), cl::Hidden,
+ cl::desc("The cost of a loop that is considered 'small' by the unroller."));
+
+static cl::opt<bool> LoopVectorizeWithBlockFrequency(
+ "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
+ cl::desc("Enable the use of the block frequency analysis to access PGO "
+ "heuristics minimizing code growth in cold regions and being more "
+ "aggressive in hot regions."));
+
+// Runtime unroll loops for load/store throughput.
+static cl::opt<bool> EnableLoadStoreRuntimeUnroll(
+ "enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden,
+ cl::desc("Enable runtime unrolling until load/store ports are saturated"));
+
+/// The number of stores in a loop that are allowed to need predication.
+static cl::opt<unsigned> NumberOfStoresToPredicate(
+ "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
+ cl::desc("Max number of stores to be predicated behind an if."));
+
+static cl::opt<bool> EnableIndVarRegisterHeur(
+ "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
+ cl::desc("Count the induction variable only once when unrolling"));
+
+static cl::opt<bool> EnableCondStoresVectorization(
+ "enable-cond-stores-vec", cl::init(false), cl::Hidden,
+ cl::desc("Enable if predication of stores during vectorization."));
namespace {
class InnerLoopVectorizer {
public:
InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
- DominatorTree *DT, DataLayout *DL,
+ DominatorTree *DT, const DataLayout *DL,
const TargetLibraryInfo *TLI, unsigned VecWidth,
unsigned UnrollFactor)
: OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
- VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
- OldInduction(0), WidenMap(UnrollFactor), Legal(0) {}
+ VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
+ Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
+ Legal(nullptr) {}
// Perform the actual loop widening (vectorization).
void vectorize(LoopVectorizationLegality *L) {
void updateAnalysis();
/// This instruction is un-vectorizable. Implement it as a sequence
- /// of scalars.
- virtual void scalarizeInstruction(Instruction *Instr);
+ /// of scalars. If \p IfPredicateStore is true we need to 'hide' each
+ /// scalarized instruction behind an if block predicated on the control
+ /// dependence of the instruction.
+ virtual void scalarizeInstruction(Instruction *Instr,
+ bool IfPredicateStore=false);
/// Vectorize Load and Store instructions,
virtual void vectorizeMemoryInstruction(Instruction *Instr);
/// Dominator Tree.
DominatorTree *DT;
/// Data Layout.
- DataLayout *DL;
+ const DataLayout *DL;
/// Target Library Info.
const TargetLibraryInfo *TLI;
///The ExitBlock of the scalar loop.
BasicBlock *LoopExitBlock;
///The vector loop body.
- BasicBlock *LoopVectorBody;
+ SmallVector<BasicBlock *, 4> LoopVectorBody;
///The scalar loop body.
BasicBlock *LoopScalarBody;
/// A list of all bypass blocks. The first block is the entry of the loop.
class InnerLoopUnroller : public InnerLoopVectorizer {
public:
InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
- DominatorTree *DT, DataLayout *DL,
+ DominatorTree *DT, const DataLayout *DL,
const TargetLibraryInfo *TLI, unsigned UnrollFactor) :
InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { }
private:
- virtual void scalarizeInstruction(Instruction *Instr);
- virtual void vectorizeMemoryInstruction(Instruction *Instr);
- virtual Value *getBroadcastInstrs(Value *V);
- virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
- virtual Value *reverseVector(Value *Vec);
+ void scalarizeInstruction(Instruction *Instr,
+ bool IfPredicateStore = false) override;
+ void vectorizeMemoryInstruction(Instruction *Instr) override;
+ Value *getBroadcastInstrs(Value *V) override;
+ Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate) override;
+ Value *reverseVector(Value *Vec) override;
};
/// \brief Look for a meaningful debug location on the instruction or it's
B.SetCurrentDebugLocation(DebugLoc());
}
+#ifndef NDEBUG
+/// \return string containing a file name and a line # for the given
+/// instruction.
+static format_object3<const char *, const char *, unsigned>
+getDebugLocString(const Instruction *I) {
+ if (!I)
+ return format<const char *, const char *, unsigned>("", "", "", 0U);
+ MDNode *N = I->getMetadata("dbg");
+ if (!N) {
+ const StringRef ModuleName =
+ I->getParent()->getParent()->getParent()->getModuleIdentifier();
+ return format<const char *, const char *, unsigned>("%s", ModuleName.data(),
+ "", 0U);
+ }
+ const DILocation Loc(N);
+ const unsigned LineNo = Loc.getLineNumber();
+ const char *DirName = Loc.getDirectory().data();
+ const char *FileName = Loc.getFilename().data();
+ return format("%s/%s:%u", DirName, FileName, LineNo);
+}
+#endif
+
/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
/// to what vectorization factor.
/// This class does not look at the profitability of vectorization, only the
/// induction variable and the different reduction variables.
class LoopVectorizationLegality {
public:
- LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
+ unsigned NumLoads;
+ unsigned NumStores;
+ unsigned NumPredStores;
+
+ LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, const DataLayout *DL,
DominatorTree *DT, TargetLibraryInfo *TLI)
- : TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI),
- Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
- MaxSafeDepDistBytes(-1U) {}
+ : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
+ DT(DT), TLI(TLI), Induction(nullptr), WidestIndTy(nullptr),
+ HasFunNoNaNAttr(false), MaxSafeDepDistBytes(-1U) {}
/// This enum represents the kinds of reductions that we support.
enum ReductionKind {
/// This struct holds information about reduction variables.
struct ReductionDescriptor {
- ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
+ ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr),
Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K,
/// A struct for saving information about induction variables.
struct InductionInfo {
InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
- InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
+ InductionInfo() : StartValue(nullptr), IK(IK_NoInduction) {}
/// Start value.
TrackingVH<Value> StartValue;
/// Induction kind.
/// Scev analysis.
ScalarEvolution *SE;
/// DataLayout analysis.
- DataLayout *DL;
+ const DataLayout *DL;
/// Dominators.
DominatorTree *DT;
/// Target Library Info.
LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
- DataLayout *DL, const TargetLibraryInfo *TLI)
+ const DataLayout *DL, const TargetLibraryInfo *TLI)
: TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {}
/// Information about vectorization costs
/// Vector target information.
const TargetTransformInfo &TTI;
/// Target data layout information.
- DataLayout *DL;
+ const DataLayout *DL;
/// Target Library Info.
const TargetLibraryInfo *TLI;
};
assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
- const MDString *S = 0;
+ const MDString *S = nullptr;
SmallVector<Value*, 4> Args;
// The expected hint is either a MDString or a MDNode with the first
}
};
+static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
+ if (L.empty())
+ return V.push_back(&L);
+
+ for (Loop *InnerL : L)
+ addInnerLoop(*InnerL, V);
+}
+
/// The LoopVectorize Pass.
-struct LoopVectorize : public LoopPass {
+struct LoopVectorize : public FunctionPass {
/// Pass identification, replacement for typeid
static char ID;
explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
- : LoopPass(ID),
+ : FunctionPass(ID),
DisableUnrolling(NoUnrolling),
AlwaysVectorize(AlwaysVectorize) {
initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
}
ScalarEvolution *SE;
- DataLayout *DL;
+ const DataLayout *DL;
LoopInfo *LI;
TargetTransformInfo *TTI;
DominatorTree *DT;
+ BlockFrequencyInfo *BFI;
TargetLibraryInfo *TLI;
bool DisableUnrolling;
bool AlwaysVectorize;
- virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
- // We only vectorize innermost loops.
- if (!L->empty())
- return false;
+ BlockFrequency ColdEntryFreq;
+ bool runOnFunction(Function &F) override {
SE = &getAnalysis<ScalarEvolution>();
- DL = getAnalysisIfAvailable<DataLayout>();
+ DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+ DL = DLP ? &DLP->getDataLayout() : nullptr;
LI = &getAnalysis<LoopInfo>();
TTI = &getAnalysis<TargetTransformInfo>();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ BFI = &getAnalysis<BlockFrequencyInfo>();
TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+ // Compute some weights outside of the loop over the loops. Compute this
+ // using a BranchProbability to re-use its scaling math.
+ const BranchProbability ColdProb(1, 5); // 20%
+ ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
+
// If the target claims to have no vector registers don't attempt
// vectorization.
if (!TTI->getNumberOfRegisters(true))
return false;
- if (DL == NULL) {
- DEBUG(dbgs() << "LV: Not vectorizing: Missing data layout\n");
+ if (!DL) {
+ DEBUG(dbgs() << "\nLV: Not vectorizing " << F.getName()
+ << ": Missing data layout\n");
return false;
}
- DEBUG(dbgs() << "LV: Checking a loop in \"" <<
- L->getHeader()->getParent()->getName() << "\"\n");
+ // Build up a worklist of inner-loops to vectorize. This is necessary as
+ // the act of vectorizing or partially unrolling a loop creates new loops
+ // and can invalidate iterators across the loops.
+ SmallVector<Loop *, 8> Worklist;
+
+ for (Loop *L : *LI)
+ addInnerLoop(*L, Worklist);
+
+ LoopsAnalyzed += Worklist.size();
+
+ // Now walk the identified inner loops.
+ bool Changed = false;
+ while (!Worklist.empty())
+ Changed |= processLoop(Worklist.pop_back_val());
+
+ // Process each loop nest in the function.
+ return Changed;
+ }
+
+ bool processLoop(Loop *L) {
+ assert(L->empty() && "Only process inner loops.");
+ DEBUG(dbgs() << "\nLV: Checking a loop in \""
+ << L->getHeader()->getParent()->getName() << "\" from "
+ << getDebugLocString(L->getHeader()->getFirstNonPHIOrDbg())
+ << "\n");
LoopVectorizeHints Hints(L, DisableUnrolling);
+ DEBUG(dbgs() << "LV: Loop hints:"
+ << " force=" << (Hints.Force == 0
+ ? "disabled"
+ : (Hints.Force == 1 ? "enabled" : "?"))
+ << " width=" << Hints.Width << " unroll=" << Hints.Unroll
+ << "\n");
+
if (Hints.Force == 0) {
DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
return false;
// Check the function attributes to find out if this function should be
// optimized for size.
Function *F = L->getHeader()->getParent();
- Attribute::AttrKind SzAttr = Attribute::OptimizeForSize;
- Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat;
- unsigned FnIndex = AttributeSet::FunctionIndex;
- bool OptForSize = Hints.Force != 1 &&
- F->getAttributes().hasAttribute(FnIndex, SzAttr);
- bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr);
-
- if (NoFloat) {
+ bool OptForSize =
+ Hints.Force != 1 && F->hasFnAttribute(Attribute::OptimizeForSize);
+
+ // Compute the weighted frequency of this loop being executed and see if it
+ // is less than 20% of the function entry baseline frequency. Note that we
+ // always have a canonical loop here because we think we *can* vectoriez.
+ // FIXME: This is hidden behind a flag due to pervasive problems with
+ // exactly what block frequency models.
+ if (LoopVectorizeWithBlockFrequency) {
+ BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
+ if (Hints.Force != 1 && LoopEntryFreq < ColdEntryFreq)
+ OptForSize = true;
+ }
+
+ // Check the function attributes to see if implicit floats are allowed.a
+ // FIXME: This check doesn't seem possibly correct -- what if the loop is
+ // an integer loop and the vector instructions selected are purely integer
+ // vector instructions?
+ if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
"attribute is used.\n");
return false;
}
// Select the optimal vectorization factor.
- LoopVectorizationCostModel::VectorizationFactor VF;
- VF = CM.selectVectorizationFactor(OptForSize, Hints.Width);
+ const LoopVectorizationCostModel::VectorizationFactor VF =
+ CM.selectVectorizationFactor(OptForSize, Hints.Width);
// Select the unroll factor.
- unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
+ const unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
VF.Cost);
- DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
- F->getParent()->getModuleIdentifier() << '\n');
+ DEBUG(dbgs() << "LV: Found a vectorizable loop ("
+ << VF.Width << ") in "
+ << getDebugLocString(L->getHeader()->getFirstNonPHIOrDbg())
+ << '\n');
DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
if (VF.Width == 1) {
// If we decided that it is *legal* to vectorize the loop then do it.
InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
LB.vectorize(&LVL);
+ ++LoopsVectorized;
}
// Mark the loop as already vectorized to avoid vectorizing again.
return true;
}
- virtual void getAnalysisUsage(AnalysisUsage &AU) const {
- LoopPass::getAnalysisUsage(AU);
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequiredID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
+ AU.addRequired<BlockFrequencyInfo>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfo>();
AU.addRequired<ScalarEvolution>();
// LoopVectorizationCostModel.
//===----------------------------------------------------------------------===//
-static Value *stripCast(Value *V) {
+static Value *stripIntegerCast(Value *V) {
if (CastInst *CI = dyn_cast<CastInst>(V))
- return CI->getOperand(0);
+ if (CI->getOperand(0)->getType()->isIntegerTy())
+ return CI->getOperand(0);
return V;
}
/// \p Ptr.
static const SCEV *replaceSymbolicStrideSCEV(ScalarEvolution *SE,
ValueToValueMap &PtrToStride,
- Value *Ptr, Value *OrigPtr = 0) {
+ Value *Ptr, Value *OrigPtr = nullptr) {
const SCEV *OrigSCEV = SE->getSCEV(Ptr);
Value *StrideVal = SI->second;
// Strip casts.
- StrideVal = stripCast(StrideVal);
+ StrideVal = stripIntegerCast(StrideVal);
// Replace symbolic stride by one.
Value *One = ConstantInt::get(StrideVal->getType(), 1);
Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
// We need to place the broadcast of invariant variables outside the loop.
Instruction *Instr = dyn_cast<Instruction>(V);
- bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
+ bool NewInstr =
+ (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),
+ Instr->getParent()) != LoopVectorBody.end());
bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
// Place the code for broadcasting invariant variables in the new preheader.
/// \brief Find the operand of the GEP that should be checked for consecutive
/// stores. This ignores trailing indices that have no effect on the final
/// pointer.
-static unsigned getGEPInductionOperand(DataLayout *DL,
+static unsigned getGEPInductionOperand(const DataLayout *DL,
const GetElementPtrInst *Gep) {
unsigned LastOperand = Gep->getNumOperands() - 1;
unsigned GEPAllocSize = DL->getTypeAllocSize(
// We can emit wide load/stores only if the last non-zero index is the
// induction variable.
- const SCEV *Last = 0;
+ const SCEV *Last = nullptr;
if (!Strides.count(Gep))
Last = SE->getSCEV(Gep->getOperand(InductionOperand));
else {
unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
+ if (SI && Legal->blockNeedsPredication(SI->getParent()))
+ return scalarizeInstruction(Instr, true);
+
if (ScalarAllocatedSize != VectorElementSize)
return scalarizeInstruction(Instr);
}
}
-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
// Holds vector parameters or scalars, in case of uniform vals.
SmallVector<VectorParts, 4> Params;
// Does this instruction return a value ?
bool IsVoidRetTy = Instr->getType()->isVoidTy();
- Value *UndefVec = IsVoidRetTy ? 0 :
+ Value *UndefVec = IsVoidRetTy ? nullptr :
UndefValue::get(VectorType::get(Instr->getType(), VF));
// Create a new entry in the WidenMap and initialize it to Undef or Null.
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
+ Instruction *InsertPt = Builder.GetInsertPoint();
+ BasicBlock *IfBlock = Builder.GetInsertBlock();
+ BasicBlock *CondBlock = nullptr;
+
+ VectorParts Cond;
+ Loop *VectorLp = nullptr;
+ if (IfPredicateStore) {
+ assert(Instr->getParent()->getSinglePredecessor() &&
+ "Only support single predecessor blocks");
+ Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
+ Instr->getParent());
+ VectorLp = LI->getLoopFor(IfBlock);
+ assert(VectorLp && "Must have a loop for this block");
+ }
+
// For each vector unroll 'part':
for (unsigned Part = 0; Part < UF; ++Part) {
// For each scalar that we create:
for (unsigned Width = 0; Width < VF; ++Width) {
+
+ // Start if-block.
+ Value *Cmp = nullptr;
+ if (IfPredicateStore) {
+ Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
+ Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
+ CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+ LoopVectorBody.push_back(CondBlock);
+ VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
+ // Update Builder with newly created basic block.
+ Builder.SetInsertPoint(InsertPt);
+ }
+
Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
if (!IsVoidRetTy)
VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
Builder.getInt32(Width));
+ // End if-block.
+ if (IfPredicateStore) {
+ BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+ LoopVectorBody.push_back(NewIfBlock);
+ VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+ OldBr->eraseFromParent();
+ IfBlock = NewIfBlock;
+ }
}
}
}
if (FirstInst)
return FirstInst;
if (Instruction *I = dyn_cast<Instruction>(V))
- return I->getParent() == Loc->getParent() ? I : 0;
- return 0;
+ return I->getParent() == Loc->getParent() ? I : nullptr;
+ return nullptr;
}
std::pair<Instruction *, Instruction *>
InnerLoopVectorizer::addStrideCheck(Instruction *Loc) {
- Instruction *tnullptr = 0;
+ Instruction *tnullptr = nullptr;
if (!Legal->mustCheckStrides())
return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
IRBuilder<> ChkBuilder(Loc);
// Emit checks.
- Value *Check = 0;
- Instruction *FirstInst = 0;
+ Value *Check = nullptr;
+ Instruction *FirstInst = nullptr;
for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(),
SE = Legal->strides_end();
SI != SE; ++SI) {
- Value *Ptr = stripCast(*SI);
+ Value *Ptr = stripIntegerCast(*SI);
Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1),
"stride.chk");
// Store the first instruction we create.
LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
Legal->getRuntimePointerCheck();
- Instruction *tnullptr = 0;
+ Instruction *tnullptr = nullptr;
if (!PtrRtCheck->Need)
return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
LLVMContext &Ctx = Loc->getContext();
SCEVExpander Exp(*SE, "induction");
- Instruction *FirstInst = 0;
+ Instruction *FirstInst = nullptr;
for (unsigned i = 0; i < NumPointers; ++i) {
Value *Ptr = PtrRtCheck->Pointers[i];
IRBuilder<> ChkBuilder(Loc);
// Our instructions might fold to a constant.
- Value *MemoryRuntimeCheck = 0;
+ Value *MemoryRuntimeCheck = nullptr;
for (unsigned i = 0; i < NumPointers; ++i) {
for (unsigned j = i+1; j < NumPointers; ++j) {
// No need to check if two readonly pointers intersect.
// sequence of instructions that form a check.
Instruction *StrideCheck;
Instruction *FirstCheckInst;
- tie(FirstCheckInst, StrideCheck) =
+ std::tie(FirstCheckInst, StrideCheck) =
addStrideCheck(BypassBlock->getTerminator());
if (StrideCheck) {
// Create a new block containing the stride check.
// checks into a separate block to make the more common case of few elements
// faster.
Instruction *MemRuntimeCheck;
- tie(FirstCheckInst, MemRuntimeCheck) =
+ std::tie(FirstCheckInst, MemRuntimeCheck) =
addRuntimeCheck(LastBypassBlock->getTerminator());
if (MemRuntimeCheck) {
// Create a new block containing the memory check.
// start value.
// This variable saves the new starting index for the scalar loop.
- PHINode *ResumeIndex = 0;
+ PHINode *ResumeIndex = nullptr;
LoopVectorizationLegality::InductionList::iterator I, E;
LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
// Set builder to point to last bypass block.
// truncated version for the scalar loop.
PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
- MiddleBlock->getTerminator()) : 0;
+ MiddleBlock->getTerminator()) : nullptr;
- Value *EndValue = 0;
+ Value *EndValue = nullptr;
switch (II.IK) {
case LoopVectorizationLegality::IK_NoInduction:
llvm_unreachable("Unknown induction");
LoopScalarPreHeader = ScalarPH;
LoopMiddleBlock = MiddleBlock;
LoopExitBlock = ExitBlock;
- LoopVectorBody = VecBody;
+ LoopVectorBody.push_back(VecBody);
LoopScalarBody = OldBasicBlock;
LoopVectorizeHints Hints(Lp, true);
getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
// If we have an intrinsic call, check if it is trivially vectorizable.
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
- switch (II->getIntrinsicID()) {
- case Intrinsic::sqrt:
- case Intrinsic::sin:
- case Intrinsic::cos:
- case Intrinsic::exp:
- case Intrinsic::exp2:
- case Intrinsic::log:
- case Intrinsic::log10:
- case Intrinsic::log2:
- case Intrinsic::fabs:
- case Intrinsic::copysign:
- case Intrinsic::floor:
- case Intrinsic::ceil:
- case Intrinsic::trunc:
- case Intrinsic::rint:
- case Intrinsic::nearbyint:
- case Intrinsic::round:
- case Intrinsic::pow:
- case Intrinsic::fma:
- case Intrinsic::fmuladd:
- case Intrinsic::lifetime_start:
- case Intrinsic::lifetime_end:
- return II->getIntrinsicID();
- default:
+ Intrinsic::ID ID = II->getIntrinsicID();
+ if (isTriviallyVectorizable(ID) || ID == Intrinsic::lifetime_start ||
+ ID == Intrinsic::lifetime_end)
+ return ID;
+ else
return Intrinsic::not_intrinsic;
- }
}
if (!TLI)
};
}
+/// \brief Check whether this block is a predicated block.
+/// Due to if predication of stores we might create a sequence of "if(pred) a[i]
+/// = ...; " blocks. We start with one vectorized basic block. For every
+/// conditional block we split this vectorized block. Therefore, every second
+/// block will be a predicated one.
+static bool isPredicatedBlock(unsigned BlockNum) {
+ return BlockNum % 2;
+}
+
///\brief Perform cse of induction variable instructions.
-static void cse(BasicBlock *BB) {
+static void cse(SmallVector<BasicBlock *, 4> &BBs) {
// Perform simple cse.
SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
- for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
- Instruction *In = I++;
+ for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
+ BasicBlock *BB = BBs[i];
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+ Instruction *In = I++;
- if (!CSEDenseMapInfo::canHandle(In))
- continue;
+ if (!CSEDenseMapInfo::canHandle(In))
+ continue;
- // Check if we can replace this instruction with any of the
- // visited instructions.
- if (Instruction *V = CSEMap.lookup(In)) {
- In->replaceAllUsesWith(V);
- In->eraseFromParent();
- continue;
+ // Check if we can replace this instruction with any of the
+ // visited instructions.
+ if (Instruction *V = CSEMap.lookup(In)) {
+ In->replaceAllUsesWith(V);
+ In->eraseFromParent();
+ continue;
+ }
+ // Ignore instructions in conditional blocks. We create "if (pred) a[i] =
+ // ...;" blocks for predicated stores. Every second block is a predicated
+ // block.
+ if (isPredicatedBlock(i))
+ continue;
+
+ CSEMap[In] = In;
}
+ }
+}
- CSEMap[In] = In;
+/// \brief Adds a 'fast' flag to floating point operations.
+static Value *addFastMathFlag(Value *V) {
+ if (isa<FPMathOperator>(V)){
+ FastMathFlags Flags;
+ Flags.setUnsafeAlgebra();
+ cast<Instruction>(V)->setFastMathFlags(Flags);
}
+ return V;
}
void InnerLoopVectorizer::vectorizeLoop() {
setDebugLocFromInst(Builder, RdxDesc.StartValue);
// We need to generate a reduction vector from the incoming scalar.
- // To do so, we need to generate the 'identity' vector and overide
+ // To do so, we need to generate the 'identity' vector and override
// one of the elements with the incoming scalar reduction. We need
// to do it in the vector-loop preheader.
Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
// first unroll part.
Value *StartVal = (part == 0) ? VectorStart : Identity;
cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
- cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody);
+ cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
+ LoopVectorBody.back());
}
// Before each round, move the insertion point right between
Value *StartVal = (part == 0) ? VectorStart : Identity;
for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
- NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
+ NewPhi->addIncoming(RdxExitVal[part],
+ LoopVectorBody.back());
RdxParts.push_back(NewPhi);
}
setDebugLocFromInst(Builder, ReducedPartRdx);
for (unsigned part = 1; part < UF; ++part) {
if (Op != Instruction::ICmp && Op != Instruction::FCmp)
- ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
- RdxParts[part], ReducedPartRdx,
- "bin.rdx");
+ // Floating point operations had to be 'fast' to enable the reduction.
+ ReducedPartRdx = addFastMathFlag(
+ Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
+ ReducedPartRdx, "bin.rdx"));
else
ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
ReducedPartRdx, RdxParts[part]);
assert(isPowerOf2_32(VF) &&
"Reduction emission only supported for pow2 vectors!");
Value *TmpVec = ReducedPartRdx;
- SmallVector<Constant*, 32> ShuffleMask(VF, 0);
+ SmallVector<Constant*, 32> ShuffleMask(VF, nullptr);
for (unsigned i = VF; i != 1; i >>= 1) {
// Move the upper half of the vector to the lower half.
for (unsigned j = 0; j != i/2; ++j)
"rdx.shuf");
if (Op != Instruction::ICmp && Op != Instruction::FCmp)
- TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
- "bin.rdx");
+ // Floating point operations had to be 'fast' to enable the reduction.
+ TmpVec = addFastMathFlag(Builder.CreateBinOp(
+ (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
else
TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
}
Type *VecTy = (VF == 1) ? PN->getType() :
VectorType::get(PN->getType(), VF);
Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
- LoopVectorBody-> getFirstInsertionPt());
+ LoopVectorBody.back()-> getFirstInsertionPt());
}
PV->push_back(P);
return;
if (VecOp && isa<PossiblyExactOperator>(VecOp))
VecOp->setIsExact(BinOp->isExact());
+ // Copy the fast-math flags.
+ if (VecOp && isa<FPMathOperator>(V))
+ VecOp->setFastMathFlags(it->getFastMathFlags());
+
Entry[Part] = V;
}
break;
VectorParts &A = getVectorValue(it->getOperand(0));
VectorParts &B = getVectorValue(it->getOperand(1));
for (unsigned Part = 0; Part < UF; ++Part) {
- Value *C = 0;
+ Value *C = nullptr;
if (FCmp)
C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
else
for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
- DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
+
+ // Due to if predication of stores we might create a sequence of "if(pred)
+ // a[i] = ...; " blocks.
+ for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
+ if (i == 0)
+ DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
+ else if (isPredicatedBlock(i)) {
+ DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
+ } else {
+ DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
+ }
+ }
+
DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
return true;
}
-static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) {
+static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
if (Ty->isPointerTy())
return DL.getIntPtrType(Ty);
return Ty;
}
-static Type* getWiderType(DataLayout &DL, Type *Ty0, Type *Ty1) {
+static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
Ty0 = convertPointerToIntegerType(DL, Ty0);
Ty1 = convertPointerToIntegerType(DL, Ty1);
if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
// instructions must not have external users.
if (!Reductions.count(Inst))
//Check that all of the users of the loop are inside the BB.
- for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end();
- I != E; ++I) {
- Instruction *U = cast<Instruction>(*I);
+ for (User *U : Inst->users()) {
+ Instruction *UI = cast<Instruction>(U);
// This user may be a reduction exit value.
- if (!TheLoop->contains(U)) {
- DEBUG(dbgs() << "LV: Found an outside user for : " << *U << '\n');
+ if (!TheLoop->contains(UI)) {
+ DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
return true;
}
}
///\brief Remove GEPs whose indices but the last one are loop invariant and
/// return the induction operand of the gep pointer.
static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE,
- DataLayout *DL, Loop *Lp) {
+ const DataLayout *DL, Loop *Lp) {
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
if (!GEP)
return Ptr;
///\brief Look for a cast use of the passed value.
static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
- Value *UniqueCast = 0;
- for (Value::use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end(); UI != UE;
- ++UI) {
- CastInst *CI = dyn_cast<CastInst>(*UI);
+ Value *UniqueCast = nullptr;
+ for (User *U : Ptr->users()) {
+ CastInst *CI = dyn_cast<CastInst>(U);
if (CI && CI->getType() == Ty) {
if (!UniqueCast)
UniqueCast = CI;
else
- return 0;
+ return nullptr;
}
}
return UniqueCast;
/// Looks for symbolic strides "a[i*stride]". Returns the symbolic stride as a
/// pointer to the Value, or null otherwise.
static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE,
- DataLayout *DL, Loop *Lp) {
+ const DataLayout *DL, Loop *Lp) {
const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
if (!PtrTy || PtrTy->isAggregateType())
- return 0;
+ return nullptr;
// Try to remove a gep instruction to make the pointer (actually index at this
// point) easier analyzable. If OrigPtr is equal to Ptr we are analzying the
const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
if (!S)
- return 0;
+ return nullptr;
V = S->getStepRecurrence(*SE);
if (!V)
- return 0;
+ return nullptr;
// Strip off the size of access multiplication if we are still analyzing the
// pointer.
DL->getTypeAllocSize(PtrTy->getElementType());
if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
if (M->getOperand(0)->getSCEVType() != scConstant)
- return 0;
+ return nullptr;
const APInt &APStepVal =
cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue();
// Huge step value - give up.
if (APStepVal.getBitWidth() > 64)
- return 0;
+ return nullptr;
int64_t StepVal = APStepVal.getSExtValue();
if (PtrAccessSize != StepVal)
- return 0;
+ return nullptr;
V = M->getOperand(1);
}
}
// Strip off casts.
- Type *StripedOffRecurrenceCast = 0;
+ Type *StripedOffRecurrenceCast = nullptr;
if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) {
StripedOffRecurrenceCast = C->getType();
V = C->getOperand();
// Look for the loop invariant symbolic value.
const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
if (!U)
- return 0;
+ return nullptr;
Value *Stride = U->getValue();
if (!Lp->isLoopInvariant(Stride))
- return 0;
+ return nullptr;
// If we have stripped off the recurrence cast we have to make sure that we
// return the value that is used in this loop so that we can replace it later.
}
void LoopVectorizationLegality::collectStridedAcccess(Value *MemAccess) {
- Value *Ptr = 0;
+ Value *Ptr = nullptr;
if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
Ptr = LI->getPointerOperand();
else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
// Start with the conditional branch and walk up the block.
Worklist.push_back(Latch->getTerminator()->getOperand(0));
+ // Also add all consecutive pointer values; these values will be uniform
+ // after vectorization (and subsequent cleanup) and, until revectorization is
+ // supported, all dependencies must also be uniform.
+ for (Loop::block_iterator B = TheLoop->block_begin(),
+ BE = TheLoop->block_end(); B != BE; ++B)
+ for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end();
+ I != IE; ++I)
+ if (I->getType()->isPointerTy() && isConsecutivePtr(I))
+ Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
+
while (Worklist.size()) {
Instruction *I = dyn_cast<Instruction>(Worklist.back());
Worklist.pop_back();
/// \brief Set of potential dependent memory accesses.
typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
- AccessAnalysis(DataLayout *Dl, DepCandidates &DA) :
+ AccessAnalysis(const DataLayout *Dl, DepCandidates &DA) :
DL(Dl), DepCands(DA), AreAllWritesIdentified(true),
AreAllReadsIdentified(true), IsRTCheckNeeded(false) {}
/// Set of underlying objects already written to.
SmallPtrSet<Value*, 16> WriteObjects;
- DataLayout *DL;
+ const DataLayout *DL;
/// Sets of potentially dependent accesses - members of one set share an
/// underlying pointer. The set "CheckDeps" identfies which sets really need a
/// \brief Check the stride of the pointer and ensure that it does not wrap in
/// the address space.
-static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
+static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
const Loop *Lp, ValueToValueMap &StridesMap);
bool AccessAnalysis::canCheckPtrAtRT(
}
bool NeedDepCheck = false;
- // Check whether there is the possiblity of dependency because of underlying
- // objects being the same.
+ // Check whether there is the possibility of dependency because of
+ // underlying objects being the same.
typedef SmallVector<Value*, 16> ValueVector;
ValueVector TempObjects;
GetUnderlyingObjects(Ptr, TempObjects, DL);
typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
- MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L)
+ MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L)
: SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
ShouldRetryWithRuntimeCheck(false) {}
private:
ScalarEvolution *SE;
- DataLayout *DL;
+ const DataLayout *DL;
const Loop *InnermostLoop;
/// \brief Maps access locations (ptr, read/write) to program order.
}
/// \brief Check whether the access through \p Ptr has a constant stride.
-static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
+static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr,
const Loop *Lp, ValueToValueMap &StridesMap) {
const Type *Ty = Ptr->getType();
assert(Ty->isPointerTy() && "Unexpected non-ptr");
// Check every access pair.
while (AI != AE) {
CheckDeps.erase(*AI);
- EquivalenceClasses<MemAccessInfo>::member_iterator OI = llvm::next(AI);
+ EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
while (OI != AE) {
// Check every accessing instruction pair in program order.
for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
DEBUG(dbgs() << "LV: Found a non-simple load.\n");
return false;
}
+ NumLoads++;
Loads.push_back(Ld);
DepChecker.addAccess(Ld);
continue;
DEBUG(dbgs() << "LV: Found a non-simple store.\n");
return false;
}
+ NumStores++;
Stores.push_back(St);
DepChecker.addAccess(St);
}
// We only allow for a single reduction value to be used outside the loop.
// This includes users of the reduction, variables (which form a cycle
// which ends in the phi node).
- Instruction *ExitInstruction = 0;
+ Instruction *ExitInstruction = nullptr;
// Indicates that we found a reduction operation in our scan.
bool FoundReduxOp = false;
// the number of instruction we saw from the recognized min/max pattern,
// to make sure we only see exactly the two instructions.
unsigned NumCmpSelectPatternInst = 0;
- ReductionInstDesc ReduxDesc(false, 0);
+ ReductionInstDesc ReduxDesc(false, nullptr);
SmallPtrSet<Instruction *, 8> VisitedInsts;
SmallVector<Instruction *, 8> Worklist;
// nodes once we get to them.
SmallVector<Instruction *, 8> NonPHIs;
SmallVector<Instruction *, 8> PHIs;
- for (Value::use_iterator UI = Cur->use_begin(), E = Cur->use_end(); UI != E;
- ++UI) {
- Instruction *Usr = cast<Instruction>(*UI);
+ for (User *U : Cur->users()) {
+ Instruction *UI = cast<Instruction>(U);
// Check if we found the exit user.
- BasicBlock *Parent = Usr->getParent();
+ BasicBlock *Parent = UI->getParent();
if (!TheLoop->contains(Parent)) {
// Exit if you find multiple outside users or if the header phi node is
// being used. In this case the user uses the value of the previous
// iteration, in which case we would loose "VF-1" iterations of the
// reduction operation if we vectorize.
- if (ExitInstruction != 0 || Cur == Phi)
+ if (ExitInstruction != nullptr || Cur == Phi)
return false;
// The instruction used by an outside user must be the last instruction
continue;
}
- // Process instructions only once (termination).
- if (VisitedInsts.insert(Usr)) {
- if (isa<PHINode>(Usr))
- PHIs.push_back(Usr);
+ // Process instructions only once (termination). Each reduction cycle
+ // value must only be used once, except by phi nodes and min/max
+ // reductions which are represented as a cmp followed by a select.
+ ReductionInstDesc IgnoredVal(false, nullptr);
+ if (VisitedInsts.insert(UI)) {
+ if (isa<PHINode>(UI))
+ PHIs.push_back(UI);
else
- NonPHIs.push_back(Usr);
- }
+ NonPHIs.push_back(UI);
+ } else if (!isa<PHINode>(UI) &&
+ ((!isa<FCmpInst>(UI) &&
+ !isa<ICmpInst>(UI) &&
+ !isa<SelectInst>(UI)) ||
+ !isMinMaxSelectCmpPattern(UI, IgnoredVal).IsReduction))
+ return false;
+
// Remember that we completed the cycle.
- if (Usr == Phi)
+ if (UI == Phi)
FoundStartPHI = true;
}
Worklist.append(PHIs.begin(), PHIs.end());
assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&
"Expect a select instruction");
- Instruction *Cmp = 0;
- SelectInst *Select = 0;
+ Instruction *Cmp = nullptr;
+ SelectInst *Select = nullptr;
// We must handle the select(cmp()) as a single instruction. Advance to the
// select.
if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) {
- if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin())))
+ if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->user_begin())))
return ReductionInstDesc(false, I);
return ReductionInstDesc(Select, Prev.MinMaxKind);
}
}
// We don't predicate stores at the moment.
- if (it->mayWriteToMemory() || it->mayThrow())
+ if (it->mayWriteToMemory()) {
+ StoreInst *SI = dyn_cast<StoreInst>(it);
+ // We only support predication of stores in basic blocks with one
+ // predecessor.
+ if (!SI || ++NumPredStores > NumberOfStoresToPredicate ||
+ !SafePtrs.count(SI->getPointerOperand()) ||
+ !SI->getParent()->getSinglePredecessor())
+ return false;
+ }
+ if (it->mayThrow())
return false;
// Check that we don't have a constant expression that can trap as operand.
return Factor;
}
+ if (!EnableCondStoresVectorization && Legal->NumPredStores) {
+ DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
+ return Factor;
+ }
+
// Find the trip count.
unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
}
}
- DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
+ DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n");
Factor.Width = Width;
Factor.Cost = Width * Cost;
return Factor;
if (TC > 1 && TC < TinyTripCountUnrollThreshold)
return 1;
- unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true);
- DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters <<
- " vector registers\n");
+ unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
+ DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters <<
+ " registers\n");
+
+ if (VF == 1) {
+ if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
+ TargetNumRegisters = ForceTargetNumScalarRegs;
+ } else {
+ if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
+ TargetNumRegisters = ForceTargetNumVectorRegs;
+ }
LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
// We divide by these constants so assume that we have at least one
// registers. These registers are used by all of the unrolled instances.
// Next, divide the remaining registers by the number of registers that is
// required by the loop, in order to estimate how many parallel instances
- // fit without causing spills.
- unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers;
+ // fit without causing spills. All of this is rounded down if necessary to be
+ // a power of two. We want power of two unroll factors to simplify any
+ // addressing operations or alignment considerations.
+ unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
+ R.MaxLocalUsers);
+
+ // Don't count the induction variable as unrolled.
+ if (EnableIndVarRegisterHeur)
+ UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
+ std::max(1U, (R.MaxLocalUsers - 1)));
// Clamp the unroll factor ranges to reasonable factors.
unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
+ // Check if the user has overridden the unroll max.
+ if (VF == 1) {
+ if (ForceTargetMaxScalarUnrollFactor.getNumOccurrences() > 0)
+ MaxUnrollSize = ForceTargetMaxScalarUnrollFactor;
+ } else {
+ if (ForceTargetMaxVectorUnrollFactor.getNumOccurrences() > 0)
+ MaxUnrollSize = ForceTargetMaxVectorUnrollFactor;
+ }
+
// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if (LoopCost == 0)
else if (UF < 1)
UF = 1;
- bool HasReductions = Legal->getReductionVars()->size();
-
- // Decide if we want to unroll if we decided that it is legal to vectorize
- // but not profitable.
- if (VF == 1) {
- if (TheLoop->getNumBlocks() > 1 || !HasReductions ||
- LoopCost > SmallLoopCost)
- return 1;
-
- return UF;
- }
-
- if (HasReductions) {
+ // Unroll if we vectorized this loop and there is a reduction that could
+ // benefit from unrolling.
+ if (VF > 1 && Legal->getReductionVars()->size()) {
DEBUG(dbgs() << "LV: Unrolling because of reductions.\n");
return UF;
}
- // We want to unroll tiny loops in order to reduce the loop overhead.
- // We assume that the cost overhead is 1 and we use the cost model
- // to estimate the cost of the loop and unroll until the cost of the
- // loop overhead is about 5% of the cost of the loop.
+ // Note that if we've already vectorized the loop we will have done the
+ // runtime check and so unrolling won't require further checks.
+ bool UnrollingRequiresRuntimePointerCheck =
+ (VF == 1 && Legal->getRuntimePointerCheck()->Need);
+
+ // We want to unroll small loops in order to reduce the loop overhead and
+ // potentially expose ILP opportunities.
DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
- if (LoopCost < SmallLoopCost) {
+ if (!UnrollingRequiresRuntimePointerCheck &&
+ LoopCost < SmallLoopCost) {
+ // We assume that the cost overhead is 1 and we use the cost model
+ // to estimate the cost of the loop and unroll until the cost of the
+ // loop overhead is about 5% of the cost of the loop.
+ unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
+
+ // Unroll until store/load ports (estimated by max unroll factor) are
+ // saturated.
+ unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1);
+ unsigned LoadsUF = UF / (Legal->NumLoads ? Legal->NumLoads : 1);
+
+ if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) {
+ DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n");
+ return std::max(StoresUF, LoadsUF);
+ }
+
DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
- unsigned NewUF = SmallLoopCost / (LoopCost + 1);
- return std::min(NewUF, UF);
+ return SmallUF;
}
DEBUG(dbgs() << "LV: Not Unrolling.\n");
continue;
unsigned C = getInstructionCost(it, VF);
+
+ // Check if we should override the cost.
+ if (ForceTargetInstructionCost.getNumOccurrences() > 0)
+ C = ForceTargetInstructionCost;
+
BlockCost += C;
DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
VF << " For instruction: " << *it << '\n');
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_AnyValue;
+ Value *Op2 = I->getOperand(1);
- if (isa<ConstantInt>(I->getOperand(1)))
+ // Check for a splat of a constant or for a non uniform vector of constants.
+ if (isa<ConstantInt>(Op2))
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+ else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
+ Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+ if (cast<Constant>(Op2)->getSplatValue() != nullptr)
+ Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+ }
return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
}
static const char lv_name[] = "Loop Vectorization";
INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
INITIALIZE_PASS_DEPENDENCY(LCSSA)
}
-void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) {
+void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
+ bool IfPredicateStore) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
// Holds vector parameters or scalars, in case of uniform vals.
SmallVector<VectorParts, 4> Params;
// Does this instruction return a value ?
bool IsVoidRetTy = Instr->getType()->isVoidTy();
- Value *UndefVec = IsVoidRetTy ? 0 :
+ Value *UndefVec = IsVoidRetTy ? nullptr :
UndefValue::get(Instr->getType());
// Create a new entry in the WidenMap and initialize it to Undef or Null.
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
+ Instruction *InsertPt = Builder.GetInsertPoint();
+ BasicBlock *IfBlock = Builder.GetInsertBlock();
+ BasicBlock *CondBlock = nullptr;
+
+ VectorParts Cond;
+ Loop *VectorLp = nullptr;
+ if (IfPredicateStore) {
+ assert(Instr->getParent()->getSinglePredecessor() &&
+ "Only support single predecessor blocks");
+ Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
+ Instr->getParent());
+ VectorLp = LI->getLoopFor(IfBlock);
+ assert(VectorLp && "Must have a loop for this block");
+ }
+
// For each vector unroll 'part':
for (unsigned Part = 0; Part < UF; ++Part) {
// For each scalar that we create:
+ // Start an "if (pred) a[i] = ..." block.
+ Value *Cmp = nullptr;
+ if (IfPredicateStore) {
+ if (Cond[Part]->getType()->isVectorTy())
+ Cond[Part] =
+ Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
+ Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
+ ConstantInt::get(Cond[Part]->getType(), 1));
+ CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+ LoopVectorBody.push_back(CondBlock);
+ VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
+ // Update Builder with newly created basic block.
+ Builder.SetInsertPoint(InsertPt);
+ }
+
Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
// so that future users will be able to use it.
if (!IsVoidRetTy)
VecResults[Part] = Cloned;
+
+ // End if-block.
+ if (IfPredicateStore) {
+ BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+ LoopVectorBody.push_back(NewIfBlock);
+ VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+ OldBr->eraseFromParent();
+ IfBlock = NewIfBlock;
+ }
}
}
void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
- return scalarizeInstruction(Instr);
+ StoreInst *SI = dyn_cast<StoreInst>(Instr);
+ bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
+
+ return scalarizeInstruction(Instr, IfPredicateStore);
}
Value *InnerLoopUnroller::reverseVector(Value *Vec) {