//===----------------------------------------------------------------------===//
//
// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
-// and generates target-independent LLVM-IR. Legalization of the IR is done
-// in the codegen. However, the vectorizer uses (will use) the codegen
-// interfaces to generate IR that is likely to result in an optimal binary.
+// and generates target-independent LLVM-IR.
+// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
+// of instructions in order to estimate the profitability of vectorization.
//
// The loop vectorizer combines consecutive loop iterations into a single
// 'wide' iteration. After this transformation the index is incremented
#include "llvm/Transforms/Vectorize.h"
#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/EquivalenceClasses.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/PatternMatch.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include <map>
using namespace llvm;
+using namespace llvm::PatternMatch;
static cl::opt<unsigned>
VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
/// We don't unroll loops with a known constant trip count below this number.
static const unsigned TinyTripCountUnrollThreshold = 128;
-/// When performing a runtime memory check, do not check more than this
-/// number of pointers. Notice that the check is quadratic!
-static const unsigned RuntimeMemoryCheckThreshold = 4;
+/// When performing memory disambiguation checks at runtime do not make more
+/// than this number of comparisons.
+static const unsigned RuntimeMemoryCheckThreshold = 8;
+
+/// Maximum simd width.
+static const unsigned MaxVectorWidth = 64;
+
+/// Maximum vectorization unroll count.
+static const unsigned MaxUnrollFactor = 16;
namespace {
class InnerLoopVectorizer {
public:
InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
- DominatorTree *DT, DataLayout *DL, unsigned VecWidth,
+ DominatorTree *DT, DataLayout *DL,
+ const TargetLibraryInfo *TLI, unsigned VecWidth,
unsigned UnrollFactor)
- : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), VF(VecWidth),
- UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
+ : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
+ VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
OldInduction(0), WidenMap(UnrollFactor) {}
// Perform the actual loop widening (vectorization).
/// This function adds 0, 1, 2 ... to each vector element, starting at zero.
/// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
/// The sequence starts at StartIndex.
- Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate);
+ Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
/// When we go over instructions in the basic block we rely on previous
/// values within the current basic block or on loop invariant values.
DominatorTree *DT;
/// Data Layout.
DataLayout *DL;
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
/// The vectorization SIMD factor to use. Each vector will have this many
/// vector elements.
unsigned VF;
PHINode *Induction;
/// The induction variable of the old basic block.
PHINode *OldInduction;
+ /// Holds the extended (to the widest induction type) start index.
+ Value *ExtendedIdx;
/// Maps scalars to widened vectors.
ValueMap WidenMap;
};
+/// \brief Check if conditionally executed loads are hoistable.
+///
+/// This class has two functions: isHoistableLoad and canHoistAllLoads.
+/// isHoistableLoad should be called on all load instructions that are executed
+/// conditionally. After all conditional loads are processed, the client should
+/// call canHoistAllLoads to determine if all of the conditional executed loads
+/// have an unconditional memory access to the same memory address in the loop.
+class LoadHoisting {
+ typedef SmallPtrSet<Value *, 8> MemorySet;
+
+ Loop *TheLoop;
+ DominatorTree *DT;
+ MemorySet CondLoadAddrSet;
+
+public:
+ LoadHoisting(Loop *L, DominatorTree *D) : TheLoop(L), DT(D) {}
+
+ /// \brief Check if the instruction is a load with a identifiable address.
+ bool isHoistableLoad(Instruction *L);
+
+ /// \brief Check if all of the conditional loads are hoistable because there
+ /// exists an unconditional memory access to the same address in the loop.
+ bool canHoistAllLoads();
+};
+
+bool LoadHoisting::isHoistableLoad(Instruction *L) {
+ LoadInst *LI = dyn_cast<LoadInst>(L);
+ if (!LI)
+ return false;
+
+ CondLoadAddrSet.insert(LI->getPointerOperand());
+ return true;
+}
+
+static void addMemAccesses(BasicBlock *BB, SmallPtrSet<Value *, 8> &Set) {
+ for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(BI)) // Try a load.
+ Set.insert(LI->getPointerOperand());
+ else if (StoreInst *SI = dyn_cast<StoreInst>(BI)) // Try a store.
+ Set.insert(SI->getPointerOperand());
+ }
+}
+
+bool LoadHoisting::canHoistAllLoads() {
+ // No conditional loads.
+ if (CondLoadAddrSet.empty())
+ return true;
+
+ MemorySet UncondMemAccesses;
+ std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
+ BasicBlock *LoopLatch = TheLoop->getLoopLatch();
+
+ // Iterate over the unconditional blocks and collect memory access addresses.
+ for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
+ BasicBlock *BB = LoopBlocks[i];
+
+ // Ignore conditional blocks.
+ if (BB != LoopLatch && !DT->dominates(BB, LoopLatch))
+ continue;
+
+ addMemAccesses(BB, UncondMemAccesses);
+ }
+
+ // And make sure there is a matching unconditional access for every
+ // conditional load.
+ for (MemorySet::iterator MI = CondLoadAddrSet.begin(),
+ ME = CondLoadAddrSet.end(); MI != ME; ++MI)
+ if (!UncondMemAccesses.count(*MI))
+ return false;
+
+ return true;
+}
+
/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
/// to what vectorization factor.
/// This class does not look at the profitability of vectorization, only the
class LoopVectorizationLegality {
public:
LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
- DominatorTree *DT)
- : TheLoop(L), SE(SE), DL(DL), DT(DT), Induction(0) {}
+ DominatorTree *DT, TargetTransformInfo* TTI,
+ AliasAnalysis *AA, TargetLibraryInfo *TLI)
+ : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI),
+ Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
+ LoadSpeculation(L, DT) {}
/// This enum represents the kinds of reductions that we support.
enum ReductionKind {
RK_IntegerOr, ///< Bitwise or logical OR of numbers.
RK_IntegerAnd, ///< Bitwise or logical AND of numbers.
RK_IntegerXor, ///< Bitwise or logical XOR of numbers.
+ RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
RK_FloatAdd, ///< Sum of floats.
- RK_FloatMult ///< Product of floats.
+ RK_FloatMult, ///< Product of floats.
+ RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()).
};
/// This enum represents the kinds of inductions that we support.
IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem).
};
+ // This enum represents the kind of minmax reduction.
+ enum MinMaxReductionKind {
+ MRK_Invalid,
+ MRK_UIntMin,
+ MRK_UIntMax,
+ MRK_SIntMin,
+ MRK_SIntMax,
+ MRK_FloatMin,
+ MRK_FloatMax
+ };
+
/// This POD struct holds information about reduction variables.
struct ReductionDescriptor {
ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
- Kind(RK_NoReduction) {}
+ Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
- ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K)
- : StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
+ ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K,
+ MinMaxReductionKind MK)
+ : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {}
// The starting value of the reduction.
// It does not have to be zero!
- Value *StartValue;
+ TrackingVH<Value> StartValue;
// The instruction who's value is used outside the loop.
Instruction *LoopExitInstr;
// The kind of the reduction.
ReductionKind Kind;
+ // If this a min/max reduction the kind of reduction.
+ MinMaxReductionKind MinMaxKind;
+ };
+
+ /// This POD struct holds information about a potential reduction operation.
+ struct ReductionInstDesc {
+ ReductionInstDesc(bool IsRedux, Instruction *I) :
+ IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {}
+
+ ReductionInstDesc(Instruction *I, MinMaxReductionKind K) :
+ IsReduction(true), PatternLastInst(I), MinMaxKind(K) {}
+
+ // Is this instruction a reduction candidate.
+ bool IsReduction;
+ // The last instruction in a min/max pattern (select of the select(icmp())
+ // pattern), or the current reduction instruction otherwise.
+ Instruction *PatternLastInst;
+ // If this is a min/max pattern the comparison predicate.
+ MinMaxReductionKind MinMaxKind;
};
// This POD struct holds information about the memory runtime legality
}
/// Insert a pointer and calculate the start and end SCEVs.
- void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr);
+ void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr);
/// This flag indicates if we need to add the runtime check.
bool Need;
/// Holds the pointers that we need to check.
- SmallVector<Value*, 2> Pointers;
+ SmallVector<TrackingVH<Value>, 2> Pointers;
/// Holds the pointer value at the beginning of the loop.
SmallVector<const SCEV*, 2> Starts;
/// Holds the pointer value at the end of the loop.
SmallVector<const SCEV*, 2> Ends;
+ /// Holds the information if this pointer is used for writing to memory.
+ SmallVector<bool, 2> IsWritePtr;
};
/// A POD for saving information about induction variables.
InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
/// Start value.
- Value *StartValue;
+ TrackingVH<Value> StartValue;
/// Induction kind.
InductionKind IK;
};
/// induction descriptor.
typedef MapVector<PHINode*, InductionInfo> InductionList;
+ /// Alias(Multi)Map stores the values (GEPs or underlying objects and their
+ /// respective Store/Load instruction(s) to calculate aliasing.
+ typedef MapVector<Value*, Instruction* > AliasMap;
+ typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap;
+
/// Returns true if it is legal to vectorize this loop.
/// This does not mean that it is profitable to vectorize this
/// loop, only that it is legal to do so.
/// Returns the induction variables found in the loop.
InductionList *getInductionVars() { return &Inductions; }
+ /// Returns the widest induction type.
+ Type *getWidestInductionType() { return WidestIndTy; }
+
/// Returns True if V is an induction variable in this loop.
bool isInductionVariable(const Value *V);
/// Returns the information that we collected about runtime memory check.
RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
+
+ /// This function returns the identity element (or neutral element) for
+ /// the operation K.
+ static Constant *getReductionIdentity(ReductionKind K, Type *Tp);
private:
/// Check if a single basic block loop is vectorizable.
/// At this point we know that this is a loop with a constant trip count
/// Returns True, if 'Phi' is the kind of reduction variable for type
/// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
- /// Returns true if the instruction I can be a reduction variable of type
- /// 'Kind'.
- bool isReductionInstr(Instruction *I, ReductionKind Kind);
+ /// Returns a struct describing if the instruction 'I' can be a reduction
+ /// variable of type 'Kind'. If the reduction is a min/max pattern of
+ /// select(icmp()) this function advances the instruction pointer 'I' from the
+ /// compare instruction to the select instruction and stores this pointer in
+ /// 'PatternLastInst' member of the returned struct.
+ ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind,
+ ReductionInstDesc &Desc);
+ /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
+ /// pattern corresponding to a min(X, Y) or max(X, Y).
+ static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I,
+ ReductionInstDesc &Prev);
/// Returns the induction kind of Phi. This function may return NoInduction
/// if the PHI is not an induction variable.
InductionKind isInductionVariable(PHINode *Phi);
/// Return true if can compute the address bounds of Ptr within the loop.
bool hasComputableBounds(Value *Ptr);
+ /// Return true if there is the chance of write reorder.
+ bool hasPossibleGlobalWriteReorder(Value *Object,
+ Instruction *Inst,
+ AliasMultiMap &WriteObjects,
+ unsigned MaxByteWidth);
+ /// Return the AA location for a load or a store.
+ AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst);
+
/// The loop that we evaluate.
Loop *TheLoop;
ScalarEvolution *SE;
/// DataLayout analysis.
DataLayout *DL;
- // Dominators.
+ /// Dominators.
DominatorTree *DT;
+ /// Target Info.
+ TargetTransformInfo *TTI;
+ /// Alias Analysis.
+ AliasAnalysis *AA;
+ /// Target Library Info.
+ TargetLibraryInfo *TLI;
// --- vectorization state --- //
/// Notice that inductions don't need to start at zero and that induction
/// variables can be pointers.
InductionList Inductions;
+ /// Holds the widest induction type encountered.
+ Type *WidestIndTy;
/// Allowed outside users. This holds the reduction
/// vars which can be accessed from outside the loop.
/// We need to check that all of the pointers in this list are disjoint
/// at runtime.
RuntimePointerCheck PtrRtCheck;
+ /// Can we assume the absence of NaNs.
+ bool HasFunNoNaNAttr;
+
+ /// Utility to determine whether loads can be speculated.
+ LoadHoisting LoadSpeculation;
};
/// LoopVectorizationCostModel - estimates the expected speedups due to
LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
- DataLayout *DL)
- : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL) {}
+ DataLayout *DL, const TargetLibraryInfo *TLI)
+ : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {}
/// Information about vectorization costs
struct VectorizationFactor {
const TargetTransformInfo &TTI;
/// Target data layout information.
DataLayout *DL;
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+};
+
+/// Utility class for getting and setting loop vectorizer hints in the form
+/// of loop metadata.
+struct LoopVectorizeHints {
+ /// Vectorization width.
+ unsigned Width;
+ /// Vectorization unroll factor.
+ unsigned Unroll;
+
+ LoopVectorizeHints(const Loop *L)
+ : Width(VectorizationFactor)
+ , Unroll(VectorizationUnroll)
+ , LoopID(L->getLoopID()) {
+ getHints(L);
+ // The command line options override any loop metadata except for when
+ // width == 1 which is used to indicate the loop is already vectorized.
+ if (VectorizationFactor.getNumOccurrences() > 0 && Width != 1)
+ Width = VectorizationFactor;
+ if (VectorizationUnroll.getNumOccurrences() > 0)
+ Unroll = VectorizationUnroll;
+ }
+
+ /// Return the loop vectorizer metadata prefix.
+ static StringRef Prefix() { return "llvm.vectorizer."; }
+
+ MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) {
+ SmallVector<Value*, 2> Vals;
+ Vals.push_back(MDString::get(Context, Name));
+ Vals.push_back(ConstantInt::get(Type::getInt32Ty(Context), V));
+ return MDNode::get(Context, Vals);
+ }
+
+ /// Mark the loop L as already vectorized by setting the width to 1.
+ void setAlreadyVectorized(Loop *L) {
+ LLVMContext &Context = L->getHeader()->getContext();
+
+ Width = 1;
+
+ // Create a new loop id with one more operand for the already_vectorized
+ // hint. If the loop already has a loop id then copy the existing operands.
+ SmallVector<Value*, 4> Vals(1);
+ if (LoopID)
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i)
+ Vals.push_back(LoopID->getOperand(i));
+
+ Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width));
+
+ MDNode *NewLoopID = MDNode::get(Context, Vals);
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+
+ L->setLoopID(NewLoopID);
+ if (LoopID)
+ LoopID->replaceAllUsesWith(NewLoopID);
+
+ LoopID = NewLoopID;
+ }
+
+private:
+ MDNode *LoopID;
+
+ /// Find hints specified in the loop metadata.
+ void getHints(const Loop *L) {
+ if (!LoopID)
+ return;
+
+ // First operand should refer to the loop id itself.
+ assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+ assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ const MDString *S = 0;
+ SmallVector<Value*, 4> Args;
+
+ // The expected hint is either a MDString or a MDNode with the first
+ // operand a MDString.
+ if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
+ if (!MD || MD->getNumOperands() == 0)
+ continue;
+ S = dyn_cast<MDString>(MD->getOperand(0));
+ for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
+ Args.push_back(MD->getOperand(i));
+ } else {
+ S = dyn_cast<MDString>(LoopID->getOperand(i));
+ assert(Args.size() == 0 && "too many arguments for MDString");
+ }
+
+ if (!S)
+ continue;
+
+ // Check if the hint starts with the vectorizer prefix.
+ StringRef Hint = S->getString();
+ if (!Hint.startswith(Prefix()))
+ continue;
+ // Remove the prefix.
+ Hint = Hint.substr(Prefix().size(), StringRef::npos);
+
+ if (Args.size() == 1)
+ getHint(Hint, Args[0]);
+ }
+ }
+
+ // Check string hint with one operand.
+ void getHint(StringRef Hint, Value *Arg) {
+ const ConstantInt *C = dyn_cast<ConstantInt>(Arg);
+ if (!C) return;
+ unsigned Val = C->getZExtValue();
+
+ if (Hint == "width") {
+ assert(isPowerOf2_32(Val) && Val <= MaxVectorWidth &&
+ "Invalid width metadata");
+ Width = Val;
+ } else if (Hint == "unroll") {
+ assert(isPowerOf2_32(Val) && Val <= MaxUnrollFactor &&
+ "Invalid unroll metadata");
+ Unroll = Val;
+ } else
+ DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint);
+ }
};
/// The LoopVectorize Pass.
LoopInfo *LI;
TargetTransformInfo *TTI;
DominatorTree *DT;
+ AliasAnalysis *AA;
+ TargetLibraryInfo *TLI;
virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
// We only vectorize innermost loops.
LI = &getAnalysis<LoopInfo>();
TTI = &getAnalysis<TargetTransformInfo>();
DT = &getAnalysis<DominatorTree>();
+ AA = getAnalysisIfAvailable<AliasAnalysis>();
+ TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+
+ if (DL == NULL) {
+ DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout");
+ return false;
+ }
DEBUG(dbgs() << "LV: Checking a loop in \"" <<
L->getHeader()->getParent()->getName() << "\"\n");
+ LoopVectorizeHints Hints(L);
+
+ if (Hints.Width == 1) {
+ DEBUG(dbgs() << "LV: Not vectorizing.\n");
+ return false;
+ }
+
// Check if it is legal to vectorize the loop.
- LoopVectorizationLegality LVL(L, SE, DL, DT);
+ LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI);
if (!LVL.canVectorize()) {
DEBUG(dbgs() << "LV: Not vectorizing.\n");
return false;
}
// Use the cost model.
- LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL);
+ LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI);
// Check the function attributes to find out if this function should be
// optimized for size.
// Select the optimal vectorization factor.
LoopVectorizationCostModel::VectorizationFactor VF;
- VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
+ VF = CM.selectVectorizationFactor(OptForSize, Hints.Width);
// Select the unroll factor.
- unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll,
- VF.Width, VF.Cost);
+ unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
+ VF.Cost);
if (VF.Width == 1) {
DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
// If we decided that it is *legal* to vectorize the loop then do it.
- InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF.Width, UF);
+ InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
LB.vectorize(&LVL);
+ // Mark the loop as already vectorized to avoid vectorizing again.
+ Hints.setAlreadyVectorized(L);
+
DEBUG(verifyFunction(*L->getHeader()->getParent()));
return true;
}
void
LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
- Loop *Lp, Value *Ptr) {
+ Loop *Lp, Value *Ptr,
+ bool WritePtr) {
const SCEV *Sc = SE->getSCEV(Ptr);
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
assert(AR && "Invalid addrec expression");
- const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch());
+ const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
Pointers.push_back(Ptr);
Starts.push_back(AR->getStart());
Ends.push_back(ScEnd);
+ IsWritePtr.push_back(WritePtr);
}
Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
return Shuf;
}
-Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
+Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
bool Negate) {
assert(Val->getType()->isVectorTy() && "Must be a vector");
assert(Val->getType()->getScalarType()->isIntegerTy() &&
// Create a vector of consecutive numbers from zero to VF.
for (int i = 0; i < VLen; ++i) {
- int Idx = Negate ? (-i): i;
- Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx));
+ int64_t Idx = Negate ? (-i) : i;
+ Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate));
}
// Add the consecutive indices to the vector value.
Type *DataTy = VectorType::get(ScalarDataTy, VF);
Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
+ unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
+ unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
+ unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
+
+ if (ScalarAllocatedSize != VectorElementSize)
+ return scalarizeInstruction(Instr);
// If the pointer is loop invariant or if it is non consecutive,
// scalarize the load.
- int Stride = Legal->isConsecutivePtr(Ptr);
- bool Reverse = Stride < 0;
+ int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+ bool Reverse = ConsecutiveStride < 0;
bool UniformLoad = LI && Legal->isUniform(Ptr);
- if (Stride == 0 || UniformLoad)
+ if (!ConsecutiveStride || UniformLoad)
return scalarizeInstruction(Instr);
Constant *Zero = Builder.getInt32(0);
PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
}
- Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+ Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
}
}
PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
}
- Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+ Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
cast<LoadInst>(LI)->setAlignment(Alignment);
Entry[Part] = Reverse ? reverseVector(LI) : LI;
// Create a new entry in the WidenMap and initialize it to Undef or Null.
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
- // For each scalar that we create:
- for (unsigned Width = 0; Width < VF; ++Width) {
- // For each vector unroll 'part':
- for (unsigned Part = 0; Part < UF; ++Part) {
+ // For each vector unroll 'part':
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // For each scalar that we create:
+ for (unsigned Width = 0; Width < VF; ++Width) {
Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
for (unsigned i = 0; i < NumPointers; ++i) {
for (unsigned j = i+1; j < NumPointers; ++j) {
+ // No need to check if two readonly pointers intersect.
+ if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
+ continue;
+
Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc");
// induction variables. In the code below we also support a case where we
// don't have a single induction variable.
OldInduction = Legal->getInduction();
- Type *IdxTy = OldInduction ? OldInduction->getType() :
- DL->getIntPtrType(SE->getContext());
+ Type *IdxTy = Legal->getWidestInductionType();
// Find the loop boundaries.
- const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch());
+ const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
// Get the total trip count from the count by adding 1.
// The loop index does not have to start at Zero. Find the original start
// value from the induction PHI node. If we don't have an induction variable
// then we know that it starts at zero.
- Value *StartIdx = OldInduction ?
- OldInduction->getIncomingValueForBlock(BypassBlock):
- ConstantInt::get(IdxTy, 0);
+ Builder.SetInsertPoint(BypassBlock->getTerminator());
+ Value *StartIdx = ExtendedIdx = OldInduction ?
+ Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock),
+ IdxTy):
+ ConstantInt::get(IdxTy, 0);
assert(BypassBlock && "Invalid loop structure");
LoopBypassBlocks.push_back(BypassBlock);
PHINode *ResumeIndex = 0;
LoopVectorizationLegality::InductionList::iterator I, E;
LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
+ // Set builder to point to last bypass block.
+ BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
for (I = List->begin(), E = List->end(); I != E; ++I) {
PHINode *OrigPhi = I->first;
LoopVectorizationLegality::InductionInfo II = I->second;
- PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
+
+ Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
+ PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
MiddleBlock->getTerminator());
+ // We might have extended the type of the induction variable but we need a
+ // truncated version for the scalar loop.
+ PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
+ PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
+ MiddleBlock->getTerminator()) : 0;
+
Value *EndValue = 0;
switch (II.IK) {
case LoopVectorizationLegality::IK_NoInduction:
llvm_unreachable("Unknown induction");
case LoopVectorizationLegality::IK_IntInduction: {
- // Handle the integer induction counter:
+ // Handle the integer induction counter.
assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
- assert(OrigPhi == OldInduction && "Unknown integer PHI");
- // We know what the end value is.
- EndValue = IdxEndRoundDown;
- // We also know which PHI node holds it.
- ResumeIndex = ResumeVal;
+
+ // We have the canonical induction variable.
+ if (OrigPhi == OldInduction) {
+ // Create a truncated version of the resume value for the scalar loop,
+ // we might have promoted the type to a larger width.
+ EndValue =
+ BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
+ // The new PHI merges the original incoming value, in case of a bypass,
+ // or the value at the end of the vectorized loop.
+ for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+ TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
+ TruncResumeVal->addIncoming(EndValue, VecBody);
+
+ // We know what the end value is.
+ EndValue = IdxEndRoundDown;
+ // We also know which PHI node holds it.
+ ResumeIndex = ResumeVal;
+ break;
+ }
+
+ // Not the canonical induction variable - add the vector loop count to the
+ // start value.
+ Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
+ II.StartValue->getType(),
+ "cast.crd");
+ EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end");
break;
}
case LoopVectorizationLegality::IK_ReverseIntInduction: {
// Convert the CountRoundDown variable to the PHI size.
- unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits();
- unsigned IISize = II.StartValue->getType()->getScalarSizeInBits();
- Value *CRD = CountRoundDown;
- if (CRDSize > IISize)
- CRD = CastInst::Create(Instruction::Trunc, CountRoundDown,
- II.StartValue->getType(), "tr.crd",
- LoopBypassBlocks.back()->getTerminator());
- else if (CRDSize < IISize)
- CRD = CastInst::Create(Instruction::SExt, CountRoundDown,
- II.StartValue->getType(),
- "sext.crd",
- LoopBypassBlocks.back()->getTerminator());
- // Handle reverse integer induction counter:
- EndValue =
- BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
- LoopBypassBlocks.back()->getTerminator());
+ Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
+ II.StartValue->getType(),
+ "cast.crd");
+ // Handle reverse integer induction counter.
+ EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end");
break;
}
case LoopVectorizationLegality::IK_PtrInduction: {
// For pointer induction variables, calculate the offset using
// the end index.
- EndValue =
- GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end",
- LoopBypassBlocks.back()->getTerminator());
+ EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown,
+ "ptr.ind.end");
break;
}
case LoopVectorizationLegality::IK_ReversePtrInduction: {
// The value at the end of the loop for the reverse pointer is calculated
// by creating a GEP with a negative index starting from the start value.
Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0);
- Value *NegIdx = BinaryOperator::CreateSub(Zero, CountRoundDown,
- "rev.ind.end",
- LoopBypassBlocks.back()->getTerminator());
- EndValue = GetElementPtrInst::Create(II.StartValue, NegIdx,
- "rev.ptr.ind.end",
- LoopBypassBlocks.back()->getTerminator());
+ Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown,
+ "rev.ind.end");
+ EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx,
+ "rev.ptr.ind.end");
break;
}
}// end of case
// The new PHI merges the original incoming value, in case of a bypass,
// or the value at the end of the vectorized loop.
- for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
- ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
+ for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) {
+ if (OrigPhi == OldInduction)
+ ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
+ else
+ ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
+ }
ResumeVal->addIncoming(EndValue, VecBody);
// Fix the scalar body counter (PHI node).
unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
- OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
+ // The old inductions phi node in the scalar body needs the truncated value.
+ if (OrigPhi == OldInduction)
+ OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal);
+ else
+ OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
}
// If we are generating a new induction variable then we also need to
/// This function returns the identity element (or neutral element) for
/// the operation K.
-static Constant*
-getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) {
+Constant*
+LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) {
switch (K) {
- case LoopVectorizationLegality:: RK_IntegerXor:
- case LoopVectorizationLegality:: RK_IntegerAdd:
- case LoopVectorizationLegality:: RK_IntegerOr:
+ case RK_IntegerXor:
+ case RK_IntegerAdd:
+ case RK_IntegerOr:
// Adding, Xoring, Oring zero to a number does not change it.
return ConstantInt::get(Tp, 0);
- case LoopVectorizationLegality:: RK_IntegerMult:
+ case RK_IntegerMult:
// Multiplying a number by 1 does not change it.
return ConstantInt::get(Tp, 1);
- case LoopVectorizationLegality:: RK_IntegerAnd:
+ case RK_IntegerAnd:
// AND-ing a number with an all-1 value does not change it.
return ConstantInt::get(Tp, -1, true);
- case LoopVectorizationLegality:: RK_FloatMult:
+ case RK_FloatMult:
// Multiplying a number by 1 does not change it.
return ConstantFP::get(Tp, 1.0L);
- case LoopVectorizationLegality:: RK_FloatAdd:
+ case RK_FloatAdd:
// Adding zero to a number does not change it.
return ConstantFP::get(Tp, 0.0L);
default:
}
}
-static bool
-isTriviallyVectorizableIntrinsic(Instruction *Inst) {
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
- if (!II)
- return false;
- switch (II->getIntrinsicID()) {
- case Intrinsic::sqrt:
- case Intrinsic::sin:
- case Intrinsic::cos:
- case Intrinsic::exp:
- case Intrinsic::exp2:
- case Intrinsic::log:
- case Intrinsic::log10:
- case Intrinsic::log2:
- case Intrinsic::fabs:
- case Intrinsic::floor:
- case Intrinsic::ceil:
- case Intrinsic::trunc:
- case Intrinsic::rint:
- case Intrinsic::nearbyint:
- case Intrinsic::pow:
- case Intrinsic::fma:
- case Intrinsic::fmuladd:
- return true;
+static Intrinsic::ID
+getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
+ // If we have an intrinsic call, check if it is trivially vectorizable.
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::sqrt:
+ case Intrinsic::sin:
+ case Intrinsic::cos:
+ case Intrinsic::exp:
+ case Intrinsic::exp2:
+ case Intrinsic::log:
+ case Intrinsic::log10:
+ case Intrinsic::log2:
+ case Intrinsic::fabs:
+ case Intrinsic::floor:
+ case Intrinsic::ceil:
+ case Intrinsic::trunc:
+ case Intrinsic::rint:
+ case Intrinsic::nearbyint:
+ case Intrinsic::pow:
+ case Intrinsic::fma:
+ case Intrinsic::fmuladd:
+ return II->getIntrinsicID();
+ default:
+ return Intrinsic::not_intrinsic;
+ }
+ }
+
+ if (!TLI)
+ return Intrinsic::not_intrinsic;
+
+ LibFunc::Func Func;
+ Function *F = CI->getCalledFunction();
+ // We're going to make assumptions on the semantics of the functions, check
+ // that the target knows that it's available in this environment.
+ if (!F || !TLI->getLibFunc(F->getName(), Func))
+ return Intrinsic::not_intrinsic;
+
+ // Otherwise check if we have a call to a function that can be turned into a
+ // vector intrinsic.
+ switch (Func) {
default:
- return false;
+ break;
+ case LibFunc::sin:
+ case LibFunc::sinf:
+ case LibFunc::sinl:
+ return Intrinsic::sin;
+ case LibFunc::cos:
+ case LibFunc::cosf:
+ case LibFunc::cosl:
+ return Intrinsic::cos;
+ case LibFunc::exp:
+ case LibFunc::expf:
+ case LibFunc::expl:
+ return Intrinsic::exp;
+ case LibFunc::exp2:
+ case LibFunc::exp2f:
+ case LibFunc::exp2l:
+ return Intrinsic::exp2;
+ case LibFunc::log:
+ case LibFunc::logf:
+ case LibFunc::logl:
+ return Intrinsic::log;
+ case LibFunc::log10:
+ case LibFunc::log10f:
+ case LibFunc::log10l:
+ return Intrinsic::log10;
+ case LibFunc::log2:
+ case LibFunc::log2f:
+ case LibFunc::log2l:
+ return Intrinsic::log2;
+ case LibFunc::fabs:
+ case LibFunc::fabsf:
+ case LibFunc::fabsl:
+ return Intrinsic::fabs;
+ case LibFunc::floor:
+ case LibFunc::floorf:
+ case LibFunc::floorl:
+ return Intrinsic::floor;
+ case LibFunc::ceil:
+ case LibFunc::ceilf:
+ case LibFunc::ceill:
+ return Intrinsic::ceil;
+ case LibFunc::trunc:
+ case LibFunc::truncf:
+ case LibFunc::truncl:
+ return Intrinsic::trunc;
+ case LibFunc::rint:
+ case LibFunc::rintf:
+ case LibFunc::rintl:
+ return Intrinsic::rint;
+ case LibFunc::nearbyint:
+ case LibFunc::nearbyintf:
+ case LibFunc::nearbyintl:
+ return Intrinsic::nearbyint;
+ case LibFunc::pow:
+ case LibFunc::powf:
+ case LibFunc::powl:
+ return Intrinsic::pow;
}
- return false;
+
+ return Intrinsic::not_intrinsic;
}
/// This function translates the reduction kind to an LLVM binary operator.
-static Instruction::BinaryOps
+static unsigned
getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
switch (Kind) {
case LoopVectorizationLegality::RK_IntegerAdd:
return Instruction::FMul;
case LoopVectorizationLegality::RK_FloatAdd:
return Instruction::FAdd;
+ case LoopVectorizationLegality::RK_IntegerMinMax:
+ return Instruction::ICmp;
+ case LoopVectorizationLegality::RK_FloatMinMax:
+ return Instruction::FCmp;
default:
llvm_unreachable("Unknown reduction operation");
}
}
+Value *createMinMaxOp(IRBuilder<> &Builder,
+ LoopVectorizationLegality::MinMaxReductionKind RK,
+ Value *Left,
+ Value *Right) {
+ CmpInst::Predicate P = CmpInst::ICMP_NE;
+ switch (RK) {
+ default:
+ llvm_unreachable("Unknown min/max reduction kind");
+ case LoopVectorizationLegality::MRK_UIntMin:
+ P = CmpInst::ICMP_ULT;
+ break;
+ case LoopVectorizationLegality::MRK_UIntMax:
+ P = CmpInst::ICMP_UGT;
+ break;
+ case LoopVectorizationLegality::MRK_SIntMin:
+ P = CmpInst::ICMP_SLT;
+ break;
+ case LoopVectorizationLegality::MRK_SIntMax:
+ P = CmpInst::ICMP_SGT;
+ break;
+ case LoopVectorizationLegality::MRK_FloatMin:
+ P = CmpInst::FCMP_OLT;
+ break;
+ case LoopVectorizationLegality::MRK_FloatMax:
+ P = CmpInst::FCMP_OGT;
+ break;
+ }
+
+ Value *Cmp;
+ if (RK == LoopVectorizationLegality::MRK_FloatMin || RK == LoopVectorizationLegality::MRK_FloatMax)
+ Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
+ else
+ Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
+
+ Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
+ return Select;
+}
+
void
InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
//===------------------------------------------------===//
// To do so, we need to generate the 'identity' vector and overide
// one of the elements with the incoming scalar reduction. We need
// to do it in the vector-loop preheader.
- Builder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
+ Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
// This is the vector-clone of the value that leaves the loop.
VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
// Find the reduction identity variable. Zero for addition, or, xor,
// one for multiplication, -1 for And.
- Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType());
- Constant *Identity = ConstantVector::getSplat(VF, Iden);
-
- // This vector is the Identity vector where the first element is the
- // incoming scalar reduction.
- Value *VectorStart = Builder.CreateInsertElement(Identity,
- RdxDesc.StartValue, Zero);
+ Value *Identity;
+ Value *VectorStart;
+ if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax ||
+ RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
+ // MinMax reduction have the start value as their identify.
+ VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue,
+ "minmax.ident");
+ } else {
+ Constant *Iden =
+ LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
+ VecTy->getScalarType());
+ Identity = ConstantVector::getSplat(VF, Iden);
+
+ // This vector is the Identity vector where the first element is the
+ // incoming scalar reduction.
+ VectorStart = Builder.CreateInsertElement(Identity,
+ RdxDesc.StartValue, Zero);
+ }
// Fix the vector-loop phi.
// We created the induction variable so we know that the
Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
VectorParts &Val = getVectorValue(LoopVal);
for (unsigned part = 0; part < UF; ++part) {
- // Make sure to add the reduction stat value only to the
+ // Make sure to add the reduction stat value only to the
// first unroll part.
Value *StartVal = (part == 0) ? VectorStart : Identity;
cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
// Reduce all of the unrolled parts into a single vector.
Value *ReducedPartRdx = RdxParts[0];
+ unsigned Op = getReductionBinOp(RdxDesc.Kind);
for (unsigned part = 1; part < UF; ++part) {
- Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
- ReducedPartRdx = Builder.CreateBinOp(Op, RdxParts[part], ReducedPartRdx,
- "bin.rdx");
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+ ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
+ RdxParts[part], ReducedPartRdx,
+ "bin.rdx");
+ else
+ ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
+ ReducedPartRdx, RdxParts[part]);
}
// VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
ConstantVector::get(ShuffleMask),
"rdx.shuf");
- Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
- TmpVec = Builder.CreateBinOp(Op, TmpVec, Shuf, "bin.rdx");
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+ TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
+ "bin.rdx");
+ else
+ TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
}
// The result is in the first element of the vector.
// We know that all PHIs in non header blocks are converted into
// selects, so we don't have to worry about the insertion order and we
// can just use the builder.
-
// At this point we generate the predication tree. There may be
// duplications since this is a simple recursive scan, but future
// optimizations will clean it up.
- VectorParts Cond = createEdgeMask(P->getIncomingBlock(0),
- P->getParent());
- for (unsigned part = 0; part < UF; ++part) {
- VectorParts &In0 = getVectorValue(P->getIncomingValue(0));
- VectorParts &In1 = getVectorValue(P->getIncomingValue(1));
- Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In1[part],
- "predphi");
+ unsigned NumIncoming = P->getNumIncomingValues();
+
+ // Generate a sequence of selects of the form:
+ // SELECT(Mask3, In3,
+ // SELECT(Mask2, In2,
+ // ( ...)))
+ for (unsigned In = 0; In < NumIncoming; In++) {
+ VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
+ P->getParent());
+ VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
+
+ for (unsigned part = 0; part < UF; ++part) {
+ // We might have single edge PHIs (blocks) - use an identity
+ // 'select' for the first PHI operand.
+ if (In == 0)
+ Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
+ In0[part]);
+ else
+ // Select between the current value and the previous incoming edge
+ // based on the incoming mask.
+ Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
+ Entry[part], "predphi");
+ }
}
continue;
}
case LoopVectorizationLegality::IK_NoInduction:
llvm_unreachable("Unknown induction");
case LoopVectorizationLegality::IK_IntInduction: {
- assert(P == OldInduction && "Unexpected PHI");
- Value *Broadcasted = getBroadcastInstrs(Induction);
- // After broadcasting the induction variable we need to make the
- // vector consecutive by adding 0, 1, 2 ...
+ assert(P->getType() == II.StartValue->getType() && "Types must match");
+ Type *PhiTy = P->getType();
+ Value *Broadcasted;
+ if (P == OldInduction) {
+ // Handle the canonical induction variable. We might have had to
+ // extend the type.
+ Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
+ } else {
+ // Handle other induction variables that are now based on the
+ // canonical one.
+ Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
+ "normalized.idx");
+ NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
+ Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
+ "offset.idx");
+ }
+ Broadcasted = getBroadcastInstrs(Broadcasted);
+ // After broadcasting the induction variable we need to make the vector
+ // consecutive by adding 0, 1, 2, etc.
for (unsigned part = 0; part < UF; ++part)
Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
continue;
case LoopVectorizationLegality::IK_PtrInduction:
case LoopVectorizationLegality::IK_ReversePtrInduction:
// Handle reverse integer and pointer inductions.
- Value *StartIdx = 0;
- // If we have a single integer induction variable then use it.
- // Otherwise, start counting at zero.
- if (OldInduction) {
- LoopVectorizationLegality::InductionInfo OldII =
- Legal->getInductionVars()->lookup(OldInduction);
- StartIdx = OldII.StartValue;
- } else {
- StartIdx = ConstantInt::get(Induction->getType(), 0);
- }
+ Value *StartIdx = ExtendedIdx;
// This is the normalized GEP that starts counting at zero.
Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
"normalized.idx");
// After broadcasting the induction variable we need to make the
// vector consecutive by adding ... -3, -2, -1, 0.
for (unsigned part = 0; part < UF; ++part)
- Entry[part] = getConsecutiveVector(Broadcasted, -VF * part, true);
+ Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
+ true);
continue;
}
}
case Instruction::Call: {
- assert(isTriviallyVectorizableIntrinsic(it));
+ // Ignore dbg intrinsics.
+ if (isa<DbgInfoIntrinsic>(it))
+ break;
+
Module *M = BB->getParent()->getParent();
- IntrinsicInst *II = cast<IntrinsicInst>(it);
- Intrinsic::ID ID = II->getIntrinsicID();
+ CallInst *CI = cast<CallInst>(it);
+ Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+ assert(ID && "Not an intrinsic call!");
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value*, 4> Args;
- for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) {
- VectorParts &Arg = getVectorValue(II->getArgOperand(i));
+ for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+ VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
Args.push_back(Arg[Part]);
}
- Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) };
+ Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) };
Function *F = Intrinsic::getDeclaration(M, ID, Tys);
Entry[Part] = Builder.CreateCall(F, Args);
}
if (!isa<BranchInst>(BB->getTerminator()))
return false;
- // We must have at most two predecessors because we need to convert
- // all PHIs to selects.
- unsigned Preds = std::distance(pred_begin(BB), pred_end(BB));
- if (Preds > 2)
- return false;
-
// We must be able to predicate all blocks that need to be predicated.
if (blockNeedsPredication(BB) && !blockCanBePredicated(BB))
return false;
}
+ // Check that we can actually speculate the hoistable loads.
+ if (!LoadSpeculation.canHoistAllLoads())
+ return false;
+
// We can if-convert this loop.
return true;
}
bool LoopVectorizationLegality::canVectorize() {
- assert(TheLoop->getLoopPreheader() && "No preheader!!");
+ // We must have a loop in canonical form. Loops with indirectbr in them cannot
+ // be canonicalized.
+ if (!TheLoop->getLoopPreheader())
+ return false;
// We can only vectorize innermost loops.
if (TheLoop->getSubLoopsVector().size())
TheLoop->getHeader()->getName() << "\n");
// ScalarEvolution needs to be able to find the exit count.
- const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch);
+ const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
if (ExitCount == SE->getCouldNotCompute()) {
DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
return false;
return true;
}
+static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) {
+ if (Ty->isPointerTy())
+ return DL.getIntPtrType(Ty->getContext());
+ return Ty;
+}
+
+static Type* getWiderType(DataLayout &DL, Type *Ty0, Type *Ty1) {
+ Ty0 = convertPointerToIntegerType(DL, Ty0);
+ Ty1 = convertPointerToIntegerType(DL, Ty1);
+ if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
+ return Ty0;
+ return Ty1;
+}
+
+/// \brief Check that the instruction has outside loop users and is not an
+/// identified reduction variable.
+static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
+ SmallPtrSet<Value *, 4> &Reductions) {
+ // Reduction instructions are allowed to have exit users. All other
+ // instructions must not have external users.
+ if (!Reductions.count(Inst))
+ //Check that all of the users of the loop are inside the BB.
+ for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end();
+ I != E; ++I) {
+ Instruction *U = cast<Instruction>(*I);
+ // This user may be a reduction exit value.
+ if (!TheLoop->contains(U)) {
+ DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+ return true;
+ }
+ }
+ return false;
+}
+
bool LoopVectorizationLegality::canVectorizeInstrs() {
BasicBlock *PreHeader = TheLoop->getLoopPreheader();
BasicBlock *Header = TheLoop->getHeader();
+ // Look for the attribute signaling the absence of NaNs.
+ Function &F = *Header->getParent();
+ if (F.hasFnAttribute("no-nans-fp-math"))
+ HasFunNoNaNAttr = F.getAttributes().getAttribute(
+ AttributeSet::FunctionIndex,
+ "no-nans-fp-math").getValueAsString() == "true";
+
// For each block in the loop.
for (Loop::block_iterator bb = TheLoop->block_begin(),
be = TheLoop->block_end(); bb != be; ++bb) {
++it) {
if (PHINode *Phi = dyn_cast<PHINode>(it)) {
- // This should not happen because the loop should be normalized.
- if (Phi->getNumIncomingValues() != 2) {
- DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
- return false;
- }
-
+ Type *PhiTy = Phi->getType();
// Check that this PHI type is allowed.
- if (!Phi->getType()->isIntegerTy() &&
- !Phi->getType()->isFloatingPointTy() &&
- !Phi->getType()->isPointerTy()) {
+ if (!PhiTy->isIntegerTy() &&
+ !PhiTy->isFloatingPointTy() &&
+ !PhiTy->isPointerTy()) {
DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
return false;
}
// If this PHINode is not in the header block, then we know that we
// can convert it to select during if-conversion. No need to check if
// the PHIs in this block are induction or reduction variables.
- if (*bb != Header)
- continue;
+ if (*bb != Header) {
+ // Check that this instruction has no outside users or is an
+ // identified reduction value with an outside user.
+ if(!hasOutsideLoopUser(TheLoop, it, AllowedExit))
+ continue;
+ return false;
+ }
+
+ // We only allow if-converted PHIs with more than two incoming values.
+ if (Phi->getNumIncomingValues() != 2) {
+ DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+ return false;
+ }
// This is the value coming from the preheader.
Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
InductionKind IK = isInductionVariable(Phi);
if (IK_NoInduction != IK) {
+ // Get the widest type.
+ if (!WidestIndTy)
+ WidestIndTy = convertPointerToIntegerType(*DL, PhiTy);
+ else
+ WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy);
+
// Int inductions are special because we only allow one IV.
if (IK == IK_IntInduction) {
- if (Induction) {
- DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
- return false;
- }
- Induction = Phi;
+ // Use the phi node with the widest type as induction. Use the last
+ // one if there are multiple (no good reason for doing this other
+ // than it is expedient).
+ if (!Induction || PhiTy == WidestIndTy)
+ Induction = Phi;
}
DEBUG(dbgs() << "LV: Found an induction variable.\n");
DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
continue;
}
+ if (AddReductionVar(Phi, RK_IntegerMinMax)) {
+ DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n");
+ continue;
+ }
if (AddReductionVar(Phi, RK_FloatMult)) {
DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n");
continue;
DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n");
continue;
}
+ if (AddReductionVar(Phi, RK_FloatMinMax)) {
+ DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<"\n");
+ continue;
+ }
DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
return false;
}// end of PHI handling
- // We still don't handle functions.
+ // We still don't handle functions. However, we can ignore dbg intrinsic
+ // calls and we do handle certain intrinsic and libm functions.
CallInst *CI = dyn_cast<CallInst>(it);
- if (CI && !isTriviallyVectorizableIntrinsic(it)) {
+ if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
DEBUG(dbgs() << "LV: Found a call site.\n");
return false;
}
// Reduction instructions are allowed to have exit users.
// All other instructions must not have external users.
- if (!AllowedExit.count(it))
- //Check that all of the users of the loop are inside the BB.
- for (Value::use_iterator I = it->use_begin(), E = it->use_end();
- I != E; ++I) {
- Instruction *U = cast<Instruction>(*I);
- // This user may be a reduction exit value.
- if (!TheLoop->contains(U)) {
- DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
- return false;
- }
- }
+ if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
+ return false;
+
} // next instr.
}
if (!Induction) {
DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
- assert(getInductionVars()->size() && "No induction variables");
+ if (Inductions.empty())
+ return false;
}
return true;
Uniforms.insert(I);
// Insert all operands.
- for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) {
- Worklist.push_back(I->getOperand(i));
+ Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
+ }
+}
+
+/// \brief Analyses memory accesses in a loop.
+///
+/// Checks whether run time pointer checks are needed and builds sets for data
+/// dependence checking.
+class AccessAnalysis {
+public:
+ /// \brief Read or write access location.
+ typedef std::pair<Value*, char> MemAccessInfo;
+
+ /// \brief Set of potential dependent memory accesses.
+ typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
+
+ AccessAnalysis(DataLayout *Dl, DepCandidates &DA) :
+ DL(Dl), DepCands(DA), AreAllWritesIdentified(true),
+ AreAllReadsIdentified(true), IsRTCheckNeeded(false) {}
+
+ /// \brief Register a load and whether it is only read from.
+ void addLoad(Value *Ptr, bool IsReadOnly) {
+ Accesses.insert(std::make_pair(Ptr, false));
+ if (IsReadOnly)
+ ReadOnlyPtr.insert(Ptr);
+ }
+
+ /// \brief Register a store.
+ void addStore(Value *Ptr) {
+ Accesses.insert(std::make_pair(Ptr, true));
+ }
+
+ /// \brief Check whether we can check the pointers at runtime for
+ /// non-intersection.
+ bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
+ unsigned &NumComparisons, ScalarEvolution *SE,
+ Loop *TheLoop);
+
+ /// \brief Goes over all memory accesses, checks whether a RT check is needed
+ /// and builds sets of dependent accesses.
+ void buildDependenceSets() {
+ // Process read-write pointers first.
+ processMemAccesses(false);
+ // Next, process read pointers.
+ processMemAccesses(true);
+ }
+
+ bool isRTCheckNeeded() { return IsRTCheckNeeded; }
+
+ bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
+
+ DenseSet<MemAccessInfo> &getDependenciesToCheck() { return CheckDeps; }
+
+private:
+ typedef SetVector<MemAccessInfo> PtrAccessSet;
+ typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap;
+
+ /// \brief Go over all memory access or only the deferred ones if
+ /// \p UseDeferred is true and check whether runtime pointer checks are needed
+ /// and build sets of dependency check candidates.
+ void processMemAccesses(bool UseDeferred);
+
+ /// Set of all accesses.
+ PtrAccessSet Accesses;
+
+ /// Set of access to check after all writes have been processed.
+ PtrAccessSet DeferredAccesses;
+
+ /// Map of pointers to last access encountered.
+ UnderlyingObjToAccessMap ObjToLastAccess;
+
+ /// Set of accesses that need a further dependence check.
+ DenseSet<MemAccessInfo> CheckDeps;
+
+ /// Set of pointers that are read only.
+ SmallPtrSet<Value*, 16> ReadOnlyPtr;
+
+ /// Set of underlying objects already written to.
+ SmallPtrSet<Value*, 16> WriteObjects;
+
+ DataLayout *DL;
+
+ /// Sets of potentially dependent accesses - members of one set share an
+ /// underlying pointer. The set "CheckDeps" identfies which sets really need a
+ /// dependence check.
+ DepCandidates &DepCands;
+
+ bool AreAllWritesIdentified;
+ bool AreAllReadsIdentified;
+ bool IsRTCheckNeeded;
+};
+
+/// \brief Check whether a pointer can participate in a runtime bounds check.
+static bool hasComputableBounds(ScalarEvolution *SE, Value *Ptr) {
+ const SCEV *PtrScev = SE->getSCEV(Ptr);
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
+ if (!AR)
+ return false;
+
+ return AR->isAffine();
+}
+
+bool AccessAnalysis::canCheckPtrAtRT(
+ LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
+ unsigned &NumComparisons, ScalarEvolution *SE,
+ Loop *TheLoop) {
+ // Find pointers with computable bounds. We are going to use this information
+ // to place a runtime bound check.
+ unsigned NumReadPtrChecks = 0;
+ unsigned NumWritePtrChecks = 0;
+ bool CanDoRT = true;
+
+ bool IsDepCheckNeeded = isDependencyCheckNeeded();
+ // We assign consecutive id to access from different dependence sets.
+ // Accesses within the same set don't need a runtime check.
+ unsigned RunningDepId = 1;
+ DenseMap<Value *, unsigned> DepSetId;
+
+ for (PtrAccessSet::iterator AI = Accesses.begin(), AE = Accesses.end();
+ AI != AE; ++AI) {
+ const MemAccessInfo &Access = *AI;
+ Value *Ptr = Access.first;
+ bool IsWrite = Access.second;
+
+ // Just add write checks if we have both.
+ if (!IsWrite && Accesses.count(std::make_pair(Ptr, true)))
+ continue;
+
+ if (IsWrite)
+ ++NumWritePtrChecks;
+ else
+ ++NumReadPtrChecks;
+
+ if (hasComputableBounds(SE, Ptr)) {
+ // The id of the dependence set.
+ unsigned DepId;
+
+ if (IsDepCheckNeeded) {
+ Value *Leader = DepCands.getLeaderValue(Access).first;
+ unsigned &LeaderId = DepSetId[Leader];
+ if (!LeaderId)
+ LeaderId = RunningDepId++;
+ DepId = LeaderId;
+ } else
+ // Each access has its own dependence set.
+ DepId = RunningDepId++;
+
+ //RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId);
+
+ DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr <<"\n");
+ } else {
+ CanDoRT = false;
+ }
+ }
+
+ if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
+ NumComparisons = 0; // Only one dependence set.
+ else
+ NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks +
+ NumWritePtrChecks - 1));
+ return CanDoRT;
+}
+
+static bool isFunctionScopeIdentifiedObject(Value *Ptr) {
+ return isNoAliasArgument(Ptr) || isNoAliasCall(Ptr) || isa<AllocaInst>(Ptr);
+}
+
+void AccessAnalysis::processMemAccesses(bool UseDeferred) {
+ // We process the set twice: first we process read-write pointers, last we
+ // process read-only pointers. This allows us to skip dependence tests for
+ // read-only pointers.
+
+ PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
+ for (PtrAccessSet::iterator AI = S.begin(), AE = S.end(); AI != AE; ++AI) {
+ const MemAccessInfo &Access = *AI;
+ Value *Ptr = Access.first;
+ bool IsWrite = Access.second;
+
+ DepCands.insert(Access);
+
+ // Memorize read-only pointers for later processing and skip them in the
+ // first round (they need to be checked after we have seen all write
+ // pointers). Note: we also mark pointer that are not consecutive as
+ // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need the
+ // second check for "!IsWrite".
+ bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
+ if (!UseDeferred && IsReadOnlyPtr) {
+ DeferredAccesses.insert(Access);
+ continue;
}
+
+ bool NeedDepCheck = false;
+ // Check whether there is the possiblity of dependency because of underlying
+ // objects being the same.
+ typedef SmallVector<Value*, 16> ValueVector;
+ ValueVector TempObjects;
+ GetUnderlyingObjects(Ptr, TempObjects, DL);
+ for (ValueVector::iterator UI = TempObjects.begin(), UE = TempObjects.end();
+ UI != UE; ++UI) {
+ Value *UnderlyingObj = *UI;
+
+ // If this is a write then it needs to be an identified object. If this a
+ // read and all writes (so far) are identified function scope objects we
+ // don't need an identified underlying object but only an Argument (the
+ // next write is going to invalidate this assumption if it is
+ // unidentified).
+ // This is a micro-optimization for the case where all writes are
+ // identified and we have one argument pointer.
+ // Otherwise, we do need a runtime check.
+ if ((IsWrite && !isFunctionScopeIdentifiedObject(UnderlyingObj)) ||
+ (!IsWrite && (!AreAllWritesIdentified ||
+ !isa<Argument>(UnderlyingObj)) &&
+ !isIdentifiedObject(UnderlyingObj))) {
+ DEBUG(dbgs() << "LV: Found an unidentified " <<
+ (IsWrite ? "write" : "read" ) << " ptr:" << *UnderlyingObj <<
+ "\n");
+ IsRTCheckNeeded = (IsRTCheckNeeded ||
+ !isIdentifiedObject(UnderlyingObj) ||
+ !AreAllReadsIdentified);
+
+ if (IsWrite)
+ AreAllWritesIdentified = false;
+ if (!IsWrite)
+ AreAllReadsIdentified = false;
+ }
+
+ // If this is a write - check other reads and writes for conflicts. If
+ // this is a read only check other writes for conflicts (but only if there
+ // is no other write to the ptr - this is an optimization to catch "a[i] =
+ // a[i] + " without having to do a dependence check).
+ if ((IsWrite || IsReadOnlyPtr) && WriteObjects.count(UnderlyingObj))
+ NeedDepCheck = true;
+
+ if (IsWrite)
+ WriteObjects.insert(UnderlyingObj);
+
+ // Create sets of pointers connected by shared underlying objects.
+ UnderlyingObjToAccessMap::iterator Prev =
+ ObjToLastAccess.find(UnderlyingObj);
+ if (Prev != ObjToLastAccess.end())
+ DepCands.unionSets(Access, Prev->second);
+
+ ObjToLastAccess[UnderlyingObj] = Access;
+ }
+
+ if (NeedDepCheck)
+ CheckDeps.insert(Access);
}
}
+/// \brief Checks memory dependences among accesses to the same underlying
+/// object to determine whether there vectorization is legal or not (and at
+/// which vectorization factor).
+///
+/// This class works under the assumption that we already checked that memory
+/// locations with different underlying pointers are "must-not alias".
+/// We use the ScalarEvolution framework to symbolically evalutate access
+/// functions pairs. Since we currently don't restructure the loop we can rely
+/// on the program order of memory accesses to determine their safety.
+/// At the moment we will only deem accesses as safe for:
+/// * A negative constant distance assuming program order.
+///
+/// Safe: tmp = a[i + 1]; OR a[i + 1] = x;
+/// a[i] = tmp; y = a[i];
+///
+/// The latter case is safe because later checks guarantuee that there can't
+/// be a cycle through a phi node (that is, we check that "x" and "y" is not
+/// the same variable: a header phi can only be an induction or a reduction, a
+/// reduction can't have a memory sink, an induction can't have a memory
+/// source). This is important and must not be violated (or we have to
+/// resort to checking for cycles through memory).
+///
+/// * A positive constant distance assuming program order that is bigger
+/// than the biggest memory access.
+///
+/// tmp = a[i] OR b[i] = x
+/// a[i+2] = tmp y = b[i+2];
+///
+/// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively.
+///
+/// * Zero distances and all accesses have the same size.
+///
+class MemoryDepChecker {
+public:
+ typedef std::pair<Value*, char> MemAccessInfo;
+
+ MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) :
+ SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0) {}
+
+ /// \brief Register the location (instructions are given increasing numbers)
+ /// of a write access.
+ void addAccess(StoreInst *SI) {
+ Value *Ptr = SI->getPointerOperand();
+ Accesses[std::make_pair(Ptr, true)].push_back(AccessIdx);
+ InstMap.push_back(SI);
+ ++AccessIdx;
+ }
+
+ /// \brief Register the location (instructions are given increasing numbers)
+ /// of a write access.
+ void addAccess(LoadInst *LI) {
+ Value *Ptr = LI->getPointerOperand();
+ Accesses[std::make_pair(Ptr, false)].push_back(AccessIdx);
+ InstMap.push_back(LI);
+ ++AccessIdx;
+ }
+
+ /// \brief Check whether the dependencies between the accesses are safe.
+ ///
+ /// Only checks sets with elements in \p CheckDeps.
+ bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
+ DenseSet<MemAccessInfo> &CheckDeps);
+
+ /// \brief The maximum number of bytes of a vector register we can vectorize
+ /// the accesses safely with.
+ unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
+
+private:
+ ScalarEvolution *SE;
+ DataLayout *DL;
+ const Loop *InnermostLoop;
+
+ /// \brief Maps access locations (ptr, read/write) to program order.
+ DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
+
+ /// \brief Memory access instructions in program order.
+ SmallVector<Instruction *, 16> InstMap;
+
+ /// \brief The program order index to be used for the next instruction.
+ unsigned AccessIdx;
+
+ // We can access this many bytes in parallel safely.
+ unsigned MaxSafeDepDistBytes;
+
+ /// \brief Check whether there is a plausible dependence between the two
+ /// accesses.
+ ///
+ /// Access \p A must happen before \p B in program order. The two indices
+ /// identify the index into the program order map.
+ ///
+ /// This function checks whether there is a plausible dependence (or the
+ /// absence of such can't be proved) between the two accesses. If there is a
+ /// plausible dependence but the dependence distance is bigger than one
+ /// element access it records this distance in \p MaxSafeDepDistBytes (if this
+ /// distance is smaller than any other distance encountered so far).
+ /// Otherwise, this function returns true signaling a possible dependence.
+ bool isDependent(const MemAccessInfo &A, unsigned AIdx,
+ const MemAccessInfo &B, unsigned BIdx);
+
+ /// \brief Check whether the data dependence could prevent store-load
+ /// forwarding.
+ bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
+};
+
+static bool isInBoundsGep(Value *Ptr) {
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
+ return GEP->isInBounds();
+ return false;
+}
+
+/// \brief Check whether the access through \p Ptr has a constant stride.
+static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
+ const Loop *Lp) {
+ const Type *PtrTy = Ptr->getType();
+ assert(PtrTy->isPointerTy() && "Unexpected non ptr");
+
+ // Make sure that the pointer does not point to aggregate types.
+ if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType()) {
+ DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr
+ << "\n");
+ return 0;
+ }
+
+ const SCEV *PtrScev = SE->getSCEV(Ptr);
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
+ if (!AR) {
+ DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer "
+ << *Ptr << " SCEV: " << *PtrScev << "\n");
+ return 0;
+ }
+
+ // The accesss function must stride over the innermost loop.
+ if (Lp != AR->getLoop()) {
+ DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " << *Ptr
+ << " SCEV: " << *PtrScev << "\n");
+ }
+
+ // The address calculation must not wrap. Otherwise, a dependence could be
+ // inverted. An inbounds getelementptr that is a AddRec with a unit stride
+ // cannot wrap per definition. The unit stride requirement is checked later.
+ bool IsInBoundsGEP = isInBoundsGep(Ptr);
+ bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
+ if (!IsNoWrapAddRec && !IsInBoundsGEP) {
+ DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space "
+ << *Ptr << " SCEV: " << *PtrScev << "\n");
+ return 0;
+ }
+
+ // Check the step is constant.
+ const SCEV *Step = AR->getStepRecurrence(*SE);
+
+ // Calculate the pointer stride and check if it is consecutive.
+ const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+ if (!C) {
+ DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr <<
+ " SCEV: " << *PtrScev << "\n");
+ return 0;
+ }
+
+ int64_t Size = DL->getTypeAllocSize(PtrTy->getPointerElementType());
+ const APInt &APStepVal = C->getValue()->getValue();
+
+ // Huge step value - give up.
+ if (APStepVal.getBitWidth() > 64)
+ return 0;
+
+ int64_t StepVal = APStepVal.getSExtValue();
+
+ // Strided access.
+ int64_t Stride = StepVal / Size;
+ int64_t Rem = StepVal % Size;
+ if (Rem)
+ return 0;
+
+ // If the SCEV could wrap but we have an inbounds gep with a unit stride we
+ // know we can't "wrap around the address space".
+ if (!IsNoWrapAddRec && IsInBoundsGEP && Stride != 1 && Stride != -1)
+ return 0;
+
+ return Stride;
+}
+
+bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
+ unsigned TypeByteSize) {
+ // If loads occur at a distance that is not a multiple of a feasible vector
+ // factor store-load forwarding does not take place.
+ // Positive dependences might cause troubles because vectorizing them might
+ // prevent store-load forwarding making vectorized code run a lot slower.
+ // a[i] = a[i-3] ^ a[i-8];
+ // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
+ // hence on your typical architecture store-load forwarding does not take
+ // place. Vectorizing in such cases does not make sense.
+ // Store-load forwarding distance.
+ const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize;
+ // Maximum vector factor.
+ unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize;
+ if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues)
+ MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes;
+
+ for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues;
+ vf *= 2) {
+ if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) {
+ MaxVFWithoutSLForwardIssues = (vf >>=1);
+ break;
+ }
+ }
+
+ if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) {
+ DEBUG(dbgs() << "LV: Distance " << Distance <<
+ " that could cause a store-load forwarding conflict\n");
+ return true;
+ }
+
+ if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
+ MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize)
+ MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
+ return false;
+}
+
+bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
+ const MemAccessInfo &B, unsigned BIdx) {
+ assert (AIdx < BIdx && "Must pass arguments in program order");
+
+ Value *APtr = A.first;
+ Value *BPtr = B.first;
+ bool AIsWrite = A.second;
+ bool BIsWrite = B.second;
+
+ // Two reads are independent.
+ if (!AIsWrite && !BIsWrite)
+ return false;
+
+ const SCEV *AScev = SE->getSCEV(APtr);
+ const SCEV *BScev = SE->getSCEV(BPtr);
+
+ int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop);
+ int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop);
+
+ const SCEV *Src = AScev;
+ const SCEV *Sink = BScev;
+
+ // If the induction step is negative we have to invert source and sink of the
+ // dependence.
+ if (StrideAPtr < 0) {
+ //Src = BScev;
+ //Sink = AScev;
+ std::swap(APtr, BPtr);
+ std::swap(Src, Sink);
+ std::swap(AIsWrite, BIsWrite);
+ std::swap(AIdx, BIdx);
+ std::swap(StrideAPtr, StrideBPtr);
+ }
+
+ const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
+
+ DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink
+ << "(Induction step: " << StrideAPtr << ")\n");
+ DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to "
+ << *InstMap[BIdx] << ": " << *Dist << "\n");
+
+ // Need consecutive accesses. We don't want to vectorize
+ // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
+ // the address space.
+ if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
+ DEBUG(dbgs() << "Non-consecutive pointer access\n");
+ return true;
+ }
+
+ const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
+ if (!C) {
+ DEBUG(dbgs() << "LV: Dependence because of non constant distance\n");
+ return true;
+ }
+
+ Type *ATy = APtr->getType()->getPointerElementType();
+ Type *BTy = BPtr->getType()->getPointerElementType();
+ unsigned TypeByteSize = DL->getTypeAllocSize(ATy);
+
+ // Negative distances are not plausible dependencies.
+ const APInt &Val = C->getValue()->getValue();
+ if (Val.isNegative()) {
+ bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
+ if (IsTrueDataDependence &&
+ (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
+ ATy != BTy))
+ return true;
+
+ DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n");
+ return false;
+ }
+
+ // Write to the same location with the same size.
+ // Could be improved to assert type sizes are the same (i32 == float, etc).
+ if (Val == 0) {
+ if (ATy == BTy)
+ return false;
+ DEBUG(dbgs() << "LV: Zero dependence difference but different types");
+ return true;
+ }
+
+ assert(Val.isStrictlyPositive() && "Expect a positive value");
+
+ // Positive distance bigger than max vectorization factor.
+ if (ATy != BTy) {
+ DEBUG(dbgs() <<
+ "LV: ReadWrite-Write positive dependency with different types");
+ return false;
+ }
+
+ unsigned Distance = (unsigned) Val.getZExtValue();
+
+ // Bail out early if passed-in parameters make vectorization not feasible.
+ unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1;
+ unsigned ForcedUnroll = VectorizationUnroll ? VectorizationUnroll : 1;
+
+ // The distance must be bigger than the size needed for a vectorized version
+ // of the operation and the size of the vectorized operation must not be
+ // bigger than the currrent maximum size.
+ if (Distance < 2*TypeByteSize ||
+ 2*TypeByteSize > MaxSafeDepDistBytes ||
+ Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
+ DEBUG(dbgs() << "LV: Failure because of Positive distance "
+ << Val.getSExtValue() << "\n");
+ return true;
+ }
+
+ MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
+ Distance : MaxSafeDepDistBytes;
+
+ bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
+ if (IsTrueDataDependence &&
+ couldPreventStoreLoadForward(Distance, TypeByteSize))
+ return true;
+
+ DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() <<
+ " with max VF=" << MaxSafeDepDistBytes/TypeByteSize << "\n");
+
+ return false;
+}
+
+bool
+MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
+ DenseSet<MemAccessInfo> &CheckDeps) {
+
+ MaxSafeDepDistBytes = -1U;
+ while (!CheckDeps.empty()) {
+ MemAccessInfo CurAccess = *CheckDeps.begin();
+
+ // Get the relevant memory access set.
+ EquivalenceClasses<MemAccessInfo>::iterator I =
+ AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
+
+ // Check accesses within this set.
+ EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE;
+ AI = AccessSets.member_begin(I), AE = AccessSets.member_end();
+
+ // Check every access pair.
+ while (AI != AE) {
+ CheckDeps.erase(*AI);
+ EquivalenceClasses<MemAccessInfo>::member_iterator OI = llvm::next(AI);
+ while (OI != AE) {
+ // Check every accessing instruction pair in program order.
+ for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
+ I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
+ for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
+ I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
+ if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2))
+ return false;
+ if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1))
+ return false;
+ }
+ ++OI;
+ }
+ AI++;
+ }
+ }
+ return true;
+}
+
+AliasAnalysis::Location
+LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) {
+ if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
+ return AA->getLocation(Store);
+ else if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
+ return AA->getLocation(Load);
+
+ llvm_unreachable("Should be either load or store instruction");
+}
+
+bool
+LoopVectorizationLegality::hasPossibleGlobalWriteReorder(
+ Value *Object,
+ Instruction *Inst,
+ AliasMultiMap& WriteObjects,
+ unsigned MaxByteWidth) {
+
+ AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst);
+
+ std::vector<Instruction*>::iterator
+ it = WriteObjects[Object].begin(),
+ end = WriteObjects[Object].end();
+
+ for (; it != end; ++it) {
+ Instruction* I = *it;
+ if (I == Inst)
+ continue;
+
+ AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I);
+ if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth),
+ ThatLoc.getWithNewSize(MaxByteWidth)))
+ return true;
+ }
+ return false;
+}
+
bool LoopVectorizationLegality::canVectorizeMemory() {
+
typedef SmallVector<Value*, 16> ValueVector;
typedef SmallPtrSet<Value*, 16> ValueSet;
// Holds the Load and Store *instructions*.
PtrRtCheck.Pointers.clear();
PtrRtCheck.Need = false;
+ const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+
// For each block.
for (Loop::block_iterator bb = TheLoop->block_begin(),
be = TheLoop->block_end(); bb != be; ++bb) {
if (it->mayReadFromMemory()) {
LoadInst *Ld = dyn_cast<LoadInst>(it);
if (!Ld) return false;
- if (!Ld->isSimple()) {
+ if (!Ld->isSimple() && !IsAnnotatedParallel) {
DEBUG(dbgs() << "LV: Found a non-simple load.\n");
return false;
}
if (it->mayWriteToMemory()) {
StoreInst *St = dyn_cast<StoreInst>(it);
if (!St) return false;
- if (!St->isSimple()) {
+ if (!St->isSimple() && !IsAnnotatedParallel) {
DEBUG(dbgs() << "LV: Found a non-simple store.\n");
return false;
}
return true;
}
- // Holds the read and read-write *pointers* that we find.
- ValueVector Reads;
- ValueVector ReadWrites;
+ // Holds the read and read-write *pointers* that we find. These maps hold
+ // unique values for pointers (so no need for multi-map).
+ AliasMap Reads;
+ AliasMap ReadWrites;
// Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
// multiple times on the same object. If the ptr is accessed twice, once
// If we did *not* see this pointer before, insert it to
// the read-write list. At this phase it is only a 'write' list.
if (Seen.insert(Ptr))
- ReadWrites.push_back(Ptr);
+ ReadWrites.insert(std::make_pair(Ptr, ST));
+ }
+
+ if (IsAnnotatedParallel) {
+ DEBUG(dbgs()
+ << "LV: A loop annotated parallel, ignore memory dependency "
+ << "checks.\n");
+ return true;
}
for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
// read a few words, modify, and write a few words, and some of the
// words may be written to the same address.
if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr))
- Reads.push_back(Ptr);
+ Reads.insert(std::make_pair(Ptr, LD));
}
// If we write (or read-write) to a single destination and there are no
return true;
}
+ unsigned NumReadPtrs = 0;
+ unsigned NumWritePtrs = 0;
+
// Find pointers with computable bounds. We are going to use this information
// to place a runtime bound check.
bool CanDoRT = true;
- for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
- if (hasComputableBounds(*I)) {
- PtrRtCheck.insert(SE, TheLoop, *I);
- DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+ AliasMap::iterator MI, ME;
+ for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
+ Value *V = (*MI).first;
+ if (hasComputableBounds(V)) {
+ PtrRtCheck.insert(SE, TheLoop, V, true);
+ NumWritePtrs++;
+ DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
} else {
CanDoRT = false;
break;
}
- for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
- if (hasComputableBounds(*I)) {
- PtrRtCheck.insert(SE, TheLoop, *I);
- DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+ }
+ for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
+ Value *V = (*MI).first;
+ if (hasComputableBounds(V)) {
+ PtrRtCheck.insert(SE, TheLoop, V, false);
+ NumReadPtrs++;
+ DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
} else {
CanDoRT = false;
break;
}
+ }
// Check that we did not collect too many pointers or found a
// unsizeable pointer.
- if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+ unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1));
+ DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n");
+ if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
PtrRtCheck.reset();
CanDoRT = false;
}
bool NeedRTCheck = false;
+ // Biggest vectorized access possible, vector width * unroll factor.
+ // TODO: We're being very pessimistic here, find a way to know the
+ // real access width before getting here.
+ unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) *
+ TTI->getMaximumUnrollFactor();
// Now that the pointers are in two lists (Reads and ReadWrites), we
// can check that there are no conflicts between each of the writes and
// between the writes to the reads.
- ValueSet WriteObjects;
+ // Note that WriteObjects duplicates the stores (indexed now by underlying
+ // objects) to avoid pointing to elements inside ReadWrites.
+ // TODO: Maybe create a new type where they can interact without duplication.
+ AliasMultiMap WriteObjects;
ValueVector TempObjects;
// Check that the read-writes do not conflict with other read-write
// pointers.
bool AllWritesIdentified = true;
- for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) {
- GetUnderlyingObjects(*I, TempObjects, DL);
- for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
- it != e; ++it) {
- if (!isIdentifiedObject(*it)) {
- DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
+ for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
+ Value *Val = (*MI).first;
+ Instruction *Inst = (*MI).second;
+
+ GetUnderlyingObjects(Val, TempObjects, DL);
+ for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
+ UI != UE; ++UI) {
+ if (!isIdentifiedObject(*UI)) {
+ DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n");
NeedRTCheck = true;
AllWritesIdentified = false;
}
- if (!WriteObjects.insert(*it)) {
+
+ // Never seen it before, can't alias.
+ if (WriteObjects[*UI].empty()) {
+ DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n");
+ WriteObjects[*UI].push_back(Inst);
+ continue;
+ }
+ // Direct alias found.
+ if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
- << **it <<"\n");
+ << **UI <<"\n");
return false;
}
+ DEBUG(dbgs() << "LV: Found a conflicting global value:"
+ << **UI <<"\n");
+ DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n");
+ DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
+
+ // If global alias, make sure they do alias.
+ if (hasPossibleGlobalWriteReorder(*UI,
+ Inst,
+ WriteObjects,
+ MaxByteWidth)) {
+ DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI
+ << "\n");
+ return false;
+ }
+
+ // Didn't alias, insert into map for further reference.
+ WriteObjects[*UI].push_back(Inst);
}
TempObjects.clear();
}
/// Check that the reads don't conflict with the read-writes.
- for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) {
- GetUnderlyingObjects(*I, TempObjects, DL);
- for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
- it != e; ++it) {
+ for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
+ Value *Val = (*MI).first;
+ GetUnderlyingObjects(Val, TempObjects, DL);
+ for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
+ UI != UE; ++UI) {
// If all of the writes are identified then we don't care if the read
// pointer is identified or not.
- if (!AllWritesIdentified && !isIdentifiedObject(*it)) {
- DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
+ if (!AllWritesIdentified && !isIdentifiedObject(*UI)) {
+ DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n");
NeedRTCheck = true;
}
- if (WriteObjects.count(*it)) {
- DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
- << **it <<"\n");
+
+ // Never seen it before, can't alias.
+ if (WriteObjects[*UI].empty())
+ continue;
+ // Direct alias found.
+ if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
+ DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
+ << **UI <<"\n");
+ return false;
+ }
+ DEBUG(dbgs() << "LV: Found a global value: "
+ << **UI <<"\n");
+ Instruction *Inst = (*MI).second;
+ DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n");
+ DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
+
+ // If global alias, make sure they do alias.
+ if (hasPossibleGlobalWriteReorder(*UI,
+ Inst,
+ WriteObjects,
+ MaxByteWidth)) {
+ DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI
+ << "\n");
return false;
}
}
return true;
}
+static bool hasMultipleUsesOf(Instruction *I,
+ SmallPtrSet<Instruction *, 8> &Insts) {
+ unsigned NumUses = 0;
+ for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) {
+ if (Insts.count(dyn_cast<Instruction>(*Use)))
+ ++NumUses;
+ if (NumUses > 1)
+ return true;
+ }
+
+ return false;
+}
+
+static bool areAllUsesIn(Instruction *I, SmallPtrSet<Instruction *, 8> &Set) {
+ for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use)
+ if (!Set.count(dyn_cast<Instruction>(*Use)))
+ return false;
+ return true;
+}
+
bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
ReductionKind Kind) {
if (Phi->getNumIncomingValues() != 2)
// This includes users of the reduction, variables (which form a cycle
// which ends in the phi node).
Instruction *ExitInstruction = 0;
- // Indicates that we found a binary operation in our scan.
- bool FoundBinOp = false;
-
- // Iter is our iterator. We start with the PHI node and scan for all of the
- // users of this instruction. All users must be instructions that can be
- // used as reduction variables (such as ADD). We may have a single
- // out-of-block user. The cycle must end with the original PHI.
- Instruction *Iter = Phi;
- while (true) {
- // If the instruction has no users then this is a broken
- // chain and can't be a reduction variable.
- if (Iter->use_empty())
+ // Indicates that we found a reduction operation in our scan.
+ bool FoundReduxOp = false;
+
+ // We start with the PHI node and scan for all of the users of this
+ // instruction. All users must be instructions that can be used as reduction
+ // variables (such as ADD). We must have a single out-of-block user. The cycle
+ // must include the original PHI.
+ bool FoundStartPHI = false;
+
+ // To recognize min/max patterns formed by a icmp select sequence, we store
+ // the number of instruction we saw from the recognized min/max pattern,
+ // to make sure we only see exactly the two instructions.
+ unsigned NumCmpSelectPatternInst = 0;
+ ReductionInstDesc ReduxDesc(false, 0);
+
+ SmallPtrSet<Instruction *, 8> VisitedInsts;
+ SmallVector<Instruction *, 8> Worklist;
+ Worklist.push_back(Phi);
+ VisitedInsts.insert(Phi);
+
+ // A value in the reduction can be used:
+ // - By the reduction:
+ // - Reduction operation:
+ // - One use of reduction value (safe).
+ // - Multiple use of reduction value (not safe).
+ // - PHI:
+ // - All uses of the PHI must be the reduction (safe).
+ // - Otherwise, not safe.
+ // - By one instruction outside of the loop (safe).
+ // - By further instructions outside of the loop (not safe).
+ // - By an instruction that is not part of the reduction (not safe).
+ // This is either:
+ // * An instruction type other than PHI or the reduction operation.
+ // * A PHI in the header other than the initial PHI.
+ while (!Worklist.empty()) {
+ Instruction *Cur = Worklist.back();
+ Worklist.pop_back();
+
+ // No Users.
+ // If the instruction has no users then this is a broken chain and can't be
+ // a reduction variable.
+ if (Cur->use_empty())
return false;
- // Did we find a user inside this loop already ?
- bool FoundInBlockUser = false;
- // Did we reach the initial PHI node already ?
- bool FoundStartPHI = false;
+ bool IsAPhi = isa<PHINode>(Cur);
- // Is this a bin op ?
- FoundBinOp |= !isa<PHINode>(Iter);
+ // A header PHI use other than the original PHI.
+ if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent())
+ return false;
- // For each of the *users* of iter.
- for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
- it != e; ++it) {
- Instruction *U = cast<Instruction>(*it);
- // We already know that the PHI is a user.
- if (U == Phi) {
- FoundStartPHI = true;
- continue;
- }
+ // Reductions of instructions such as Div, and Sub is only possible if the
+ // LHS is the reduction variable.
+ if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) &&
+ !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) &&
+ !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0))))
+ return false;
+
+ // Any reduction instruction must be of one of the allowed kinds.
+ ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc);
+ if (!ReduxDesc.IsReduction)
+ return false;
+
+ // A reduction operation must only have one use of the reduction value.
+ if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax &&
+ hasMultipleUsesOf(Cur, VisitedInsts))
+ return false;
+
+ // All inputs to a PHI node must be a reduction value.
+ if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
+ return false;
+
+ if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) ||
+ isa<SelectInst>(Cur)))
+ ++NumCmpSelectPatternInst;
+ if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) ||
+ isa<SelectInst>(Cur)))
+ ++NumCmpSelectPatternInst;
+
+ // Check whether we found a reduction operator.
+ FoundReduxOp |= !IsAPhi;
+
+ // Process users of current instruction. Push non PHI nodes after PHI nodes
+ // onto the stack. This way we are going to have seen all inputs to PHI
+ // nodes once we get to them.
+ SmallVector<Instruction *, 8> NonPHIs;
+ SmallVector<Instruction *, 8> PHIs;
+ for (Value::use_iterator UI = Cur->use_begin(), E = Cur->use_end(); UI != E;
+ ++UI) {
+ Instruction *Usr = cast<Instruction>(*UI);
// Check if we found the exit user.
- BasicBlock *Parent = U->getParent();
+ BasicBlock *Parent = Usr->getParent();
if (!TheLoop->contains(Parent)) {
// Exit if you find multiple outside users.
if (ExitInstruction != 0)
return false;
- ExitInstruction = Iter;
+ ExitInstruction = Cur;
+ continue;
}
- // We allow in-loop PHINodes which are not the original reduction PHI
- // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE
- // structure) then don't skip this PHI.
- if (isa<PHINode>(Iter) && isa<PHINode>(U) &&
- U->getParent() != TheLoop->getHeader() &&
- TheLoop->contains(U) &&
- Iter->getNumUses() > 1)
- continue;
+ // Process instructions only once (termination).
+ if (VisitedInsts.insert(Usr)) {
+ if (isa<PHINode>(Usr))
+ PHIs.push_back(Usr);
+ else
+ NonPHIs.push_back(Usr);
+ }
+ // Remember that we completed the cycle.
+ if (Usr == Phi)
+ FoundStartPHI = true;
+ }
+ Worklist.append(PHIs.begin(), PHIs.end());
+ Worklist.append(NonPHIs.begin(), NonPHIs.end());
+ }
- // We can't have multiple inside users.
- if (FoundInBlockUser)
- return false;
- FoundInBlockUser = true;
+ // This means we have seen one but not the other instruction of the
+ // pattern or more than just a select and cmp.
+ if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) &&
+ NumCmpSelectPatternInst != 2)
+ return false;
- // Any reduction instr must be of one of the allowed kinds.
- if (!isReductionInstr(U, Kind))
- return false;
+ if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
+ return false;
- // Reductions of instructions such as Div, and Sub is only
- // possible if the LHS is the reduction variable.
- if (!U->isCommutative() && !isa<PHINode>(U) && U->getOperand(0) != Iter)
- return false;
+ // We found a reduction var if we have reached the original phi node and we
+ // only have a single instruction with out-of-loop users.
- Iter = U;
- }
+ // This instruction is allowed to have out-of-loop users.
+ AllowedExit.insert(ExitInstruction);
- // We found a reduction var if we have reached the original
- // phi node and we only have a single instruction with out-of-loop
- // users.
- if (FoundStartPHI) {
- // This instruction is allowed to have out-of-loop users.
- AllowedExit.insert(ExitInstruction);
-
- // Save the description of this reduction variable.
- ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
- Reductions[Phi] = RD;
- // We've ended the cycle. This is a reduction variable if we have an
- // outside user and it has a binary op.
- return FoundBinOp && ExitInstruction;
- }
+ // Save the description of this reduction variable.
+ ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
+ ReduxDesc.MinMaxKind);
+ Reductions[Phi] = RD;
+ // We've ended the cycle. This is a reduction variable if we have an
+ // outside user and it has a binary op.
+
+ return true;
+}
+
+/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
+/// pattern corresponding to a min(X, Y) or max(X, Y).
+LoopVectorizationLegality::ReductionInstDesc
+LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I,
+ ReductionInstDesc &Prev) {
+
+ assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&
+ "Expect a select instruction");
+ Instruction *Cmp = 0;
+ SelectInst *Select = 0;
+
+ // We must handle the select(cmp()) as a single instruction. Advance to the
+ // select.
+ if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) {
+ if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin())))
+ return ReductionInstDesc(false, I);
+ return ReductionInstDesc(Select, Prev.MinMaxKind);
}
+
+ // Only handle single use cases for now.
+ if (!(Select = dyn_cast<SelectInst>(I)))
+ return ReductionInstDesc(false, I);
+ if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) &&
+ !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0))))
+ return ReductionInstDesc(false, I);
+ if (!Cmp->hasOneUse())
+ return ReductionInstDesc(false, I);
+
+ Value *CmpLeft;
+ Value *CmpRight;
+
+ // Look for a min/max pattern.
+ if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+ return ReductionInstDesc(Select, MRK_UIntMin);
+ else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+ return ReductionInstDesc(Select, MRK_UIntMax);
+ else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+ return ReductionInstDesc(Select, MRK_SIntMax);
+ else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+ return ReductionInstDesc(Select, MRK_SIntMin);
+ else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+ return ReductionInstDesc(Select, MRK_FloatMin);
+ else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+ return ReductionInstDesc(Select, MRK_FloatMax);
+ else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+ return ReductionInstDesc(Select, MRK_FloatMin);
+ else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+ return ReductionInstDesc(Select, MRK_FloatMax);
+
+ return ReductionInstDesc(false, I);
}
-bool
+LoopVectorizationLegality::ReductionInstDesc
LoopVectorizationLegality::isReductionInstr(Instruction *I,
- ReductionKind Kind) {
+ ReductionKind Kind,
+ ReductionInstDesc &Prev) {
bool FP = I->getType()->isFloatingPointTy();
bool FastMath = (FP && I->isCommutative() && I->isAssociative());
-
switch (I->getOpcode()) {
default:
- return false;
+ return ReductionInstDesc(false, I);
case Instruction::PHI:
- if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd))
- return false;
- // possibly.
- return true;
+ if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd &&
+ Kind != RK_FloatMinMax))
+ return ReductionInstDesc(false, I);
+ return ReductionInstDesc(I, Prev.MinMaxKind);
case Instruction::Sub:
case Instruction::Add:
- return Kind == RK_IntegerAdd;
- case Instruction::SDiv:
- case Instruction::UDiv:
+ return ReductionInstDesc(Kind == RK_IntegerAdd, I);
case Instruction::Mul:
- return Kind == RK_IntegerMult;
+ return ReductionInstDesc(Kind == RK_IntegerMult, I);
case Instruction::And:
- return Kind == RK_IntegerAnd;
+ return ReductionInstDesc(Kind == RK_IntegerAnd, I);
case Instruction::Or:
- return Kind == RK_IntegerOr;
+ return ReductionInstDesc(Kind == RK_IntegerOr, I);
case Instruction::Xor:
- return Kind == RK_IntegerXor;
+ return ReductionInstDesc(Kind == RK_IntegerXor, I);
case Instruction::FMul:
- return Kind == RK_FloatMult && FastMath;
+ return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
case Instruction::FAdd:
- return Kind == RK_FloatAdd && FastMath;
- }
+ return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
+ case Instruction::FCmp:
+ case Instruction::ICmp:
+ case Instruction::Select:
+ if (Kind != RK_IntegerMinMax &&
+ (!HasFunNoNaNAttr || Kind != RK_FloatMinMax))
+ return ReductionInstDesc(false, I);
+ return isMinMaxSelectCmpPattern(I, Prev);
+ }
}
LoopVectorizationLegality::InductionKind
bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
- // We don't predicate loads/stores at the moment.
- if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow())
+ // We might be able to hoist the load.
+ if (it->mayReadFromMemory() && !LoadSpeculation.isHoistableLoad(it))
+ return false;
+
+ // We don't predicate stores at the moment.
+ if (it->mayWriteToMemory() || it->mayThrow())
return false;
// The instructions below can trap.
continue;
// Examine the stored values.
- StoreInst *ST = 0;
- if ((ST = dyn_cast<StoreInst>(it)))
+ if (StoreInst *ST = dyn_cast<StoreInst>(it))
T = ST->getValueOperand()->getType();
// Ignore loaded pointer types and stored pointer types that are not
// consecutive. However, we do want to take consecutive stores/loads of
// pointer vectors into account.
- if (T->isPointerTy() && isConsecutiveLoadOrStore(it))
- MaxWidth = std::max(MaxWidth, DL->getPointerSizeInBits());
- else
- MaxWidth = std::max(MaxWidth, T->getScalarSizeInBits());
+ if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
+ continue;
+
+ MaxWidth = std::max(MaxWidth,
+ (unsigned)DL->getTypeSizeInBits(T->getScalarType()));
}
}
// For each instruction in the old loop.
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ // Skip dbg intrinsics.
+ if (isa<DbgInfoIntrinsic>(it))
+ continue;
+
unsigned C = getInstructionCost(it, VF);
Cost += C;
DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
- case Instruction::Xor:
- return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
+ case Instruction::Xor: {
+ // Certain instructions can be cheaper to vectorize if they have a constant
+ // second vector operand. One example of this are shifts on x86.
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_AnyValue;
+
+ if (isa<ConstantInt>(I->getOperand(1)))
+ Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+
+ return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
+ }
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);
const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
Type *CondTy = SI->getCondition()->getType();
- if (ScalarCond)
+ if (!ScalarCond)
CondTy = VectorType::get(CondTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
// Scalarized loads/stores.
- int Stride = Legal->isConsecutivePtr(Ptr);
- bool Reverse = Stride < 0;
- if (0 == Stride) {
+ int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+ bool Reverse = ConsecutiveStride < 0;
+ unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
+ unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
+ if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
unsigned Cost = 0;
// The cost of extracting from the value vector and pointer vector.
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
}
case Instruction::Call: {
- assert(isTriviallyVectorizableIntrinsic(I));
- IntrinsicInst *II = cast<IntrinsicInst>(I);
- Type *RetTy = ToVectorTy(II->getType(), VF);
+ CallInst *CI = cast<CallInst>(I);
+ Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+ assert(ID && "Not an intrinsic call!");
+ Type *RetTy = ToVectorTy(CI->getType(), VF);
SmallVector<Type*, 4> Tys;
- for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i)
- Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF));
- return TTI.getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys);
+ for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
+ Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
+ return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
}
default: {
// We are scalarizing the instruction. Return the cost of the scalar
bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
// Check for a store.
- StoreInst *ST = dyn_cast<StoreInst>(Inst);
- if (ST)
+ if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
// Check for a load.
- LoadInst *LI = dyn_cast<LoadInst>(Inst);
- if (LI)
+ if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
return false;