//
//===----------------------------------------------------------------------===//
//
-// This is a simple loop vectorizer. We currently only support single block
-// loops. We have a very simple and restrictive legality check: we need to read
-// and write from disjoint memory locations. We still don't have a cost model.
-// We do support integer reductions.
+// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
+// and generates target-independent LLVM-IR. Legalization of the IR is done
+// in the codegen. However, the vectorizes uses (will use) the codegen
+// interfaces to generate IR that is likely to result in an optimal binary.
+//
+// The loop vectorizer combines consecutive loop iteration into a single
+// 'wide' iteration. After this transformation the index is incremented
+// by the SIMD vector width, and not by one.
//
// This pass has three parts:
// 1. The main loop pass that drives the different parts.
-// 2. LoopVectorizationLegality - A helper class that checks for the legality
+// 2. LoopVectorizationLegality - A unit that checks for the legality
// of the vectorization.
-// 3. SingleBlockLoopVectorizer - A helper class that performs the actual
+// 3. SingleBlockLoopVectorizer - A unit that performs the actual
// widening of instructions.
+// 4. LoopVectorizationCostModel - A unit that checks for the profitability
+// of vectorization. It decides on the optimal vector width, which
+// can be one, if vectorization is not profitable.
+//===----------------------------------------------------------------------===//
+//
+// The reduction-variable vectorization is based on the paper:
+// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
+//
+// Variable uniformity checks are inspired by:
+// Karrenberg, R. and Hack, S. Whole Function Vectorization.
+//
+// Other ideas/concepts are from:
+// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
//
//===----------------------------------------------------------------------===//
#define LV_NAME "loop-vectorize"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Transforms/Scalar.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/TargetTransformInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
static cl::opt<unsigned>
-DefaultVectorizationFactor("default-loop-vectorize-width",
- cl::init(4), cl::Hidden,
- cl::desc("Set the default loop vectorization width"));
+VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
+ cl::desc("Set the default vectorization width. Zero is autoselect."));
+
namespace {
-// Forward declaration.
+// Forward declarations.
class LoopVectorizationLegality;
-
-/// Vectorize a simple loop. This class performs the widening of simple single
-/// basic block loops into vectors. It does not perform any
-/// vectorization-legality checks, and just does it. It widens the vectors
-/// to a given vectorization factor (VF).
+class LoopVectorizationCostModel;
+
+/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
+/// block to a specified vectorization factor (VF).
+/// This class performs the widening of scalars into vectors, or multiple
+/// scalars. This class also implements the following features:
+/// * It inserts an epilogue loop for handling loops that don't have iteration
+/// counts that are known to be a multiple of the vectorization factor.
+/// * It handles the code generation for reduction variables.
+/// * Scalarization (implementation using scalars) of un-vectorizable
+/// instructions.
+/// SingleBlockLoopVectorizer does not perform any vectorization-legality
+/// checks, and relies on the caller to check for the different legality
+/// aspects. The SingleBlockLoopVectorizer relies on the
+/// LoopVectorizationLegality class to provide information about the induction
+/// and reduction variables that were found to a given vectorization factor.
class SingleBlockLoopVectorizer {
public:
/// Ctor.
- SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li,
+ SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
LPPassManager *Lpm, unsigned VecWidth):
- Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
+ OrigLoop(Orig), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
Builder(Se->getContext()), Induction(0), OldInduction(0) { }
// Perform the actual loop widening (vectorization).
typedef DenseMap<Value*, Value*> ValueMap;
/// The original loop.
- Loop *Orig;
+ Loop *OrigLoop;
// Scev analysis to use.
ScalarEvolution *SE;
// Loop Info.
ValueMap WidenMap;
};
-/// Perform the vectorization legality check. This class does not look at the
-/// profitability of vectorization, only the legality. At the moment the checks
-/// are very simple and focus on single basic block loops with a constant
-/// iteration count and no reductions.
+/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
+/// to what vectorization factor.
+/// This class does not look at the profitability of vectorization, only the
+/// legality. This class has two main kinds of checks:
+/// * Memory checks - The code in canVectorizeMemory checks if vectorization
+/// will change the order of memory accesses in a way that will change the
+/// correctness of the program.
+/// * Scalars checks - The code in canVectorizeBlock checks for a number
+/// of different conditions, such as the availability of a single induction
+/// variable, that all types are supported and vectorize-able, etc.
+/// This code reflects the capabilities of SingleBlockLoopVectorizer.
+/// This class is also used by SingleBlockLoopVectorizer for identifying
+/// induction variable and the different reduction variables.
class LoopVectorizationLegality {
public:
LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl):
TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
/// This represents the kinds of reductions that we support.
+ /// We use the enum values to hold the 'identity' value for
+ /// each operand. This value does not change the result if applied.
enum ReductionKind {
- IntegerAdd, /// Sum of numbers.
- IntegerMult, /// Product of numbers.
- NoReduction /// Not a reduction.
+ NoReduction = -1, /// Not a reduction.
+ IntegerAdd = 0, /// Sum of numbers.
+ IntegerMult = 1 /// Product of numbers.
};
- // Holds a pairing of reduction instruction and the reduction kind.
- typedef std::pair<Instruction*, ReductionKind> ReductionPair;
+ /// This POD struct holds information about reduction variables.
+ struct ReductionDescriptor {
+ // Default C'tor
+ ReductionDescriptor():
+ StartValue(0), LoopExitInstr(0), Kind(NoReduction) {}
+
+ // C'tor.
+ ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K):
+ StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
+
+ // The starting value of the reduction.
+ // It does not have to be zero!
+ Value *StartValue;
+ // The instruction who's value is used outside the loop.
+ Instruction *LoopExitInstr;
+ // The kind of the reduction.
+ ReductionKind Kind;
+ };
- /// ReductionList contains the reduction variables
- /// as well as a single EXIT (from the block) value and the kind of
- /// reduction variable..
- /// Notice that the EXIT instruction can also be the PHI itself.
- typedef DenseMap<PHINode*, ReductionPair> ReductionList;
+ /// ReductionList contains the reduction descriptors for all
+ /// of the reductions that were found in the loop.
+ typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
- /// Returns the maximum vectorization factor that we *can* use to vectorize
- /// this loop. This does not mean that it is profitable to vectorize this
- /// loop, only that it is legal to do so. This may be a large number. We
- /// can vectorize to any SIMD width below this number.
- unsigned getLoopMaxVF();
+ /// Returns true if it is legal to vectorize this loop.
+ /// This does not mean that it is profitable to vectorize this
+ /// loop, only that it is legal to do so.
+ bool canVectorize();
/// Returns the Induction variable.
PHINode *getInduction() {return Induction;}
/// Returns the reduction variables found in the loop.
ReductionList *getReductionVars() { return &Reductions; }
- /// Check that the GEP operands are all uniform except for the last index
- /// which has to be the induction variable.
+ /// Check if the pointer returned by this GEP is consecutive
+ /// when the index is vectorized. This happens when the last
+ /// index of the GEP is consecutive, like the induction variable.
+ /// This check allows us to vectorize A[idx] into a wide load/store.
bool isConsecutiveGep(Value *Ptr);
private:
/// Returns true if BB is vectorizable
bool canVectorizeMemory(BasicBlock &BB);
- // Check if a pointer value is known to be disjoint.
- // Example: Alloca, Global, NoAlias.
- bool isIdentifiedSafeObject(Value* Val);
-
/// Returns True, if 'Phi' is the kind of reduction variable for type
/// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
- /// Checks if a constant matches the reduction kind.
- /// Sums starts with zero. Products start at one.
- bool isReductionConstant(Value *V, ReductionKind Kind);
/// Returns true if the instruction I can be a reduction variable of type
/// 'Kind'.
bool isReductionInstr(Instruction *I, ReductionKind Kind);
SmallPtrSet<Value*, 4> AllowedExit;
};
+/// LoopVectorizationCostModel - estimates the expected speedups due to
+/// vectorization.
+/// In many cases vectorization is not profitable. This can happen because
+/// of a number of reasons. In this class we mainly attempt to predict
+/// the expected speedup/slowdowns due to the supported instruction set.
+/// We use the VectorTargetTransformInfo to query the different backends
+/// for the cost of different operations.
+class LoopVectorizationCostModel {
+public:
+ /// C'tor.
+ LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl,
+ LoopVectorizationLegality *Leg,
+ const VectorTargetTransformInfo *Vtti):
+ TheLoop(Lp), SE(Se), DL(Dl), Legal(Leg), VTTI(Vtti) { }
+
+ /// Returns the most profitable vectorization factor for the loop that is
+ /// smaller or equal to the VF argument. This method checks every power
+ /// of two up to VF.
+ unsigned findBestVectorizationFactor(unsigned VF = 4);
+
+private:
+ /// Returns the expected execution cost. The unit of the cost does
+ /// not matter because we use the 'cost' units to compare different
+ /// vector widths. The cost that is returned is *not* normalized by
+ /// the factor width.
+ unsigned expectedCost(unsigned VF);
+
+ /// Returns the execution time cost of an instruction for a given vector
+ /// width. Vector width of one means scalar.
+ unsigned getInstructionCost(Instruction *I, unsigned VF);
+
+ /// The loop that we evaluate.
+ Loop *TheLoop;
+ /// Scev analysis.
+ ScalarEvolution *SE;
+ /// DataLayout analysis.
+ DataLayout *DL;
+ /// Vectorization legality.
+ LoopVectorizationLegality *Legal;
+ /// Vector target information.
+ const VectorTargetTransformInfo *VTTI;
+};
+
struct LoopVectorize : public LoopPass {
static char ID; // Pass identification, replacement for typeid
ScalarEvolution *SE;
DataLayout *DL;
LoopInfo *LI;
+ TargetTransformInfo *TTI;
virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
-
- // Only vectorize innermost loops.
+ // We only vectorize innermost loops.
if (!L->empty())
return false;
SE = &getAnalysis<ScalarEvolution>();
DL = getAnalysisIfAvailable<DataLayout>();
LI = &getAnalysis<LoopInfo>();
+ TTI = getAnalysisIfAvailable<TargetTransformInfo>();
DEBUG(dbgs() << "LV: Checking a loop in \"" <<
L->getHeader()->getParent()->getName() << "\"\n");
// Check if it is legal to vectorize the loop.
LoopVectorizationLegality LVL(L, SE, DL);
- unsigned MaxVF = LVL.getLoopMaxVF();
-
- // Check that we can vectorize using the chosen vectorization width.
- if (MaxVF < DefaultVectorizationFactor) {
- DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n");
+ if (!LVL.canVectorize()) {
+ DEBUG(dbgs() << "LV: Not vectorizing.\n");
return false;
}
- DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n");
+ // Select the preffered vectorization factor.
+ unsigned VF = 1;
+ if (VectorizationFactor == 0) {
+ const VectorTargetTransformInfo *VTTI = 0;
+ if (TTI)
+ VTTI = TTI->getVectorTargetTransformInfo();
+ // Use the cost model.
+ LoopVectorizationCostModel CM(L, SE, DL, &LVL, VTTI);
+ VF = CM.findBestVectorizationFactor();
+
+ if (VF == 1) {
+ DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+ return false;
+ }
+
+ } else {
+ // Use the user command flag.
+ VF = VectorizationFactor;
+ }
+
+ DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ").\n");
- // If we decided that is is *legal* to vectorizer the loop. Do it.
- SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor);
+ // If we decided that it is *legal* to vectorizer the loop then do it.
+ SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, VF);
LB.vectorize(&LVL);
DEBUG(verifyFunction(*L->getHeader()->getParent()));
}
bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
- GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+ GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
if (!Gep)
return false;
Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
assert(!V->getType()->isVectorTy() && "Can't widen a vector");
// If we saved a vectorized copy of V, use it.
- ValueMap::iterator it = WidenMap.find(V);
- if (it != WidenMap.end())
- return it->second;
+ Value *&MapEntry = WidenMap[V];
+ if (MapEntry)
+ return MapEntry;
// Broadcast V and save the value for future uses.
Value *B = getBroadcastInstrs(V);
- WidenMap[V] = B;
+ MapEntry = B;
return B;
}
if (!IsVoidRetTy)
VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF));
- // For each scalar that we create.
+ // For each scalar that we create:
for (unsigned i = 0; i < VF; ++i) {
Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy)
the vectorized instructions while the old loop will continue to run the
scalar remainder.
- [ ] <-- vector loop bypass.
+ [ ] <-- vector loop bypass.
/ |
/ v
| [ ] <-- vector pre header.
*/
// This is the original scalar-loop preheader.
- BasicBlock *BypassBlock = Orig->getLoopPreheader();
- BasicBlock *ExitBlock = Orig->getExitBlock();
+ BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
+ BasicBlock *ExitBlock = OrigLoop->getExitBlock();
assert(ExitBlock && "Must have an exit block");
- assert(Orig->getNumBlocks() == 1 && "Invalid loop");
+ assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
assert(BypassBlock && "Invalid loop structure");
BasicBlock *VectorPH =
MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
"scalar.preheader");
// Find the induction variable.
- BasicBlock *OldBasicBlock = Orig->getHeader();
+ BasicBlock *OldBasicBlock = OrigLoop->getHeader();
OldInduction = Legal->getInduction();
assert(OldInduction && "We must have a single phi node.");
Type *IdxTy = OldInduction->getType();
Constant *Step = ConstantInt::get(IdxTy, VF);
// Find the loop boundaries.
- const SCEV *ExitCount = SE->getExitCount(Orig, Orig->getHeader());
+ const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
// Get the total trip count from the count by adding 1.
// Register the new loop.
Loop* Lp = new Loop();
- LPM->insertLoop(Lp, Orig->getParentLoop());
+ LPM->insertLoop(Lp, OrigLoop->getParentLoop());
Lp->addBasicBlockToLoop(VecBody, LI->getBase());
- Loop *ParentLoop = Orig->getParentLoop();
+ Loop *ParentLoop = OrigLoop->getParentLoop();
if (ParentLoop) {
ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
void
SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
+ //===------------------------------------------------===//
+ //
+ // Notice: any optimization or new instruction that go
+ // into the code below should be also be implemented in
+ // the cost-model.
+ //
+ //===------------------------------------------------===//
typedef SmallVector<PHINode*, 4> PhiVector;
- BasicBlock &BB = *Orig->getHeader();
+ BasicBlock &BB = *OrigLoop->getHeader();
+ Constant *Zero = ConstantInt::get(
+ IntegerType::getInt32Ty(BB.getContext()), 0);
// In order to support reduction variables we need to be able to vectorize
// Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
}
case Instruction::Select: {
// Widen selects.
- // TODO: If the selector is loop invariant we can issue a select
- // instruction with a scalar condition.
- Value *A = getVectorValue(Inst->getOperand(0));
- Value *B = getVectorValue(Inst->getOperand(1));
- Value *C = getVectorValue(Inst->getOperand(2));
- WidenMap[Inst] = Builder.CreateSelect(A, B, C);
+ // If the selector is loop invariant we can create a select
+ // instruction with a scalar condition. Otherwise, use vector-select.
+ Value *Cond = Inst->getOperand(0);
+ bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
+
+ // The condition can be loop invariant but still defined inside the
+ // loop. This means that we can't just use the original 'cond' value.
+ // We have to take the 'vectorized' value and pick the first lane.
+ // Instcombine will make this a no-op.
+ Cond = getVectorValue(Cond);
+ if (InvariantCond)
+ Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
+
+ Value *Op0 = getVectorValue(Inst->getOperand(1));
+ Value *Op1 = getVectorValue(Inst->getOperand(2));
+ WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1);
break;
}
break;
}
+ // The last index does not have to be the induction. It can be
+ // consecutive and be a function of the index. For example A[I+1];
+ unsigned NumOperands = Gep->getNumOperands();
+ Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+ LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
+
// Create the new GEP with the new induction variable.
GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
- unsigned NumOperands = Gep->getNumOperands();
- Gep2->setOperand(NumOperands - 1, Induction);
+ Gep2->setOperand(NumOperands - 1, LastIndex);
Ptr = Builder.Insert(Gep2);
Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
Value *Val = getVectorValue(SI->getValueOperand());
break;
}
+ // The last index does not have to be the induction. It can be
+ // consecutive and be a function of the index. For example A[I+1];
+ unsigned NumOperands = Gep->getNumOperands();
+ Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+ LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
+
// Create the new GEP with the new induction variable.
GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
- unsigned NumOperands = Gep->getNumOperands();
- Gep2->setOperand(NumOperands - 1, Induction);
+ Gep2->setOperand(NumOperands - 1, LastIndex);
Ptr = Builder.Insert(Gep2);
Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
LI = Builder.CreateLoad(Ptr);
PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
assert(RdxPhi && "Unable to recover vectorized PHI");
- // Find the reduction variable.
+ // Find the reduction variable descriptor.
assert(Legal->getReductionVars()->count(RdxPhi) &&
"Unable to find the reduction variable");
- LoopVectorizationLegality::ReductionPair ReductionVar =
+ LoopVectorizationLegality::ReductionDescriptor RdxDesc =
(*Legal->getReductionVars())[RdxPhi];
+ // We need to generate a reduction vector from the incoming scalar.
+ // To do so, we need to generate the 'identity' vector and overide
+ // one of the elements with the incoming scalar reduction. We need
+ // to do it in the vector-loop preheader.
+ Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
+
// This is the vector-clone of the value that leaves the loop.
- Value *VectorExit = getVectorValue(ReductionVar.first);
+ Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
Type *VecTy = VectorExit->getType();
- // This is the kind of reduction.
- LoopVectorizationLegality::ReductionKind RdxKind = ReductionVar.second;
- // Find the reduction identity variable.
- // Zero for addition. One for Multiplication.
- unsigned IdentitySclr =
- (RdxKind == LoopVectorizationLegality::IntegerAdd ? 0 : 1);
- Constant *Identity = getUniformVector(IdentitySclr, VecTy->getScalarType());
+ // Find the reduction identity variable. The value of the enum is the
+ // identity. Zero for addition. One for Multiplication.
+ unsigned IdentitySclr = RdxDesc.Kind;
+ Constant *Identity = getUniformVector(IdentitySclr,
+ VecTy->getScalarType());
+
+ // This vector is the Identity vector where the first element is the
+ // incoming scalar reduction.
+ Value *VectorStart = Builder.CreateInsertElement(Identity,
+ RdxDesc.StartValue, Zero);
+
// Fix the vector-loop phi.
// We created the induction variable so we know that the
// preheader is the first entry.
BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
- VecRdxPhi->addIncoming(Identity, VecPreheader);
+
+ // Reductions do not have to start at zero. They can start with
+ // any loop invariant values.
+ VecRdxPhi->addIncoming(VectorStart, VecPreheader);
unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
VecRdxPhi->addIncoming(Val, LoopVectorBody);
Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
// This PHINode contains the vectorized reduction variable, or
- // the identity vector, if we bypass the vector loop.
+ // the initial value vector, if we bypass the vector loop.
PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
- NewPhi->addIncoming(Identity, LoopBypassBlock);
- NewPhi->addIncoming(getVectorValue(ReductionVar.first), LoopVectorBody);
+ NewPhi->addIncoming(VectorStart, LoopBypassBlock);
+ NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody);
// Extract the first scalar.
Value *Scalar0 =
for (unsigned i=1; i < VF; ++i) {
Value *Scalar1 =
Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
- if (RdxKind == LoopVectorizationLegality::IntegerAdd) {
+ if (RdxDesc.Kind == LoopVectorizationLegality::IntegerAdd) {
Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
} else {
Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
if (!LCSSAPhi) continue;
- // All PHINodes need to have a single entry edge, or two if we already fixed them.
+ // All PHINodes need to have a single entry edge, or two if
+ // we already fixed them.
assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
- // We found our reduction value exit-PHI. Update it with the incoming bypass edge.
- if (LCSSAPhi->getIncomingValue(0) == ReductionVar.first) {
+ // We found our reduction value exit-PHI. Update it with the
+ // incoming bypass edge.
+ if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
// Add an edge coming from the bypass.
LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
break;
int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
(RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
- (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, ReductionVar.first);
+ (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
}// end of for each redux variable.
}
void SingleBlockLoopVectorizer::cleanup() {
// The original basic block.
- SE->forgetLoop(Orig);
+ SE->forgetLoop(OrigLoop);
}
-unsigned LoopVectorizationLegality::getLoopMaxVF() {
+bool LoopVectorizationLegality::canVectorize() {
if (!TheLoop->getLoopPreheader()) {
assert(false && "No preheader!!");
DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
- return 1;
+ return false;
}
// We can only vectorize single basic block loops.
unsigned NumBlocks = TheLoop->getNumBlocks();
if (NumBlocks != 1) {
DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
- return 1;
+ return false;
}
// We need to have a loop header.
// Go over each instruction and look at memory deps.
if (!canVectorizeBlock(*BB)) {
DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
- return 1;
+ return false;
}
// ScalarEvolution needs to be able to find the exit count.
const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
if (ExitCount == SE->getCouldNotCompute()) {
DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
- return 1;
+ return false;
}
DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
// Okay! We can vectorize. At this point we don't have any other mem analysis
- // which may limit our maximum vectorization factor, so just return the
- // maximum SIMD size.
- return DefaultVectorizationFactor;
+ // which may limit our maximum vectorization factor, so just return true with
+ // no restrictions.
+ return true;
}
bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
GetUnderlyingObjects(*I, TempObjects, DL);
for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
it != e; ++it) {
- if (!isIdentifiedSafeObject(*it)) {
+ if (!isIdentifiedObject(*it)) {
DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
return false;
}
GetUnderlyingObjects(*I, TempObjects, DL);
for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
it != e; ++it) {
- if (!isIdentifiedSafeObject(*it)) {
+ if (!isIdentifiedObject(*it)) {
DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
return false;
}
return true;
}
-/// Checks if the value is a Global variable or if it is an Arguments
-/// marked with the NoAlias attribute.
-bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) {
- assert(Val && "Invalid value");
- if (dyn_cast<GlobalValue>(Val))
- return true;
- if (dyn_cast<AllocaInst>(Val))
- return true;
- Argument *A = dyn_cast<Argument>(Val);
- if (!A)
- return false;
- return A->hasNoAliasAttr();
-}
-
bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
- ReductionKind Kind) {
+ ReductionKind Kind) {
if (Phi->getNumIncomingValues() != 2)
return false;
int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry.
Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx);
- // We must have a constant that starts the reduction.
- if (!isReductionConstant(RdxStart, Kind))
- return false;
-
// ExitInstruction is the single value which is used outside the loop.
// We only allow for a single reduction value to be used outside the loop.
// This includes users of the reduction, variables (which form a cycle
// If the instruction has no users then this is a broken
// chain and can't be a reduction variable.
- if (Iter->use_begin() == Iter->use_end())
+ if (Iter->use_empty())
return false;
// For each of the *users* of iter.
if (FoundStartPHI && ExitInstruction) {
// This instruction is allowed to have out-of-loop users.
AllowedExit.insert(ExitInstruction);
- // Mark this as a reduction var.
- Reductions[Phi] = std::make_pair(ExitInstruction, Kind);
+
+ // Save the description of this reduction variable.
+ ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
+ Reductions[Phi] = RD;
return true;
}
}
}
-bool
-LoopVectorizationLegality::isReductionConstant(Value *V, ReductionKind Kind) {
- ConstantInt *CI = dyn_cast<ConstantInt>(V);
- if (!CI)
- return false;
- if (Kind == IntegerMult && CI->isOne())
- return true;
- if (Kind == IntegerAdd && CI->isZero())
- return true;
- return false;
-}
-
bool
LoopVectorizationLegality::isReductionInstr(Instruction *I,
ReductionKind Kind) {
return true;
}
+unsigned
+LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
+ if (!VTTI) {
+ DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n");
+ return 1;
+ }
+
+ float Cost = expectedCost(1);
+ unsigned Width = 1;
+ DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
+ for (unsigned i=2; i <= VF; i*=2) {
+ // Notice that the vector loop needs to be executed less times, so
+ // we need to divide the cost of the vector loops by the width of
+ // the vector elements.
+ float VectorCost = expectedCost(i) / (float)i;
+ DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
+ (int)VectorCost << ".\n");
+ if (VectorCost < Cost) {
+ Cost = VectorCost;
+ Width = i;
+ }
+ }
+
+ DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
+ return Width;
+}
+
+unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
+ // We can only estimate the cost of single basic block loops.
+ assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop");
+
+ BasicBlock *BB = TheLoop->getHeader();
+ unsigned Cost = 0;
+
+ // For each instruction in the old loop.
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ Instruction *Inst = it;
+ Cost += getInstructionCost(Inst, VF);
+ }
+
+ // Return the cost divided by VF, because we will be executing
+ // less iterations of the vector form.
+ return Cost;
+}
+
+unsigned
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
+ assert(VTTI && "Invalid vector target transformation info");
+ switch (I->getOpcode()) {
+ case Instruction::Br: {
+ return VTTI->getInstrCost(I->getOpcode());
+ }
+ case Instruction::PHI:
+ // PHIs are handled the same as the binary instructions below.
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ Type *VTy = VectorType::get(I->getType(), VF);
+ return VTTI->getInstrCost(I->getOpcode(), VTy);
+ }
+ case Instruction::Select: {
+ SelectInst *SI = cast<SelectInst>(I);
+ Type *VTy = VectorType::get(I->getType(), VF);
+ const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
+ bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
+ Type *CondTy = SI->getCondition()->getType();
+ if (ScalarCond)
+ CondTy = VectorType::get(CondTy, VF);
+
+ return VTTI->getInstrCost(I->getOpcode(), VTy, CondTy);
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ Type *VTy = VectorType::get(I->getOperand(0)->getType(), VF);
+ return VTTI->getInstrCost(I->getOpcode(), VTy);
+ }
+ case Instruction::Store: {
+ StoreInst *SI = cast<StoreInst>(I);
+ Type *VTy = VectorType::get(SI->getValueOperand()->getType(), VF);
+
+ // Scalarized stores.
+ if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
+ unsigned Cost = 0;
+ unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy);
+ // The cost of extracting from the vector value.
+ Cost += VF * ExtCost;
+ // The cost of the scalar stores.
+ Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType());
+ return Cost;
+ }
+
+ // Wide stores.
+ return VTTI->getMemoryOpCost(I->getOpcode(), VTy, SI->getAlignment(),
+ SI->getPointerAddressSpace());
+ }
+ case Instruction::Load: {
+ LoadInst *LI = cast<LoadInst>(I);
+ Type *VTy = VectorType::get(I->getType(), VF);
+
+ // Scalarized loads.
+ if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
+ unsigned Cost = 0;
+ unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
+ // The cost of inserting the loaded value into the result vector.
+ Cost += VF * InCost;
+ // The cost of the scalar stores.
+ Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType());
+ return Cost;
+ }
+
+ // Wide loads.
+ return VTTI->getMemoryOpCost(I->getOpcode(), VTy, LI->getAlignment(),
+ LI->getPointerAddressSpace());
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ Type *SrcTy = VectorType::get(I->getOperand(0)->getType(), VF);
+ Type *DstTy = VectorType::get(I->getType(), VF);
+ return VTTI->getInstrCost(I->getOpcode(), DstTy, SrcTy);
+ }
+ default: {
+ // We are scalarizing the instruction. Return the cost of the scalar
+ // instruction, plus the cost of insert and extract into vector
+ // elements, times the vector width.
+ unsigned Cost = 0;
+ Type *Ty = I->getType();
+
+ if (!Ty->isVoidTy()) {
+ Type *VTy = VectorType::get(Ty, VF);
+ unsigned InsCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
+ unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy);
+ Cost += VF * (InsCost + ExtCost);
+ }
+
+ /// We don't have any information on the scalar instruction, but maybe
+ /// the target has.
+ /// TODO: This may be a target-specific intrinsic.
+ /// Need to add API for that.
+ Cost += VF * VTTI->getInstrCost(I->getOpcode(), Ty);
+
+ return Cost;
+ }
+ }// end of switch.
+}
+
+
} // namespace
char LoopVectorize::ID = 0;