1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This is a simple loop vectorizer. We currently only support single block
11 // loops. We have a very simple and restrictive legality check: we need to read
12 // and write from disjoint memory locations. We still don't have a cost model.
13 // We do support integer reductions.
15 // This pass has three parts:
16 // 1. The main loop pass that drives the different parts.
17 // 2. LoopVectorizationLegality - A helper class that checks for the legality
18 // of the vectorization.
19 // 3. SingleBlockLoopVectorizer - A helper class that performs the actual
20 // widening of instructions.
22 //===----------------------------------------------------------------------===//
23 #define LV_NAME "loop-vectorize"
24 #define DEBUG_TYPE LV_NAME
25 #include "llvm/Constants.h"
26 #include "llvm/DerivedTypes.h"
27 #include "llvm/Instructions.h"
28 #include "llvm/LLVMContext.h"
29 #include "llvm/Pass.h"
30 #include "llvm/Analysis/LoopPass.h"
31 #include "llvm/Value.h"
32 #include "llvm/Function.h"
33 #include "llvm/Analysis/Verifier.h"
34 #include "llvm/Module.h"
35 #include "llvm/Type.h"
36 #include "llvm/ADT/SmallVector.h"
37 #include "llvm/ADT/StringExtras.h"
38 #include "llvm/Analysis/AliasAnalysis.h"
39 #include "llvm/Analysis/AliasSetTracker.h"
40 #include "llvm/Transforms/Scalar.h"
41 #include "llvm/Analysis/ScalarEvolution.h"
42 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
43 #include "llvm/Analysis/ScalarEvolutionExpander.h"
44 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
45 #include "llvm/Analysis/ValueTracking.h"
46 #include "llvm/Analysis/LoopInfo.h"
47 #include "llvm/Support/CommandLine.h"
48 #include "llvm/Support/Debug.h"
49 #include "llvm/Support/raw_ostream.h"
50 #include "llvm/DataLayout.h"
51 #include "llvm/Transforms/Utils/Local.h"
55 static cl::opt<unsigned>
56 DefaultVectorizationFactor("default-loop-vectorize-width",
57 cl::init(4), cl::Hidden,
58 cl::desc("Set the default loop vectorization width"));
61 // Forward declaration.
62 class LoopVectorizationLegality;
64 /// Vectorize a simple loop. This class performs the widening of simple single
65 /// basic block loops into vectors. It does not perform any
66 /// vectorization-legality checks, and just does it. It widens the vectors
67 /// to a given vectorization factor (VF).
68 class SingleBlockLoopVectorizer {
71 SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li,
72 LPPassManager *Lpm, unsigned VecWidth):
73 Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
74 Builder(Se->getContext()), Induction(0), OldInduction(0) { }
76 // Perform the actual loop widening (vectorization).
77 void vectorize(LoopVectorizationLegality *Legal) {
78 ///Create a new empty loop. Unlink the old loop and connect the new one.
80 /// Widen each instruction in the old loop to a new one in the new loop.
81 /// Use the Legality module to find the induction and reduction variables.
83 // register the new loop.
88 /// Create an empty loop, based on the loop ranges of the old loop.
89 void createEmptyLoop();
90 /// Copy and widen the instructions from the old loop.
91 void vectorizeLoop(LoopVectorizationLegality *Legal);
92 /// Insert the new loop to the loop hierarchy and pass manager.
95 /// This instruction is un-vectorizable. Implement it as a sequence
97 void scalarizeInstruction(Instruction *Instr);
99 /// Create a broadcast instruction. This method generates a broadcast
100 /// instruction (shuffle) for loop invariant values and for the induction
101 /// value. If this is the induction variable then we extend it to N, N+1, ...
102 /// this is needed because each iteration in the loop corresponds to a SIMD
104 Value *getBroadcastInstrs(Value *V);
106 /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 ..
107 /// for each element in the vector. Starting from zero.
108 Value *getConsecutiveVector(Value* Val);
110 /// Check that the GEP operands are all uniform except for the last index
111 /// which has to be the induction variable.
112 bool isConsecutiveGep(GetElementPtrInst *Gep);
114 /// When we go over instructions in the basic block we rely on previous
115 /// values within the current basic block or on loop invariant values.
116 /// When we widen (vectorize) values we place them in the map. If the values
117 /// are not within the map, they have to be loop invariant, so we simply
118 /// broadcast them into a vector.
119 Value *getVectorValue(Value *V);
121 /// Get a uniform vector of constant integers. We use this to get
122 /// vectors of ones and zeros for the reduction code.
123 Constant* getUniformVector(unsigned Val, Type* ScalarTy);
125 typedef DenseMap<Value*, Value*> ValueMap;
127 /// The original loop.
129 // Scev analysis to use.
133 // Loop Pass Manager;
135 // The vectorization factor to use.
138 // The builder that we use
141 // --- Vectorization state ---
143 /// Middle Block between the vector and the scalar.
144 BasicBlock *LoopMiddleBlock;
145 ///The ExitBlock of the scalar loop.
146 BasicBlock *LoopExitBlock;
147 ///The vector loop body.
148 BasicBlock *LoopVectorBody;
149 ///The scalar loop body.
150 BasicBlock *LoopScalarBody;
151 ///The first bypass block.
152 BasicBlock *LoopBypassBlock;
154 /// The new Induction variable which was added to the new block.
156 /// The induction variable of the old basic block.
157 PHINode *OldInduction;
158 // Maps scalars to widened vectors.
162 /// Perform the vectorization legality check. This class does not look at the
163 /// profitability of vectorization, only the legality. At the moment the checks
164 /// are very simple and focus on single basic block loops with a constant
165 /// iteration count and no reductions.
166 class LoopVectorizationLegality {
168 LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl):
169 TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
171 /// This represents the kinds of reductions that we support.
173 IntegerAdd, /// Sum of numbers.
174 IntegerMult, /// Product of numbers.
175 NoReduction /// Not a reduction.
178 // Holds a pairing of reduction instruction and the reduction kind.
179 typedef std::pair<Instruction*, ReductionKind> ReductionPair;
181 /// ReductionList contains the reduction variables
182 /// as well as a single EXIT (from the block) value and the kind of
183 /// reduction variable..
184 /// Notice that the EXIT instruction can also be the PHI itself.
185 typedef DenseMap<PHINode*, ReductionPair> ReductionList;
187 /// Returns the maximum vectorization factor that we *can* use to vectorize
188 /// this loop. This does not mean that it is profitable to vectorize this
189 /// loop, only that it is legal to do so. This may be a large number. We
190 /// can vectorize to any SIMD width below this number.
191 unsigned getLoopMaxVF();
193 /// Returns the Induction variable.
194 PHINode *getInduction() {return Induction;}
196 /// Returns the reduction variables found in the loop.
197 ReductionList *getReductionVars() { return &Reductions; }
200 /// Check if a single basic block loop is vectorizable.
201 /// At this point we know that this is a loop with a constant trip count
202 /// and we only need to check individual instructions.
203 bool canVectorizeBlock(BasicBlock &BB);
205 // Check if a pointer value is known to be disjoint.
206 // Example: Alloca, Global, NoAlias.
207 bool isIdentifiedSafeObject(Value* Val);
209 /// Returns True, if 'Phi' is the kind of reduction variable for type
210 /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
211 bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
212 /// Checks if a constant matches the reduction kind.
213 /// Sums starts with zero. Products start at one.
214 bool isReductionConstant(Value *V, ReductionKind Kind);
215 /// Returns true if the instruction I can be a reduction variable of type
217 bool isReductionInstr(Instruction *I, ReductionKind Kind);
219 /// The loop that we evaluate.
223 /// DataLayout analysis.
226 // --- vectorization state --- //
228 /// Holds the induction variable.
230 /// Holds the reduction variables.
231 ReductionList Reductions;
232 /// Allowed outside users. This holds the reduction
233 /// vars which can be accessed from outside the loop.
234 SmallPtrSet<Value*, 4> AllowedExit;
237 struct LoopVectorize : public LoopPass {
238 static char ID; // Pass identification, replacement for typeid
240 LoopVectorize() : LoopPass(ID) {
241 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
248 virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
250 // Only vectorize innermost loops.
254 SE = &getAnalysis<ScalarEvolution>();
255 DL = getAnalysisIfAvailable<DataLayout>();
256 LI = &getAnalysis<LoopInfo>();
258 DEBUG(dbgs() << "LV: Checking a loop in \"" <<
259 L->getHeader()->getParent()->getName() << "\"\n");
261 // Check if it is legal to vectorize the loop.
262 LoopVectorizationLegality LVL(L, SE, DL);
263 unsigned MaxVF = LVL.getLoopMaxVF();
265 // Check that we can vectorize using the chosen vectorization width.
266 if (MaxVF < DefaultVectorizationFactor) {
267 DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n");
271 DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n");
273 // If we decided that is is *legal* to vectorizer the loop. Do it.
274 SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor);
277 DEBUG(verifyFunction(*L->getHeader()->getParent()));
281 virtual void getAnalysisUsage(AnalysisUsage &AU) const {
282 LoopPass::getAnalysisUsage(AU);
283 AU.addRequiredID(LoopSimplifyID);
284 AU.addRequiredID(LCSSAID);
285 AU.addRequired<LoopInfo>();
286 AU.addRequired<ScalarEvolution>();
291 Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
292 // Instructions that access the old induction variable
293 // actually want to get the new one.
294 if (V == OldInduction)
297 LLVMContext &C = V->getContext();
298 Type *VTy = VectorType::get(V->getType(), VF);
299 Type *I32 = IntegerType::getInt32Ty(C);
300 Constant *Zero = ConstantInt::get(I32, 0);
301 Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
302 Value *UndefVal = UndefValue::get(VTy);
303 // Insert the value into a new vector.
304 Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
305 // Broadcast the scalar into all locations in the vector.
306 Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
308 // We are accessing the induction variable. Make sure to promote the
309 // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
311 return getConsecutiveVector(Shuf);
315 Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
316 assert(Val->getType()->isVectorTy() && "Must be a vector");
317 assert(Val->getType()->getScalarType()->isIntegerTy() &&
318 "Elem must be an integer");
320 Type *ITy = Val->getType()->getScalarType();
321 VectorType *Ty = cast<VectorType>(Val->getType());
322 unsigned VLen = Ty->getNumElements();
323 SmallVector<Constant*, 8> Indices;
325 // Create a vector of consecutive numbers from zero to VF.
326 for (unsigned i = 0; i < VLen; ++i)
327 Indices.push_back(ConstantInt::get(ITy, i));
329 // Add the consecutive indices to the vector value.
330 Constant *Cv = ConstantVector::get(Indices);
331 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
332 return Builder.CreateAdd(Val, Cv, "induction");
336 bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) {
340 unsigned NumOperands = Gep->getNumOperands();
341 Value *LastIndex = Gep->getOperand(NumOperands - 1);
343 // Check that all of the gep indices are uniform except for the last.
344 for (unsigned i = 0; i < NumOperands - 1; ++i)
345 if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), Orig))
348 // We can emit wide load/stores only of the last index is the induction
350 const SCEV *Last = SE->getSCEV(LastIndex);
351 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
352 const SCEV *Step = AR->getStepRecurrence(*SE);
354 // The memory is consecutive because the last index is consecutive
355 // and all other indices are loop invariant.
363 Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
364 assert(!V->getType()->isVectorTy() && "Can't widen a vector");
365 // If we saved a vectorized copy of V, use it.
366 ValueMap::iterator it = WidenMap.find(V);
367 if (it != WidenMap.end())
370 // Broadcast V and save the value for future uses.
371 Value *B = getBroadcastInstrs(V);
377 SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
378 SmallVector<Constant*, 8> Indices;
379 // Create a vector of consecutive numbers from zero to VF.
380 for (unsigned i = 0; i < VF; ++i)
381 Indices.push_back(ConstantInt::get(ScalarTy, Val));
383 // Add the consecutive indices to the vector value.
384 return ConstantVector::get(Indices);
387 void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
388 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
389 // Holds vector parameters or scalars, in case of uniform vals.
390 SmallVector<Value*, 8> Params;
392 // Find all of the vectorized parameters.
393 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
394 Value *SrcOp = Instr->getOperand(op);
396 // If we are accessing the old induction variable, use the new one.
397 if (SrcOp == OldInduction) {
398 Params.push_back(getBroadcastInstrs(Induction));
402 // Try using previously calculated values.
403 Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
405 // If the src is an instruction that appeared earlier in the basic block
406 // then it should already be vectorized.
407 if (SrcInst && SrcInst->getParent() == Instr->getParent()) {
408 assert(WidenMap.count(SrcInst) && "Source operand is unavailable");
409 // The parameter is a vector value from earlier.
410 Params.push_back(WidenMap[SrcInst]);
412 // The parameter is a scalar from outside the loop. Maybe even a constant.
413 Params.push_back(SrcOp);
417 assert(Params.size() == Instr->getNumOperands() &&
418 "Invalid number of operands");
420 // Does this instruction return a value ?
421 bool IsVoidRetTy = Instr->getType()->isVoidTy();
422 Value *VecResults = 0;
424 // If we have a return value, create an empty vector. We place the scalarized
425 // instructions in this vector.
427 VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF));
429 // For each scalar that we create.
430 for (unsigned i = 0; i < VF; ++i) {
431 Instruction *Cloned = Instr->clone();
433 Cloned->setName(Instr->getName() + ".cloned");
434 // Replace the operands of the cloned instrucions with extracted scalars.
435 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
436 Value *Op = Params[op];
437 // Param is a vector. Need to extract the right lane.
438 if (Op->getType()->isVectorTy())
439 Op = Builder.CreateExtractElement(Op, Builder.getInt32(i));
440 Cloned->setOperand(op, Op);
443 // Place the cloned scalar in the new loop.
444 Builder.Insert(Cloned);
446 // If the original scalar returns a value we need to place it in a vector
447 // so that future users will be able to use it.
449 VecResults = Builder.CreateInsertElement(VecResults, Cloned,
450 Builder.getInt32(i));
454 WidenMap[Instr] = VecResults;
457 void SingleBlockLoopVectorizer::createEmptyLoop() {
459 In this function we generate a new loop. The new loop will contain
460 the vectorized instructions while the old loop will continue to run the
463 [ ] <-- vector loop bypass.
466 | [ ] <-- vector pre header.
470 | [ ]_| <-- vector loop.
473 >[ ] <--- middle-block.
476 | [ ] <--- new preheader.
480 | [ ]_| <-- old scalar loop to handle remainder.
487 // This is the original scalar-loop preheader.
488 BasicBlock *BypassBlock = Orig->getLoopPreheader();
489 BasicBlock *ExitBlock = Orig->getExitBlock();
490 assert(ExitBlock && "Must have an exit block");
492 assert(Orig->getNumBlocks() == 1 && "Invalid loop");
493 assert(BypassBlock && "Invalid loop structure");
495 BasicBlock *VectorPH =
496 BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
497 BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
500 BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
502 BasicBlock *ScalarPH =
503 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
505 // Find the induction variable.
506 BasicBlock *OldBasicBlock = Orig->getHeader();
507 OldInduction = dyn_cast<PHINode>(OldBasicBlock->begin());
508 assert(OldInduction && "We must have a single phi node.");
509 Type *IdxTy = OldInduction->getType();
511 // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
513 Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
515 // Generate the induction variable.
516 Induction = Builder.CreatePHI(IdxTy, 2, "index");
517 Constant *Zero = ConstantInt::get(IdxTy, 0);
518 Constant *Step = ConstantInt::get(IdxTy, VF);
520 // Find the loop boundaries.
521 const SCEV *ExitCount = SE->getExitCount(Orig, Orig->getHeader());
522 assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
524 // Get the total trip count from the count by adding 1.
525 ExitCount = SE->getAddExpr(ExitCount,
526 SE->getConstant(ExitCount->getType(), 1));
528 // Expand the trip count and place the new instructions in the preheader.
529 // Notice that the pre-header does not change, only the loop body.
530 SCEVExpander Exp(*SE, "induction");
531 Instruction *Loc = BypassBlock->getTerminator();
533 // We may need to extend the index in case there is a type mismatch.
534 // We know that the count starts at zero and does not overflow.
535 // We are using Zext because it should be less expensive.
536 if (ExitCount->getType() != Induction->getType())
537 ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
539 // Count holds the overall loop count (N).
540 Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
541 // Now we need to generate the expression for N - (N % VF), which is
542 // the part that the vectorized body will execute.
543 Constant *CIVF = ConstantInt::get(IdxTy, VF);
544 Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
545 Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
547 // Now, compare the new count to zero. If it is zero, jump to the scalar part.
548 Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
549 CountRoundDown, ConstantInt::getNullValue(IdxTy),
551 BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
552 // Remove the old terminator.
553 Loc->eraseFromParent();
555 // Add a check in the middle block to see if we have completed
556 // all of the iterations in the first vector loop.
557 // If (N - N%VF) == N, then we *don't* need to run the remainder.
558 Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
559 CountRoundDown, "cmp.n",
560 MiddleBlock->getTerminator());
562 BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
563 // Remove the old terminator.
564 MiddleBlock->getTerminator()->eraseFromParent();
566 // Create i+1 and fill the PHINode.
567 Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
568 Induction->addIncoming(Zero, VectorPH);
569 Induction->addIncoming(NextIdx, VecBody);
570 // Create the compare.
571 Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown);
572 Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
574 // Now we have two terminators. Remove the old one from the block.
575 VecBody->getTerminator()->eraseFromParent();
577 // Fix the scalar body iteration count.
578 unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
579 OldInduction->setIncomingValue(BlockIdx, CountRoundDown);
581 // Get ready to start creating new instructions into the vectorized body.
582 Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
584 // Register the new loop.
585 Loop* Lp = new Loop();
586 LPM->insertLoop(Lp, Orig->getParentLoop());
588 Lp->addBasicBlockToLoop(VecBody, LI->getBase());
590 Loop *ParentLoop = Orig->getParentLoop();
592 ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
593 ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
594 ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
598 LoopMiddleBlock = MiddleBlock;
599 LoopExitBlock = ExitBlock;
600 LoopVectorBody = VecBody;
601 LoopScalarBody = OldBasicBlock;
602 LoopBypassBlock = BypassBlock;
606 SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
607 typedef SmallVector<PHINode*, 4> PhiVector;
608 BasicBlock &BB = *Orig->getHeader();
610 // In order to support reduction variables we need to be able to vectorize
611 // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
612 // steages. First, we create a new vector PHI node with no incoming edges.
613 // We use this value when we vectorize all of the instructions that use the
614 // PHI. Next, after all of the instructions in the block are complete we
615 // add the new incoming edges to the PHI. At this point all of the
616 // instructions in the basic block are vectorized, so we can use them to
617 // construct the PHI.
620 // For each instruction in the old loop.
621 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
622 Instruction *Inst = it;
624 switch (Inst->getOpcode()) {
625 case Instruction::Br:
626 // Nothing to do for PHIs and BR, since we already took care of the
627 // loop control flow instructions.
629 case Instruction::PHI:{
630 PHINode* P = cast<PHINode>(Inst);
631 // Special handling for the induction var.
632 if (OldInduction == Inst)
634 // This is phase I of vectorizing PHIs.
635 // This has to be a reduction variable.
636 assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
637 Type *VecTy = VectorType::get(Inst->getType(), VF);
638 WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
639 PHIsToFix.push_back(P);
642 case Instruction::Add:
643 case Instruction::FAdd:
644 case Instruction::Sub:
645 case Instruction::FSub:
646 case Instruction::Mul:
647 case Instruction::FMul:
648 case Instruction::UDiv:
649 case Instruction::SDiv:
650 case Instruction::FDiv:
651 case Instruction::URem:
652 case Instruction::SRem:
653 case Instruction::FRem:
654 case Instruction::Shl:
655 case Instruction::LShr:
656 case Instruction::AShr:
657 case Instruction::And:
658 case Instruction::Or:
659 case Instruction::Xor: {
660 // Just widen binops.
661 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
662 Value *A = getVectorValue(Inst->getOperand(0));
663 Value *B = getVectorValue(Inst->getOperand(1));
664 // Use this vector value for all users of the original instruction.
665 WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
668 case Instruction::Select: {
670 // TODO: If the selector is loop invariant we can issue a select
671 // instruction with a scalar condition.
672 Value *A = getVectorValue(Inst->getOperand(0));
673 Value *B = getVectorValue(Inst->getOperand(1));
674 Value *C = getVectorValue(Inst->getOperand(2));
675 WidenMap[Inst] = Builder.CreateSelect(A, B, C);
679 case Instruction::ICmp:
680 case Instruction::FCmp: {
681 // Widen compares. Generate vector compares.
682 bool FCmp = (Inst->getOpcode() == Instruction::FCmp);
683 CmpInst *Cmp = dyn_cast<CmpInst>(Inst);
684 Value *A = getVectorValue(Inst->getOperand(0));
685 Value *B = getVectorValue(Inst->getOperand(1));
687 WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
689 WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
693 case Instruction::Store: {
694 // Attempt to issue a wide store.
695 StoreInst *SI = dyn_cast<StoreInst>(Inst);
696 Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
697 Value *Ptr = SI->getPointerOperand();
698 unsigned Alignment = SI->getAlignment();
699 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
700 // This store does not use GEPs.
701 if (!isConsecutiveGep(Gep)) {
702 scalarizeInstruction(Inst);
706 // Create the new GEP with the new induction variable.
707 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
708 unsigned NumOperands = Gep->getNumOperands();
709 Gep2->setOperand(NumOperands - 1, Induction);
710 Ptr = Builder.Insert(Gep2);
711 Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
712 Value *Val = getVectorValue(SI->getValueOperand());
713 Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
716 case Instruction::Load: {
717 // Attempt to issue a wide load.
718 LoadInst *LI = dyn_cast<LoadInst>(Inst);
719 Type *RetTy = VectorType::get(LI->getType(), VF);
720 Value *Ptr = LI->getPointerOperand();
721 unsigned Alignment = LI->getAlignment();
722 GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
724 // We don't have a gep. Scalarize the load.
725 if (!isConsecutiveGep(Gep)) {
726 scalarizeInstruction(Inst);
730 // Create the new GEP with the new induction variable.
731 GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
732 unsigned NumOperands = Gep->getNumOperands();
733 Gep2->setOperand(NumOperands - 1, Induction);
734 Ptr = Builder.Insert(Gep2);
735 Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
736 LI = Builder.CreateLoad(Ptr);
737 LI->setAlignment(Alignment);
738 // Use this vector value for all users of the load.
742 case Instruction::ZExt:
743 case Instruction::SExt:
744 case Instruction::FPToUI:
745 case Instruction::FPToSI:
746 case Instruction::FPExt:
747 case Instruction::PtrToInt:
748 case Instruction::IntToPtr:
749 case Instruction::SIToFP:
750 case Instruction::UIToFP:
751 case Instruction::Trunc:
752 case Instruction::FPTrunc:
753 case Instruction::BitCast: {
754 /// Vectorize bitcasts.
755 CastInst *CI = dyn_cast<CastInst>(Inst);
756 Value *A = getVectorValue(Inst->getOperand(0));
757 Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
758 WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
763 /// All other instructions are unsupported. Scalarize them.
764 scalarizeInstruction(Inst);
767 }// end of for_each instr.
769 // At this point every instruction in the original loop is widended to
770 // a vector form. We are almost done. Now, we need to fix the PHI nodes
771 // that we vectorized. The PHI nodes are currently empty because we did
772 // not want to introduce cycles. Notice that the remaining PHI nodes
773 // that we need to fix are reduction variables.
775 // Create the 'reduced' values for each of the induction vars.
776 // The reduced values are the vector values that we scalarize and combine
777 // after the loop is finished.
778 for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end();
780 PHINode *RdxPhi = *it;
781 PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
782 assert(RdxPhi && "Unable to recover vectorized PHI");
784 // Find the reduction variable.
785 assert(Legal->getReductionVars()->count(RdxPhi) &&
786 "Unable to find the reduction variable");
787 LoopVectorizationLegality::ReductionPair ReductionVar =
788 (*Legal->getReductionVars())[RdxPhi];
790 // This is the vector-clone of the value that leaves the loop.
791 Value *VectorExit = getVectorValue(ReductionVar.first);
792 Type *VecTy = VectorExit->getType();
794 // This is the kind of reduction.
795 LoopVectorizationLegality::ReductionKind RdxKind = ReductionVar.second;
796 // Find the reduction identity variable.
797 // Zero for addition. One for Multiplication.
798 unsigned IdentitySclr =
799 (RdxKind == LoopVectorizationLegality::IntegerAdd ? 0 : 1);
800 Constant *Identity = getUniformVector(IdentitySclr, VecTy->getScalarType());
802 // Fix the vector-loop phi.
803 // We created the induction variable so we know that the
804 // preheader is the first entry.
805 BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
806 VecRdxPhi->addIncoming(Identity, VecPreheader);
807 unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
808 Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
809 VecRdxPhi->addIncoming(Val, LoopVectorBody);
811 // Before each round, move the insertion point right between
812 // the PHIs and the values we are going to write.
813 // This allows us to write both PHINodes and the extractelement
815 Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
817 // This PHINode contains the vectorized reduction variable, or
818 // the identity vector, if we bypass the vector loop.
819 PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
820 NewPhi->addIncoming(Identity, LoopBypassBlock);
821 NewPhi->addIncoming(getVectorValue(ReductionVar.first), LoopVectorBody);
823 // Extract the first scalar.
825 Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
826 // Extract and sum the remaining vector elements.
827 for (unsigned i=1; i < VF; ++i) {
829 Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
830 if (RdxKind == LoopVectorizationLegality::IntegerAdd) {
831 Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
833 Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
837 // Now, we need to fix the users of the reduction variable
838 // inside and outside of the scalar remainder loop.
839 // We know that the loop is in LCSSA form. We need to update the
840 // PHI nodes in the exit blocks.
841 for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
842 LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
843 PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
844 if (!LCSSAPhi) continue;
846 // All PHINodes need to have a single entry edge, or two if we already fixed them.
847 assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
849 // We found our reduction value exit-PHI. Update it with the incoming bypass edge.
850 if (LCSSAPhi->getIncomingValue(0) == ReductionVar.first) {
851 // Add an edge coming from the bypass.
852 LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
855 }// end of the LCSSA phi scan.
857 // Fix the scalar loop reduction variable with the incoming reduction sum
858 // from the vector body and from the backedge value.
859 int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
860 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
861 (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
862 (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, ReductionVar.first);
863 }// end of for each redux variable.
866 void SingleBlockLoopVectorizer::cleanup() {
867 // The original basic block.
868 SE->forgetLoop(Orig);
871 unsigned LoopVectorizationLegality::getLoopMaxVF() {
872 if (!TheLoop->getLoopPreheader()) {
873 assert(false && "No preheader!!");
874 DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
878 // We can only vectorize single basic block loops.
879 unsigned NumBlocks = TheLoop->getNumBlocks();
880 if (NumBlocks != 1) {
881 DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
885 // We need to have a loop header.
886 BasicBlock *BB = TheLoop->getHeader();
887 DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
889 // Go over each instruction and look at memory deps.
890 if (!canVectorizeBlock(*BB)) {
891 DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
895 // ScalarEvolution needs to be able to find the exit count.
896 const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
897 if (ExitCount == SE->getCouldNotCompute()) {
898 DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
902 DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
904 // Okay! We can vectorize. At this point we don't have any other mem analysis
905 // which may limit our maximum vectorization factor, so just return the
906 // maximum SIMD size.
907 return DefaultVectorizationFactor;
910 bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
911 // Holds the read and write pointers that we find.
912 typedef SmallVector<Value*, 10> ValueVector;
916 for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
919 PHINode *Phi = dyn_cast<PHINode>(I);
921 // This should not happen because the loop should be normalized.
922 if (Phi->getNumIncomingValues() != 2) {
923 DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
926 // We only look at integer phi nodes.
927 if (!Phi->getType()->isIntegerTy()) {
928 DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
931 if (AddReductionVar(Phi, IntegerAdd)) {
932 DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
935 if (AddReductionVar(Phi, IntegerMult)) {
936 DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n");
940 DEBUG(dbgs() << "LV: Found too many PHIs.\n");
943 // Found the induction variable.
946 // Check that the PHI is consecutive and starts at zero.
947 const SCEV *PhiScev = SE->getSCEV(Phi);
948 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
950 DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
954 const SCEV *Step = AR->getStepRecurrence(*SE);
955 const SCEV *Start = AR->getStart();
957 if (!Step->isOne() || !Start->isZero()) {
958 DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
961 }// end of PHI handling
963 // If this is a load, record its pointer. If it is not a load, abort.
964 // Notice that we don't handle function calls that read or write.
965 if (I->mayReadFromMemory()) {
966 LoadInst *Ld = dyn_cast<LoadInst>(I);
967 if (!Ld) return false;
968 if (!Ld->isSimple()) {
969 DEBUG(dbgs() << "LV: Found a non-simple load.\n");
973 Value* Ptr = Ld->getPointerOperand();
974 GetUnderlyingObjects(Ptr, Reads, DL);
977 // Record store pointers. Abort on all other instructions that write to
979 if (I->mayWriteToMemory()) {
980 StoreInst *St = dyn_cast<StoreInst>(I);
981 if (!St) return false;
982 if (!St->isSimple()) {
983 DEBUG(dbgs() << "LV: Found a non-simple store.\n");
987 Value* Ptr = St->getPointerOperand();
988 GetUnderlyingObjects(Ptr, Writes, DL);
991 // We still don't handle functions.
992 CallInst *CI = dyn_cast<CallInst>(I);
994 DEBUG(dbgs() << "LV: Found a call site:"<<
995 CI->getCalledFunction()->getName() << "\n");
999 // We do not re-vectorize vectors.
1000 if (!VectorType::isValidElementType(I->getType()) &&
1001 !I->getType()->isVoidTy()) {
1002 DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
1006 // Reduction instructions are allowed to have exit users.
1007 // All other instructions must not have external users.
1008 if (!AllowedExit.count(I))
1009 //Check that all of the users of the loop are inside the BB.
1010 for (Value::use_iterator it = I->use_begin(), e = I->use_end();
1012 Instruction *U = cast<Instruction>(*it);
1013 // This user may be a reduction exit value.
1014 BasicBlock *Parent = U->getParent();
1015 if (Parent != &BB) {
1016 DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
1023 DEBUG(dbgs() << "LV: Did not find an induction var.\n");
1027 // Check that the underlying objects of the reads and writes are either
1028 // disjoint memory locations, or that they are no-alias arguments.
1029 ValueVector::iterator r, re, w, we;
1030 for (r = Reads.begin(), re = Reads.end(); r != re; ++r) {
1031 if (!isIdentifiedSafeObject(*r)) {
1032 DEBUG(dbgs() << "LV: Found a bad read Ptr: "<< **r << "\n");
1037 for (w = Writes.begin(), we = Writes.end(); w != we; ++w) {
1038 if (!isIdentifiedSafeObject(*w)) {
1039 DEBUG(dbgs() << "LV: Found a bad write Ptr: "<< **w << "\n");
1044 // Check that there are no multiple write locations to the same pointer.
1045 SmallPtrSet<Value*, 8> WritePointerSet;
1046 for (w = Writes.begin(), we = Writes.end(); w != we; ++w) {
1047 if (!WritePointerSet.insert(*w)) {
1048 DEBUG(dbgs() << "LV: Multiple writes to the same index :"<< **w << "\n");
1053 // Check that the reads and the writes are disjoint.
1054 for (r = Reads.begin(), re = Reads.end(); r != re; ++r) {
1055 if (WritePointerSet.count(*r)) {
1056 DEBUG(dbgs() << "Vectorizer: Found a read/write ptr:"<< **r << "\n");
1065 /// Checks if the value is a Global variable or if it is an Arguments
1066 /// marked with the NoAlias attribute.
1067 bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) {
1068 assert(Val && "Invalid value");
1069 if (dyn_cast<GlobalValue>(Val))
1071 if (dyn_cast<AllocaInst>(Val))
1073 Argument *A = dyn_cast<Argument>(Val);
1076 return A->hasNoAliasAttr();
1079 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
1080 ReductionKind Kind) {
1081 if (Phi->getNumIncomingValues() != 2)
1084 // Find the possible incoming reduction variable.
1085 BasicBlock *BB = Phi->getParent();
1086 int SelfEdgeIdx = Phi->getBasicBlockIndex(BB);
1087 int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry.
1088 Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx);
1090 // We must have a constant that starts the reduction.
1091 if (!isReductionConstant(RdxStart, Kind))
1094 // ExitInstruction is the single value which is used outside the loop.
1095 // We only allow for a single reduction value to be used outside the loop.
1096 // This includes users of the reduction, variables (which form a cycle
1097 // which ends in the phi node).
1098 Instruction *ExitInstruction = 0;
1100 // Iter is our iterator. We start with the PHI node and scan for all of the
1101 // users of this instruction. All users must be instructions which can be
1102 // used as reduction variables (such as ADD). We may have a single
1103 // out-of-block user. They cycle must end with the original PHI.
1104 // Also, we can't have multiple block-local users.
1105 Instruction *Iter = Phi;
1107 // Any reduction instr must be of one of the allowed kinds.
1108 if (!isReductionInstr(Iter, Kind))
1111 // Did we found a user inside this block ?
1112 bool FoundInBlockUser = false;
1113 // Did we reach the initial PHI node ?
1114 bool FoundStartPHI = false;
1115 // For each of the *users* of iter.
1116 for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
1118 Instruction *U = cast<Instruction>(*it);
1119 // We already know that the PHI is a user.
1121 FoundStartPHI = true;
1124 // Check if we found the exit user.
1125 BasicBlock *Parent = U->getParent();
1127 // We must have a single exit instruction.
1128 if (ExitInstruction != 0)
1130 ExitInstruction = Iter;
1132 // We can't have multiple inside users.
1133 if (FoundInBlockUser)
1135 FoundInBlockUser = true;
1139 // We found a reduction var if we have reached the original
1140 // phi node and we only have a single instruction with out-of-loop
1142 if (FoundStartPHI && ExitInstruction) {
1143 // This instruction is allowed to have out-of-loop users.
1144 AllowedExit.insert(ExitInstruction);
1145 // Mark this as a reduction var.
1146 Reductions[Phi] = std::make_pair(ExitInstruction, Kind);
1153 LoopVectorizationLegality::isReductionConstant(Value *V, ReductionKind Kind) {
1154 ConstantInt *CI = dyn_cast<ConstantInt>(V);
1157 if (Kind == IntegerMult && CI->isOne())
1159 if (Kind == IntegerAdd && CI->isZero())
1165 LoopVectorizationLegality::isReductionInstr(Instruction *I,
1166 ReductionKind Kind) {
1167 switch (I->getOpcode()) {
1170 case Instruction::PHI:
1173 case Instruction::Add:
1174 case Instruction::Sub:
1175 return Kind == IntegerAdd;
1176 case Instruction::Mul:
1177 case Instruction::UDiv:
1178 case Instruction::SDiv:
1179 return Kind == IntegerMult;
1185 char LoopVectorize::ID = 0;
1186 static const char lv_name[] = "Loop Vectorization";
1187 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
1188 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
1189 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
1190 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
1191 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
1194 Pass *createLoopVectorizePass() {
1195 return new LoopVectorize();