lib/Transforms/Vectorize/LoopVectorize.cpp

   1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
  11 // and generates target-independent LLVM-IR.
  12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
  13 // of instructions in order to estimate the profitability of vectorization.
  14 //
  15 // The loop vectorizer combines consecutive loop iterations into a single
  16 // 'wide' iteration. After this transformation the index is incremented
  17 // by the SIMD vector width, and not by one.
  18 //
  19 // This pass has three parts:
  20 // 1. The main loop pass that drives the different parts.
  21 // 2. LoopVectorizationLegality - A unit that checks for the legality
  22 //    of the vectorization.
  23 // 3. InnerLoopVectorizer - A unit that performs the actual
  24 //    widening of instructions.
  25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
  26 //    of vectorization. It decides on the optimal vector width, which
  27 //    can be one, if vectorization is not profitable.
  28 //
  29 //===----------------------------------------------------------------------===//
  30 //
  31 // The reduction-variable vectorization is based on the paper:
  32 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
  33 //
  34 // Variable uniformity checks are inspired by:
  35 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
  36 //
  37 // Other ideas/concepts are from:
  38 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
  39 //
  40 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
  41 //  Vectorizing Compilers.
  42 //
  43 //===----------------------------------------------------------------------===//
  44
  45 #define LV_NAME "loop-vectorize"
  46 #define DEBUG_TYPE LV_NAME
  47
  48 #include "llvm/Transforms/Vectorize.h"
  49 #include "llvm/ADT/DenseMap.h"
  50 #include "llvm/ADT/EquivalenceClasses.h"
  51 #include "llvm/ADT/MapVector.h"
  52 #include "llvm/ADT/SetVector.h"
  53 #include "llvm/ADT/SmallPtrSet.h"
  54 #include "llvm/ADT/SmallSet.h"
  55 #include "llvm/ADT/SmallVector.h"
  56 #include "llvm/ADT/StringExtras.h"
  57 #include "llvm/Analysis/AliasAnalysis.h"
  58 #include "llvm/Analysis/Dominators.h"
  59 #include "llvm/Analysis/LoopInfo.h"
  60 #include "llvm/Analysis/LoopIterator.h"
  61 #include "llvm/Analysis/LoopPass.h"
  62 #include "llvm/Analysis/ScalarEvolution.h"
  63 #include "llvm/Analysis/ScalarEvolutionExpander.h"
  64 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  65 #include "llvm/Analysis/TargetTransformInfo.h"
  66 #include "llvm/Analysis/ValueTracking.h"
  67 #include "llvm/Analysis/Verifier.h"
  68 #include "llvm/IR/Constants.h"
  69 #include "llvm/IR/DataLayout.h"
  70 #include "llvm/IR/DerivedTypes.h"
  71 #include "llvm/IR/Function.h"
  72 #include "llvm/IR/IRBuilder.h"
  73 #include "llvm/IR/Instructions.h"
  74 #include "llvm/IR/IntrinsicInst.h"
  75 #include "llvm/IR/LLVMContext.h"
  76 #include "llvm/IR/Module.h"
  77 #include "llvm/IR/Type.h"
  78 #include "llvm/IR/Value.h"
  79 #include "llvm/Pass.h"
  80 #include "llvm/Support/CommandLine.h"
  81 #include "llvm/Support/Debug.h"
  82 #include "llvm/Support/PatternMatch.h"
  83 #include "llvm/Support/raw_ostream.h"
  84 #include "llvm/Support/ValueHandle.h"
  85 #include "llvm/Target/TargetLibraryInfo.h"
  86 #include "llvm/Transforms/Scalar.h"
  87 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  88 #include "llvm/Transforms/Utils/Local.h"
  89 #include <algorithm>
  90 #include <map>
  91
  92 using namespace llvm;
  93 using namespace llvm::PatternMatch;
  94
  95 static cl::opt<unsigned>
  96 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
  97                     cl::desc("Sets the SIMD width. Zero is autoselect."));
  98
  99 static cl::opt<unsigned>
 100 VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden,
 101                     cl::desc("Sets the vectorization unroll count. "
 102                              "Zero is autoselect."));
 103
 104 static cl::opt<bool>
 105 EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
 106                    cl::desc("Enable if-conversion during vectorization."));
 107
 108 /// We don't vectorize loops with a known constant trip count below this number.
 109 static cl::opt<unsigned>
 110 TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
 111                              cl::Hidden,
 112                              cl::desc("Don't vectorize loops with a constant "
 113                                       "trip count that is smaller than this "
 114                                       "value."));
 115
 116 /// We don't unroll loops with a known constant trip count below this number.
 117 static const unsigned TinyTripCountUnrollThreshold = 128;
 118
 119 /// When performing memory disambiguation checks at runtime do not make more
 120 /// than this number of comparisons.
 121 static const unsigned RuntimeMemoryCheckThreshold = 8;
 122
 123 /// Maximum simd width.
 124 static const unsigned MaxVectorWidth = 64;
 125
 126 /// Maximum vectorization unroll count.
 127 static const unsigned MaxUnrollFactor = 16;
 128
 129 namespace {
 130
 131 // Forward declarations.
 132 class LoopVectorizationLegality;
 133 class LoopVectorizationCostModel;
 134
 135 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 136 /// block to a specified vectorization factor (VF).
 137 /// This class performs the widening of scalars into vectors, or multiple
 138 /// scalars. This class also implements the following features:
 139 /// * It inserts an epilogue loop for handling loops that don't have iteration
 140 ///   counts that are known to be a multiple of the vectorization factor.
 141 /// * It handles the code generation for reduction variables.
 142 /// * Scalarization (implementation using scalars) of un-vectorizable
 143 ///   instructions.
 144 /// InnerLoopVectorizer does not perform any vectorization-legality
 145 /// checks, and relies on the caller to check for the different legality
 146 /// aspects. The InnerLoopVectorizer relies on the
 147 /// LoopVectorizationLegality class to provide information about the induction
 148 /// and reduction variables that were found to a given vectorization factor.
 149 class InnerLoopVectorizer {
 150 public:
 151   InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
 152                       DominatorTree *DT, DataLayout *DL,
 153                       const TargetLibraryInfo *TLI, unsigned VecWidth,
 154                       unsigned UnrollFactor)
 155       : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
 156         VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
 157         OldInduction(0), WidenMap(UnrollFactor) {}
 158
 159   // Perform the actual loop widening (vectorization).
 160   void vectorize(LoopVectorizationLegality *Legal) {
 161     // Create a new empty loop. Unlink the old loop and connect the new one.
 162     createEmptyLoop(Legal);
 163     // Widen each instruction in the old loop to a new one in the new loop.
 164     // Use the Legality module to find the induction and reduction variables.
 165     vectorizeLoop(Legal);
 166     // Register the new loop and update the analysis passes.
 167     updateAnalysis();
 168   }
 169
 170 private:
 171   /// A small list of PHINodes.
 172   typedef SmallVector<PHINode*, 4> PhiVector;
 173   /// When we unroll loops we have multiple vector values for each scalar.
 174   /// This data structure holds the unrolled and vectorized values that
 175   /// originated from one scalar instruction.
 176   typedef SmallVector<Value*, 2> VectorParts;
 177
 178   // When we if-convert we need create edge masks. We have to cache values so
 179   // that we don't end up with exponential recursion/IR.
 180   typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,
 181                    VectorParts> EdgeMaskCache;
 182
 183   /// Add code that checks at runtime if the accessed arrays overlap.
 184   /// Returns the comparator value or NULL if no check is needed.
 185   Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal,
 186                                Instruction *Loc);
 187   /// Create an empty loop, based on the loop ranges of the old loop.
 188   void createEmptyLoop(LoopVectorizationLegality *Legal);
 189   /// Copy and widen the instructions from the old loop.
 190   void vectorizeLoop(LoopVectorizationLegality *Legal);
 191
 192   /// A helper function that computes the predicate of the block BB, assuming
 193   /// that the header block of the loop is set to True. It returns the *entry*
 194   /// mask for the block BB.
 195   VectorParts createBlockInMask(BasicBlock *BB);
 196   /// A helper function that computes the predicate of the edge between SRC
 197   /// and DST.
 198   VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
 199
 200   /// A helper function to vectorize a single BB within the innermost loop.
 201   void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
 202                             PhiVector *PV);
 203
 204   /// Insert the new loop to the loop hierarchy and pass manager
 205   /// and update the analysis passes.
 206   void updateAnalysis();
 207
 208   /// This instruction is un-vectorizable. Implement it as a sequence
 209   /// of scalars.
 210   void scalarizeInstruction(Instruction *Instr);
 211
 212   /// Vectorize Load and Store instructions,
 213   void vectorizeMemoryInstruction(Instruction *Instr,
 214                                   LoopVectorizationLegality *Legal);
 215
 216   /// Create a broadcast instruction. This method generates a broadcast
 217   /// instruction (shuffle) for loop invariant values and for the induction
 218   /// value. If this is the induction variable then we extend it to N, N+1, ...
 219   /// this is needed because each iteration in the loop corresponds to a SIMD
 220   /// element.
 221   Value *getBroadcastInstrs(Value *V);
 222
 223   /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
 224   /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
 225   /// The sequence starts at StartIndex.
 226   Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
 227
 228   /// When we go over instructions in the basic block we rely on previous
 229   /// values within the current basic block or on loop invariant values.
 230   /// When we widen (vectorize) values we place them in the map. If the values
 231   /// are not within the map, they have to be loop invariant, so we simply
 232   /// broadcast them into a vector.
 233   VectorParts &getVectorValue(Value *V);
 234
 235   /// Generate a shuffle sequence that will reverse the vector Vec.
 236   Value *reverseVector(Value *Vec);
 237
 238   /// This is a helper class that holds the vectorizer state. It maps scalar
 239   /// instructions to vector instructions. When the code is 'unrolled' then
 240   /// then a single scalar value is mapped to multiple vector parts. The parts
 241   /// are stored in the VectorPart type.
 242   struct ValueMap {
 243     /// C'tor.  UnrollFactor controls the number of vectors ('parts') that
 244     /// are mapped.
 245     ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
 246
 247     /// \return True if 'Key' is saved in the Value Map.
 248     bool has(Value *Key) const { return MapStorage.count(Key); }
 249
 250     /// Initializes a new entry in the map. Sets all of the vector parts to the
 251     /// save value in 'Val'.
 252     /// \return A reference to a vector with splat values.
 253     VectorParts &splat(Value *Key, Value *Val) {
 254       VectorParts &Entry = MapStorage[Key];
 255       Entry.assign(UF, Val);
 256       return Entry;
 257     }
 258
 259     ///\return A reference to the value that is stored at 'Key'.
 260     VectorParts &get(Value *Key) {
 261       VectorParts &Entry = MapStorage[Key];
 262       if (Entry.empty())
 263         Entry.resize(UF);
 264       assert(Entry.size() == UF);
 265       return Entry;
 266     }
 267
 268   private:
 269     /// The unroll factor. Each entry in the map stores this number of vector
 270     /// elements.
 271     unsigned UF;
 272
 273     /// Map storage. We use std::map and not DenseMap because insertions to a
 274     /// dense map invalidates its iterators.
 275     std::map<Value *, VectorParts> MapStorage;
 276   };
 277
 278   /// The original loop.
 279   Loop *OrigLoop;
 280   /// Scev analysis to use.
 281   ScalarEvolution *SE;
 282   /// Loop Info.
 283   LoopInfo *LI;
 284   /// Dominator Tree.
 285   DominatorTree *DT;
 286   /// Data Layout.
 287   DataLayout *DL;
 288   /// Target Library Info.
 289   const TargetLibraryInfo *TLI;
 290
 291   /// The vectorization SIMD factor to use. Each vector will have this many
 292   /// vector elements.
 293   unsigned VF;
 294   /// The vectorization unroll factor to use. Each scalar is vectorized to this
 295   /// many different vector instructions.
 296   unsigned UF;
 297
 298   /// The builder that we use
 299   IRBuilder<> Builder;
 300
 301   // --- Vectorization state ---
 302
 303   /// The vector-loop preheader.
 304   BasicBlock *LoopVectorPreHeader;
 305   /// The scalar-loop preheader.
 306   BasicBlock *LoopScalarPreHeader;
 307   /// Middle Block between the vector and the scalar.
 308   BasicBlock *LoopMiddleBlock;
 309   ///The ExitBlock of the scalar loop.
 310   BasicBlock *LoopExitBlock;
 311   ///The vector loop body.
 312   BasicBlock *LoopVectorBody;
 313   ///The scalar loop body.
 314   BasicBlock *LoopScalarBody;
 315   /// A list of all bypass blocks. The first block is the entry of the loop.
 316   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
 317
 318   /// The new Induction variable which was added to the new block.
 319   PHINode *Induction;
 320   /// The induction variable of the old basic block.
 321   PHINode *OldInduction;
 322   /// Holds the extended (to the widest induction type) start index.
 323   Value *ExtendedIdx;
 324   /// Maps scalars to widened vectors.
 325   ValueMap WidenMap;
 326   EdgeMaskCache MaskCache;
 327 };
 328
 329 /// \brief Look for a meaningful debug location on the instruction or it's
 330 /// operands.
 331 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
 332   if (!I)
 333     return I;
 334
 335   DebugLoc Empty;
 336   if (I->getDebugLoc() != Empty)
 337     return I;
 338
 339   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
 340     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
 341       if (OpInst->getDebugLoc() != Empty)
 342         return OpInst;
 343   }
 344
 345   return I;
 346 }
 347
 348 /// \brief Set the debug location in the builder using the debug location in the
 349 /// instruction.
 350 static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
 351   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
 352     B.SetCurrentDebugLocation(Inst->getDebugLoc());
 353   else
 354     B.SetCurrentDebugLocation(DebugLoc());
 355 }
 356
 357 /// \brief Check if conditionally executed loads are hoistable.
 358 ///
 359 /// This class has two functions: isHoistableLoad and canHoistAllLoads.
 360 /// isHoistableLoad should be called on all load instructions that are executed
 361 /// conditionally. After all conditional loads are processed, the client should
 362 /// call canHoistAllLoads to determine if all of the conditional executed loads
 363 /// have an unconditional memory access to the same memory address in the loop.
 364 class LoadHoisting {
 365   typedef SmallPtrSet<Value *, 8> MemorySet;
 366
 367   Loop *TheLoop;
 368   DominatorTree *DT;
 369   MemorySet CondLoadAddrSet;
 370
 371 public:
 372   LoadHoisting(Loop *L, DominatorTree *D) : TheLoop(L), DT(D) {}
 373
 374   /// \brief Check if the instruction is a load with a identifiable address.
 375   bool isHoistableLoad(Instruction *L);
 376
 377   /// \brief Check if all of the conditional loads are hoistable because there
 378   /// exists an unconditional memory access to the same address in the loop.
 379   bool canHoistAllLoads();
 380 };
 381
 382 bool LoadHoisting::isHoistableLoad(Instruction *L) {
 383   LoadInst *LI = dyn_cast<LoadInst>(L);
 384   if (!LI)
 385     return false;
 386
 387   CondLoadAddrSet.insert(LI->getPointerOperand());
 388   return true;
 389 }
 390
 391 static void addMemAccesses(BasicBlock *BB, SmallPtrSet<Value *, 8> &Set) {
 392   for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
 393     if (LoadInst *LI = dyn_cast<LoadInst>(BI)) // Try a load.
 394       Set.insert(LI->getPointerOperand());
 395     else if (StoreInst *SI = dyn_cast<StoreInst>(BI)) // Try a store.
 396       Set.insert(SI->getPointerOperand());
 397   }
 398 }
 399
 400 bool LoadHoisting::canHoistAllLoads() {
 401   // No conditional loads.
 402   if (CondLoadAddrSet.empty())
 403     return true;
 404
 405   MemorySet UncondMemAccesses;
 406   std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
 407   BasicBlock *LoopLatch = TheLoop->getLoopLatch();
 408
 409   // Iterate over the unconditional blocks and collect memory access addresses.
 410   for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
 411     BasicBlock *BB = LoopBlocks[i];
 412
 413     // Ignore conditional blocks.
 414     if (BB != LoopLatch && !DT->dominates(BB, LoopLatch))
 415       continue;
 416
 417     addMemAccesses(BB, UncondMemAccesses);
 418   }
 419
 420   // And make sure there is a matching unconditional access for every
 421   // conditional load.
 422   for (MemorySet::iterator MI = CondLoadAddrSet.begin(),
 423        ME = CondLoadAddrSet.end(); MI != ME; ++MI)
 424     if (!UncondMemAccesses.count(*MI))
 425       return false;
 426
 427   return true;
 428 }
 429
 430 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
 431 /// to what vectorization factor.
 432 /// This class does not look at the profitability of vectorization, only the
 433 /// legality. This class has two main kinds of checks:
 434 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
 435 ///   will change the order of memory accesses in a way that will change the
 436 ///   correctness of the program.
 437 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
 438 /// checks for a number of different conditions, such as the availability of a
 439 /// single induction variable, that all types are supported and vectorize-able,
 440 /// etc. This code reflects the capabilities of InnerLoopVectorizer.
 441 /// This class is also used by InnerLoopVectorizer for identifying
 442 /// induction variable and the different reduction variables.
 443 class LoopVectorizationLegality {
 444 public:
 445   LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
 446                             DominatorTree *DT, TargetLibraryInfo *TLI)
 447       : TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI),
 448         Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
 449         MaxSafeDepDistBytes(-1U), LoadSpeculation(L, DT) {}
 450
 451   /// This enum represents the kinds of reductions that we support.
 452   enum ReductionKind {
 453     RK_NoReduction, ///< Not a reduction.
 454     RK_IntegerAdd,  ///< Sum of integers.
 455     RK_IntegerMult, ///< Product of integers.
 456     RK_IntegerOr,   ///< Bitwise or logical OR of numbers.
 457     RK_IntegerAnd,  ///< Bitwise or logical AND of numbers.
 458     RK_IntegerXor,  ///< Bitwise or logical XOR of numbers.
 459     RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
 460     RK_FloatAdd,    ///< Sum of floats.
 461     RK_FloatMult,   ///< Product of floats.
 462     RK_FloatMinMax  ///< Min/max implemented in terms of select(cmp()).
 463   };
 464
 465   /// This enum represents the kinds of inductions that we support.
 466   enum InductionKind {
 467     IK_NoInduction,         ///< Not an induction variable.
 468     IK_IntInduction,        ///< Integer induction variable. Step = 1.
 469     IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
 470     IK_PtrInduction,        ///< Pointer induction var. Step = sizeof(elem).
 471     IK_ReversePtrInduction  ///< Reverse ptr indvar. Step = - sizeof(elem).
 472   };
 473
 474   // This enum represents the kind of minmax reduction.
 475   enum MinMaxReductionKind {
 476     MRK_Invalid,
 477     MRK_UIntMin,
 478     MRK_UIntMax,
 479     MRK_SIntMin,
 480     MRK_SIntMax,
 481     MRK_FloatMin,
 482     MRK_FloatMax
 483   };
 484
 485   /// This POD struct holds information about reduction variables.
 486   struct ReductionDescriptor {
 487     ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
 488       Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
 489
 490     ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K,
 491                         MinMaxReductionKind MK)
 492         : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {}
 493
 494     // The starting value of the reduction.
 495     // It does not have to be zero!
 496     TrackingVH<Value> StartValue;
 497     // The instruction who's value is used outside the loop.
 498     Instruction *LoopExitInstr;
 499     // The kind of the reduction.
 500     ReductionKind Kind;
 501     // If this a min/max reduction the kind of reduction.
 502     MinMaxReductionKind MinMaxKind;
 503   };
 504
 505   /// This POD struct holds information about a potential reduction operation.
 506   struct ReductionInstDesc {
 507     ReductionInstDesc(bool IsRedux, Instruction *I) :
 508       IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {}
 509
 510     ReductionInstDesc(Instruction *I, MinMaxReductionKind K) :
 511       IsReduction(true), PatternLastInst(I), MinMaxKind(K) {}
 512
 513     // Is this instruction a reduction candidate.
 514     bool IsReduction;
 515     // The last instruction in a min/max pattern (select of the select(icmp())
 516     // pattern), or the current reduction instruction otherwise.
 517     Instruction *PatternLastInst;
 518     // If this is a min/max pattern the comparison predicate.
 519     MinMaxReductionKind MinMaxKind;
 520   };
 521
 522   // This POD struct holds information about the memory runtime legality
 523   // check that a group of pointers do not overlap.
 524   struct RuntimePointerCheck {
 525     RuntimePointerCheck() : Need(false) {}
 526
 527     /// Reset the state of the pointer runtime information.
 528     void reset() {
 529       Need = false;
 530       Pointers.clear();
 531       Starts.clear();
 532       Ends.clear();
 533     }
 534
 535     /// Insert a pointer and calculate the start and end SCEVs.
 536     void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr,
 537                 unsigned DepSetId);
 538
 539     /// This flag indicates if we need to add the runtime check.
 540     bool Need;
 541     /// Holds the pointers that we need to check.
 542     SmallVector<TrackingVH<Value>, 2> Pointers;
 543     /// Holds the pointer value at the beginning of the loop.
 544     SmallVector<const SCEV*, 2> Starts;
 545     /// Holds the pointer value at the end of the loop.
 546     SmallVector<const SCEV*, 2> Ends;
 547     /// Holds the information if this pointer is used for writing to memory.
 548     SmallVector<bool, 2> IsWritePtr;
 549     /// Holds the id of the set of pointers that could be dependent because of a
 550     /// shared underlying object.
 551     SmallVector<unsigned, 2> DependencySetId;
 552   };
 553
 554   /// A POD for saving information about induction variables.
 555   struct InductionInfo {
 556     InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
 557     InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
 558     /// Start value.
 559     TrackingVH<Value> StartValue;
 560     /// Induction kind.
 561     InductionKind IK;
 562   };
 563
 564   /// ReductionList contains the reduction descriptors for all
 565   /// of the reductions that were found in the loop.
 566   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
 567
 568   /// InductionList saves induction variables and maps them to the
 569   /// induction descriptor.
 570   typedef MapVector<PHINode*, InductionInfo> InductionList;
 571
 572   /// Returns true if it is legal to vectorize this loop.
 573   /// This does not mean that it is profitable to vectorize this
 574   /// loop, only that it is legal to do so.
 575   bool canVectorize();
 576
 577   /// Returns the Induction variable.
 578   PHINode *getInduction() { return Induction; }
 579
 580   /// Returns the reduction variables found in the loop.
 581   ReductionList *getReductionVars() { return &Reductions; }
 582
 583   /// Returns the induction variables found in the loop.
 584   InductionList *getInductionVars() { return &Inductions; }
 585
 586   /// Returns the widest induction type.
 587   Type *getWidestInductionType() { return WidestIndTy; }
 588
 589   /// Returns True if V is an induction variable in this loop.
 590   bool isInductionVariable(const Value *V);
 591
 592   /// Return true if the block BB needs to be predicated in order for the loop
 593   /// to be vectorized.
 594   bool blockNeedsPredication(BasicBlock *BB);
 595
 596   /// Check if this  pointer is consecutive when vectorizing. This happens
 597   /// when the last index of the GEP is the induction variable, or that the
 598   /// pointer itself is an induction variable.
 599   /// This check allows us to vectorize A[idx] into a wide load/store.
 600   /// Returns:
 601   /// 0 - Stride is unknown or non consecutive.
 602   /// 1 - Address is consecutive.
 603   /// -1 - Address is consecutive, and decreasing.
 604   int isConsecutivePtr(Value *Ptr);
 605
 606   /// Returns true if the value V is uniform within the loop.
 607   bool isUniform(Value *V);
 608
 609   /// Returns true if this instruction will remain scalar after vectorization.
 610   bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
 611
 612   /// Returns the information that we collected about runtime memory check.
 613   RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
 614
 615   /// This function returns the identity element (or neutral element) for
 616   /// the operation K.
 617   static Constant *getReductionIdentity(ReductionKind K, Type *Tp);
 618
 619   unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
 620
 621 private:
 622   /// Check if a single basic block loop is vectorizable.
 623   /// At this point we know that this is a loop with a constant trip count
 624   /// and we only need to check individual instructions.
 625   bool canVectorizeInstrs();
 626
 627   /// When we vectorize loops we may change the order in which
 628   /// we read and write from memory. This method checks if it is
 629   /// legal to vectorize the code, considering only memory constrains.
 630   /// Returns true if the loop is vectorizable
 631   bool canVectorizeMemory();
 632
 633   /// Return true if we can vectorize this loop using the IF-conversion
 634   /// transformation.
 635   bool canVectorizeWithIfConvert();
 636
 637   /// Collect the variables that need to stay uniform after vectorization.
 638   void collectLoopUniforms();
 639
 640   /// Return true if all of the instructions in the block can be speculatively
 641   /// executed.
 642   bool blockCanBePredicated(BasicBlock *BB);
 643
 644   /// Returns True, if 'Phi' is the kind of reduction variable for type
 645   /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
 646   bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
 647   /// Returns a struct describing if the instruction 'I' can be a reduction
 648   /// variable of type 'Kind'. If the reduction is a min/max pattern of
 649   /// select(icmp()) this function advances the instruction pointer 'I' from the
 650   /// compare instruction to the select instruction and stores this pointer in
 651   /// 'PatternLastInst' member of the returned struct.
 652   ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind,
 653                                      ReductionInstDesc &Desc);
 654   /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
 655   /// pattern corresponding to a min(X, Y) or max(X, Y).
 656   static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I,
 657                                                     ReductionInstDesc &Prev);
 658   /// Returns the induction kind of Phi. This function may return NoInduction
 659   /// if the PHI is not an induction variable.
 660   InductionKind isInductionVariable(PHINode *Phi);
 661
 662   /// The loop that we evaluate.
 663   Loop *TheLoop;
 664   /// Scev analysis.
 665   ScalarEvolution *SE;
 666   /// DataLayout analysis.
 667   DataLayout *DL;
 668   /// Dominators.
 669   DominatorTree *DT;
 670   /// Target Library Info.
 671   TargetLibraryInfo *TLI;
 672
 673   //  ---  vectorization state --- //
 674
 675   /// Holds the integer induction variable. This is the counter of the
 676   /// loop.
 677   PHINode *Induction;
 678   /// Holds the reduction variables.
 679   ReductionList Reductions;
 680   /// Holds all of the induction variables that we found in the loop.
 681   /// Notice that inductions don't need to start at zero and that induction
 682   /// variables can be pointers.
 683   InductionList Inductions;
 684   /// Holds the widest induction type encountered.
 685   Type *WidestIndTy;
 686
 687   /// Allowed outside users. This holds the reduction
 688   /// vars which can be accessed from outside the loop.
 689   SmallPtrSet<Value*, 4> AllowedExit;
 690   /// This set holds the variables which are known to be uniform after
 691   /// vectorization.
 692   SmallPtrSet<Instruction*, 4> Uniforms;
 693   /// We need to check that all of the pointers in this list are disjoint
 694   /// at runtime.
 695   RuntimePointerCheck PtrRtCheck;
 696   /// Can we assume the absence of NaNs.
 697   bool HasFunNoNaNAttr;
 698
 699   unsigned MaxSafeDepDistBytes;
 700
 701   /// Utility to determine whether loads can be speculated.
 702   LoadHoisting LoadSpeculation;
 703 };
 704
 705 /// LoopVectorizationCostModel - estimates the expected speedups due to
 706 /// vectorization.
 707 /// In many cases vectorization is not profitable. This can happen because of
 708 /// a number of reasons. In this class we mainly attempt to predict the
 709 /// expected speedup/slowdowns due to the supported instruction set. We use the
 710 /// TargetTransformInfo to query the different backends for the cost of
 711 /// different operations.
 712 class LoopVectorizationCostModel {
 713 public:
 714   LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
 715                              LoopVectorizationLegality *Legal,
 716                              const TargetTransformInfo &TTI,
 717                              DataLayout *DL, const TargetLibraryInfo *TLI)
 718       : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {}
 719
 720   /// Information about vectorization costs
 721   struct VectorizationFactor {
 722     unsigned Width; // Vector width with best cost
 723     unsigned Cost; // Cost of the loop with that width
 724   };
 725   /// \return The most profitable vectorization factor and the cost of that VF.
 726   /// This method checks every power of two up to VF. If UserVF is not ZERO
 727   /// then this vectorization factor will be selected if vectorization is
 728   /// possible.
 729   VectorizationFactor selectVectorizationFactor(bool OptForSize,
 730                                                 unsigned UserVF);
 731
 732   /// \return The size (in bits) of the widest type in the code that
 733   /// needs to be vectorized. We ignore values that remain scalar such as
 734   /// 64 bit loop indices.
 735   unsigned getWidestType();
 736
 737   /// \return The most profitable unroll factor.
 738   /// If UserUF is non-zero then this method finds the best unroll-factor
 739   /// based on register pressure and other parameters.
 740   /// VF and LoopCost are the selected vectorization factor and the cost of the
 741   /// selected VF.
 742   unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF,
 743                               unsigned LoopCost);
 744
 745   /// \brief A struct that represents some properties of the register usage
 746   /// of a loop.
 747   struct RegisterUsage {
 748     /// Holds the number of loop invariant values that are used in the loop.
 749     unsigned LoopInvariantRegs;
 750     /// Holds the maximum number of concurrent live intervals in the loop.
 751     unsigned MaxLocalUsers;
 752     /// Holds the number of instructions in the loop.
 753     unsigned NumInstructions;
 754   };
 755
 756   /// \return  information about the register usage of the loop.
 757   RegisterUsage calculateRegisterUsage();
 758
 759 private:
 760   /// Returns the expected execution cost. The unit of the cost does
 761   /// not matter because we use the 'cost' units to compare different
 762   /// vector widths. The cost that is returned is *not* normalized by
 763   /// the factor width.
 764   unsigned expectedCost(unsigned VF);
 765
 766   /// Returns the execution time cost of an instruction for a given vector
 767   /// width. Vector width of one means scalar.
 768   unsigned getInstructionCost(Instruction *I, unsigned VF);
 769
 770   /// A helper function for converting Scalar types to vector types.
 771   /// If the incoming type is void, we return void. If the VF is 1, we return
 772   /// the scalar type.
 773   static Type* ToVectorTy(Type *Scalar, unsigned VF);
 774
 775   /// Returns whether the instruction is a load or store and will be a emitted
 776   /// as a vector operation.
 777   bool isConsecutiveLoadOrStore(Instruction *I);
 778
 779   /// The loop that we evaluate.
 780   Loop *TheLoop;
 781   /// Scev analysis.
 782   ScalarEvolution *SE;
 783   /// Loop Info analysis.
 784   LoopInfo *LI;
 785   /// Vectorization legality.
 786   LoopVectorizationLegality *Legal;
 787   /// Vector target information.
 788   const TargetTransformInfo &TTI;
 789   /// Target data layout information.
 790   DataLayout *DL;
 791   /// Target Library Info.
 792   const TargetLibraryInfo *TLI;
 793 };
 794
 795 /// Utility class for getting and setting loop vectorizer hints in the form
 796 /// of loop metadata.
 797 struct LoopVectorizeHints {
 798   /// Vectorization width.
 799   unsigned Width;
 800   /// Vectorization unroll factor.
 801   unsigned Unroll;
 802
 803   LoopVectorizeHints(const Loop *L)
 804   : Width(VectorizationFactor)
 805   , Unroll(VectorizationUnroll)
 806   , LoopID(L->getLoopID()) {
 807     getHints(L);
 808     // The command line options override any loop metadata except for when
 809     // width == 1 which is used to indicate the loop is already vectorized.
 810     if (VectorizationFactor.getNumOccurrences() > 0 && Width != 1)
 811       Width = VectorizationFactor;
 812     if (VectorizationUnroll.getNumOccurrences() > 0)
 813       Unroll = VectorizationUnroll;
 814   }
 815
 816   /// Return the loop vectorizer metadata prefix.
 817   static StringRef Prefix() { return "llvm.vectorizer."; }
 818
 819   MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) {
 820     SmallVector<Value*, 2> Vals;
 821     Vals.push_back(MDString::get(Context, Name));
 822     Vals.push_back(ConstantInt::get(Type::getInt32Ty(Context), V));
 823     return MDNode::get(Context, Vals);
 824   }
 825
 826   /// Mark the loop L as already vectorized by setting the width to 1.
 827   void setAlreadyVectorized(Loop *L) {
 828     LLVMContext &Context = L->getHeader()->getContext();
 829
 830     Width = 1;
 831
 832     // Create a new loop id with one more operand for the already_vectorized
 833     // hint. If the loop already has a loop id then copy the existing operands.
 834     SmallVector<Value*, 4> Vals(1);
 835     if (LoopID)
 836       for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i)
 837         Vals.push_back(LoopID->getOperand(i));
 838
 839     Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width));
 840
 841     MDNode *NewLoopID = MDNode::get(Context, Vals);
 842     // Set operand 0 to refer to the loop id itself.
 843     NewLoopID->replaceOperandWith(0, NewLoopID);
 844
 845     L->setLoopID(NewLoopID);
 846     if (LoopID)
 847       LoopID->replaceAllUsesWith(NewLoopID);
 848
 849     LoopID = NewLoopID;
 850   }
 851
 852 private:
 853   MDNode *LoopID;
 854
 855   /// Find hints specified in the loop metadata.
 856   void getHints(const Loop *L) {
 857     if (!LoopID)
 858       return;
 859
 860     // First operand should refer to the loop id itself.
 861     assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
 862     assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
 863
 864     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
 865       const MDString *S = 0;
 866       SmallVector<Value*, 4> Args;
 867
 868       // The expected hint is either a MDString or a MDNode with the first
 869       // operand a MDString.
 870       if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
 871         if (!MD || MD->getNumOperands() == 0)
 872           continue;
 873         S = dyn_cast<MDString>(MD->getOperand(0));
 874         for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
 875           Args.push_back(MD->getOperand(i));
 876       } else {
 877         S = dyn_cast<MDString>(LoopID->getOperand(i));
 878         assert(Args.size() == 0 && "too many arguments for MDString");
 879       }
 880
 881       if (!S)
 882         continue;
 883
 884       // Check if the hint starts with the vectorizer prefix.
 885       StringRef Hint = S->getString();
 886       if (!Hint.startswith(Prefix()))
 887         continue;
 888       // Remove the prefix.
 889       Hint = Hint.substr(Prefix().size(), StringRef::npos);
 890
 891       if (Args.size() == 1)
 892         getHint(Hint, Args[0]);
 893     }
 894   }
 895
 896   // Check string hint with one operand.
 897   void getHint(StringRef Hint, Value *Arg) {
 898     const ConstantInt *C = dyn_cast<ConstantInt>(Arg);
 899     if (!C) return;
 900     unsigned Val = C->getZExtValue();
 901
 902     if (Hint == "width") {
 903       assert(isPowerOf2_32(Val) && Val <= MaxVectorWidth &&
 904              "Invalid width metadata");
 905       Width = Val;
 906     } else if (Hint == "unroll") {
 907       assert(isPowerOf2_32(Val) && Val <= MaxUnrollFactor &&
 908              "Invalid unroll metadata");
 909       Unroll = Val;
 910     } else
 911       DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint);
 912   }
 913 };
 914
 915 /// The LoopVectorize Pass.
 916 struct LoopVectorize : public LoopPass {
 917   /// Pass identification, replacement for typeid
 918   static char ID;
 919
 920   explicit LoopVectorize() : LoopPass(ID) {
 921     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
 922   }
 923
 924   ScalarEvolution *SE;
 925   DataLayout *DL;
 926   LoopInfo *LI;
 927   TargetTransformInfo *TTI;
 928   DominatorTree *DT;
 929   TargetLibraryInfo *TLI;
 930
 931   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
 932     // We only vectorize innermost loops.
 933     if (!L->empty())
 934       return false;
 935
 936     SE = &getAnalysis<ScalarEvolution>();
 937     DL = getAnalysisIfAvailable<DataLayout>();
 938     LI = &getAnalysis<LoopInfo>();
 939     TTI = &getAnalysis<TargetTransformInfo>();
 940     DT = &getAnalysis<DominatorTree>();
 941     TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 942
 943     if (DL == NULL) {
 944       DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout");
 945       return false;
 946     }
 947
 948     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
 949           L->getHeader()->getParent()->getName() << "\"\n");
 950
 951     LoopVectorizeHints Hints(L);
 952
 953     if (Hints.Width == 1) {
 954       DEBUG(dbgs() << "LV: Not vectorizing.\n");
 955       return false;
 956     }
 957
 958     // Check if it is legal to vectorize the loop.
 959     LoopVectorizationLegality LVL(L, SE, DL, DT, TLI);
 960     if (!LVL.canVectorize()) {
 961       DEBUG(dbgs() << "LV: Not vectorizing.\n");
 962       return false;
 963     }
 964
 965     // Use the cost model.
 966     LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI);
 967
 968     // Check the function attributes to find out if this function should be
 969     // optimized for size.
 970     Function *F = L->getHeader()->getParent();
 971     Attribute::AttrKind SzAttr = Attribute::OptimizeForSize;
 972     Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat;
 973     unsigned FnIndex = AttributeSet::FunctionIndex;
 974     bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr);
 975     bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr);
 976
 977     if (NoFloat) {
 978       DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
 979             "attribute is used.\n");
 980       return false;
 981     }
 982
 983     // Select the optimal vectorization factor.
 984     LoopVectorizationCostModel::VectorizationFactor VF;
 985     VF = CM.selectVectorizationFactor(OptForSize, Hints.Width);
 986     // Select the unroll factor.
 987     unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
 988                                         VF.Cost);
 989
 990     if (VF.Width == 1) {
 991       DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
 992       return false;
 993     }
 994
 995     DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
 996           F->getParent()->getModuleIdentifier()<<"\n");
 997     DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
 998
 999     // If we decided that it is *legal* to vectorize the loop then do it.
1000     InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
1001     LB.vectorize(&LVL);
1002
1003     // Mark the loop as already vectorized to avoid vectorizing again.
1004     Hints.setAlreadyVectorized(L);
1005
1006     DEBUG(verifyFunction(*L->getHeader()->getParent()));
1007     return true;
1008   }
1009
1010   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
1011     LoopPass::getAnalysisUsage(AU);
1012     AU.addRequiredID(LoopSimplifyID);
1013     AU.addRequiredID(LCSSAID);
1014     AU.addRequired<DominatorTree>();
1015     AU.addRequired<LoopInfo>();
1016     AU.addRequired<ScalarEvolution>();
1017     AU.addRequired<TargetTransformInfo>();
1018     AU.addPreserved<LoopInfo>();
1019     AU.addPreserved<DominatorTree>();
1020   }
1021
1022 };
1023
1024 } // end anonymous namespace
1025
1026 //===----------------------------------------------------------------------===//
1027 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1028 // LoopVectorizationCostModel.
1029 //===----------------------------------------------------------------------===//
1030
1031 void
1032 LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
1033                                                        Loop *Lp, Value *Ptr,
1034                                                        bool WritePtr,
1035                                                        unsigned DepSetId) {
1036   const SCEV *Sc = SE->getSCEV(Ptr);
1037   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
1038   assert(AR && "Invalid addrec expression");
1039   const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
1040   const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
1041   Pointers.push_back(Ptr);
1042   Starts.push_back(AR->getStart());
1043   Ends.push_back(ScEnd);
1044   IsWritePtr.push_back(WritePtr);
1045   DependencySetId.push_back(DepSetId);
1046 }
1047
1048 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1049   // Save the current insertion location.
1050   Instruction *Loc = Builder.GetInsertPoint();
1051
1052   // We need to place the broadcast of invariant variables outside the loop.
1053   Instruction *Instr = dyn_cast<Instruction>(V);
1054   bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
1055   bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
1056
1057   // Place the code for broadcasting invariant variables in the new preheader.
1058   if (Invariant)
1059     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1060
1061   // Broadcast the scalar into all locations in the vector.
1062   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1063
1064   // Restore the builder insertion point.
1065   if (Invariant)
1066     Builder.SetInsertPoint(Loc);
1067
1068   return Shuf;
1069 }
1070
1071 Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
1072                                                  bool Negate) {
1073   assert(Val->getType()->isVectorTy() && "Must be a vector");
1074   assert(Val->getType()->getScalarType()->isIntegerTy() &&
1075          "Elem must be an integer");
1076   // Create the types.
1077   Type *ITy = Val->getType()->getScalarType();
1078   VectorType *Ty = cast<VectorType>(Val->getType());
1079   int VLen = Ty->getNumElements();
1080   SmallVector<Constant*, 8> Indices;
1081
1082   // Create a vector of consecutive numbers from zero to VF.
1083   for (int i = 0; i < VLen; ++i) {
1084     int64_t Idx = Negate ? (-i) : i;
1085     Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate));
1086   }
1087
1088   // Add the consecutive indices to the vector value.
1089   Constant *Cv = ConstantVector::get(Indices);
1090   assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1091   return Builder.CreateAdd(Val, Cv, "induction");
1092 }
1093
1094 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
1095   assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
1096   // Make sure that the pointer does not point to structs.
1097   if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType())
1098     return 0;
1099
1100   // If this value is a pointer induction variable we know it is consecutive.
1101   PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
1102   if (Phi && Inductions.count(Phi)) {
1103     InductionInfo II = Inductions[Phi];
1104     if (IK_PtrInduction == II.IK)
1105       return 1;
1106     else if (IK_ReversePtrInduction == II.IK)
1107       return -1;
1108   }
1109
1110   GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
1111   if (!Gep)
1112     return 0;
1113
1114   unsigned NumOperands = Gep->getNumOperands();
1115   Value *LastIndex = Gep->getOperand(NumOperands - 1);
1116
1117   Value *GpPtr = Gep->getPointerOperand();
1118   // If this GEP value is a consecutive pointer induction variable and all of
1119   // the indices are constant then we know it is consecutive. We can
1120   Phi = dyn_cast<PHINode>(GpPtr);
1121   if (Phi && Inductions.count(Phi)) {
1122
1123     // Make sure that the pointer does not point to structs.
1124     PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
1125     if (GepPtrType->getElementType()->isAggregateType())
1126       return 0;
1127
1128     // Make sure that all of the index operands are loop invariant.
1129     for (unsigned i = 1; i < NumOperands; ++i)
1130       if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
1131         return 0;
1132
1133     InductionInfo II = Inductions[Phi];
1134     if (IK_PtrInduction == II.IK)
1135       return 1;
1136     else if (IK_ReversePtrInduction == II.IK)
1137       return -1;
1138   }
1139
1140   // Check that all of the gep indices are uniform except for the last.
1141   for (unsigned i = 0; i < NumOperands - 1; ++i)
1142     if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
1143       return 0;
1144
1145   // We can emit wide load/stores only if the last index is the induction
1146   // variable.
1147   const SCEV *Last = SE->getSCEV(LastIndex);
1148   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
1149     const SCEV *Step = AR->getStepRecurrence(*SE);
1150
1151     // The memory is consecutive because the last index is consecutive
1152     // and all other indices are loop invariant.
1153     if (Step->isOne())
1154       return 1;
1155     if (Step->isAllOnesValue())
1156       return -1;
1157   }
1158
1159   return 0;
1160 }
1161
1162 bool LoopVectorizationLegality::isUniform(Value *V) {
1163   return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
1164 }
1165
1166 InnerLoopVectorizer::VectorParts&
1167 InnerLoopVectorizer::getVectorValue(Value *V) {
1168   assert(V != Induction && "The new induction variable should not be used.");
1169   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1170
1171   // If we have this scalar in the map, return it.
1172   if (WidenMap.has(V))
1173     return WidenMap.get(V);
1174
1175   // If this scalar is unknown, assume that it is a constant or that it is
1176   // loop invariant. Broadcast V and save the value for future uses.
1177   Value *B = getBroadcastInstrs(V);
1178   return WidenMap.splat(V, B);
1179 }
1180
1181 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
1182   assert(Vec->getType()->isVectorTy() && "Invalid type");
1183   SmallVector<Constant*, 8> ShuffleMask;
1184   for (unsigned i = 0; i < VF; ++i)
1185     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
1186
1187   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
1188                                      ConstantVector::get(ShuffleMask),
1189                                      "reverse");
1190 }
1191
1192
1193 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
1194                                              LoopVectorizationLegality *Legal) {
1195   // Attempt to issue a wide load.
1196   LoadInst *LI = dyn_cast<LoadInst>(Instr);
1197   StoreInst *SI = dyn_cast<StoreInst>(Instr);
1198
1199   assert((LI || SI) && "Invalid Load/Store instruction");
1200
1201   Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
1202   Type *DataTy = VectorType::get(ScalarDataTy, VF);
1203   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
1204   unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
1205   unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
1206   unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
1207   unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
1208
1209   if (ScalarAllocatedSize != VectorElementSize)
1210     return scalarizeInstruction(Instr);
1211
1212   // If the pointer is loop invariant or if it is non consecutive,
1213   // scalarize the load.
1214   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
1215   bool Reverse = ConsecutiveStride < 0;
1216   bool UniformLoad = LI && Legal->isUniform(Ptr);
1217   if (!ConsecutiveStride || UniformLoad)
1218     return scalarizeInstruction(Instr);
1219
1220   Constant *Zero = Builder.getInt32(0);
1221   VectorParts &Entry = WidenMap.get(Instr);
1222
1223   // Handle consecutive loads/stores.
1224   GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
1225   if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
1226     setDebugLocFromInst(Builder, Gep);
1227     Value *PtrOperand = Gep->getPointerOperand();
1228     Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
1229     FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
1230
1231     // Create the new GEP with the new induction variable.
1232     GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
1233     Gep2->setOperand(0, FirstBasePtr);
1234     Gep2->setName("gep.indvar.base");
1235     Ptr = Builder.Insert(Gep2);
1236   } else if (Gep) {
1237     setDebugLocFromInst(Builder, Gep);
1238     assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
1239                                OrigLoop) && "Base ptr must be invariant");
1240
1241     // The last index does not have to be the induction. It can be
1242     // consecutive and be a function of the index. For example A[I+1];
1243     unsigned NumOperands = Gep->getNumOperands();
1244     unsigned LastOperand = NumOperands - 1;
1245     // Create the new GEP with the new induction variable.
1246     GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
1247
1248     for (unsigned i = 0; i < NumOperands; ++i) {
1249       Value *GepOperand = Gep->getOperand(i);
1250       Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
1251
1252       // Update last index or loop invariant instruction anchored in loop.
1253       if (i == LastOperand ||
1254           (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
1255         assert((i == LastOperand ||
1256                SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
1257                "Must be last index or loop invariant");
1258
1259         VectorParts &GEPParts = getVectorValue(GepOperand);
1260         Value *Index = GEPParts[0];
1261         Index = Builder.CreateExtractElement(Index, Zero);
1262         Gep2->setOperand(i, Index);
1263         Gep2->setName("gep.indvar.idx");
1264       }
1265     }
1266     Ptr = Builder.Insert(Gep2);
1267   } else {
1268     // Use the induction element ptr.
1269     assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
1270     setDebugLocFromInst(Builder, Ptr);
1271     VectorParts &PtrVal = getVectorValue(Ptr);
1272     Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
1273   }
1274
1275   // Handle Stores:
1276   if (SI) {
1277     assert(!Legal->isUniform(SI->getPointerOperand()) &&
1278            "We do not allow storing to uniform addresses");
1279     setDebugLocFromInst(Builder, SI);
1280     // We don't want to update the value in the map as it might be used in
1281     // another expression. So don't use a reference type for "StoredVal".
1282     VectorParts StoredVal = getVectorValue(SI->getValueOperand());
1283
1284     for (unsigned Part = 0; Part < UF; ++Part) {
1285       // Calculate the pointer for the specific unroll-part.
1286       Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
1287
1288       if (Reverse) {
1289         // If we store to reverse consecutive memory locations then we need
1290         // to reverse the order of elements in the stored value.
1291         StoredVal[Part] = reverseVector(StoredVal[Part]);
1292         // If the address is consecutive but reversed, then the
1293         // wide store needs to start at the last vector element.
1294         PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
1295         PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
1296       }
1297
1298       Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
1299       Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
1300     }
1301     return;
1302   }
1303
1304   // Handle loads.
1305   assert(LI && "Must have a load instruction");
1306   setDebugLocFromInst(Builder, LI);
1307   for (unsigned Part = 0; Part < UF; ++Part) {
1308     // Calculate the pointer for the specific unroll-part.
1309     Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
1310
1311     if (Reverse) {
1312       // If the address is consecutive but reversed, then the
1313       // wide store needs to start at the last vector element.
1314       PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
1315       PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
1316     }
1317
1318     Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
1319     Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
1320     cast<LoadInst>(LI)->setAlignment(Alignment);
1321     Entry[Part] = Reverse ? reverseVector(LI) :  LI;
1322   }
1323 }
1324
1325 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
1326   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
1327   // Holds vector parameters or scalars, in case of uniform vals.
1328   SmallVector<VectorParts, 4> Params;
1329
1330   setDebugLocFromInst(Builder, Instr);
1331
1332   // Find all of the vectorized parameters.
1333   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
1334     Value *SrcOp = Instr->getOperand(op);
1335
1336     // If we are accessing the old induction variable, use the new one.
1337     if (SrcOp == OldInduction) {
1338       Params.push_back(getVectorValue(SrcOp));
1339       continue;
1340     }
1341
1342     // Try using previously calculated values.
1343     Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
1344
1345     // If the src is an instruction that appeared earlier in the basic block
1346     // then it should already be vectorized.
1347     if (SrcInst && OrigLoop->contains(SrcInst)) {
1348       assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
1349       // The parameter is a vector value from earlier.
1350       Params.push_back(WidenMap.get(SrcInst));
1351     } else {
1352       // The parameter is a scalar from outside the loop. Maybe even a constant.
1353       VectorParts Scalars;
1354       Scalars.append(UF, SrcOp);
1355       Params.push_back(Scalars);
1356     }
1357   }
1358
1359   assert(Params.size() == Instr->getNumOperands() &&
1360          "Invalid number of operands");
1361
1362   // Does this instruction return a value ?
1363   bool IsVoidRetTy = Instr->getType()->isVoidTy();
1364
1365   Value *UndefVec = IsVoidRetTy ? 0 :
1366     UndefValue::get(VectorType::get(Instr->getType(), VF));
1367   // Create a new entry in the WidenMap and initialize it to Undef or Null.
1368   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
1369
1370   // For each vector unroll 'part':
1371   for (unsigned Part = 0; Part < UF; ++Part) {
1372     // For each scalar that we create:
1373     for (unsigned Width = 0; Width < VF; ++Width) {
1374       Instruction *Cloned = Instr->clone();
1375       if (!IsVoidRetTy)
1376         Cloned->setName(Instr->getName() + ".cloned");
1377       // Replace the operands of the cloned instrucions with extracted scalars.
1378       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
1379         Value *Op = Params[op][Part];
1380         // Param is a vector. Need to extract the right lane.
1381         if (Op->getType()->isVectorTy())
1382           Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
1383         Cloned->setOperand(op, Op);
1384       }
1385
1386       // Place the cloned scalar in the new loop.
1387       Builder.Insert(Cloned);
1388
1389       // If the original scalar returns a value we need to place it in a vector
1390       // so that future users will be able to use it.
1391       if (!IsVoidRetTy)
1392         VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
1393                                                        Builder.getInt32(Width));
1394     }
1395   }
1396 }
1397
1398 Instruction *
1399 InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
1400                                      Instruction *Loc) {
1401   LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
1402   Legal->getRuntimePointerCheck();
1403
1404   if (!PtrRtCheck->Need)
1405     return NULL;
1406
1407   unsigned NumPointers = PtrRtCheck->Pointers.size();
1408   SmallVector<TrackingVH<Value> , 2> Starts;
1409   SmallVector<TrackingVH<Value> , 2> Ends;
1410
1411   SCEVExpander Exp(*SE, "induction");
1412
1413   // Use this type for pointer arithmetic.
1414   Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0);
1415
1416   for (unsigned i = 0; i < NumPointers; ++i) {
1417     Value *Ptr = PtrRtCheck->Pointers[i];
1418     const SCEV *Sc = SE->getSCEV(Ptr);
1419
1420     if (SE->isLoopInvariant(Sc, OrigLoop)) {
1421       DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<
1422             *Ptr <<"\n");
1423       Starts.push_back(Ptr);
1424       Ends.push_back(Ptr);
1425     } else {
1426       DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
1427
1428       Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
1429       Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
1430       Starts.push_back(Start);
1431       Ends.push_back(End);
1432     }
1433   }
1434
1435   IRBuilder<> ChkBuilder(Loc);
1436   // Our instructions might fold to a constant.
1437   Value *MemoryRuntimeCheck = 0;
1438   for (unsigned i = 0; i < NumPointers; ++i) {
1439     for (unsigned j = i+1; j < NumPointers; ++j) {
1440       // No need to check if two readonly pointers intersect.
1441       if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
1442         continue;
1443
1444       // Only need to check pointers between two different dependency sets.
1445       if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j])
1446        continue;
1447
1448       Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
1449       Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
1450       Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy, "bc");
1451       Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy, "bc");
1452
1453       Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
1454       Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
1455       Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
1456       if (MemoryRuntimeCheck)
1457         IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
1458                                          "conflict.rdx");
1459       MemoryRuntimeCheck = IsConflict;
1460     }
1461   }
1462
1463   // We have to do this trickery because the IRBuilder might fold the check to a
1464   // constant expression in which case there is no Instruction anchored in a
1465   // the block.
1466   LLVMContext &Ctx = Loc->getContext();
1467   Instruction * Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
1468                                                   ConstantInt::getTrue(Ctx));
1469   ChkBuilder.Insert(Check, "memcheck.conflict");
1470   return Check;
1471 }
1472
1473 void
1474 InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
1475   /*
1476    In this function we generate a new loop. The new loop will contain
1477    the vectorized instructions while the old loop will continue to run the
1478    scalar remainder.
1479
1480        [ ] <-- vector loop bypass (may consist of multiple blocks).
1481      /  |
1482     /   v
1483    |   [ ]     <-- vector pre header.
1484    |    |
1485    |    v
1486    |   [  ] \
1487    |   [  ]_|   <-- vector loop.
1488    |    |
1489     \   v
1490       >[ ]   <--- middle-block.
1491      /  |
1492     /   v
1493    |   [ ]     <--- new preheader.
1494    |    |
1495    |    v
1496    |   [ ] \
1497    |   [ ]_|   <-- old scalar loop to handle remainder.
1498     \   |
1499      \  v
1500       >[ ]     <-- exit block.
1501    ...
1502    */
1503
1504   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
1505   BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
1506   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
1507   assert(ExitBlock && "Must have an exit block");
1508
1509   // Some loops have a single integer induction variable, while other loops
1510   // don't. One example is c++ iterators that often have multiple pointer
1511   // induction variables. In the code below we also support a case where we
1512   // don't have a single induction variable.
1513   OldInduction = Legal->getInduction();
1514   Type *IdxTy = Legal->getWidestInductionType();
1515
1516   // Find the loop boundaries.
1517   const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
1518   assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
1519
1520   // Get the total trip count from the count by adding 1.
1521   ExitCount = SE->getAddExpr(ExitCount,
1522                              SE->getConstant(ExitCount->getType(), 1));
1523
1524   // Expand the trip count and place the new instructions in the preheader.
1525   // Notice that the pre-header does not change, only the loop body.
1526   SCEVExpander Exp(*SE, "induction");
1527
1528   // Count holds the overall loop count (N).
1529   Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
1530                                    BypassBlock->getTerminator());
1531
1532   // The loop index does not have to start at Zero. Find the original start
1533   // value from the induction PHI node. If we don't have an induction variable
1534   // then we know that it starts at zero.
1535   Builder.SetInsertPoint(BypassBlock->getTerminator());
1536   Value *StartIdx = ExtendedIdx = OldInduction ?
1537     Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock),
1538                        IdxTy):
1539     ConstantInt::get(IdxTy, 0);
1540
1541   assert(BypassBlock && "Invalid loop structure");
1542   LoopBypassBlocks.push_back(BypassBlock);
1543
1544   // Split the single block loop into the two loop structure described above.
1545   BasicBlock *VectorPH =
1546   BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
1547   BasicBlock *VecBody =
1548   VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
1549   BasicBlock *MiddleBlock =
1550   VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
1551   BasicBlock *ScalarPH =
1552   MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
1553
1554   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
1555   // inside the loop.
1556   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
1557
1558   // Generate the induction variable.
1559   setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
1560   Induction = Builder.CreatePHI(IdxTy, 2, "index");
1561   // The loop step is equal to the vectorization factor (num of SIMD elements)
1562   // times the unroll factor (num of SIMD instructions).
1563   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
1564
1565   // This is the IR builder that we use to add all of the logic for bypassing
1566   // the new vector loop.
1567   IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
1568   setDebugLocFromInst(BypassBuilder,
1569                       getDebugLocFromInstOrOperands(OldInduction));
1570
1571   // We may need to extend the index in case there is a type mismatch.
1572   // We know that the count starts at zero and does not overflow.
1573   if (Count->getType() != IdxTy) {
1574     // The exit count can be of pointer type. Convert it to the correct
1575     // integer type.
1576     if (ExitCount->getType()->isPointerTy())
1577       Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
1578     else
1579       Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
1580   }
1581
1582   // Add the start index to the loop count to get the new end index.
1583   Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
1584
1585   // Now we need to generate the expression for N - (N % VF), which is
1586   // the part that the vectorized body will execute.
1587   Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
1588   Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
1589   Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
1590                                                      "end.idx.rnd.down");
1591
1592   // Now, compare the new count to zero. If it is zero skip the vector loop and
1593   // jump to the scalar loop.
1594   Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx,
1595                                           "cmp.zero");
1596
1597   BasicBlock *LastBypassBlock = BypassBlock;
1598
1599   // Generate the code that checks in runtime if arrays overlap. We put the
1600   // checks into a separate block to make the more common case of few elements
1601   // faster.
1602   Instruction *MemRuntimeCheck = addRuntimeCheck(Legal,
1603                                                  BypassBlock->getTerminator());
1604   if (MemRuntimeCheck) {
1605     // Create a new block containing the memory check.
1606     BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck,
1607                                                           "vector.memcheck");
1608     LoopBypassBlocks.push_back(CheckBlock);
1609
1610     // Replace the branch into the memory check block with a conditional branch
1611     // for the "few elements case".
1612     Instruction *OldTerm = BypassBlock->getTerminator();
1613     BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
1614     OldTerm->eraseFromParent();
1615
1616     Cmp = MemRuntimeCheck;
1617     LastBypassBlock = CheckBlock;
1618   }
1619
1620   LastBypassBlock->getTerminator()->eraseFromParent();
1621   BranchInst::Create(MiddleBlock, VectorPH, Cmp,
1622                      LastBypassBlock);
1623
1624   // We are going to resume the execution of the scalar loop.
1625   // Go over all of the induction variables that we found and fix the
1626   // PHIs that are left in the scalar version of the loop.
1627   // The starting values of PHI nodes depend on the counter of the last
1628   // iteration in the vectorized loop.
1629   // If we come from a bypass edge then we need to start from the original
1630   // start value.
1631
1632   // This variable saves the new starting index for the scalar loop.
1633   PHINode *ResumeIndex = 0;
1634   LoopVectorizationLegality::InductionList::iterator I, E;
1635   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
1636   // Set builder to point to last bypass block.
1637   BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
1638   for (I = List->begin(), E = List->end(); I != E; ++I) {
1639     PHINode *OrigPhi = I->first;
1640     LoopVectorizationLegality::InductionInfo II = I->second;
1641
1642     Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
1643     PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
1644                                          MiddleBlock->getTerminator());
1645     // We might have extended the type of the induction variable but we need a
1646     // truncated version for the scalar loop.
1647     PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
1648       PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
1649                       MiddleBlock->getTerminator()) : 0;
1650
1651     Value *EndValue = 0;
1652     switch (II.IK) {
1653     case LoopVectorizationLegality::IK_NoInduction:
1654       llvm_unreachable("Unknown induction");
1655     case LoopVectorizationLegality::IK_IntInduction: {
1656       // Handle the integer induction counter.
1657       assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
1658
1659       // We have the canonical induction variable.
1660       if (OrigPhi == OldInduction) {
1661         // Create a truncated version of the resume value for the scalar loop,
1662         // we might have promoted the type to a larger width.
1663         EndValue =
1664           BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
1665         // The new PHI merges the original incoming value, in case of a bypass,
1666         // or the value at the end of the vectorized loop.
1667         for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
1668           TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
1669         TruncResumeVal->addIncoming(EndValue, VecBody);
1670
1671         // We know what the end value is.
1672         EndValue = IdxEndRoundDown;
1673         // We also know which PHI node holds it.
1674         ResumeIndex = ResumeVal;
1675         break;
1676       }
1677
1678       // Not the canonical induction variable - add the vector loop count to the
1679       // start value.
1680       Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
1681                                                    II.StartValue->getType(),
1682                                                    "cast.crd");
1683       EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end");
1684       break;
1685     }
1686     case LoopVectorizationLegality::IK_ReverseIntInduction: {
1687       // Convert the CountRoundDown variable to the PHI size.
1688       Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
1689                                                    II.StartValue->getType(),
1690                                                    "cast.crd");
1691       // Handle reverse integer induction counter.
1692       EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end");
1693       break;
1694     }
1695     case LoopVectorizationLegality::IK_PtrInduction: {
1696       // For pointer induction variables, calculate the offset using
1697       // the end index.
1698       EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown,
1699                                          "ptr.ind.end");
1700       break;
1701     }
1702     case LoopVectorizationLegality::IK_ReversePtrInduction: {
1703       // The value at the end of the loop for the reverse pointer is calculated
1704       // by creating a GEP with a negative index starting from the start value.
1705       Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0);
1706       Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown,
1707                                               "rev.ind.end");
1708       EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx,
1709                                          "rev.ptr.ind.end");
1710       break;
1711     }
1712     }// end of case
1713
1714     // The new PHI merges the original incoming value, in case of a bypass,
1715     // or the value at the end of the vectorized loop.
1716     for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) {
1717       if (OrigPhi == OldInduction)
1718         ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
1719       else
1720         ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
1721     }
1722     ResumeVal->addIncoming(EndValue, VecBody);
1723
1724     // Fix the scalar body counter (PHI node).
1725     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
1726     // The old inductions phi node in the scalar body needs the truncated value.
1727     if (OrigPhi == OldInduction)
1728       OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal);
1729     else
1730       OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
1731   }
1732
1733   // If we are generating a new induction variable then we also need to
1734   // generate the code that calculates the exit value. This value is not
1735   // simply the end of the counter because we may skip the vectorized body
1736   // in case of a runtime check.
1737   if (!OldInduction){
1738     assert(!ResumeIndex && "Unexpected resume value found");
1739     ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
1740                                   MiddleBlock->getTerminator());
1741     for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
1742       ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
1743     ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
1744   }
1745
1746   // Make sure that we found the index where scalar loop needs to continue.
1747   assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
1748          "Invalid resume Index");
1749
1750   // Add a check in the middle block to see if we have completed
1751   // all of the iterations in the first vector loop.
1752   // If (N - N%VF) == N, then we *don't* need to run the remainder.
1753   Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
1754                                 ResumeIndex, "cmp.n",
1755                                 MiddleBlock->getTerminator());
1756
1757   BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
1758   // Remove the old terminator.
1759   MiddleBlock->getTerminator()->eraseFromParent();
1760
1761   // Create i+1 and fill the PHINode.
1762   Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
1763   Induction->addIncoming(StartIdx, VectorPH);
1764   Induction->addIncoming(NextIdx, VecBody);
1765   // Create the compare.
1766   Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
1767   Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
1768
1769   // Now we have two terminators. Remove the old one from the block.
1770   VecBody->getTerminator()->eraseFromParent();
1771
1772   // Get ready to start creating new instructions into the vectorized body.
1773   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
1774
1775   // Create and register the new vector loop.
1776   Loop* Lp = new Loop();
1777   Loop *ParentLoop = OrigLoop->getParentLoop();
1778
1779   // Insert the new loop into the loop nest and register the new basic blocks.
1780   if (ParentLoop) {
1781     ParentLoop->addChildLoop(Lp);
1782     for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
1783       ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase());
1784     ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
1785     ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
1786     ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
1787   } else {
1788     LI->addTopLevelLoop(Lp);
1789   }
1790
1791   Lp->addBasicBlockToLoop(VecBody, LI->getBase());
1792
1793   // Save the state.
1794   LoopVectorPreHeader = VectorPH;
1795   LoopScalarPreHeader = ScalarPH;
1796   LoopMiddleBlock = MiddleBlock;
1797   LoopExitBlock = ExitBlock;
1798   LoopVectorBody = VecBody;
1799   LoopScalarBody = OldBasicBlock;
1800 }
1801
1802 /// This function returns the identity element (or neutral element) for
1803 /// the operation K.
1804 Constant*
1805 LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) {
1806   switch (K) {
1807   case RK_IntegerXor:
1808   case RK_IntegerAdd:
1809   case RK_IntegerOr:
1810     // Adding, Xoring, Oring zero to a number does not change it.
1811     return ConstantInt::get(Tp, 0);
1812   case RK_IntegerMult:
1813     // Multiplying a number by 1 does not change it.
1814     return ConstantInt::get(Tp, 1);
1815   case RK_IntegerAnd:
1816     // AND-ing a number with an all-1 value does not change it.
1817     return ConstantInt::get(Tp, -1, true);
1818   case  RK_FloatMult:
1819     // Multiplying a number by 1 does not change it.
1820     return ConstantFP::get(Tp, 1.0L);
1821   case  RK_FloatAdd:
1822     // Adding zero to a number does not change it.
1823     return ConstantFP::get(Tp, 0.0L);
1824   default:
1825     llvm_unreachable("Unknown reduction kind");
1826   }
1827 }
1828
1829 static Intrinsic::ID
1830 getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
1831   // If we have an intrinsic call, check if it is trivially vectorizable.
1832   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
1833     switch (II->getIntrinsicID()) {
1834     case Intrinsic::sqrt:
1835     case Intrinsic::sin:
1836     case Intrinsic::cos:
1837     case Intrinsic::exp:
1838     case Intrinsic::exp2:
1839     case Intrinsic::log:
1840     case Intrinsic::log10:
1841     case Intrinsic::log2:
1842     case Intrinsic::fabs:
1843     case Intrinsic::floor:
1844     case Intrinsic::ceil:
1845     case Intrinsic::trunc:
1846     case Intrinsic::rint:
1847     case Intrinsic::nearbyint:
1848     case Intrinsic::pow:
1849     case Intrinsic::fma:
1850     case Intrinsic::fmuladd:
1851       return II->getIntrinsicID();
1852     default:
1853       return Intrinsic::not_intrinsic;
1854     }
1855   }
1856
1857   if (!TLI)
1858     return Intrinsic::not_intrinsic;
1859
1860   LibFunc::Func Func;
1861   Function *F = CI->getCalledFunction();
1862   // We're going to make assumptions on the semantics of the functions, check
1863   // that the target knows that it's available in this environment.
1864   if (!F || !TLI->getLibFunc(F->getName(), Func))
1865     return Intrinsic::not_intrinsic;
1866
1867   // Otherwise check if we have a call to a function that can be turned into a
1868   // vector intrinsic.
1869   switch (Func) {
1870   default:
1871     break;
1872   case LibFunc::sin:
1873   case LibFunc::sinf:
1874   case LibFunc::sinl:
1875     return Intrinsic::sin;
1876   case LibFunc::cos:
1877   case LibFunc::cosf:
1878   case LibFunc::cosl:
1879     return Intrinsic::cos;
1880   case LibFunc::exp:
1881   case LibFunc::expf:
1882   case LibFunc::expl:
1883     return Intrinsic::exp;
1884   case LibFunc::exp2:
1885   case LibFunc::exp2f:
1886   case LibFunc::exp2l:
1887     return Intrinsic::exp2;
1888   case LibFunc::log:
1889   case LibFunc::logf:
1890   case LibFunc::logl:
1891     return Intrinsic::log;
1892   case LibFunc::log10:
1893   case LibFunc::log10f:
1894   case LibFunc::log10l:
1895     return Intrinsic::log10;
1896   case LibFunc::log2:
1897   case LibFunc::log2f:
1898   case LibFunc::log2l:
1899     return Intrinsic::log2;
1900   case LibFunc::fabs:
1901   case LibFunc::fabsf:
1902   case LibFunc::fabsl:
1903     return Intrinsic::fabs;
1904   case LibFunc::floor:
1905   case LibFunc::floorf:
1906   case LibFunc::floorl:
1907     return Intrinsic::floor;
1908   case LibFunc::ceil:
1909   case LibFunc::ceilf:
1910   case LibFunc::ceill:
1911     return Intrinsic::ceil;
1912   case LibFunc::trunc:
1913   case LibFunc::truncf:
1914   case LibFunc::truncl:
1915     return Intrinsic::trunc;
1916   case LibFunc::rint:
1917   case LibFunc::rintf:
1918   case LibFunc::rintl:
1919     return Intrinsic::rint;
1920   case LibFunc::nearbyint:
1921   case LibFunc::nearbyintf:
1922   case LibFunc::nearbyintl:
1923     return Intrinsic::nearbyint;
1924   case LibFunc::pow:
1925   case LibFunc::powf:
1926   case LibFunc::powl:
1927     return Intrinsic::pow;
1928   }
1929
1930   return Intrinsic::not_intrinsic;
1931 }
1932
1933 /// This function translates the reduction kind to an LLVM binary operator.
1934 static unsigned
1935 getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
1936   switch (Kind) {
1937     case LoopVectorizationLegality::RK_IntegerAdd:
1938       return Instruction::Add;
1939     case LoopVectorizationLegality::RK_IntegerMult:
1940       return Instruction::Mul;
1941     case LoopVectorizationLegality::RK_IntegerOr:
1942       return Instruction::Or;
1943     case LoopVectorizationLegality::RK_IntegerAnd:
1944       return Instruction::And;
1945     case LoopVectorizationLegality::RK_IntegerXor:
1946       return Instruction::Xor;
1947     case LoopVectorizationLegality::RK_FloatMult:
1948       return Instruction::FMul;
1949     case LoopVectorizationLegality::RK_FloatAdd:
1950       return Instruction::FAdd;
1951     case LoopVectorizationLegality::RK_IntegerMinMax:
1952       return Instruction::ICmp;
1953     case LoopVectorizationLegality::RK_FloatMinMax:
1954       return Instruction::FCmp;
1955     default:
1956       llvm_unreachable("Unknown reduction operation");
1957   }
1958 }
1959
1960 Value *createMinMaxOp(IRBuilder<> &Builder,
1961                       LoopVectorizationLegality::MinMaxReductionKind RK,
1962                       Value *Left,
1963                       Value *Right) {
1964   CmpInst::Predicate P = CmpInst::ICMP_NE;
1965   switch (RK) {
1966   default:
1967     llvm_unreachable("Unknown min/max reduction kind");
1968   case LoopVectorizationLegality::MRK_UIntMin:
1969     P = CmpInst::ICMP_ULT;
1970     break;
1971   case LoopVectorizationLegality::MRK_UIntMax:
1972     P = CmpInst::ICMP_UGT;
1973     break;
1974   case LoopVectorizationLegality::MRK_SIntMin:
1975     P = CmpInst::ICMP_SLT;
1976     break;
1977   case LoopVectorizationLegality::MRK_SIntMax:
1978     P = CmpInst::ICMP_SGT;
1979     break;
1980   case LoopVectorizationLegality::MRK_FloatMin:
1981     P = CmpInst::FCMP_OLT;
1982     break;
1983   case LoopVectorizationLegality::MRK_FloatMax:
1984     P = CmpInst::FCMP_OGT;
1985     break;
1986   }
1987
1988   Value *Cmp;
1989   if (RK == LoopVectorizationLegality::MRK_FloatMin || RK == LoopVectorizationLegality::MRK_FloatMax)
1990     Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
1991   else
1992     Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
1993
1994   Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
1995   return Select;
1996 }
1997
1998 void
1999 InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
2000   //===------------------------------------------------===//
2001   //
2002   // Notice: any optimization or new instruction that go
2003   // into the code below should be also be implemented in
2004   // the cost-model.
2005   //
2006   //===------------------------------------------------===//
2007   Constant *Zero = Builder.getInt32(0);
2008
2009   // In order to support reduction variables we need to be able to vectorize
2010   // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
2011   // stages. First, we create a new vector PHI node with no incoming edges.
2012   // We use this value when we vectorize all of the instructions that use the
2013   // PHI. Next, after all of the instructions in the block are complete we
2014   // add the new incoming edges to the PHI. At this point all of the
2015   // instructions in the basic block are vectorized, so we can use them to
2016   // construct the PHI.
2017   PhiVector RdxPHIsToFix;
2018
2019   // Scan the loop in a topological order to ensure that defs are vectorized
2020   // before users.
2021   LoopBlocksDFS DFS(OrigLoop);
2022   DFS.perform(LI);
2023
2024   // Vectorize all of the blocks in the original loop.
2025   for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
2026        be = DFS.endRPO(); bb != be; ++bb)
2027     vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix);
2028
2029   // At this point every instruction in the original loop is widened to
2030   // a vector form. We are almost done. Now, we need to fix the PHI nodes
2031   // that we vectorized. The PHI nodes are currently empty because we did
2032   // not want to introduce cycles. Notice that the remaining PHI nodes
2033   // that we need to fix are reduction variables.
2034
2035   // Create the 'reduced' values for each of the induction vars.
2036   // The reduced values are the vector values that we scalarize and combine
2037   // after the loop is finished.
2038   for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
2039        it != e; ++it) {
2040     PHINode *RdxPhi = *it;
2041     assert(RdxPhi && "Unable to recover vectorized PHI");
2042
2043     // Find the reduction variable descriptor.
2044     assert(Legal->getReductionVars()->count(RdxPhi) &&
2045            "Unable to find the reduction variable");
2046     LoopVectorizationLegality::ReductionDescriptor RdxDesc =
2047     (*Legal->getReductionVars())[RdxPhi];
2048
2049     setDebugLocFromInst(Builder, RdxDesc.StartValue);
2050
2051     // We need to generate a reduction vector from the incoming scalar.
2052     // To do so, we need to generate the 'identity' vector and overide
2053     // one of the elements with the incoming scalar reduction. We need
2054     // to do it in the vector-loop preheader.
2055     Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
2056
2057     // This is the vector-clone of the value that leaves the loop.
2058     VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
2059     Type *VecTy = VectorExit[0]->getType();
2060
2061     // Find the reduction identity variable. Zero for addition, or, xor,
2062     // one for multiplication, -1 for And.
2063     Value *Identity;
2064     Value *VectorStart;
2065     if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax ||
2066         RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
2067       // MinMax reduction have the start value as their identify.
2068       VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue,
2069                                                          "minmax.ident");
2070     } else {
2071       Constant *Iden =
2072         LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
2073                                                         VecTy->getScalarType());
2074       Identity = ConstantVector::getSplat(VF, Iden);
2075
2076       // This vector is the Identity vector where the first element is the
2077       // incoming scalar reduction.
2078       VectorStart = Builder.CreateInsertElement(Identity,
2079                                                 RdxDesc.StartValue, Zero);
2080     }
2081
2082     // Fix the vector-loop phi.
2083     // We created the induction variable so we know that the
2084     // preheader is the first entry.
2085     BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
2086
2087     // Reductions do not have to start at zero. They can start with
2088     // any loop invariant values.
2089     VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
2090     BasicBlock *Latch = OrigLoop->getLoopLatch();
2091     Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
2092     VectorParts &Val = getVectorValue(LoopVal);
2093     for (unsigned part = 0; part < UF; ++part) {
2094       // Make sure to add the reduction stat value only to the
2095       // first unroll part.
2096       Value *StartVal = (part == 0) ? VectorStart : Identity;
2097       cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
2098       cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody);
2099     }
2100
2101     // Before each round, move the insertion point right between
2102     // the PHIs and the values we are going to write.
2103     // This allows us to write both PHINodes and the extractelement
2104     // instructions.
2105     Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
2106
2107     VectorParts RdxParts;
2108     setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr);
2109     for (unsigned part = 0; part < UF; ++part) {
2110       // This PHINode contains the vectorized reduction variable, or
2111       // the initial value vector, if we bypass the vector loop.
2112       VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
2113       PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
2114       Value *StartVal = (part == 0) ? VectorStart : Identity;
2115       for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
2116         NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
2117       NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
2118       RdxParts.push_back(NewPhi);
2119     }
2120
2121     // Reduce all of the unrolled parts into a single vector.
2122     Value *ReducedPartRdx = RdxParts[0];
2123     unsigned Op = getReductionBinOp(RdxDesc.Kind);
2124     setDebugLocFromInst(Builder, ReducedPartRdx);
2125     for (unsigned part = 1; part < UF; ++part) {
2126       if (Op != Instruction::ICmp && Op != Instruction::FCmp)
2127         ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
2128                                              RdxParts[part], ReducedPartRdx,
2129                                              "bin.rdx");
2130       else
2131         ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
2132                                         ReducedPartRdx, RdxParts[part]);
2133     }
2134
2135     // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
2136     // and vector ops, reducing the set of values being computed by half each
2137     // round.
2138     assert(isPowerOf2_32(VF) &&
2139            "Reduction emission only supported for pow2 vectors!");
2140     Value *TmpVec = ReducedPartRdx;
2141     SmallVector<Constant*, 32> ShuffleMask(VF, 0);
2142     for (unsigned i = VF; i != 1; i >>= 1) {
2143       // Move the upper half of the vector to the lower half.
2144       for (unsigned j = 0; j != i/2; ++j)
2145         ShuffleMask[j] = Builder.getInt32(i/2 + j);
2146
2147       // Fill the rest of the mask with undef.
2148       std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
2149                 UndefValue::get(Builder.getInt32Ty()));
2150
2151       Value *Shuf =
2152         Builder.CreateShuffleVector(TmpVec,
2153                                     UndefValue::get(TmpVec->getType()),
2154                                     ConstantVector::get(ShuffleMask),
2155                                     "rdx.shuf");
2156
2157       if (Op != Instruction::ICmp && Op != Instruction::FCmp)
2158         TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
2159                                      "bin.rdx");
2160       else
2161         TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
2162     }
2163
2164     // The result is in the first element of the vector.
2165     Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
2166
2167     // Now, we need to fix the users of the reduction variable
2168     // inside and outside of the scalar remainder loop.
2169     // We know that the loop is in LCSSA form. We need to update the
2170     // PHI nodes in the exit blocks.
2171     for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
2172          LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
2173       PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
2174       if (!LCSSAPhi) continue;
2175
2176       // All PHINodes need to have a single entry edge, or two if
2177       // we already fixed them.
2178       assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
2179
2180       // We found our reduction value exit-PHI. Update it with the
2181       // incoming bypass edge.
2182       if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
2183         // Add an edge coming from the bypass.
2184         LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
2185         break;
2186       }
2187     }// end of the LCSSA phi scan.
2188
2189     // Fix the scalar loop reduction variable with the incoming reduction sum
2190     // from the vector body and from the backedge value.
2191     int IncomingEdgeBlockIdx =
2192     (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
2193     assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
2194     // Pick the other block.
2195     int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
2196     (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
2197     (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
2198   }// end of for each redux variable.
2199
2200   // The Loop exit block may have single value PHI nodes where the incoming
2201   // value is 'undef'. While vectorizing we only handled real values that
2202   // were defined inside the loop. Here we handle the 'undef case'.
2203   // See PR14725.
2204   for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
2205        LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
2206     PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
2207     if (!LCSSAPhi) continue;
2208     if (LCSSAPhi->getNumIncomingValues() == 1)
2209       LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
2210                             LoopMiddleBlock);
2211   }
2212 }
2213
2214 InnerLoopVectorizer::VectorParts
2215 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
2216   assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
2217          "Invalid edge");
2218
2219   // Look for cached value.
2220   std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst);
2221   EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
2222   if (ECEntryIt != MaskCache.end())
2223     return ECEntryIt->second;
2224
2225   VectorParts SrcMask = createBlockInMask(Src);
2226
2227   // The terminator has to be a branch inst!
2228   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
2229   assert(BI && "Unexpected terminator found");
2230
2231   if (BI->isConditional()) {
2232     VectorParts EdgeMask = getVectorValue(BI->getCondition());
2233
2234     if (BI->getSuccessor(0) != Dst)
2235       for (unsigned part = 0; part < UF; ++part)
2236         EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
2237
2238     for (unsigned part = 0; part < UF; ++part)
2239       EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
2240
2241     MaskCache[Edge] = EdgeMask;
2242     return EdgeMask;
2243   }
2244
2245   MaskCache[Edge] = SrcMask;
2246   return SrcMask;
2247 }
2248
2249 InnerLoopVectorizer::VectorParts
2250 InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
2251   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
2252
2253   // Loop incoming mask is all-one.
2254   if (OrigLoop->getHeader() == BB) {
2255     Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
2256     return getVectorValue(C);
2257   }
2258
2259   // This is the block mask. We OR all incoming edges, and with zero.
2260   Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
2261   VectorParts BlockMask = getVectorValue(Zero);
2262
2263   // For each pred:
2264   for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
2265     VectorParts EM = createEdgeMask(*it, BB);
2266     for (unsigned part = 0; part < UF; ++part)
2267       BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
2268   }
2269
2270   return BlockMask;
2271 }
2272
2273 void
2274 InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
2275                                           BasicBlock *BB, PhiVector *PV) {
2276   // For each instruction in the old loop.
2277   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
2278     VectorParts &Entry = WidenMap.get(it);
2279     switch (it->getOpcode()) {
2280     case Instruction::Br:
2281       // Nothing to do for PHIs and BR, since we already took care of the
2282       // loop control flow instructions.
2283       continue;
2284     case Instruction::PHI:{
2285       PHINode* P = cast<PHINode>(it);
2286       // Handle reduction variables:
2287       if (Legal->getReductionVars()->count(P)) {
2288         for (unsigned part = 0; part < UF; ++part) {
2289           // This is phase one of vectorizing PHIs.
2290           Type *VecTy = VectorType::get(it->getType(), VF);
2291           Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
2292                                         LoopVectorBody-> getFirstInsertionPt());
2293         }
2294         PV->push_back(P);
2295         continue;
2296       }
2297
2298       setDebugLocFromInst(Builder, P);
2299       // Check for PHI nodes that are lowered to vector selects.
2300       if (P->getParent() != OrigLoop->getHeader()) {
2301         // We know that all PHIs in non header blocks are converted into
2302         // selects, so we don't have to worry about the insertion order and we
2303         // can just use the builder.
2304         // At this point we generate the predication tree. There may be
2305         // duplications since this is a simple recursive scan, but future
2306         // optimizations will clean it up.
2307
2308         unsigned NumIncoming = P->getNumIncomingValues();
2309
2310         // Generate a sequence of selects of the form:
2311         // SELECT(Mask3, In3,
2312         //      SELECT(Mask2, In2,
2313         //                   ( ...)))
2314         for (unsigned In = 0; In < NumIncoming; In++) {
2315           VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
2316                                             P->getParent());
2317           VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
2318
2319           for (unsigned part = 0; part < UF; ++part) {
2320             // We might have single edge PHIs (blocks) - use an identity
2321             // 'select' for the first PHI operand.
2322             if (In == 0)
2323               Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
2324                                                  In0[part]);
2325             else
2326               // Select between the current value and the previous incoming edge
2327               // based on the incoming mask.
2328               Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
2329                                                  Entry[part], "predphi");
2330           }
2331         }
2332         continue;
2333       }
2334
2335       // This PHINode must be an induction variable.
2336       // Make sure that we know about it.
2337       assert(Legal->getInductionVars()->count(P) &&
2338              "Not an induction variable");
2339
2340       LoopVectorizationLegality::InductionInfo II =
2341         Legal->getInductionVars()->lookup(P);
2342
2343       switch (II.IK) {
2344       case LoopVectorizationLegality::IK_NoInduction:
2345         llvm_unreachable("Unknown induction");
2346       case LoopVectorizationLegality::IK_IntInduction: {
2347         assert(P->getType() == II.StartValue->getType() && "Types must match");
2348         Type *PhiTy = P->getType();
2349         Value *Broadcasted;
2350         if (P == OldInduction) {
2351           // Handle the canonical induction variable. We might have had to
2352           // extend the type.
2353           Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
2354         } else {
2355           // Handle other induction variables that are now based on the
2356           // canonical one.
2357           Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
2358                                                    "normalized.idx");
2359           NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
2360           Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
2361                                           "offset.idx");
2362         }
2363         Broadcasted = getBroadcastInstrs(Broadcasted);
2364         // After broadcasting the induction variable we need to make the vector
2365         // consecutive by adding 0, 1, 2, etc.
2366         for (unsigned part = 0; part < UF; ++part)
2367           Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
2368         continue;
2369       }
2370       case LoopVectorizationLegality::IK_ReverseIntInduction:
2371       case LoopVectorizationLegality::IK_PtrInduction:
2372       case LoopVectorizationLegality::IK_ReversePtrInduction:
2373         // Handle reverse integer and pointer inductions.
2374         Value *StartIdx = ExtendedIdx;
2375         // This is the normalized GEP that starts counting at zero.
2376         Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
2377                                                  "normalized.idx");
2378
2379         // Handle the reverse integer induction variable case.
2380         if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
2381           IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
2382           Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
2383                                                  "resize.norm.idx");
2384           Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
2385                                                  "reverse.idx");
2386
2387           // This is a new value so do not hoist it out.
2388           Value *Broadcasted = getBroadcastInstrs(ReverseInd);
2389           // After broadcasting the induction variable we need to make the
2390           // vector consecutive by adding  ... -3, -2, -1, 0.
2391           for (unsigned part = 0; part < UF; ++part)
2392             Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
2393                                                true);
2394           continue;
2395         }
2396
2397         // Handle the pointer induction variable case.
2398         assert(P->getType()->isPointerTy() && "Unexpected type.");
2399
2400         // Is this a reverse induction ptr or a consecutive induction ptr.
2401         bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
2402                         II.IK);
2403
2404         // This is the vector of results. Notice that we don't generate
2405         // vector geps because scalar geps result in better code.
2406         for (unsigned part = 0; part < UF; ++part) {
2407           Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
2408           for (unsigned int i = 0; i < VF; ++i) {
2409             int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
2410             Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
2411             Value *GlobalIdx;
2412             if (!Reverse)
2413               GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
2414             else
2415               GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
2416
2417             Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
2418                                                "next.gep");
2419             VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
2420                                                  Builder.getInt32(i),
2421                                                  "insert.gep");
2422           }
2423           Entry[part] = VecVal;
2424         }
2425         continue;
2426       }
2427
2428     }// End of PHI.
2429
2430     case Instruction::Add:
2431     case Instruction::FAdd:
2432     case Instruction::Sub:
2433     case Instruction::FSub:
2434     case Instruction::Mul:
2435     case Instruction::FMul:
2436     case Instruction::UDiv:
2437     case Instruction::SDiv:
2438     case Instruction::FDiv:
2439     case Instruction::URem:
2440     case Instruction::SRem:
2441     case Instruction::FRem:
2442     case Instruction::Shl:
2443     case Instruction::LShr:
2444     case Instruction::AShr:
2445     case Instruction::And:
2446     case Instruction::Or:
2447     case Instruction::Xor: {
2448       // Just widen binops.
2449       BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
2450       setDebugLocFromInst(Builder, BinOp);
2451       VectorParts &A = getVectorValue(it->getOperand(0));
2452       VectorParts &B = getVectorValue(it->getOperand(1));
2453
2454       // Use this vector value for all users of the original instruction.
2455       for (unsigned Part = 0; Part < UF; ++Part) {
2456         Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
2457
2458         // Update the NSW, NUW and Exact flags. Notice: V can be an Undef.
2459         BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V);
2460         if (VecOp && isa<OverflowingBinaryOperator>(BinOp)) {
2461           VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
2462           VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
2463         }
2464         if (VecOp && isa<PossiblyExactOperator>(VecOp))
2465           VecOp->setIsExact(BinOp->isExact());
2466
2467         Entry[Part] = V;
2468       }
2469       break;
2470     }
2471     case Instruction::Select: {
2472       // Widen selects.
2473       // If the selector is loop invariant we can create a select
2474       // instruction with a scalar condition. Otherwise, use vector-select.
2475       bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
2476                                                OrigLoop);
2477       setDebugLocFromInst(Builder, it);
2478
2479       // The condition can be loop invariant  but still defined inside the
2480       // loop. This means that we can't just use the original 'cond' value.
2481       // We have to take the 'vectorized' value and pick the first lane.
2482       // Instcombine will make this a no-op.
2483       VectorParts &Cond = getVectorValue(it->getOperand(0));
2484       VectorParts &Op0  = getVectorValue(it->getOperand(1));
2485       VectorParts &Op1  = getVectorValue(it->getOperand(2));
2486       Value *ScalarCond = Builder.CreateExtractElement(Cond[0],
2487                                                        Builder.getInt32(0));
2488       for (unsigned Part = 0; Part < UF; ++Part) {
2489         Entry[Part] = Builder.CreateSelect(
2490           InvariantCond ? ScalarCond : Cond[Part],
2491           Op0[Part],
2492           Op1[Part]);
2493       }
2494       break;
2495     }
2496
2497     case Instruction::ICmp:
2498     case Instruction::FCmp: {
2499       // Widen compares. Generate vector compares.
2500       bool FCmp = (it->getOpcode() == Instruction::FCmp);
2501       CmpInst *Cmp = dyn_cast<CmpInst>(it);
2502       setDebugLocFromInst(Builder, it);
2503       VectorParts &A = getVectorValue(it->getOperand(0));
2504       VectorParts &B = getVectorValue(it->getOperand(1));
2505       for (unsigned Part = 0; Part < UF; ++Part) {
2506         Value *C = 0;
2507         if (FCmp)
2508           C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
2509         else
2510           C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
2511         Entry[Part] = C;
2512       }
2513       break;
2514     }
2515
2516     case Instruction::Store:
2517     case Instruction::Load:
2518         vectorizeMemoryInstruction(it, Legal);
2519         break;
2520     case Instruction::ZExt:
2521     case Instruction::SExt:
2522     case Instruction::FPToUI:
2523     case Instruction::FPToSI:
2524     case Instruction::FPExt:
2525     case Instruction::PtrToInt:
2526     case Instruction::IntToPtr:
2527     case Instruction::SIToFP:
2528     case Instruction::UIToFP:
2529     case Instruction::Trunc:
2530     case Instruction::FPTrunc:
2531     case Instruction::BitCast: {
2532       CastInst *CI = dyn_cast<CastInst>(it);
2533       setDebugLocFromInst(Builder, it);
2534       /// Optimize the special case where the source is the induction
2535       /// variable. Notice that we can only optimize the 'trunc' case
2536       /// because: a. FP conversions lose precision, b. sext/zext may wrap,
2537       /// c. other casts depend on pointer size.
2538       if (CI->getOperand(0) == OldInduction &&
2539           it->getOpcode() == Instruction::Trunc) {
2540         Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
2541                                                CI->getType());
2542         Value *Broadcasted = getBroadcastInstrs(ScalarCast);
2543         for (unsigned Part = 0; Part < UF; ++Part)
2544           Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false);
2545         break;
2546       }
2547       /// Vectorize casts.
2548       Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
2549
2550       VectorParts &A = getVectorValue(it->getOperand(0));
2551       for (unsigned Part = 0; Part < UF; ++Part)
2552         Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
2553       break;
2554     }
2555
2556     case Instruction::Call: {
2557       // Ignore dbg intrinsics.
2558       if (isa<DbgInfoIntrinsic>(it))
2559         break;
2560       setDebugLocFromInst(Builder, it);
2561
2562       Module *M = BB->getParent()->getParent();
2563       CallInst *CI = cast<CallInst>(it);
2564       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
2565       assert(ID && "Not an intrinsic call!");
2566       for (unsigned Part = 0; Part < UF; ++Part) {
2567         SmallVector<Value*, 4> Args;
2568         for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
2569           VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
2570           Args.push_back(Arg[Part]);
2571         }
2572         Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) };
2573         Function *F = Intrinsic::getDeclaration(M, ID, Tys);
2574         Entry[Part] = Builder.CreateCall(F, Args);
2575       }
2576       break;
2577     }
2578
2579     default:
2580       // All other instructions are unsupported. Scalarize them.
2581       scalarizeInstruction(it);
2582       break;
2583     }// end of switch.
2584   }// end of for_each instr.
2585 }
2586
2587 void InnerLoopVectorizer::updateAnalysis() {
2588   // Forget the original basic block.
2589   SE->forgetLoop(OrigLoop);
2590
2591   // Update the dominator tree information.
2592   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
2593          "Entry does not dominate exit.");
2594
2595   for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
2596     DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
2597   DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
2598   DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
2599   DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
2600   DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
2601   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
2602   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2603
2604   DEBUG(DT->verifyAnalysis());
2605 }
2606
2607 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
2608   if (!EnableIfConversion)
2609     return false;
2610
2611   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
2612   std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
2613
2614   // Collect the blocks that need predication.
2615   for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
2616     BasicBlock *BB = LoopBlocks[i];
2617
2618     // We don't support switch statements inside loops.
2619     if (!isa<BranchInst>(BB->getTerminator()))
2620       return false;
2621
2622     // We must be able to predicate all blocks that need to be predicated.
2623     if (blockNeedsPredication(BB) && !blockCanBePredicated(BB))
2624       return false;
2625   }
2626
2627   // Check that we can actually speculate the hoistable loads.
2628   if (!LoadSpeculation.canHoistAllLoads())
2629     return false;
2630
2631   // We can if-convert this loop.
2632   return true;
2633 }
2634
2635 bool LoopVectorizationLegality::canVectorize() {
2636   // We must have a loop in canonical form. Loops with indirectbr in them cannot
2637   // be canonicalized.
2638   if (!TheLoop->getLoopPreheader())
2639     return false;
2640
2641   // We can only vectorize innermost loops.
2642   if (TheLoop->getSubLoopsVector().size())
2643     return false;
2644
2645   // We must have a single backedge.
2646   if (TheLoop->getNumBackEdges() != 1)
2647     return false;
2648
2649   // We must have a single exiting block.
2650   if (!TheLoop->getExitingBlock())
2651     return false;
2652
2653   unsigned NumBlocks = TheLoop->getNumBlocks();
2654
2655   // Check if we can if-convert non single-bb loops.
2656   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
2657     DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
2658     return false;
2659   }
2660
2661   // We need to have a loop header.
2662   BasicBlock *Latch = TheLoop->getLoopLatch();
2663   DEBUG(dbgs() << "LV: Found a loop: " <<
2664         TheLoop->getHeader()->getName() << "\n");
2665
2666   // ScalarEvolution needs to be able to find the exit count.
2667   const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
2668   if (ExitCount == SE->getCouldNotCompute()) {
2669     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
2670     return false;
2671   }
2672
2673   // Do not loop-vectorize loops with a tiny trip count.
2674   unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
2675   if (TC > 0u && TC < TinyTripCountVectorThreshold) {
2676     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
2677           "This loop is not worth vectorizing.\n");
2678     return false;
2679   }
2680
2681   // Check if we can vectorize the instructions and CFG in this loop.
2682   if (!canVectorizeInstrs()) {
2683     DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
2684     return false;
2685   }
2686
2687   // Go over each instruction and look at memory deps.
2688   if (!canVectorizeMemory()) {
2689     DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
2690     return false;
2691   }
2692
2693   // Collect all of the variables that remain uniform after vectorization.
2694   collectLoopUniforms();
2695
2696   DEBUG(dbgs() << "LV: We can vectorize this loop" <<
2697         (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
2698         <<"!\n");
2699
2700   // Okay! We can vectorize. At this point we don't have any other mem analysis
2701   // which may limit our maximum vectorization factor, so just return true with
2702   // no restrictions.
2703   return true;
2704 }
2705
2706 static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) {
2707   if (Ty->isPointerTy())
2708     return DL.getIntPtrType(Ty->getContext());
2709   return Ty;
2710 }
2711
2712 static Type* getWiderType(DataLayout &DL, Type *Ty0, Type *Ty1) {
2713   Ty0 = convertPointerToIntegerType(DL, Ty0);
2714   Ty1 = convertPointerToIntegerType(DL, Ty1);
2715   if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
2716     return Ty0;
2717   return Ty1;
2718 }
2719
2720 /// \brief Check that the instruction has outside loop users and is not an
2721 /// identified reduction variable.
2722 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
2723                                SmallPtrSet<Value *, 4> &Reductions) {
2724   // Reduction instructions are allowed to have exit users. All other
2725   // instructions must not have external users.
2726   if (!Reductions.count(Inst))
2727     //Check that all of the users of the loop are inside the BB.
2728     for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end();
2729          I != E; ++I) {
2730       Instruction *U = cast<Instruction>(*I);
2731       // This user may be a reduction exit value.
2732       if (!TheLoop->contains(U)) {
2733         DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
2734         return true;
2735       }
2736     }
2737   return false;
2738 }
2739
2740 bool LoopVectorizationLegality::canVectorizeInstrs() {
2741   BasicBlock *PreHeader = TheLoop->getLoopPreheader();
2742   BasicBlock *Header = TheLoop->getHeader();
2743
2744   // Look for the attribute signaling the absence of NaNs.
2745   Function &F = *Header->getParent();
2746   if (F.hasFnAttribute("no-nans-fp-math"))
2747     HasFunNoNaNAttr = F.getAttributes().getAttribute(
2748       AttributeSet::FunctionIndex,
2749       "no-nans-fp-math").getValueAsString() == "true";
2750
2751   // For each block in the loop.
2752   for (Loop::block_iterator bb = TheLoop->block_begin(),
2753        be = TheLoop->block_end(); bb != be; ++bb) {
2754
2755     // Scan the instructions in the block and look for hazards.
2756     for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
2757          ++it) {
2758
2759       if (PHINode *Phi = dyn_cast<PHINode>(it)) {
2760         Type *PhiTy = Phi->getType();
2761         // Check that this PHI type is allowed.
2762         if (!PhiTy->isIntegerTy() &&
2763             !PhiTy->isFloatingPointTy() &&
2764             !PhiTy->isPointerTy()) {
2765           DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
2766           return false;
2767         }
2768
2769         // If this PHINode is not in the header block, then we know that we
2770         // can convert it to select during if-conversion. No need to check if
2771         // the PHIs in this block are induction or reduction variables.
2772         if (*bb != Header) {
2773           // Check that this instruction has no outside users or is an
2774           // identified reduction value with an outside user.
2775           if(!hasOutsideLoopUser(TheLoop, it, AllowedExit))
2776             continue;
2777           return false;
2778         }
2779
2780         // We only allow if-converted PHIs with more than two incoming values.
2781         if (Phi->getNumIncomingValues() != 2) {
2782           DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
2783           return false;
2784         }
2785
2786         // This is the value coming from the preheader.
2787         Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
2788         // Check if this is an induction variable.
2789         InductionKind IK = isInductionVariable(Phi);
2790
2791         if (IK_NoInduction != IK) {
2792           // Get the widest type.
2793           if (!WidestIndTy)
2794             WidestIndTy = convertPointerToIntegerType(*DL, PhiTy);
2795           else
2796             WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy);
2797
2798           // Int inductions are special because we only allow one IV.
2799           if (IK == IK_IntInduction) {
2800             // Use the phi node with the widest type as induction. Use the last
2801             // one if there are multiple (no good reason for doing this other
2802             // than it is expedient).
2803             if (!Induction || PhiTy == WidestIndTy)
2804               Induction = Phi;
2805           }
2806
2807           DEBUG(dbgs() << "LV: Found an induction variable.\n");
2808           Inductions[Phi] = InductionInfo(StartValue, IK);
2809           continue;
2810         }
2811
2812         if (AddReductionVar(Phi, RK_IntegerAdd)) {
2813           DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
2814           continue;
2815         }
2816         if (AddReductionVar(Phi, RK_IntegerMult)) {
2817           DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
2818           continue;
2819         }
2820         if (AddReductionVar(Phi, RK_IntegerOr)) {
2821           DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
2822           continue;
2823         }
2824         if (AddReductionVar(Phi, RK_IntegerAnd)) {
2825           DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
2826           continue;
2827         }
2828         if (AddReductionVar(Phi, RK_IntegerXor)) {
2829           DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
2830           continue;
2831         }
2832         if (AddReductionVar(Phi, RK_IntegerMinMax)) {
2833           DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n");
2834           continue;
2835         }
2836         if (AddReductionVar(Phi, RK_FloatMult)) {
2837           DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n");
2838           continue;
2839         }
2840         if (AddReductionVar(Phi, RK_FloatAdd)) {
2841           DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n");
2842           continue;
2843         }
2844         if (AddReductionVar(Phi, RK_FloatMinMax)) {
2845           DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<"\n");
2846           continue;
2847         }
2848
2849         DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
2850         return false;
2851       }// end of PHI handling
2852
2853       // We still don't handle functions. However, we can ignore dbg intrinsic
2854       // calls and we do handle certain intrinsic and libm functions.
2855       CallInst *CI = dyn_cast<CallInst>(it);
2856       if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
2857         DEBUG(dbgs() << "LV: Found a call site.\n");
2858         return false;
2859       }
2860
2861       // Check that the instruction return type is vectorizable.
2862       if (!VectorType::isValidElementType(it->getType()) &&
2863           !it->getType()->isVoidTy()) {
2864         DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
2865         return false;
2866       }
2867
2868       // Check that the stored type is vectorizable.
2869       if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
2870         Type *T = ST->getValueOperand()->getType();
2871         if (!VectorType::isValidElementType(T))
2872           return false;
2873       }
2874
2875       // Reduction instructions are allowed to have exit users.
2876       // All other instructions must not have external users.
2877       if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
2878         return false;
2879
2880     } // next instr.
2881
2882   }
2883
2884   if (!Induction) {
2885     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
2886     if (Inductions.empty())
2887       return false;
2888   }
2889
2890   return true;
2891 }
2892
2893 void LoopVectorizationLegality::collectLoopUniforms() {
2894   // We now know that the loop is vectorizable!
2895   // Collect variables that will remain uniform after vectorization.
2896   std::vector<Value*> Worklist;
2897   BasicBlock *Latch = TheLoop->getLoopLatch();
2898
2899   // Start with the conditional branch and walk up the block.
2900   Worklist.push_back(Latch->getTerminator()->getOperand(0));
2901
2902   while (Worklist.size()) {
2903     Instruction *I = dyn_cast<Instruction>(Worklist.back());
2904     Worklist.pop_back();
2905
2906     // Look at instructions inside this loop.
2907     // Stop when reaching PHI nodes.
2908     // TODO: we need to follow values all over the loop, not only in this block.
2909     if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
2910       continue;
2911
2912     // This is a known uniform.
2913     Uniforms.insert(I);
2914
2915     // Insert all operands.
2916     Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
2917   }
2918 }
2919
2920 /// \brief Analyses memory accesses in a loop.
2921 ///
2922 /// Checks whether run time pointer checks are needed and builds sets for data
2923 /// dependence checking.
2924 class AccessAnalysis {
2925 public:
2926   /// \brief Read or write access location.
2927   typedef std::pair<Value*, char> MemAccessInfo;
2928
2929   /// \brief Set of potential dependent memory accesses.
2930   typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
2931
2932   AccessAnalysis(DataLayout *Dl, DepCandidates &DA) :
2933     DL(Dl), DepCands(DA), AreAllWritesIdentified(true),
2934     AreAllReadsIdentified(true), IsRTCheckNeeded(false) {}
2935
2936   /// \brief Register a load  and whether it is only read from.
2937   void addLoad(Value *Ptr, bool IsReadOnly) {
2938     Accesses.insert(std::make_pair(Ptr, false));
2939     if (IsReadOnly)
2940       ReadOnlyPtr.insert(Ptr);
2941   }
2942
2943   /// \brief Register a store.
2944   void addStore(Value *Ptr) {
2945     Accesses.insert(std::make_pair(Ptr, true));
2946   }
2947
2948   /// \brief Check whether we can check the pointers at runtime for
2949   /// non-intersection.
2950   bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
2951                        unsigned &NumComparisons, ScalarEvolution *SE,
2952                        Loop *TheLoop);
2953
2954   /// \brief Goes over all memory accesses, checks whether a RT check is needed
2955   /// and builds sets of dependent accesses.
2956   void buildDependenceSets() {
2957     // Process read-write pointers first.
2958     processMemAccesses(false);
2959     // Next, process read pointers.
2960     processMemAccesses(true);
2961   }
2962
2963   bool isRTCheckNeeded() { return IsRTCheckNeeded; }
2964
2965   bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
2966
2967   DenseSet<MemAccessInfo> &getDependenciesToCheck() { return CheckDeps; }
2968
2969 private:
2970   typedef SetVector<MemAccessInfo> PtrAccessSet;
2971   typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap;
2972
2973   /// \brief Go over all memory access or only the deferred ones if
2974   /// \p UseDeferred is true and check whether runtime pointer checks are needed
2975   /// and build sets of dependency check candidates.
2976   void processMemAccesses(bool UseDeferred);
2977
2978   /// Set of all accesses.
2979   PtrAccessSet Accesses;
2980
2981   /// Set of access to check after all writes have been processed.
2982   PtrAccessSet DeferredAccesses;
2983
2984   /// Map of pointers to last access encountered.
2985   UnderlyingObjToAccessMap ObjToLastAccess;
2986
2987   /// Set of accesses that need a further dependence check.
2988   DenseSet<MemAccessInfo> CheckDeps;
2989
2990   /// Set of pointers that are read only.
2991   SmallPtrSet<Value*, 16> ReadOnlyPtr;
2992
2993   /// Set of underlying objects already written to.
2994   SmallPtrSet<Value*, 16> WriteObjects;
2995
2996   DataLayout *DL;
2997
2998   /// Sets of potentially dependent accesses - members of one set share an
2999   /// underlying pointer. The set "CheckDeps" identfies which sets really need a
3000   /// dependence check.
3001   DepCandidates &DepCands;
3002
3003   bool AreAllWritesIdentified;
3004   bool AreAllReadsIdentified;
3005   bool IsRTCheckNeeded;
3006 };
3007
3008 /// \brief Check whether a pointer can participate in a runtime bounds check.
3009 static bool hasComputableBounds(ScalarEvolution *SE, Value *Ptr) {
3010   const SCEV *PtrScev = SE->getSCEV(Ptr);
3011   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
3012   if (!AR)
3013     return false;
3014
3015   return AR->isAffine();
3016 }
3017
3018 bool AccessAnalysis::canCheckPtrAtRT(
3019                        LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
3020                         unsigned &NumComparisons, ScalarEvolution *SE,
3021                         Loop *TheLoop) {
3022   // Find pointers with computable bounds. We are going to use this information
3023   // to place a runtime bound check.
3024   unsigned NumReadPtrChecks = 0;
3025   unsigned NumWritePtrChecks = 0;
3026   bool CanDoRT = true;
3027
3028   bool IsDepCheckNeeded = isDependencyCheckNeeded();
3029   // We assign consecutive id to access from different dependence sets.
3030   // Accesses within the same set don't need a runtime check.
3031   unsigned RunningDepId = 1;
3032   DenseMap<Value *, unsigned> DepSetId;
3033
3034   for (PtrAccessSet::iterator AI = Accesses.begin(), AE = Accesses.end();
3035        AI != AE; ++AI) {
3036     const MemAccessInfo &Access = *AI;
3037     Value *Ptr = Access.first;
3038     bool IsWrite = Access.second;
3039
3040     // Just add write checks if we have both.
3041     if (!IsWrite && Accesses.count(std::make_pair(Ptr, true)))
3042       continue;
3043
3044     if (IsWrite)
3045       ++NumWritePtrChecks;
3046     else
3047       ++NumReadPtrChecks;
3048
3049     if (hasComputableBounds(SE, Ptr)) {
3050       // The id of the dependence set.
3051       unsigned DepId;
3052
3053       if (IsDepCheckNeeded) {
3054         Value *Leader = DepCands.getLeaderValue(Access).first;
3055         unsigned &LeaderId = DepSetId[Leader];
3056         if (!LeaderId)
3057           LeaderId = RunningDepId++;
3058         DepId = LeaderId;
3059       } else
3060         // Each access has its own dependence set.
3061         DepId = RunningDepId++;
3062
3063       RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId);
3064
3065       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr <<"\n");
3066     } else {
3067       CanDoRT = false;
3068     }
3069   }
3070
3071   if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
3072     NumComparisons = 0; // Only one dependence set.
3073   else
3074     NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks +
3075                                            NumWritePtrChecks - 1));
3076   return CanDoRT;
3077 }
3078
3079 static bool isFunctionScopeIdentifiedObject(Value *Ptr) {
3080   return isNoAliasArgument(Ptr) || isNoAliasCall(Ptr) || isa<AllocaInst>(Ptr);
3081 }
3082
3083 void AccessAnalysis::processMemAccesses(bool UseDeferred) {
3084   // We process the set twice: first we process read-write pointers, last we
3085   // process read-only pointers. This allows us to skip dependence tests for
3086   // read-only pointers.
3087
3088   PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
3089   for (PtrAccessSet::iterator AI = S.begin(), AE = S.end(); AI != AE; ++AI) {
3090     const MemAccessInfo &Access = *AI;
3091     Value *Ptr = Access.first;
3092     bool IsWrite = Access.second;
3093
3094     DepCands.insert(Access);
3095
3096     // Memorize read-only pointers for later processing and skip them in the
3097     // first round (they need to be checked after we have seen all write
3098     // pointers). Note: we also mark pointer that are not consecutive as
3099     // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need the
3100     // second check for "!IsWrite".
3101     bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
3102     if (!UseDeferred && IsReadOnlyPtr) {
3103       DeferredAccesses.insert(Access);
3104       continue;
3105     }
3106
3107     bool NeedDepCheck = false;
3108     // Check whether there is the possiblity of dependency because of underlying
3109     // objects being the same.
3110     typedef SmallVector<Value*, 16> ValueVector;
3111     ValueVector TempObjects;
3112     GetUnderlyingObjects(Ptr, TempObjects, DL);
3113     for (ValueVector::iterator UI = TempObjects.begin(), UE = TempObjects.end();
3114          UI != UE; ++UI) {
3115       Value *UnderlyingObj = *UI;
3116
3117       // If this is a write then it needs to be an identified object.  If this a
3118       // read and all writes (so far) are identified function scope objects we
3119       // don't need an identified underlying object but only an Argument (the
3120       // next write is going to invalidate this assumption if it is
3121       // unidentified).
3122       // This is a micro-optimization for the case where all writes are
3123       // identified and we have one argument pointer.
3124       // Otherwise, we do need a runtime check.
3125       if ((IsWrite && !isFunctionScopeIdentifiedObject(UnderlyingObj)) ||
3126           (!IsWrite && (!AreAllWritesIdentified ||
3127                         !isa<Argument>(UnderlyingObj)) &&
3128            !isIdentifiedObject(UnderlyingObj))) {
3129         DEBUG(dbgs() << "LV: Found an unidentified " <<
3130               (IsWrite ?  "write" : "read" ) << " ptr:" << *UnderlyingObj <<
3131               "\n");
3132         IsRTCheckNeeded = (IsRTCheckNeeded ||
3133                            !isIdentifiedObject(UnderlyingObj) ||
3134                            !AreAllReadsIdentified);
3135
3136         if (IsWrite)
3137           AreAllWritesIdentified = false;
3138         if (!IsWrite)
3139           AreAllReadsIdentified = false;
3140       }
3141
3142       // If this is a write - check other reads and writes for conflicts.  If
3143       // this is a read only check other writes for conflicts (but only if there
3144       // is no other write to the ptr - this is an optimization to catch "a[i] =
3145       // a[i] + " without having to do a dependence check).
3146       if ((IsWrite || IsReadOnlyPtr) && WriteObjects.count(UnderlyingObj))
3147         NeedDepCheck = true;
3148
3149       if (IsWrite)
3150         WriteObjects.insert(UnderlyingObj);
3151
3152       // Create sets of pointers connected by shared underlying objects.
3153       UnderlyingObjToAccessMap::iterator Prev =
3154         ObjToLastAccess.find(UnderlyingObj);
3155       if (Prev != ObjToLastAccess.end())
3156         DepCands.unionSets(Access, Prev->second);
3157
3158       ObjToLastAccess[UnderlyingObj] = Access;
3159     }
3160
3161     if (NeedDepCheck)
3162       CheckDeps.insert(Access);
3163   }
3164 }
3165
3166 /// \brief Checks memory dependences among accesses to the same underlying
3167 /// object to determine whether there vectorization is legal or not (and at
3168 /// which vectorization factor).
3169 ///
3170 /// This class works under the assumption that we already checked that memory
3171 /// locations with different underlying pointers are "must-not alias".
3172 /// We use the ScalarEvolution framework to symbolically evalutate access
3173 /// functions pairs. Since we currently don't restructure the loop we can rely
3174 /// on the program order of memory accesses to determine their safety.
3175 /// At the moment we will only deem accesses as safe for:
3176 ///  * A negative constant distance assuming program order.
3177 ///
3178 ///      Safe: tmp = a[i + 1];     OR     a[i + 1] = x;
3179 ///            a[i] = tmp;                y = a[i];
3180 ///
3181 ///   The latter case is safe because later checks guarantuee that there can't
3182 ///   be a cycle through a phi node (that is, we check that "x" and "y" is not
3183 ///   the same variable: a header phi can only be an induction or a reduction, a
3184 ///   reduction can't have a memory sink, an induction can't have a memory
3185 ///   source). This is important and must not be violated (or we have to
3186 ///   resort to checking for cycles through memory).
3187 ///
3188 ///  * A positive constant distance assuming program order that is bigger
3189 ///    than the biggest memory access.
3190 ///
3191 ///     tmp = a[i]        OR              b[i] = x
3192 ///     a[i+2] = tmp                      y = b[i+2];
3193 ///
3194 ///     Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively.
3195 ///
3196 ///  * Zero distances and all accesses have the same size.
3197 ///
3198 class MemoryDepChecker {
3199 public:
3200   typedef std::pair<Value*, char> MemAccessInfo;
3201
3202   MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) :
3203     SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0) {}
3204
3205   /// \brief Register the location (instructions are given increasing numbers)
3206   /// of a write access.
3207   void addAccess(StoreInst *SI) {
3208     Value *Ptr = SI->getPointerOperand();
3209     Accesses[std::make_pair(Ptr, true)].push_back(AccessIdx);
3210     InstMap.push_back(SI);
3211     ++AccessIdx;
3212   }
3213
3214   /// \brief Register the location (instructions are given increasing numbers)
3215   /// of a write access.
3216   void addAccess(LoadInst *LI) {
3217     Value *Ptr = LI->getPointerOperand();
3218     Accesses[std::make_pair(Ptr, false)].push_back(AccessIdx);
3219     InstMap.push_back(LI);
3220     ++AccessIdx;
3221   }
3222
3223   /// \brief Check whether the dependencies between the accesses are safe.
3224   ///
3225   /// Only checks sets with elements in \p CheckDeps.
3226   bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
3227                    DenseSet<MemAccessInfo> &CheckDeps);
3228
3229   /// \brief The maximum number of bytes of a vector register we can vectorize
3230   /// the accesses safely with.
3231   unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
3232
3233 private:
3234   ScalarEvolution *SE;
3235   DataLayout *DL;
3236   const Loop *InnermostLoop;
3237
3238   /// \brief Maps access locations (ptr, read/write) to program order.
3239   DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
3240
3241   /// \brief Memory access instructions in program order.
3242   SmallVector<Instruction *, 16> InstMap;
3243
3244   /// \brief The program order index to be used for the next instruction.
3245   unsigned AccessIdx;
3246
3247   // We can access this many bytes in parallel safely.
3248   unsigned MaxSafeDepDistBytes;
3249
3250   /// \brief Check whether there is a plausible dependence between the two
3251   /// accesses.
3252   ///
3253   /// Access \p A must happen before \p B in program order. The two indices
3254   /// identify the index into the program order map.
3255   ///
3256   /// This function checks  whether there is a plausible dependence (or the
3257   /// absence of such can't be proved) between the two accesses. If there is a
3258   /// plausible dependence but the dependence distance is bigger than one
3259   /// element access it records this distance in \p MaxSafeDepDistBytes (if this
3260   /// distance is smaller than any other distance encountered so far).
3261   /// Otherwise, this function returns true signaling a possible dependence.
3262   bool isDependent(const MemAccessInfo &A, unsigned AIdx,
3263                    const MemAccessInfo &B, unsigned BIdx);
3264
3265   /// \brief Check whether the data dependence could prevent store-load
3266   /// forwarding.
3267   bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
3268 };
3269
3270 static bool isInBoundsGep(Value *Ptr) {
3271   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
3272     return GEP->isInBounds();
3273   return false;
3274 }
3275
3276 /// \brief Check whether the access through \p Ptr has a constant stride.
3277 static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
3278                         const Loop *Lp) {
3279   const Type *PtrTy = Ptr->getType();
3280   assert(PtrTy->isPointerTy() && "Unexpected non ptr");
3281
3282   // Make sure that the pointer does not point to aggregate types.
3283   if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType()) {
3284     DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr
3285           << "\n");
3286     return 0;
3287   }
3288
3289   const SCEV *PtrScev = SE->getSCEV(Ptr);
3290   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
3291   if (!AR) {
3292     DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer "
3293           << *Ptr << " SCEV: " << *PtrScev << "\n");
3294     return 0;
3295   }
3296
3297   // The accesss function must stride over the innermost loop.
3298   if (Lp != AR->getLoop()) {
3299     DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " << *Ptr
3300           << " SCEV: " << *PtrScev << "\n");
3301   }
3302
3303   // The address calculation must not wrap. Otherwise, a dependence could be
3304   // inverted. An inbounds getelementptr that is a AddRec with a unit stride
3305   // cannot wrap per definition. The unit stride requirement is checked later.
3306   bool IsInBoundsGEP = isInBoundsGep(Ptr);
3307   bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
3308   if (!IsNoWrapAddRec && !IsInBoundsGEP) {
3309     DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space "
3310           << *Ptr << " SCEV: " << *PtrScev << "\n");
3311     return 0;
3312   }
3313
3314   // Check the step is constant.
3315   const SCEV *Step = AR->getStepRecurrence(*SE);
3316
3317   // Calculate the pointer stride and check if it is consecutive.
3318   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
3319   if (!C) {
3320     DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr <<
3321           " SCEV: " << *PtrScev << "\n");
3322     return 0;
3323   }
3324
3325   int64_t Size = DL->getTypeAllocSize(PtrTy->getPointerElementType());
3326   const APInt &APStepVal = C->getValue()->getValue();
3327
3328   // Huge step value - give up.
3329   if (APStepVal.getBitWidth() > 64)
3330     return 0;
3331
3332   int64_t StepVal = APStepVal.getSExtValue();
3333
3334   // Strided access.
3335   int64_t Stride = StepVal / Size;
3336   int64_t Rem = StepVal % Size;
3337   if (Rem)
3338     return 0;
3339
3340   // If the SCEV could wrap but we have an inbounds gep with a unit stride we
3341   // know we can't "wrap around the address space".
3342   if (!IsNoWrapAddRec && IsInBoundsGEP && Stride != 1 && Stride != -1)
3343     return 0;
3344
3345   return Stride;
3346 }
3347
3348 bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
3349                                                     unsigned TypeByteSize) {
3350   // If loads occur at a distance that is not a multiple of a feasible vector
3351   // factor store-load forwarding does not take place.
3352   // Positive dependences might cause troubles because vectorizing them might
3353   // prevent store-load forwarding making vectorized code run a lot slower.
3354   //   a[i] = a[i-3] ^ a[i-8];
3355   //   The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
3356   //   hence on your typical architecture store-load forwarding does not take
3357   //   place. Vectorizing in such cases does not make sense.
3358   // Store-load forwarding distance.
3359   const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize;
3360   // Maximum vector factor.
3361   unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize;
3362   if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues)
3363     MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes;
3364
3365   for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues;
3366        vf *= 2) {
3367     if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) {
3368       MaxVFWithoutSLForwardIssues = (vf >>=1);
3369       break;
3370     }
3371   }
3372
3373   if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) {
3374     DEBUG(dbgs() << "LV: Distance " << Distance <<
3375           " that could cause a store-load forwarding conflict\n");
3376     return true;
3377   }
3378
3379   if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
3380       MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize)
3381     MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
3382   return false;
3383 }
3384
3385 bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
3386                                    const MemAccessInfo &B, unsigned BIdx) {
3387   assert (AIdx < BIdx && "Must pass arguments in program order");
3388
3389   Value *APtr = A.first;
3390   Value *BPtr = B.first;
3391   bool AIsWrite = A.second;
3392   bool BIsWrite = B.second;
3393
3394   // Two reads are independent.
3395   if (!AIsWrite && !BIsWrite)
3396     return false;
3397
3398   const SCEV *AScev = SE->getSCEV(APtr);
3399   const SCEV *BScev = SE->getSCEV(BPtr);
3400
3401   int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop);
3402   int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop);
3403
3404   const SCEV *Src = AScev;
3405   const SCEV *Sink = BScev;
3406
3407   // If the induction step is negative we have to invert source and sink of the
3408   // dependence.
3409   if (StrideAPtr < 0) {
3410     //Src = BScev;
3411     //Sink = AScev;
3412     std::swap(APtr, BPtr);
3413     std::swap(Src, Sink);
3414     std::swap(AIsWrite, BIsWrite);
3415     std::swap(AIdx, BIdx);
3416     std::swap(StrideAPtr, StrideBPtr);
3417   }
3418
3419   const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
3420
3421   DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink
3422         << "(Induction step: " << StrideAPtr <<  ")\n");
3423   DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to "
3424         << *InstMap[BIdx] << ": " << *Dist << "\n");
3425
3426   // Need consecutive accesses. We don't want to vectorize
3427   // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
3428   // the address space.
3429   if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
3430     DEBUG(dbgs() << "Non-consecutive pointer access\n");
3431     return true;
3432   }
3433
3434   const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
3435   if (!C) {
3436     DEBUG(dbgs() << "LV: Dependence because of non constant distance\n");
3437     return true;
3438   }
3439
3440   Type *ATy = APtr->getType()->getPointerElementType();
3441   Type *BTy = BPtr->getType()->getPointerElementType();
3442   unsigned TypeByteSize = DL->getTypeAllocSize(ATy);
3443
3444   // Negative distances are not plausible dependencies.
3445   const APInt &Val = C->getValue()->getValue();
3446   if (Val.isNegative()) {
3447     bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
3448     if (IsTrueDataDependence &&
3449         (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
3450          ATy != BTy))
3451       return true;
3452
3453     DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n");
3454     return false;
3455   }
3456
3457   // Write to the same location with the same size.
3458   // Could be improved to assert type sizes are the same (i32 == float, etc).
3459   if (Val == 0) {
3460     if (ATy == BTy)
3461       return false;
3462     DEBUG(dbgs() << "LV: Zero dependence difference but different types");
3463     return true;
3464   }
3465
3466   assert(Val.isStrictlyPositive() && "Expect a positive value");
3467
3468   // Positive distance bigger than max vectorization factor.
3469   if (ATy != BTy) {
3470     DEBUG(dbgs() <<
3471           "LV: ReadWrite-Write positive dependency with different types");
3472     return false;
3473   }
3474
3475   unsigned Distance = (unsigned) Val.getZExtValue();
3476
3477   // Bail out early if passed-in parameters make vectorization not feasible.
3478   unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1;
3479   unsigned ForcedUnroll = VectorizationUnroll ? VectorizationUnroll : 1;
3480
3481   // The distance must be bigger than the size needed for a vectorized version
3482   // of the operation and the size of the vectorized operation must not be
3483   // bigger than the currrent maximum size.
3484   if (Distance < 2*TypeByteSize ||
3485       2*TypeByteSize > MaxSafeDepDistBytes ||
3486       Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
3487     DEBUG(dbgs() << "LV: Failure because of Positive distance "
3488         << Val.getSExtValue() << "\n");
3489     return true;
3490   }
3491
3492   MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
3493     Distance : MaxSafeDepDistBytes;
3494
3495   bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
3496   if (IsTrueDataDependence &&
3497       couldPreventStoreLoadForward(Distance, TypeByteSize))
3498      return true;
3499
3500   DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() <<
3501         " with max VF=" << MaxSafeDepDistBytes/TypeByteSize << "\n");
3502
3503   return false;
3504 }
3505
3506 bool
3507 MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
3508                               DenseSet<MemAccessInfo> &CheckDeps) {
3509
3510   MaxSafeDepDistBytes = -1U;
3511   while (!CheckDeps.empty()) {
3512     MemAccessInfo CurAccess = *CheckDeps.begin();
3513
3514     // Get the relevant memory access set.
3515     EquivalenceClasses<MemAccessInfo>::iterator I =
3516       AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
3517
3518     // Check accesses within this set.
3519     EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE;
3520     AI = AccessSets.member_begin(I), AE = AccessSets.member_end();
3521
3522     // Check every access pair.
3523     while (AI != AE) {
3524       CheckDeps.erase(*AI);
3525       EquivalenceClasses<MemAccessInfo>::member_iterator OI = llvm::next(AI);
3526       while (OI != AE) {
3527         // Check every accessing instruction pair in program order.
3528         for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
3529              I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
3530           for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
3531                I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
3532             if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2))
3533               return false;
3534             if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1))
3535               return false;
3536           }
3537         ++OI;
3538       }
3539       AI++;
3540     }
3541   }
3542   return true;
3543 }
3544
3545 bool LoopVectorizationLegality::canVectorizeMemory() {
3546
3547   typedef SmallVector<Value*, 16> ValueVector;
3548   typedef SmallPtrSet<Value*, 16> ValueSet;
3549
3550   // Stores a pair of memory access location and whether the access is a store
3551   // (true) or a load (false).
3552   typedef std::pair<Value*, char> MemAccessInfo;
3553   typedef DenseSet<MemAccessInfo> PtrAccessSet;
3554
3555   // Holds the Load and Store *instructions*.
3556   ValueVector Loads;
3557   ValueVector Stores;
3558
3559   // Holds all the different accesses in the loop.
3560   unsigned NumReads = 0;
3561   unsigned NumReadWrites = 0;
3562
3563   PtrRtCheck.Pointers.clear();
3564   PtrRtCheck.Need = false;
3565
3566   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
3567   MemoryDepChecker DepChecker(SE, DL, TheLoop);
3568
3569   // For each block.
3570   for (Loop::block_iterator bb = TheLoop->block_begin(),
3571        be = TheLoop->block_end(); bb != be; ++bb) {
3572
3573     // Scan the BB and collect legal loads and stores.
3574     for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
3575          ++it) {
3576
3577       // If this is a load, save it. If this instruction can read from memory
3578       // but is not a load, then we quit. Notice that we don't handle function
3579       // calls that read or write.
3580       if (it->mayReadFromMemory()) {
3581         LoadInst *Ld = dyn_cast<LoadInst>(it);
3582         if (!Ld) return false;
3583         if (!Ld->isSimple() && !IsAnnotatedParallel) {
3584           DEBUG(dbgs() << "LV: Found a non-simple load.\n");
3585           return false;
3586         }
3587         Loads.push_back(Ld);
3588         DepChecker.addAccess(Ld);
3589         continue;
3590       }
3591
3592       // Save 'store' instructions. Abort if other instructions write to memory.
3593       if (it->mayWriteToMemory()) {
3594         StoreInst *St = dyn_cast<StoreInst>(it);
3595         if (!St) return false;
3596         if (!St->isSimple() && !IsAnnotatedParallel) {
3597           DEBUG(dbgs() << "LV: Found a non-simple store.\n");
3598           return false;
3599         }
3600         Stores.push_back(St);
3601         DepChecker.addAccess(St);
3602       }
3603     } // next instr.
3604   } // next block.
3605
3606   // Now we have two lists that hold the loads and the stores.
3607   // Next, we find the pointers that they use.
3608
3609   // Check if we see any stores. If there are no stores, then we don't
3610   // care if the pointers are *restrict*.
3611   if (!Stores.size()) {
3612     DEBUG(dbgs() << "LV: Found a read-only loop!\n");
3613     return true;
3614   }
3615
3616   AccessAnalysis::DepCandidates DependentAccesses;
3617   AccessAnalysis Accesses(DL, DependentAccesses);
3618
3619   // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
3620   // multiple times on the same object. If the ptr is accessed twice, once
3621   // for read and once for write, it will only appear once (on the write
3622   // list). This is okay, since we are going to check for conflicts between
3623   // writes and between reads and writes, but not between reads and reads.
3624   ValueSet Seen;
3625
3626   ValueVector::iterator I, IE;
3627   for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
3628     StoreInst *ST = cast<StoreInst>(*I);
3629     Value* Ptr = ST->getPointerOperand();
3630
3631     if (isUniform(Ptr)) {
3632       DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
3633       return false;
3634     }
3635
3636     // If we did *not* see this pointer before, insert it to  the read-write
3637     // list. At this phase it is only a 'write' list.
3638     if (Seen.insert(Ptr)) {
3639       ++NumReadWrites;
3640       Accesses.addStore(Ptr);
3641     }
3642   }
3643
3644   if (IsAnnotatedParallel) {
3645     DEBUG(dbgs()
3646           << "LV: A loop annotated parallel, ignore memory dependency "
3647           << "checks.\n");
3648     return true;
3649   }
3650
3651   SmallPtrSet<Value *, 16> ReadOnlyPtr;
3652   for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
3653     LoadInst *LD = cast<LoadInst>(*I);
3654     Value* Ptr = LD->getPointerOperand();
3655     // If we did *not* see this pointer before, insert it to the
3656     // read list. If we *did* see it before, then it is already in
3657     // the read-write list. This allows us to vectorize expressions
3658     // such as A[i] += x;  Because the address of A[i] is a read-write
3659     // pointer. This only works if the index of A[i] is consecutive.
3660     // If the address of i is unknown (for example A[B[i]]) then we may
3661     // read a few words, modify, and write a few words, and some of the
3662     // words may be written to the same address.
3663     bool IsReadOnlyPtr = false;
3664     if (Seen.insert(Ptr) || !isStridedPtr(SE, DL, Ptr, TheLoop)) {
3665       ++NumReads;
3666       IsReadOnlyPtr = true;
3667     }
3668     Accesses.addLoad(Ptr, IsReadOnlyPtr);
3669   }
3670
3671   // If we write (or read-write) to a single destination and there are no
3672   // other reads in this loop then is it safe to vectorize.
3673   if (NumReadWrites == 1 && NumReads == 0) {
3674     DEBUG(dbgs() << "LV: Found a write-only loop!\n");
3675     return true;
3676   }
3677
3678   // Build dependence sets and check whether we need a runtime pointer bounds
3679   // check.
3680   Accesses.buildDependenceSets();
3681   bool NeedRTCheck = Accesses.isRTCheckNeeded();
3682
3683   // Find pointers with computable bounds. We are going to use this information
3684   // to place a runtime bound check.
3685   unsigned NumComparisons = 0;
3686   bool CanDoRT = false;
3687   if (NeedRTCheck)
3688     CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop);
3689
3690
3691   DEBUG(dbgs() << "LV: We need to do " << NumComparisons <<
3692         " pointer comparisons.\n");
3693
3694   // If we only have one set of dependences to check pointers among we don't
3695   // need a runtime check.
3696   if (NumComparisons == 0 && NeedRTCheck)
3697     NeedRTCheck = false;
3698
3699   // Check that we did not collect too many pointers or found a unsizeable
3700   // pointer.
3701   if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
3702     PtrRtCheck.reset();
3703     CanDoRT = false;
3704   }
3705
3706   if (CanDoRT) {
3707     DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
3708   }
3709
3710   if (NeedRTCheck && !CanDoRT) {
3711     DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
3712           "the array bounds.\n");
3713     PtrRtCheck.reset();
3714     return false;
3715   }
3716
3717   PtrRtCheck.Need = NeedRTCheck;
3718
3719   bool CanVecMem = true;
3720   if (Accesses.isDependencyCheckNeeded()) {
3721     DEBUG(dbgs() << "LV: Checking memory dependencies\n");
3722     CanVecMem = DepChecker.areDepsSafe(DependentAccesses,
3723                                        Accesses.getDependenciesToCheck());
3724     MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes();
3725   }
3726
3727   DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") <<
3728         " need a runtime memory check.\n");
3729
3730   return CanVecMem;
3731 }
3732
3733 static bool hasMultipleUsesOf(Instruction *I,
3734                               SmallPtrSet<Instruction *, 8> &Insts) {
3735   unsigned NumUses = 0;
3736   for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) {
3737     if (Insts.count(dyn_cast<Instruction>(*Use)))
3738       ++NumUses;
3739     if (NumUses > 1)
3740       return true;
3741   }
3742
3743   return false;
3744 }
3745
3746 static bool areAllUsesIn(Instruction *I, SmallPtrSet<Instruction *, 8> &Set) {
3747   for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use)
3748     if (!Set.count(dyn_cast<Instruction>(*Use)))
3749       return false;
3750   return true;
3751 }
3752
3753 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
3754                                                 ReductionKind Kind) {
3755   if (Phi->getNumIncomingValues() != 2)
3756     return false;
3757
3758   // Reduction variables are only found in the loop header block.
3759   if (Phi->getParent() != TheLoop->getHeader())
3760     return false;
3761
3762   // Obtain the reduction start value from the value that comes from the loop
3763   // preheader.
3764   Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
3765
3766   // ExitInstruction is the single value which is used outside the loop.
3767   // We only allow for a single reduction value to be used outside the loop.
3768   // This includes users of the reduction, variables (which form a cycle
3769   // which ends in the phi node).
3770   Instruction *ExitInstruction = 0;
3771   // Indicates that we found a reduction operation in our scan.
3772   bool FoundReduxOp = false;
3773
3774   // We start with the PHI node and scan for all of the users of this
3775   // instruction. All users must be instructions that can be used as reduction
3776   // variables (such as ADD). We must have a single out-of-block user. The cycle
3777   // must include the original PHI.
3778   bool FoundStartPHI = false;
3779
3780   // To recognize min/max patterns formed by a icmp select sequence, we store
3781   // the number of instruction we saw from the recognized min/max pattern,
3782   //  to make sure we only see exactly the two instructions.
3783   unsigned NumCmpSelectPatternInst = 0;
3784   ReductionInstDesc ReduxDesc(false, 0);
3785
3786   SmallPtrSet<Instruction *, 8> VisitedInsts;
3787   SmallVector<Instruction *, 8> Worklist;
3788   Worklist.push_back(Phi);
3789   VisitedInsts.insert(Phi);
3790
3791   // A value in the reduction can be used:
3792   //  - By the reduction:
3793   //      - Reduction operation:
3794   //        - One use of reduction value (safe).
3795   //        - Multiple use of reduction value (not safe).
3796   //      - PHI:
3797   //        - All uses of the PHI must be the reduction (safe).
3798   //        - Otherwise, not safe.
3799   //  - By one instruction outside of the loop (safe).
3800   //  - By further instructions outside of the loop (not safe).
3801   //  - By an instruction that is not part of the reduction (not safe).
3802   //    This is either:
3803   //      * An instruction type other than PHI or the reduction operation.
3804   //      * A PHI in the header other than the initial PHI.
3805   while (!Worklist.empty()) {
3806     Instruction *Cur = Worklist.back();
3807     Worklist.pop_back();
3808
3809     // No Users.
3810     // If the instruction has no users then this is a broken chain and can't be
3811     // a reduction variable.
3812     if (Cur->use_empty())
3813       return false;
3814
3815     bool IsAPhi = isa<PHINode>(Cur);
3816
3817     // A header PHI use other than the original PHI.
3818     if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent())
3819       return false;
3820
3821     // Reductions of instructions such as Div, and Sub is only possible if the
3822     // LHS is the reduction variable.
3823     if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) &&
3824         !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) &&
3825         !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0))))
3826       return false;
3827
3828     // Any reduction instruction must be of one of the allowed kinds.
3829     ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc);
3830     if (!ReduxDesc.IsReduction)
3831       return false;
3832
3833     // A reduction operation must only have one use of the reduction value.
3834     if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax &&
3835         hasMultipleUsesOf(Cur, VisitedInsts))
3836       return false;
3837
3838     // All inputs to a PHI node must be a reduction value.
3839     if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
3840       return false;
3841
3842     if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) ||
3843                                      isa<SelectInst>(Cur)))
3844       ++NumCmpSelectPatternInst;
3845     if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) ||
3846                                    isa<SelectInst>(Cur)))
3847       ++NumCmpSelectPatternInst;
3848
3849     // Check  whether we found a reduction operator.
3850     FoundReduxOp |= !IsAPhi;
3851
3852     // Process users of current instruction. Push non PHI nodes after PHI nodes
3853     // onto the stack. This way we are going to have seen all inputs to PHI
3854     // nodes once we get to them.
3855     SmallVector<Instruction *, 8> NonPHIs;
3856     SmallVector<Instruction *, 8> PHIs;
3857     for (Value::use_iterator UI = Cur->use_begin(), E = Cur->use_end(); UI != E;
3858          ++UI) {
3859       Instruction *Usr = cast<Instruction>(*UI);
3860
3861       // Check if we found the exit user.
3862       BasicBlock *Parent = Usr->getParent();
3863       if (!TheLoop->contains(Parent)) {
3864         // Exit if you find multiple outside users.
3865         if (ExitInstruction != 0)
3866           return false;
3867         ExitInstruction = Cur;
3868         continue;
3869       }
3870
3871       // Process instructions only once (termination).
3872       if (VisitedInsts.insert(Usr)) {
3873         if (isa<PHINode>(Usr))
3874           PHIs.push_back(Usr);
3875         else
3876           NonPHIs.push_back(Usr);
3877       }
3878       // Remember that we completed the cycle.
3879       if (Usr == Phi)
3880         FoundStartPHI = true;
3881     }
3882     Worklist.append(PHIs.begin(), PHIs.end());
3883     Worklist.append(NonPHIs.begin(), NonPHIs.end());
3884   }
3885
3886   // This means we have seen one but not the other instruction of the
3887   // pattern or more than just a select and cmp.
3888   if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) &&
3889       NumCmpSelectPatternInst != 2)
3890     return false;
3891
3892   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
3893     return false;
3894
3895   // We found a reduction var if we have reached the original phi node and we
3896   // only have a single instruction with out-of-loop users.
3897
3898   // This instruction is allowed to have out-of-loop users.
3899   AllowedExit.insert(ExitInstruction);
3900
3901   // Save the description of this reduction variable.
3902   ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
3903                          ReduxDesc.MinMaxKind);
3904   Reductions[Phi] = RD;
3905   // We've ended the cycle. This is a reduction variable if we have an
3906   // outside user and it has a binary op.
3907
3908   return true;
3909 }
3910
3911 /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
3912 /// pattern corresponding to a min(X, Y) or max(X, Y).
3913 LoopVectorizationLegality::ReductionInstDesc
3914 LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I,
3915                                                     ReductionInstDesc &Prev) {
3916
3917   assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&
3918          "Expect a select instruction");
3919   Instruction *Cmp = 0;
3920   SelectInst *Select = 0;
3921
3922   // We must handle the select(cmp()) as a single instruction. Advance to the
3923   // select.
3924   if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) {
3925     if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin())))
3926       return ReductionInstDesc(false, I);
3927     return ReductionInstDesc(Select, Prev.MinMaxKind);
3928   }
3929
3930   // Only handle single use cases for now.
3931   if (!(Select = dyn_cast<SelectInst>(I)))
3932     return ReductionInstDesc(false, I);
3933   if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) &&
3934       !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0))))
3935     return ReductionInstDesc(false, I);
3936   if (!Cmp->hasOneUse())
3937     return ReductionInstDesc(false, I);
3938
3939   Value *CmpLeft;
3940   Value *CmpRight;
3941
3942   // Look for a min/max pattern.
3943   if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
3944     return ReductionInstDesc(Select, MRK_UIntMin);
3945   else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
3946     return ReductionInstDesc(Select, MRK_UIntMax);
3947   else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
3948     return ReductionInstDesc(Select, MRK_SIntMax);
3949   else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
3950     return ReductionInstDesc(Select, MRK_SIntMin);
3951   else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
3952     return ReductionInstDesc(Select, MRK_FloatMin);
3953   else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
3954     return ReductionInstDesc(Select, MRK_FloatMax);
3955   else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
3956     return ReductionInstDesc(Select, MRK_FloatMin);
3957   else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
3958     return ReductionInstDesc(Select, MRK_FloatMax);
3959
3960   return ReductionInstDesc(false, I);
3961 }
3962
3963 LoopVectorizationLegality::ReductionInstDesc
3964 LoopVectorizationLegality::isReductionInstr(Instruction *I,
3965                                             ReductionKind Kind,
3966                                             ReductionInstDesc &Prev) {
3967   bool FP = I->getType()->isFloatingPointTy();
3968   bool FastMath = (FP && I->isCommutative() && I->isAssociative());
3969   switch (I->getOpcode()) {
3970   default:
3971     return ReductionInstDesc(false, I);
3972   case Instruction::PHI:
3973       if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd &&
3974                  Kind != RK_FloatMinMax))
3975         return ReductionInstDesc(false, I);
3976     return ReductionInstDesc(I, Prev.MinMaxKind);
3977   case Instruction::Sub:
3978   case Instruction::Add:
3979     return ReductionInstDesc(Kind == RK_IntegerAdd, I);
3980   case Instruction::Mul:
3981     return ReductionInstDesc(Kind == RK_IntegerMult, I);
3982   case Instruction::And:
3983     return ReductionInstDesc(Kind == RK_IntegerAnd, I);
3984   case Instruction::Or:
3985     return ReductionInstDesc(Kind == RK_IntegerOr, I);
3986   case Instruction::Xor:
3987     return ReductionInstDesc(Kind == RK_IntegerXor, I);
3988   case Instruction::FMul:
3989     return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
3990   case Instruction::FAdd:
3991     return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
3992   case Instruction::FCmp:
3993   case Instruction::ICmp:
3994   case Instruction::Select:
3995     if (Kind != RK_IntegerMinMax &&
3996         (!HasFunNoNaNAttr || Kind != RK_FloatMinMax))
3997       return ReductionInstDesc(false, I);
3998     return isMinMaxSelectCmpPattern(I, Prev);
3999   }
4000 }
4001
4002 LoopVectorizationLegality::InductionKind
4003 LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
4004   Type *PhiTy = Phi->getType();
4005   // We only handle integer and pointer inductions variables.
4006   if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
4007     return IK_NoInduction;
4008
4009   // Check that the PHI is consecutive.
4010   const SCEV *PhiScev = SE->getSCEV(Phi);
4011   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
4012   if (!AR) {
4013     DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
4014     return IK_NoInduction;
4015   }
4016   const SCEV *Step = AR->getStepRecurrence(*SE);
4017
4018   // Integer inductions need to have a stride of one.
4019   if (PhiTy->isIntegerTy()) {
4020     if (Step->isOne())
4021       return IK_IntInduction;
4022     if (Step->isAllOnesValue())
4023       return IK_ReverseIntInduction;
4024     return IK_NoInduction;
4025   }
4026
4027   // Calculate the pointer stride and check if it is consecutive.
4028   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
4029   if (!C)
4030     return IK_NoInduction;
4031
4032   assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
4033   uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
4034   if (C->getValue()->equalsInt(Size))
4035     return IK_PtrInduction;
4036   else if (C->getValue()->equalsInt(0 - Size))
4037     return IK_ReversePtrInduction;
4038
4039   return IK_NoInduction;
4040 }
4041
4042 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
4043   Value *In0 = const_cast<Value*>(V);
4044   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
4045   if (!PN)
4046     return false;
4047
4048   return Inductions.count(PN);
4049 }
4050
4051 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
4052   assert(TheLoop->contains(BB) && "Unknown block used");
4053
4054   // Blocks that do not dominate the latch need predication.
4055   BasicBlock* Latch = TheLoop->getLoopLatch();
4056   return !DT->dominates(BB, Latch);
4057 }
4058
4059 bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
4060   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
4061     // We might be able to hoist the load.
4062     if (it->mayReadFromMemory() && !LoadSpeculation.isHoistableLoad(it))
4063       return false;
4064
4065     // We don't predicate stores at the moment.
4066     if (it->mayWriteToMemory() || it->mayThrow())
4067       return false;
4068
4069     // The instructions below can trap.
4070     switch (it->getOpcode()) {
4071     default: continue;
4072     case Instruction::UDiv:
4073     case Instruction::SDiv:
4074     case Instruction::URem:
4075     case Instruction::SRem:
4076              return false;
4077     }
4078   }
4079
4080   return true;
4081 }
4082
4083 LoopVectorizationCostModel::VectorizationFactor
4084 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
4085                                                       unsigned UserVF) {
4086   // Width 1 means no vectorize
4087   VectorizationFactor Factor = { 1U, 0U };
4088   if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
4089     DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
4090     return Factor;
4091   }
4092
4093   // Find the trip count.
4094   unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
4095   DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n");
4096
4097   unsigned WidestType = getWidestType();
4098   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4099   unsigned MaxSafeDepDist = -1U;
4100   if (Legal->getMaxSafeDepDistBytes() != -1U)
4101     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
4102   WidestRegister = WidestRegister < MaxSafeDepDist ?  WidestRegister : MaxSafeDepDist;
4103   unsigned MaxVectorSize = WidestRegister / WidestType;
4104   DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
4105   DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n");
4106
4107   if (MaxVectorSize == 0) {
4108     DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4109     MaxVectorSize = 1;
4110   }
4111
4112   assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements"
4113          " into one vector!");
4114
4115   unsigned VF = MaxVectorSize;
4116
4117   // If we optimize the program for size, avoid creating the tail loop.
4118   if (OptForSize) {
4119     // If we are unable to calculate the trip count then don't try to vectorize.
4120     if (TC < 2) {
4121       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
4122       return Factor;
4123     }
4124
4125     // Find the maximum SIMD width that can fit within the trip count.
4126     VF = TC % MaxVectorSize;
4127
4128     if (VF == 0)
4129       VF = MaxVectorSize;
4130
4131     // If the trip count that we found modulo the vectorization factor is not
4132     // zero then we require a tail.
4133     if (VF < 2) {
4134       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
4135       return Factor;
4136     }
4137   }
4138
4139   if (UserVF != 0) {
4140     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
4141     DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n");
4142
4143     Factor.Width = UserVF;
4144     return Factor;
4145   }
4146
4147   float Cost = expectedCost(1);
4148   unsigned Width = 1;
4149   DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
4150   for (unsigned i=2; i <= VF; i*=2) {
4151     // Notice that the vector loop needs to be executed less times, so
4152     // we need to divide the cost of the vector loops by the width of
4153     // the vector elements.
4154     float VectorCost = expectedCost(i) / (float)i;
4155     DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
4156           (int)VectorCost << ".\n");
4157     if (VectorCost < Cost) {
4158       Cost = VectorCost;
4159       Width = i;
4160     }
4161   }
4162
4163   DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
4164   Factor.Width = Width;
4165   Factor.Cost = Width * Cost;
4166   return Factor;
4167 }
4168
4169 unsigned LoopVectorizationCostModel::getWidestType() {
4170   unsigned MaxWidth = 8;
4171
4172   // For each block.
4173   for (Loop::block_iterator bb = TheLoop->block_begin(),
4174        be = TheLoop->block_end(); bb != be; ++bb) {
4175     BasicBlock *BB = *bb;
4176
4177     // For each instruction in the loop.
4178     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
4179       Type *T = it->getType();
4180
4181       // Only examine Loads, Stores and PHINodes.
4182       if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
4183         continue;
4184
4185       // Examine PHI nodes that are reduction variables.
4186       if (PHINode *PN = dyn_cast<PHINode>(it))
4187         if (!Legal->getReductionVars()->count(PN))
4188           continue;
4189
4190       // Examine the stored values.
4191       if (StoreInst *ST = dyn_cast<StoreInst>(it))
4192         T = ST->getValueOperand()->getType();
4193
4194       // Ignore loaded pointer types and stored pointer types that are not
4195       // consecutive. However, we do want to take consecutive stores/loads of
4196       // pointer vectors into account.
4197       if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
4198         continue;
4199
4200       MaxWidth = std::max(MaxWidth,
4201                           (unsigned)DL->getTypeSizeInBits(T->getScalarType()));
4202     }
4203   }
4204
4205   return MaxWidth;
4206 }
4207
4208 unsigned
4209 LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
4210                                                unsigned UserUF,
4211                                                unsigned VF,
4212                                                unsigned LoopCost) {
4213
4214   // -- The unroll heuristics --
4215   // We unroll the loop in order to expose ILP and reduce the loop overhead.
4216   // There are many micro-architectural considerations that we can't predict
4217   // at this level. For example frontend pressure (on decode or fetch) due to
4218   // code size, or the number and capabilities of the execution ports.
4219   //
4220   // We use the following heuristics to select the unroll factor:
4221   // 1. If the code has reductions the we unroll in order to break the cross
4222   // iteration dependency.
4223   // 2. If the loop is really small then we unroll in order to reduce the loop
4224   // overhead.
4225   // 3. We don't unroll if we think that we will spill registers to memory due
4226   // to the increased register pressure.
4227
4228   // Use the user preference, unless 'auto' is selected.
4229   if (UserUF != 0)
4230     return UserUF;
4231
4232   // When we optimize for size we don't unroll.
4233   if (OptForSize)
4234     return 1;
4235
4236   // We used the distance for the unroll factor.
4237   if (Legal->getMaxSafeDepDistBytes() != -1U)
4238     return 1;
4239
4240   // Do not unroll loops with a relatively small trip count.
4241   unsigned TC = SE->getSmallConstantTripCount(TheLoop,
4242                                               TheLoop->getLoopLatch());
4243   if (TC > 1 && TC < TinyTripCountUnrollThreshold)
4244     return 1;
4245
4246   unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true);
4247   DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters <<
4248         " vector registers\n");
4249
4250   LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
4251   // We divide by these constants so assume that we have at least one
4252   // instruction that uses at least one register.
4253   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
4254   R.NumInstructions = std::max(R.NumInstructions, 1U);
4255
4256   // We calculate the unroll factor using the following formula.
4257   // Subtract the number of loop invariants from the number of available
4258   // registers. These registers are used by all of the unrolled instances.
4259   // Next, divide the remaining registers by the number of registers that is
4260   // required by the loop, in order to estimate how many parallel instances
4261   // fit without causing spills.
4262   unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers;
4263
4264   // Clamp the unroll factor ranges to reasonable factors.
4265   unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
4266
4267   // If we did not calculate the cost for VF (because the user selected the VF)
4268   // then we calculate the cost of VF here.
4269   if (LoopCost == 0)
4270     LoopCost = expectedCost(VF);
4271
4272   // Clamp the calculated UF to be between the 1 and the max unroll factor
4273   // that the target allows.
4274   if (UF > MaxUnrollSize)
4275     UF = MaxUnrollSize;
4276   else if (UF < 1)
4277     UF = 1;
4278
4279   if (Legal->getReductionVars()->size()) {
4280     DEBUG(dbgs() << "LV: Unrolling because of reductions. \n");
4281     return UF;
4282   }
4283
4284   // We want to unroll tiny loops in order to reduce the loop overhead.
4285   // We assume that the cost overhead is 1 and we use the cost model
4286   // to estimate the cost of the loop and unroll until the cost of the
4287   // loop overhead is about 5% of the cost of the loop.
4288   DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n");
4289   if (LoopCost < 20) {
4290     DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n");
4291     unsigned NewUF = 20/LoopCost + 1;
4292     return std::min(NewUF, UF);
4293   }
4294
4295   DEBUG(dbgs() << "LV: Not Unrolling. \n");
4296   return 1;
4297 }
4298
4299 LoopVectorizationCostModel::RegisterUsage
4300 LoopVectorizationCostModel::calculateRegisterUsage() {
4301   // This function calculates the register usage by measuring the highest number
4302   // of values that are alive at a single location. Obviously, this is a very
4303   // rough estimation. We scan the loop in a topological order in order and
4304   // assign a number to each instruction. We use RPO to ensure that defs are
4305   // met before their users. We assume that each instruction that has in-loop
4306   // users starts an interval. We record every time that an in-loop value is
4307   // used, so we have a list of the first and last occurrences of each
4308   // instruction. Next, we transpose this data structure into a multi map that
4309   // holds the list of intervals that *end* at a specific location. This multi
4310   // map allows us to perform a linear search. We scan the instructions linearly
4311   // and record each time that a new interval starts, by placing it in a set.
4312   // If we find this value in the multi-map then we remove it from the set.
4313   // The max register usage is the maximum size of the set.
4314   // We also search for instructions that are defined outside the loop, but are
4315   // used inside the loop. We need this number separately from the max-interval
4316   // usage number because when we unroll, loop-invariant values do not take
4317   // more register.
4318   LoopBlocksDFS DFS(TheLoop);
4319   DFS.perform(LI);
4320
4321   RegisterUsage R;
4322   R.NumInstructions = 0;
4323
4324   // Each 'key' in the map opens a new interval. The values
4325   // of the map are the index of the 'last seen' usage of the
4326   // instruction that is the key.
4327   typedef DenseMap<Instruction*, unsigned> IntervalMap;
4328   // Maps instruction to its index.
4329   DenseMap<unsigned, Instruction*> IdxToInstr;
4330   // Marks the end of each interval.
4331   IntervalMap EndPoint;
4332   // Saves the list of instruction indices that are used in the loop.
4333   SmallSet<Instruction*, 8> Ends;
4334   // Saves the list of values that are used in the loop but are
4335   // defined outside the loop, such as arguments and constants.
4336   SmallPtrSet<Value*, 8> LoopInvariants;
4337
4338   unsigned Index = 0;
4339   for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
4340        be = DFS.endRPO(); bb != be; ++bb) {
4341     R.NumInstructions += (*bb)->size();
4342     for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
4343          ++it) {
4344       Instruction *I = it;
4345       IdxToInstr[Index++] = I;
4346
4347       // Save the end location of each USE.
4348       for (unsigned i = 0; i < I->getNumOperands(); ++i) {
4349         Value *U = I->getOperand(i);
4350         Instruction *Instr = dyn_cast<Instruction>(U);
4351
4352         // Ignore non-instruction values such as arguments, constants, etc.
4353         if (!Instr) continue;
4354
4355         // If this instruction is outside the loop then record it and continue.
4356         if (!TheLoop->contains(Instr)) {
4357           LoopInvariants.insert(Instr);
4358           continue;
4359         }
4360
4361         // Overwrite previous end points.
4362         EndPoint[Instr] = Index;
4363         Ends.insert(Instr);
4364       }
4365     }
4366   }
4367
4368   // Saves the list of intervals that end with the index in 'key'.
4369   typedef SmallVector<Instruction*, 2> InstrList;
4370   DenseMap<unsigned, InstrList> TransposeEnds;
4371
4372   // Transpose the EndPoints to a list of values that end at each index.
4373   for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
4374        it != e; ++it)
4375     TransposeEnds[it->second].push_back(it->first);
4376
4377   SmallSet<Instruction*, 8> OpenIntervals;
4378   unsigned MaxUsage = 0;
4379
4380
4381   DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
4382   for (unsigned int i = 0; i < Index; ++i) {
4383     Instruction *I = IdxToInstr[i];
4384     // Ignore instructions that are never used within the loop.
4385     if (!Ends.count(I)) continue;
4386
4387     // Remove all of the instructions that end at this location.
4388     InstrList &List = TransposeEnds[i];
4389     for (unsigned int j=0, e = List.size(); j < e; ++j)
4390       OpenIntervals.erase(List[j]);
4391
4392     // Count the number of live interals.
4393     MaxUsage = std::max(MaxUsage, OpenIntervals.size());
4394
4395     DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
4396           OpenIntervals.size() <<"\n");
4397
4398     // Add the current instruction to the list of open intervals.
4399     OpenIntervals.insert(I);
4400   }
4401
4402   unsigned Invariant = LoopInvariants.size();
4403   DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n");
4404   DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n");
4405   DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n");
4406
4407   R.LoopInvariantRegs = Invariant;
4408   R.MaxLocalUsers = MaxUsage;
4409   return R;
4410 }
4411
4412 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
4413   unsigned Cost = 0;
4414
4415   // For each block.
4416   for (Loop::block_iterator bb = TheLoop->block_begin(),
4417        be = TheLoop->block_end(); bb != be; ++bb) {
4418     unsigned BlockCost = 0;
4419     BasicBlock *BB = *bb;
4420
4421     // For each instruction in the old loop.
4422     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
4423       // Skip dbg intrinsics.
4424       if (isa<DbgInfoIntrinsic>(it))
4425         continue;
4426
4427       unsigned C = getInstructionCost(it, VF);
4428       Cost += C;
4429       DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
4430             VF << " For instruction: "<< *it << "\n");
4431     }
4432
4433     // We assume that if-converted blocks have a 50% chance of being executed.
4434     // When the code is scalar then some of the blocks are avoided due to CF.
4435     // When the code is vectorized we execute all code paths.
4436     if (Legal->blockNeedsPredication(*bb) && VF == 1)
4437       BlockCost /= 2;
4438
4439     Cost += BlockCost;
4440   }
4441
4442   return Cost;
4443 }
4444
4445 unsigned
4446 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
4447   // If we know that this instruction will remain uniform, check the cost of
4448   // the scalar version.
4449   if (Legal->isUniformAfterVectorization(I))
4450     VF = 1;
4451
4452   Type *RetTy = I->getType();
4453   Type *VectorTy = ToVectorTy(RetTy, VF);
4454
4455   // TODO: We need to estimate the cost of intrinsic calls.
4456   switch (I->getOpcode()) {
4457   case Instruction::GetElementPtr:
4458     // We mark this instruction as zero-cost because the cost of GEPs in
4459     // vectorized code depends on whether the corresponding memory instruction
4460     // is scalarized or not. Therefore, we handle GEPs with the memory
4461     // instruction cost.
4462     return 0;
4463   case Instruction::Br: {
4464     return TTI.getCFInstrCost(I->getOpcode());
4465   }
4466   case Instruction::PHI:
4467     //TODO: IF-converted IFs become selects.
4468     return 0;
4469   case Instruction::Add:
4470   case Instruction::FAdd:
4471   case Instruction::Sub:
4472   case Instruction::FSub:
4473   case Instruction::Mul:
4474   case Instruction::FMul:
4475   case Instruction::UDiv:
4476   case Instruction::SDiv:
4477   case Instruction::FDiv:
4478   case Instruction::URem:
4479   case Instruction::SRem:
4480   case Instruction::FRem:
4481   case Instruction::Shl:
4482   case Instruction::LShr:
4483   case Instruction::AShr:
4484   case Instruction::And:
4485   case Instruction::Or:
4486   case Instruction::Xor: {
4487     // Certain instructions can be cheaper to vectorize if they have a constant
4488     // second vector operand. One example of this are shifts on x86.
4489     TargetTransformInfo::OperandValueKind Op1VK =
4490       TargetTransformInfo::OK_AnyValue;
4491     TargetTransformInfo::OperandValueKind Op2VK =
4492       TargetTransformInfo::OK_AnyValue;
4493
4494     if (isa<ConstantInt>(I->getOperand(1)))
4495       Op2VK = TargetTransformInfo::OK_UniformConstantValue;
4496
4497     return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
4498   }
4499   case Instruction::Select: {
4500     SelectInst *SI = cast<SelectInst>(I);
4501     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
4502     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
4503     Type *CondTy = SI->getCondition()->getType();
4504     if (!ScalarCond)
4505       CondTy = VectorType::get(CondTy, VF);
4506
4507     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
4508   }
4509   case Instruction::ICmp:
4510   case Instruction::FCmp: {
4511     Type *ValTy = I->getOperand(0)->getType();
4512     VectorTy = ToVectorTy(ValTy, VF);
4513     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
4514   }
4515   case Instruction::Store:
4516   case Instruction::Load: {
4517     StoreInst *SI = dyn_cast<StoreInst>(I);
4518     LoadInst *LI = dyn_cast<LoadInst>(I);
4519     Type *ValTy = (SI ? SI->getValueOperand()->getType() :
4520                    LI->getType());
4521     VectorTy = ToVectorTy(ValTy, VF);
4522
4523     unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
4524     unsigned AS = SI ? SI->getPointerAddressSpace() :
4525       LI->getPointerAddressSpace();
4526     Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
4527     // We add the cost of address computation here instead of with the gep
4528     // instruction because only here we know whether the operation is
4529     // scalarized.
4530     if (VF == 1)
4531       return TTI.getAddressComputationCost(VectorTy) +
4532         TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
4533
4534     // Scalarized loads/stores.
4535     int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
4536     bool Reverse = ConsecutiveStride < 0;
4537     unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
4538     unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
4539     if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
4540       unsigned Cost = 0;
4541       // The cost of extracting from the value vector and pointer vector.
4542       Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
4543       for (unsigned i = 0; i < VF; ++i) {
4544         //  The cost of extracting the pointer operand.
4545         Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
4546         // In case of STORE, the cost of ExtractElement from the vector.
4547         // In case of LOAD, the cost of InsertElement into the returned
4548         // vector.
4549         Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
4550                                             Instruction::InsertElement,
4551                                             VectorTy, i);
4552       }
4553
4554       // The cost of the scalar loads/stores.
4555       Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType());
4556       Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
4557                                        Alignment, AS);
4558       return Cost;
4559     }
4560
4561     // Wide load/stores.
4562     unsigned Cost = TTI.getAddressComputationCost(VectorTy);
4563     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
4564
4565     if (Reverse)
4566       Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
4567                                   VectorTy, 0);
4568     return Cost;
4569   }
4570   case Instruction::ZExt:
4571   case Instruction::SExt:
4572   case Instruction::FPToUI:
4573   case Instruction::FPToSI:
4574   case Instruction::FPExt:
4575   case Instruction::PtrToInt:
4576   case Instruction::IntToPtr:
4577   case Instruction::SIToFP:
4578   case Instruction::UIToFP:
4579   case Instruction::Trunc:
4580   case Instruction::FPTrunc:
4581   case Instruction::BitCast: {
4582     // We optimize the truncation of induction variable.
4583     // The cost of these is the same as the scalar operation.
4584     if (I->getOpcode() == Instruction::Trunc &&
4585         Legal->isInductionVariable(I->getOperand(0)))
4586       return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
4587                                   I->getOperand(0)->getType());
4588
4589     Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
4590     return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
4591   }
4592   case Instruction::Call: {
4593     CallInst *CI = cast<CallInst>(I);
4594     Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
4595     assert(ID && "Not an intrinsic call!");
4596     Type *RetTy = ToVectorTy(CI->getType(), VF);
4597     SmallVector<Type*, 4> Tys;
4598     for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
4599       Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
4600     return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
4601   }
4602   default: {
4603     // We are scalarizing the instruction. Return the cost of the scalar
4604     // instruction, plus the cost of insert and extract into vector
4605     // elements, times the vector width.
4606     unsigned Cost = 0;
4607
4608     if (!RetTy->isVoidTy() && VF != 1) {
4609       unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
4610                                                 VectorTy);
4611       unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
4612                                                 VectorTy);
4613
4614       // The cost of inserting the results plus extracting each one of the
4615       // operands.
4616       Cost += VF * (InsCost + ExtCost * I->getNumOperands());
4617     }
4618
4619     // The cost of executing VF copies of the scalar instruction. This opcode
4620     // is unknown. Assume that it is the same as 'mul'.
4621     Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
4622     return Cost;
4623   }
4624   }// end of switch.
4625 }
4626
4627 Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
4628   if (Scalar->isVoidTy() || VF == 1)
4629     return Scalar;
4630   return VectorType::get(Scalar, VF);
4631 }
4632
4633 char LoopVectorize::ID = 0;
4634 static const char lv_name[] = "Loop Vectorization";
4635 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
4636 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
4637 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
4638 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
4639 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
4640
4641 namespace llvm {
4642   Pass *createLoopVectorizePass() {
4643     return new LoopVectorize();
4644   }
4645 }
4646
4647 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
4648   // Check for a store.
4649   if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
4650     return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
4651
4652   // Check for a load.
4653   if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
4654     return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
4655
4656   return false;
4657 }