lib/Transforms/Vectorize/SLPVectorizer.cpp

   1 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
  10 // stores that can be put together into vector-stores. Next, it attempts to
  11 // construct vectorizable tree using the use-def chains. If a profitable tree
  12 // was found, the SLP vectorizer performs vectorization on the tree.
  13 //
  14 // The pass is inspired by the work described in the paper:
  15 //  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
  16 //
  17 //===----------------------------------------------------------------------===//
  18 #include "llvm/Transforms/Vectorize.h"
  19 #include "llvm/ADT/MapVector.h"
  20 #include "llvm/ADT/PostOrderIterator.h"
  21 #include "llvm/ADT/SetVector.h"
  22 #include "llvm/ADT/Statistic.h"
  23 #include "llvm/Analysis/AliasAnalysis.h"
  24 #include "llvm/Analysis/AssumptionTracker.h"
  25 #include "llvm/Analysis/CodeMetrics.h"
  26 #include "llvm/Analysis/LoopInfo.h"
  27 #include "llvm/Analysis/ScalarEvolution.h"
  28 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  29 #include "llvm/Analysis/TargetTransformInfo.h"
  30 #include "llvm/Analysis/ValueTracking.h"
  31 #include "llvm/IR/DataLayout.h"
  32 #include "llvm/IR/Dominators.h"
  33 #include "llvm/IR/IRBuilder.h"
  34 #include "llvm/IR/Instructions.h"
  35 #include "llvm/IR/IntrinsicInst.h"
  36 #include "llvm/IR/Module.h"
  37 #include "llvm/IR/NoFolder.h"
  38 #include "llvm/IR/Type.h"
  39 #include "llvm/IR/Value.h"
  40 #include "llvm/IR/Verifier.h"
  41 #include "llvm/Pass.h"
  42 #include "llvm/Support/CommandLine.h"
  43 #include "llvm/Support/Debug.h"
  44 #include "llvm/Support/raw_ostream.h"
  45 #include "llvm/Transforms/Utils/VectorUtils.h"
  46 #include <algorithm>
  47 #include <map>
  48 #include <memory>
  49
  50 using namespace llvm;
  51
  52 #define SV_NAME "slp-vectorizer"
  53 #define DEBUG_TYPE "SLP"
  54
  55 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
  56
  57 static cl::opt<int>
  58     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
  59                      cl::desc("Only vectorize if you gain more than this "
  60                               "number "));
  61
  62 static cl::opt<bool>
  63 ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
  64                    cl::desc("Attempt to vectorize horizontal reductions"));
  65
  66 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
  67     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
  68     cl::desc(
  69         "Attempt to vectorize horizontal reductions feeding into a store"));
  70
  71 namespace {
  72
  73 static const unsigned MinVecRegSize = 128;
  74
  75 static const unsigned RecursionMaxDepth = 12;
  76
  77 /// \returns the parent basic block if all of the instructions in \p VL
  78 /// are in the same block or null otherwise.
  79 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
  80   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  81   if (!I0)
  82     return nullptr;
  83   BasicBlock *BB = I0->getParent();
  84   for (int i = 1, e = VL.size(); i < e; i++) {
  85     Instruction *I = dyn_cast<Instruction>(VL[i]);
  86     if (!I)
  87       return nullptr;
  88
  89     if (BB != I->getParent())
  90       return nullptr;
  91   }
  92   return BB;
  93 }
  94
  95 /// \returns True if all of the values in \p VL are constants.
  96 static bool allConstant(ArrayRef<Value *> VL) {
  97   for (unsigned i = 0, e = VL.size(); i < e; ++i)
  98     if (!isa<Constant>(VL[i]))
  99       return false;
 100   return true;
 101 }
 102
 103 /// \returns True if all of the values in \p VL are identical.
 104 static bool isSplat(ArrayRef<Value *> VL) {
 105   for (unsigned i = 1, e = VL.size(); i < e; ++i)
 106     if (VL[i] != VL[0])
 107       return false;
 108   return true;
 109 }
 110
 111 ///\returns Opcode that can be clubbed with \p Op to create an alternate
 112 /// sequence which can later be merged as a ShuffleVector instruction.
 113 static unsigned getAltOpcode(unsigned Op) {
 114   switch (Op) {
 115   case Instruction::FAdd:
 116     return Instruction::FSub;
 117   case Instruction::FSub:
 118     return Instruction::FAdd;
 119   case Instruction::Add:
 120     return Instruction::Sub;
 121   case Instruction::Sub:
 122     return Instruction::Add;
 123   default:
 124     return 0;
 125   }
 126 }
 127
 128 ///\returns bool representing if Opcode \p Op can be part
 129 /// of an alternate sequence which can later be merged as
 130 /// a ShuffleVector instruction.
 131 static bool canCombineAsAltInst(unsigned Op) {
 132   if (Op == Instruction::FAdd || Op == Instruction::FSub ||
 133       Op == Instruction::Sub || Op == Instruction::Add)
 134     return true;
 135   return false;
 136 }
 137
 138 /// \returns ShuffleVector instruction if intructions in \p VL have
 139 ///  alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
 140 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
 141 static unsigned isAltInst(ArrayRef<Value *> VL) {
 142   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
 143   unsigned Opcode = I0->getOpcode();
 144   unsigned AltOpcode = getAltOpcode(Opcode);
 145   for (int i = 1, e = VL.size(); i < e; i++) {
 146     Instruction *I = dyn_cast<Instruction>(VL[i]);
 147     if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
 148       return 0;
 149   }
 150   return Instruction::ShuffleVector;
 151 }
 152
 153 /// \returns The opcode if all of the Instructions in \p VL have the same
 154 /// opcode, or zero.
 155 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
 156   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
 157   if (!I0)
 158     return 0;
 159   unsigned Opcode = I0->getOpcode();
 160   for (int i = 1, e = VL.size(); i < e; i++) {
 161     Instruction *I = dyn_cast<Instruction>(VL[i]);
 162     if (!I || Opcode != I->getOpcode()) {
 163       if (canCombineAsAltInst(Opcode) && i == 1)
 164         return isAltInst(VL);
 165       return 0;
 166     }
 167   }
 168   return Opcode;
 169 }
 170
 171 /// Get the intersection (logical and) of all of the potential IR flags
 172 /// of each scalar operation (VL) that will be converted into a vector (I).
 173 /// Flag set: NSW, NUW, exact, and all of fast-math.
 174 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
 175   if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
 176     if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
 177       // Intersection is initialized to the 0th scalar,
 178       // so start counting from index '1'.
 179       for (int i = 1, e = VL.size(); i < e; ++i) {
 180         if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
 181           Intersection->andIRFlags(Scalar);
 182       }
 183       VecOp->copyIRFlags(Intersection);
 184     }
 185   }
 186 }
 187
 188 /// \returns \p I after propagating metadata from \p VL.
 189 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
 190   Instruction *I0 = cast<Instruction>(VL[0]);
 191   SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
 192   I0->getAllMetadataOtherThanDebugLoc(Metadata);
 193
 194   for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
 195     unsigned Kind = Metadata[i].first;
 196     MDNode *MD = Metadata[i].second;
 197
 198     for (int i = 1, e = VL.size(); MD && i != e; i++) {
 199       Instruction *I = cast<Instruction>(VL[i]);
 200       MDNode *IMD = I->getMetadata(Kind);
 201
 202       switch (Kind) {
 203       default:
 204         MD = nullptr; // Remove unknown metadata
 205         break;
 206       case LLVMContext::MD_tbaa:
 207         MD = MDNode::getMostGenericTBAA(MD, IMD);
 208         break;
 209       case LLVMContext::MD_alias_scope:
 210       case LLVMContext::MD_noalias:
 211         MD = MDNode::intersect(MD, IMD);
 212         break;
 213       case LLVMContext::MD_fpmath:
 214         MD = MDNode::getMostGenericFPMath(MD, IMD);
 215         break;
 216       }
 217     }
 218     I->setMetadata(Kind, MD);
 219   }
 220   return I;
 221 }
 222
 223 /// \returns The type that all of the values in \p VL have or null if there
 224 /// are different types.
 225 static Type* getSameType(ArrayRef<Value *> VL) {
 226   Type *Ty = VL[0]->getType();
 227   for (int i = 1, e = VL.size(); i < e; i++)
 228     if (VL[i]->getType() != Ty)
 229       return nullptr;
 230
 231   return Ty;
 232 }
 233
 234 /// \returns True if the ExtractElement instructions in VL can be vectorized
 235 /// to use the original vector.
 236 static bool CanReuseExtract(ArrayRef<Value *> VL) {
 237   assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
 238   // Check if all of the extracts come from the same vector and from the
 239   // correct offset.
 240   Value *VL0 = VL[0];
 241   ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
 242   Value *Vec = E0->getOperand(0);
 243
 244   // We have to extract from the same vector type.
 245   unsigned NElts = Vec->getType()->getVectorNumElements();
 246
 247   if (NElts != VL.size())
 248     return false;
 249
 250   // Check that all of the indices extract from the correct offset.
 251   ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
 252   if (!CI || CI->getZExtValue())
 253     return false;
 254
 255   for (unsigned i = 1, e = VL.size(); i < e; ++i) {
 256     ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
 257     ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
 258
 259     if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
 260       return false;
 261   }
 262
 263   return true;
 264 }
 265
 266 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
 267                                            SmallVectorImpl<Value *> &Left,
 268                                            SmallVectorImpl<Value *> &Right) {
 269
 270   SmallVector<Value *, 16> OrigLeft, OrigRight;
 271
 272   bool AllSameOpcodeLeft = true;
 273   bool AllSameOpcodeRight = true;
 274   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
 275     Instruction *I = cast<Instruction>(VL[i]);
 276     Value *V0 = I->getOperand(0);
 277     Value *V1 = I->getOperand(1);
 278
 279     OrigLeft.push_back(V0);
 280     OrigRight.push_back(V1);
 281
 282     Instruction *I0 = dyn_cast<Instruction>(V0);
 283     Instruction *I1 = dyn_cast<Instruction>(V1);
 284
 285     // Check whether all operands on one side have the same opcode. In this case
 286     // we want to preserve the original order and not make things worse by
 287     // reordering.
 288     AllSameOpcodeLeft = I0;
 289     AllSameOpcodeRight = I1;
 290
 291     if (i && AllSameOpcodeLeft) {
 292       if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) {
 293         if(P0->getOpcode() != I0->getOpcode())
 294           AllSameOpcodeLeft = false;
 295       } else
 296         AllSameOpcodeLeft = false;
 297     }
 298     if (i && AllSameOpcodeRight) {
 299       if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) {
 300         if(P1->getOpcode() != I1->getOpcode())
 301           AllSameOpcodeRight = false;
 302       } else
 303         AllSameOpcodeRight = false;
 304     }
 305
 306     // Sort two opcodes. In the code below we try to preserve the ability to use
 307     // broadcast of values instead of individual inserts.
 308     // vl1 = load
 309     // vl2 = phi
 310     // vr1 = load
 311     // vr2 = vr2
 312     //    = vl1 x vr1
 313     //    = vl2 x vr2
 314     // If we just sorted according to opcode we would leave the first line in
 315     // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
 316     //    = vl1 x vr1
 317     //    = vr2 x vl2
 318     // Because vr2 and vr1 are from the same load we loose the opportunity of a
 319     // broadcast for the packed right side in the backend: we have [vr1, vl2]
 320     // instead of [vr1, vr2=vr1].
 321     if (I0 && I1) {
 322        if(!i && I0->getOpcode() > I1->getOpcode()) {
 323          Left.push_back(I1);
 324          Right.push_back(I0);
 325        } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) {
 326          // Try not to destroy a broad cast for no apparent benefit.
 327          Left.push_back(I1);
 328          Right.push_back(I0);
 329        } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] ==  I0) {
 330          // Try preserve broadcasts.
 331          Left.push_back(I1);
 332          Right.push_back(I0);
 333        } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) {
 334          // Try preserve broadcasts.
 335          Left.push_back(I1);
 336          Right.push_back(I0);
 337        } else {
 338          Left.push_back(I0);
 339          Right.push_back(I1);
 340        }
 341        continue;
 342     }
 343     // One opcode, put the instruction on the right.
 344     if (I0) {
 345       Left.push_back(V1);
 346       Right.push_back(I0);
 347       continue;
 348     }
 349     Left.push_back(V0);
 350     Right.push_back(V1);
 351   }
 352
 353   bool LeftBroadcast = isSplat(Left);
 354   bool RightBroadcast = isSplat(Right);
 355
 356   // Don't reorder if the operands where good to begin with.
 357   if (!(LeftBroadcast || RightBroadcast) &&
 358       (AllSameOpcodeRight || AllSameOpcodeLeft)) {
 359     Left = OrigLeft;
 360     Right = OrigRight;
 361   }
 362 }
 363
 364 /// \returns True if in-tree use also needs extract. This refers to
 365 /// possible scalar operand in vectorized instruction.
 366 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
 367                                     TargetLibraryInfo *TLI) {
 368
 369   unsigned Opcode = UserInst->getOpcode();
 370   switch (Opcode) {
 371   case Instruction::Load: {
 372     LoadInst *LI = cast<LoadInst>(UserInst);
 373     return (LI->getPointerOperand() == Scalar);
 374   }
 375   case Instruction::Store: {
 376     StoreInst *SI = cast<StoreInst>(UserInst);
 377     return (SI->getPointerOperand() == Scalar);
 378   }
 379   case Instruction::Call: {
 380     CallInst *CI = cast<CallInst>(UserInst);
 381     Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
 382     if (hasVectorInstrinsicScalarOpd(ID, 1)) {
 383       return (CI->getArgOperand(1) == Scalar);
 384     }
 385   }
 386   default:
 387     return false;
 388   }
 389 }
 390
 391 /// Bottom Up SLP Vectorizer.
 392 class BoUpSLP {
 393 public:
 394   typedef SmallVector<Value *, 8> ValueList;
 395   typedef SmallVector<Instruction *, 16> InstrList;
 396   typedef SmallPtrSet<Value *, 16> ValueSet;
 397   typedef SmallVector<StoreInst *, 8> StoreList;
 398
 399   BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl,
 400           TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa,
 401           LoopInfo *Li, DominatorTree *Dt, AssumptionTracker *AT)
 402       : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0),
 403         F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
 404         Builder(Se->getContext()) {
 405     CodeMetrics::collectEphemeralValues(F, AT, EphValues);
 406   }
 407
 408   /// \brief Vectorize the tree that starts with the elements in \p VL.
 409   /// Returns the vectorized root.
 410   Value *vectorizeTree();
 411
 412   /// \returns the cost incurred by unwanted spills and fills, caused by
 413   /// holding live values over call sites.
 414   int getSpillCost();
 415
 416   /// \returns the vectorization cost of the subtree that starts at \p VL.
 417   /// A negative number means that this is profitable.
 418   int getTreeCost();
 419
 420   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
 421   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
 422   void buildTree(ArrayRef<Value *> Roots,
 423                  ArrayRef<Value *> UserIgnoreLst = None);
 424
 425   /// Clear the internal data structures that are created by 'buildTree'.
 426   void deleteTree() {
 427     VectorizableTree.clear();
 428     ScalarToTreeEntry.clear();
 429     MustGather.clear();
 430     ExternalUses.clear();
 431     NumLoadsWantToKeepOrder = 0;
 432     NumLoadsWantToChangeOrder = 0;
 433     for (auto &Iter : BlocksSchedules) {
 434       BlockScheduling *BS = Iter.second.get();
 435       BS->clear();
 436     }
 437   }
 438
 439   /// \returns true if the memory operations A and B are consecutive.
 440   bool isConsecutiveAccess(Value *A, Value *B);
 441
 442   /// \brief Perform LICM and CSE on the newly generated gather sequences.
 443   void optimizeGatherSequence();
 444
 445   /// \returns true if it is benefitial to reverse the vector order.
 446   bool shouldReorder() const {
 447     return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
 448   }
 449
 450 private:
 451   struct TreeEntry;
 452
 453   /// \returns the cost of the vectorizable entry.
 454   int getEntryCost(TreeEntry *E);
 455
 456   /// This is the recursive part of buildTree.
 457   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
 458
 459   /// Vectorize a single entry in the tree.
 460   Value *vectorizeTree(TreeEntry *E);
 461
 462   /// Vectorize a single entry in the tree, starting in \p VL.
 463   Value *vectorizeTree(ArrayRef<Value *> VL);
 464
 465   /// \returns the pointer to the vectorized value if \p VL is already
 466   /// vectorized, or NULL. They may happen in cycles.
 467   Value *alreadyVectorized(ArrayRef<Value *> VL) const;
 468
 469   /// \brief Take the pointer operand from the Load/Store instruction.
 470   /// \returns NULL if this is not a valid Load/Store instruction.
 471   static Value *getPointerOperand(Value *I);
 472
 473   /// \brief Take the address space operand from the Load/Store instruction.
 474   /// \returns -1 if this is not a valid Load/Store instruction.
 475   static unsigned getAddressSpaceOperand(Value *I);
 476
 477   /// \returns the scalarization cost for this type. Scalarization in this
 478   /// context means the creation of vectors from a group of scalars.
 479   int getGatherCost(Type *Ty);
 480
 481   /// \returns the scalarization cost for this list of values. Assuming that
 482   /// this subtree gets vectorized, we may need to extract the values from the
 483   /// roots. This method calculates the cost of extracting the values.
 484   int getGatherCost(ArrayRef<Value *> VL);
 485
 486   /// \brief Set the Builder insert point to one after the last instruction in
 487   /// the bundle
 488   void setInsertPointAfterBundle(ArrayRef<Value *> VL);
 489
 490   /// \returns a vector from a collection of scalars in \p VL.
 491   Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
 492
 493   /// \returns whether the VectorizableTree is fully vectoriable and will
 494   /// be beneficial even the tree height is tiny.
 495   bool isFullyVectorizableTinyTree();
 496
 497   struct TreeEntry {
 498     TreeEntry() : Scalars(), VectorizedValue(nullptr),
 499     NeedToGather(0) {}
 500
 501     /// \returns true if the scalars in VL are equal to this entry.
 502     bool isSame(ArrayRef<Value *> VL) const {
 503       assert(VL.size() == Scalars.size() && "Invalid size");
 504       return std::equal(VL.begin(), VL.end(), Scalars.begin());
 505     }
 506
 507     /// A vector of scalars.
 508     ValueList Scalars;
 509
 510     /// The Scalars are vectorized into this value. It is initialized to Null.
 511     Value *VectorizedValue;
 512
 513     /// Do we need to gather this sequence ?
 514     bool NeedToGather;
 515   };
 516
 517   /// Create a new VectorizableTree entry.
 518   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
 519     VectorizableTree.push_back(TreeEntry());
 520     int idx = VectorizableTree.size() - 1;
 521     TreeEntry *Last = &VectorizableTree[idx];
 522     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
 523     Last->NeedToGather = !Vectorized;
 524     if (Vectorized) {
 525       for (int i = 0, e = VL.size(); i != e; ++i) {
 526         assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
 527         ScalarToTreeEntry[VL[i]] = idx;
 528       }
 529     } else {
 530       MustGather.insert(VL.begin(), VL.end());
 531     }
 532     return Last;
 533   }
 534
 535   /// -- Vectorization State --
 536   /// Holds all of the tree entries.
 537   std::vector<TreeEntry> VectorizableTree;
 538
 539   /// Maps a specific scalar to its tree entry.
 540   SmallDenseMap<Value*, int> ScalarToTreeEntry;
 541
 542   /// A list of scalars that we found that we need to keep as scalars.
 543   ValueSet MustGather;
 544
 545   /// This POD struct describes one external user in the vectorized tree.
 546   struct ExternalUser {
 547     ExternalUser (Value *S, llvm::User *U, int L) :
 548       Scalar(S), User(U), Lane(L){};
 549     // Which scalar in our function.
 550     Value *Scalar;
 551     // Which user that uses the scalar.
 552     llvm::User *User;
 553     // Which lane does the scalar belong to.
 554     int Lane;
 555   };
 556   typedef SmallVector<ExternalUser, 16> UserList;
 557
 558   /// A list of values that need to extracted out of the tree.
 559   /// This list holds pairs of (Internal Scalar : External User).
 560   UserList ExternalUses;
 561
 562   /// Values used only by @llvm.assume calls.
 563   SmallPtrSet<const Value *, 32> EphValues;
 564
 565   /// Holds all of the instructions that we gathered.
 566   SetVector<Instruction *> GatherSeq;
 567   /// A list of blocks that we are going to CSE.
 568   SetVector<BasicBlock *> CSEBlocks;
 569
 570   /// Contains all scheduling relevant data for an instruction.
 571   /// A ScheduleData either represents a single instruction or a member of an
 572   /// instruction bundle (= a group of instructions which is combined into a
 573   /// vector instruction).
 574   struct ScheduleData {
 575
 576     // The initial value for the dependency counters. It means that the
 577     // dependencies are not calculated yet.
 578     enum { InvalidDeps = -1 };
 579
 580     ScheduleData()
 581         : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
 582           NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
 583           Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
 584           UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
 585
 586     void init(int BlockSchedulingRegionID) {
 587       FirstInBundle = this;
 588       NextInBundle = nullptr;
 589       NextLoadStore = nullptr;
 590       IsScheduled = false;
 591       SchedulingRegionID = BlockSchedulingRegionID;
 592       UnscheduledDepsInBundle = UnscheduledDeps;
 593       clearDependencies();
 594     }
 595
 596     /// Returns true if the dependency information has been calculated.
 597     bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
 598
 599     /// Returns true for single instructions and for bundle representatives
 600     /// (= the head of a bundle).
 601     bool isSchedulingEntity() const { return FirstInBundle == this; }
 602
 603     /// Returns true if it represents an instruction bundle and not only a
 604     /// single instruction.
 605     bool isPartOfBundle() const {
 606       return NextInBundle != nullptr || FirstInBundle != this;
 607     }
 608
 609     /// Returns true if it is ready for scheduling, i.e. it has no more
 610     /// unscheduled depending instructions/bundles.
 611     bool isReady() const {
 612       assert(isSchedulingEntity() &&
 613              "can't consider non-scheduling entity for ready list");
 614       return UnscheduledDepsInBundle == 0 && !IsScheduled;
 615     }
 616
 617     /// Modifies the number of unscheduled dependencies, also updating it for
 618     /// the whole bundle.
 619     int incrementUnscheduledDeps(int Incr) {
 620       UnscheduledDeps += Incr;
 621       return FirstInBundle->UnscheduledDepsInBundle += Incr;
 622     }
 623
 624     /// Sets the number of unscheduled dependencies to the number of
 625     /// dependencies.
 626     void resetUnscheduledDeps() {
 627       incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
 628     }
 629
 630     /// Clears all dependency information.
 631     void clearDependencies() {
 632       Dependencies = InvalidDeps;
 633       resetUnscheduledDeps();
 634       MemoryDependencies.clear();
 635     }
 636
 637     void dump(raw_ostream &os) const {
 638       if (!isSchedulingEntity()) {
 639         os << "/ " << *Inst;
 640       } else if (NextInBundle) {
 641         os << '[' << *Inst;
 642         ScheduleData *SD = NextInBundle;
 643         while (SD) {
 644           os << ';' << *SD->Inst;
 645           SD = SD->NextInBundle;
 646         }
 647         os << ']';
 648       } else {
 649         os << *Inst;
 650       }
 651     }
 652
 653     Instruction *Inst;
 654
 655     /// Points to the head in an instruction bundle (and always to this for
 656     /// single instructions).
 657     ScheduleData *FirstInBundle;
 658
 659     /// Single linked list of all instructions in a bundle. Null if it is a
 660     /// single instruction.
 661     ScheduleData *NextInBundle;
 662
 663     /// Single linked list of all memory instructions (e.g. load, store, call)
 664     /// in the block - until the end of the scheduling region.
 665     ScheduleData *NextLoadStore;
 666
 667     /// The dependent memory instructions.
 668     /// This list is derived on demand in calculateDependencies().
 669     SmallVector<ScheduleData *, 4> MemoryDependencies;
 670
 671     /// This ScheduleData is in the current scheduling region if this matches
 672     /// the current SchedulingRegionID of BlockScheduling.
 673     int SchedulingRegionID;
 674
 675     /// Used for getting a "good" final ordering of instructions.
 676     int SchedulingPriority;
 677
 678     /// The number of dependencies. Constitutes of the number of users of the
 679     /// instruction plus the number of dependent memory instructions (if any).
 680     /// This value is calculated on demand.
 681     /// If InvalidDeps, the number of dependencies is not calculated yet.
 682     ///
 683     int Dependencies;
 684
 685     /// The number of dependencies minus the number of dependencies of scheduled
 686     /// instructions. As soon as this is zero, the instruction/bundle gets ready
 687     /// for scheduling.
 688     /// Note that this is negative as long as Dependencies is not calculated.
 689     int UnscheduledDeps;
 690
 691     /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
 692     /// single instructions.
 693     int UnscheduledDepsInBundle;
 694
 695     /// True if this instruction is scheduled (or considered as scheduled in the
 696     /// dry-run).
 697     bool IsScheduled;
 698   };
 699
 700 #ifndef NDEBUG
 701   friend raw_ostream &operator<<(raw_ostream &os,
 702                                  const BoUpSLP::ScheduleData &SD);
 703 #endif
 704
 705   /// Contains all scheduling data for a basic block.
 706   ///
 707   struct BlockScheduling {
 708
 709     BlockScheduling(BasicBlock *BB)
 710         : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
 711           ScheduleStart(nullptr), ScheduleEnd(nullptr),
 712           FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
 713           // Make sure that the initial SchedulingRegionID is greater than the
 714           // initial SchedulingRegionID in ScheduleData (which is 0).
 715           SchedulingRegionID(1) {}
 716
 717     void clear() {
 718       ReadyInsts.clear();
 719       ScheduleStart = nullptr;
 720       ScheduleEnd = nullptr;
 721       FirstLoadStoreInRegion = nullptr;
 722       LastLoadStoreInRegion = nullptr;
 723
 724       // Make a new scheduling region, i.e. all existing ScheduleData is not
 725       // in the new region yet.
 726       ++SchedulingRegionID;
 727     }
 728
 729     ScheduleData *getScheduleData(Value *V) {
 730       ScheduleData *SD = ScheduleDataMap[V];
 731       if (SD && SD->SchedulingRegionID == SchedulingRegionID)
 732         return SD;
 733       return nullptr;
 734     }
 735
 736     bool isInSchedulingRegion(ScheduleData *SD) {
 737       return SD->SchedulingRegionID == SchedulingRegionID;
 738     }
 739
 740     /// Marks an instruction as scheduled and puts all dependent ready
 741     /// instructions into the ready-list.
 742     template <typename ReadyListType>
 743     void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
 744       SD->IsScheduled = true;
 745       DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
 746
 747       ScheduleData *BundleMember = SD;
 748       while (BundleMember) {
 749         // Handle the def-use chain dependencies.
 750         for (Use &U : BundleMember->Inst->operands()) {
 751           ScheduleData *OpDef = getScheduleData(U.get());
 752           if (OpDef && OpDef->hasValidDependencies() &&
 753               OpDef->incrementUnscheduledDeps(-1) == 0) {
 754             // There are no more unscheduled dependencies after decrementing,
 755             // so we can put the dependent instruction into the ready list.
 756             ScheduleData *DepBundle = OpDef->FirstInBundle;
 757             assert(!DepBundle->IsScheduled &&
 758                    "already scheduled bundle gets ready");
 759             ReadyList.insert(DepBundle);
 760             DEBUG(dbgs() << "SLP:    gets ready (def): " << *DepBundle << "\n");
 761           }
 762         }
 763         // Handle the memory dependencies.
 764         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
 765           if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
 766             // There are no more unscheduled dependencies after decrementing,
 767             // so we can put the dependent instruction into the ready list.
 768             ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
 769             assert(!DepBundle->IsScheduled &&
 770                    "already scheduled bundle gets ready");
 771             ReadyList.insert(DepBundle);
 772             DEBUG(dbgs() << "SLP:    gets ready (mem): " << *DepBundle << "\n");
 773           }
 774         }
 775         BundleMember = BundleMember->NextInBundle;
 776       }
 777     }
 778
 779     /// Put all instructions into the ReadyList which are ready for scheduling.
 780     template <typename ReadyListType>
 781     void initialFillReadyList(ReadyListType &ReadyList) {
 782       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
 783         ScheduleData *SD = getScheduleData(I);
 784         if (SD->isSchedulingEntity() && SD->isReady()) {
 785           ReadyList.insert(SD);
 786           DEBUG(dbgs() << "SLP:    initially in ready list: " << *I << "\n");
 787         }
 788       }
 789     }
 790
 791     /// Checks if a bundle of instructions can be scheduled, i.e. has no
 792     /// cyclic dependencies. This is only a dry-run, no instructions are
 793     /// actually moved at this stage.
 794     bool tryScheduleBundle(ArrayRef<Value *> VL, AliasAnalysis *AA);
 795
 796     /// Un-bundles a group of instructions.
 797     void cancelScheduling(ArrayRef<Value *> VL);
 798
 799     /// Extends the scheduling region so that V is inside the region.
 800     void extendSchedulingRegion(Value *V);
 801
 802     /// Initialize the ScheduleData structures for new instructions in the
 803     /// scheduling region.
 804     void initScheduleData(Instruction *FromI, Instruction *ToI,
 805                           ScheduleData *PrevLoadStore,
 806                           ScheduleData *NextLoadStore);
 807
 808     /// Updates the dependency information of a bundle and of all instructions/
 809     /// bundles which depend on the original bundle.
 810     void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
 811                                AliasAnalysis *AA);
 812
 813     /// Sets all instruction in the scheduling region to un-scheduled.
 814     void resetSchedule();
 815
 816     BasicBlock *BB;
 817
 818     /// Simple memory allocation for ScheduleData.
 819     std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
 820
 821     /// The size of a ScheduleData array in ScheduleDataChunks.
 822     int ChunkSize;
 823
 824     /// The allocator position in the current chunk, which is the last entry
 825     /// of ScheduleDataChunks.
 826     int ChunkPos;
 827
 828     /// Attaches ScheduleData to Instruction.
 829     /// Note that the mapping survives during all vectorization iterations, i.e.
 830     /// ScheduleData structures are recycled.
 831     DenseMap<Value *, ScheduleData *> ScheduleDataMap;
 832
 833     struct ReadyList : SmallVector<ScheduleData *, 8> {
 834       void insert(ScheduleData *SD) { push_back(SD); }
 835     };
 836
 837     /// The ready-list for scheduling (only used for the dry-run).
 838     ReadyList ReadyInsts;
 839
 840     /// The first instruction of the scheduling region.
 841     Instruction *ScheduleStart;
 842
 843     /// The first instruction _after_ the scheduling region.
 844     Instruction *ScheduleEnd;
 845
 846     /// The first memory accessing instruction in the scheduling region
 847     /// (can be null).
 848     ScheduleData *FirstLoadStoreInRegion;
 849
 850     /// The last memory accessing instruction in the scheduling region
 851     /// (can be null).
 852     ScheduleData *LastLoadStoreInRegion;
 853
 854     /// The ID of the scheduling region. For a new vectorization iteration this
 855     /// is incremented which "removes" all ScheduleData from the region.
 856     int SchedulingRegionID;
 857   };
 858
 859   /// Attaches the BlockScheduling structures to basic blocks.
 860   DenseMap<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
 861
 862   /// Performs the "real" scheduling. Done before vectorization is actually
 863   /// performed in a basic block.
 864   void scheduleBlock(BlockScheduling *BS);
 865
 866   /// List of users to ignore during scheduling and that don't need extracting.
 867   ArrayRef<Value *> UserIgnoreList;
 868
 869   // Number of load-bundles, which contain consecutive loads.
 870   int NumLoadsWantToKeepOrder;
 871
 872   // Number of load-bundles of size 2, which are consecutive loads if reversed.
 873   int NumLoadsWantToChangeOrder;
 874
 875   // Analysis and block reference.
 876   Function *F;
 877   ScalarEvolution *SE;
 878   const DataLayout *DL;
 879   TargetTransformInfo *TTI;
 880   TargetLibraryInfo *TLI;
 881   AliasAnalysis *AA;
 882   LoopInfo *LI;
 883   DominatorTree *DT;
 884   /// Instruction builder to construct the vectorized tree.
 885   IRBuilder<> Builder;
 886 };
 887
 888 #ifndef NDEBUG
 889 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
 890   SD.dump(os);
 891   return os;
 892 }
 893 #endif
 894
 895 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
 896                         ArrayRef<Value *> UserIgnoreLst) {
 897   deleteTree();
 898   UserIgnoreList = UserIgnoreLst;
 899   if (!getSameType(Roots))
 900     return;
 901   buildTree_rec(Roots, 0);
 902
 903   // Collect the values that we need to extract from the tree.
 904   for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
 905     TreeEntry *Entry = &VectorizableTree[EIdx];
 906
 907     // For each lane:
 908     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
 909       Value *Scalar = Entry->Scalars[Lane];
 910
 911       // No need to handle users of gathered values.
 912       if (Entry->NeedToGather)
 913         continue;
 914
 915       for (User *U : Scalar->users()) {
 916         DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
 917
 918         Instruction *UserInst = dyn_cast<Instruction>(U);
 919         if (!UserInst)
 920           continue;
 921
 922         // Skip in-tree scalars that become vectors
 923         if (ScalarToTreeEntry.count(U)) {
 924           int Idx = ScalarToTreeEntry[U];
 925           TreeEntry *UseEntry = &VectorizableTree[Idx];
 926           Value *UseScalar = UseEntry->Scalars[0];
 927           // Some in-tree scalars will remain as scalar in vectorized
 928           // instructions. If that is the case, the one in Lane 0 will
 929           // be used.
 930           if (UseScalar != U ||
 931               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
 932             DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
 933                          << ".\n");
 934             assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
 935             continue;
 936           }
 937         }
 938
 939         // Ignore users in the user ignore list.
 940         if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
 941             UserIgnoreList.end())
 942           continue;
 943
 944         DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
 945               Lane << " from " << *Scalar << ".\n");
 946         ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
 947       }
 948     }
 949   }
 950 }
 951
 952
 953 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
 954   bool SameTy = getSameType(VL); (void)SameTy;
 955   bool isAltShuffle = false;
 956   assert(SameTy && "Invalid types!");
 957
 958   if (Depth == RecursionMaxDepth) {
 959     DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
 960     newTreeEntry(VL, false);
 961     return;
 962   }
 963
 964   // Don't handle vectors.
 965   if (VL[0]->getType()->isVectorTy()) {
 966     DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
 967     newTreeEntry(VL, false);
 968     return;
 969   }
 970
 971   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
 972     if (SI->getValueOperand()->getType()->isVectorTy()) {
 973       DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
 974       newTreeEntry(VL, false);
 975       return;
 976     }
 977   unsigned Opcode = getSameOpcode(VL);
 978
 979   // Check that this shuffle vector refers to the alternate
 980   // sequence of opcodes.
 981   if (Opcode == Instruction::ShuffleVector) {
 982     Instruction *I0 = dyn_cast<Instruction>(VL[0]);
 983     unsigned Op = I0->getOpcode();
 984     if (Op != Instruction::ShuffleVector)
 985       isAltShuffle = true;
 986   }
 987
 988   // If all of the operands are identical or constant we have a simple solution.
 989   if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
 990     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
 991     newTreeEntry(VL, false);
 992     return;
 993   }
 994
 995   // We now know that this is a vector of instructions of the same type from
 996   // the same block.
 997
 998   // Don't vectorize ephemeral values.
 999   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1000     if (EphValues.count(VL[i])) {
1001       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1002             ") is ephemeral.\n");
1003       newTreeEntry(VL, false);
1004       return;
1005     }
1006   }
1007
1008   // Check if this is a duplicate of another entry.
1009   if (ScalarToTreeEntry.count(VL[0])) {
1010     int Idx = ScalarToTreeEntry[VL[0]];
1011     TreeEntry *E = &VectorizableTree[Idx];
1012     for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1013       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
1014       if (E->Scalars[i] != VL[i]) {
1015         DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
1016         newTreeEntry(VL, false);
1017         return;
1018       }
1019     }
1020     DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
1021     return;
1022   }
1023
1024   // Check that none of the instructions in the bundle are already in the tree.
1025   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1026     if (ScalarToTreeEntry.count(VL[i])) {
1027       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
1028             ") is already in tree.\n");
1029       newTreeEntry(VL, false);
1030       return;
1031     }
1032   }
1033
1034   // If any of the scalars appears in the table OR it is marked as a value that
1035   // needs to stat scalar then we need to gather the scalars.
1036   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
1037     if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) {
1038       DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n");
1039       newTreeEntry(VL, false);
1040       return;
1041     }
1042   }
1043
1044   // Check that all of the users of the scalars that we want to vectorize are
1045   // schedulable.
1046   Instruction *VL0 = cast<Instruction>(VL[0]);
1047   BasicBlock *BB = cast<Instruction>(VL0)->getParent();
1048
1049   if (!DT->isReachableFromEntry(BB)) {
1050     // Don't go into unreachable blocks. They may contain instructions with
1051     // dependency cycles which confuse the final scheduling.
1052     DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
1053     newTreeEntry(VL, false);
1054     return;
1055   }
1056
1057   // Check that every instructions appears once in this bundle.
1058   for (unsigned i = 0, e = VL.size(); i < e; ++i)
1059     for (unsigned j = i+1; j < e; ++j)
1060       if (VL[i] == VL[j]) {
1061         DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
1062         newTreeEntry(VL, false);
1063         return;
1064       }
1065
1066   auto &BSRef = BlocksSchedules[BB];
1067   if (!BSRef) {
1068     BSRef = llvm::make_unique<BlockScheduling>(BB);
1069   }
1070   BlockScheduling &BS = *BSRef.get();
1071
1072   if (!BS.tryScheduleBundle(VL, AA)) {
1073     DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
1074     BS.cancelScheduling(VL);
1075     newTreeEntry(VL, false);
1076     return;
1077   }
1078   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
1079
1080   switch (Opcode) {
1081     case Instruction::PHI: {
1082       PHINode *PH = dyn_cast<PHINode>(VL0);
1083
1084       // Check for terminator values (e.g. invoke).
1085       for (unsigned j = 0; j < VL.size(); ++j)
1086         for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1087           TerminatorInst *Term = dyn_cast<TerminatorInst>(
1088               cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
1089           if (Term) {
1090             DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
1091             BS.cancelScheduling(VL);
1092             newTreeEntry(VL, false);
1093             return;
1094           }
1095         }
1096
1097       newTreeEntry(VL, true);
1098       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
1099
1100       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1101         ValueList Operands;
1102         // Prepare the operand vector.
1103         for (unsigned j = 0; j < VL.size(); ++j)
1104           Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
1105               PH->getIncomingBlock(i)));
1106
1107         buildTree_rec(Operands, Depth + 1);
1108       }
1109       return;
1110     }
1111     case Instruction::ExtractElement: {
1112       bool Reuse = CanReuseExtract(VL);
1113       if (Reuse) {
1114         DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
1115       } else {
1116         BS.cancelScheduling(VL);
1117       }
1118       newTreeEntry(VL, Reuse);
1119       return;
1120     }
1121     case Instruction::Load: {
1122       // Check if the loads are consecutive or of we need to swizzle them.
1123       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
1124         LoadInst *L = cast<LoadInst>(VL[i]);
1125         if (!L->isSimple()) {
1126           BS.cancelScheduling(VL);
1127           newTreeEntry(VL, false);
1128           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
1129           return;
1130         }
1131         if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
1132           if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) {
1133             ++NumLoadsWantToChangeOrder;
1134           }
1135           BS.cancelScheduling(VL);
1136           newTreeEntry(VL, false);
1137           DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
1138           return;
1139         }
1140       }
1141       ++NumLoadsWantToKeepOrder;
1142       newTreeEntry(VL, true);
1143       DEBUG(dbgs() << "SLP: added a vector of loads.\n");
1144       return;
1145     }
1146     case Instruction::ZExt:
1147     case Instruction::SExt:
1148     case Instruction::FPToUI:
1149     case Instruction::FPToSI:
1150     case Instruction::FPExt:
1151     case Instruction::PtrToInt:
1152     case Instruction::IntToPtr:
1153     case Instruction::SIToFP:
1154     case Instruction::UIToFP:
1155     case Instruction::Trunc:
1156     case Instruction::FPTrunc:
1157     case Instruction::BitCast: {
1158       Type *SrcTy = VL0->getOperand(0)->getType();
1159       for (unsigned i = 0; i < VL.size(); ++i) {
1160         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
1161         if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) {
1162           BS.cancelScheduling(VL);
1163           newTreeEntry(VL, false);
1164           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
1165           return;
1166         }
1167       }
1168       newTreeEntry(VL, true);
1169       DEBUG(dbgs() << "SLP: added a vector of casts.\n");
1170
1171       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1172         ValueList Operands;
1173         // Prepare the operand vector.
1174         for (unsigned j = 0; j < VL.size(); ++j)
1175           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1176
1177         buildTree_rec(Operands, Depth+1);
1178       }
1179       return;
1180     }
1181     case Instruction::ICmp:
1182     case Instruction::FCmp: {
1183       // Check that all of the compares have the same predicate.
1184       CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
1185       Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
1186       for (unsigned i = 1, e = VL.size(); i < e; ++i) {
1187         CmpInst *Cmp = cast<CmpInst>(VL[i]);
1188         if (Cmp->getPredicate() != P0 ||
1189             Cmp->getOperand(0)->getType() != ComparedTy) {
1190           BS.cancelScheduling(VL);
1191           newTreeEntry(VL, false);
1192           DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
1193           return;
1194         }
1195       }
1196
1197       newTreeEntry(VL, true);
1198       DEBUG(dbgs() << "SLP: added a vector of compares.\n");
1199
1200       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1201         ValueList Operands;
1202         // Prepare the operand vector.
1203         for (unsigned j = 0; j < VL.size(); ++j)
1204           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1205
1206         buildTree_rec(Operands, Depth+1);
1207       }
1208       return;
1209     }
1210     case Instruction::Select:
1211     case Instruction::Add:
1212     case Instruction::FAdd:
1213     case Instruction::Sub:
1214     case Instruction::FSub:
1215     case Instruction::Mul:
1216     case Instruction::FMul:
1217     case Instruction::UDiv:
1218     case Instruction::SDiv:
1219     case Instruction::FDiv:
1220     case Instruction::URem:
1221     case Instruction::SRem:
1222     case Instruction::FRem:
1223     case Instruction::Shl:
1224     case Instruction::LShr:
1225     case Instruction::AShr:
1226     case Instruction::And:
1227     case Instruction::Or:
1228     case Instruction::Xor: {
1229       newTreeEntry(VL, true);
1230       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
1231
1232       // Sort operands of the instructions so that each side is more likely to
1233       // have the same opcode.
1234       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
1235         ValueList Left, Right;
1236         reorderInputsAccordingToOpcode(VL, Left, Right);
1237         buildTree_rec(Left, Depth + 1);
1238         buildTree_rec(Right, Depth + 1);
1239         return;
1240       }
1241
1242       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1243         ValueList Operands;
1244         // Prepare the operand vector.
1245         for (unsigned j = 0; j < VL.size(); ++j)
1246           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1247
1248         buildTree_rec(Operands, Depth+1);
1249       }
1250       return;
1251     }
1252     case Instruction::GetElementPtr: {
1253       // We don't combine GEPs with complicated (nested) indexing.
1254       for (unsigned j = 0; j < VL.size(); ++j) {
1255         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
1256           DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
1257           BS.cancelScheduling(VL);
1258           newTreeEntry(VL, false);
1259           return;
1260         }
1261       }
1262
1263       // We can't combine several GEPs into one vector if they operate on
1264       // different types.
1265       Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
1266       for (unsigned j = 0; j < VL.size(); ++j) {
1267         Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
1268         if (Ty0 != CurTy) {
1269           DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
1270           BS.cancelScheduling(VL);
1271           newTreeEntry(VL, false);
1272           return;
1273         }
1274       }
1275
1276       // We don't combine GEPs with non-constant indexes.
1277       for (unsigned j = 0; j < VL.size(); ++j) {
1278         auto Op = cast<Instruction>(VL[j])->getOperand(1);
1279         if (!isa<ConstantInt>(Op)) {
1280           DEBUG(
1281               dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
1282           BS.cancelScheduling(VL);
1283           newTreeEntry(VL, false);
1284           return;
1285         }
1286       }
1287
1288       newTreeEntry(VL, true);
1289       DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
1290       for (unsigned i = 0, e = 2; i < e; ++i) {
1291         ValueList Operands;
1292         // Prepare the operand vector.
1293         for (unsigned j = 0; j < VL.size(); ++j)
1294           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1295
1296         buildTree_rec(Operands, Depth + 1);
1297       }
1298       return;
1299     }
1300     case Instruction::Store: {
1301       // Check if the stores are consecutive or of we need to swizzle them.
1302       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
1303         if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
1304           BS.cancelScheduling(VL);
1305           newTreeEntry(VL, false);
1306           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
1307           return;
1308         }
1309
1310       newTreeEntry(VL, true);
1311       DEBUG(dbgs() << "SLP: added a vector of stores.\n");
1312
1313       ValueList Operands;
1314       for (unsigned j = 0; j < VL.size(); ++j)
1315         Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
1316
1317       buildTree_rec(Operands, Depth + 1);
1318       return;
1319     }
1320     case Instruction::Call: {
1321       // Check if the calls are all to the same vectorizable intrinsic.
1322       CallInst *CI = cast<CallInst>(VL[0]);
1323       // Check if this is an Intrinsic call or something that can be
1324       // represented by an intrinsic call
1325       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1326       if (!isTriviallyVectorizable(ID)) {
1327         BS.cancelScheduling(VL);
1328         newTreeEntry(VL, false);
1329         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
1330         return;
1331       }
1332       Function *Int = CI->getCalledFunction();
1333       Value *A1I = nullptr;
1334       if (hasVectorInstrinsicScalarOpd(ID, 1))
1335         A1I = CI->getArgOperand(1);
1336       for (unsigned i = 1, e = VL.size(); i != e; ++i) {
1337         CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
1338         if (!CI2 || CI2->getCalledFunction() != Int ||
1339             getIntrinsicIDForCall(CI2, TLI) != ID) {
1340           BS.cancelScheduling(VL);
1341           newTreeEntry(VL, false);
1342           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
1343                        << "\n");
1344           return;
1345         }
1346         // ctlz,cttz and powi are special intrinsics whose second argument
1347         // should be same in order for them to be vectorized.
1348         if (hasVectorInstrinsicScalarOpd(ID, 1)) {
1349           Value *A1J = CI2->getArgOperand(1);
1350           if (A1I != A1J) {
1351             BS.cancelScheduling(VL);
1352             newTreeEntry(VL, false);
1353             DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
1354                          << " argument "<< A1I<<"!=" << A1J
1355                          << "\n");
1356             return;
1357           }
1358         }
1359       }
1360
1361       newTreeEntry(VL, true);
1362       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
1363         ValueList Operands;
1364         // Prepare the operand vector.
1365         for (unsigned j = 0; j < VL.size(); ++j) {
1366           CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
1367           Operands.push_back(CI2->getArgOperand(i));
1368         }
1369         buildTree_rec(Operands, Depth + 1);
1370       }
1371       return;
1372     }
1373     case Instruction::ShuffleVector: {
1374       // If this is not an alternate sequence of opcode like add-sub
1375       // then do not vectorize this instruction.
1376       if (!isAltShuffle) {
1377         BS.cancelScheduling(VL);
1378         newTreeEntry(VL, false);
1379         DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
1380         return;
1381       }
1382       newTreeEntry(VL, true);
1383       DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
1384       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
1385         ValueList Operands;
1386         // Prepare the operand vector.
1387         for (unsigned j = 0; j < VL.size(); ++j)
1388           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
1389
1390         buildTree_rec(Operands, Depth + 1);
1391       }
1392       return;
1393     }
1394     default:
1395       BS.cancelScheduling(VL);
1396       newTreeEntry(VL, false);
1397       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
1398       return;
1399   }
1400 }
1401
1402 int BoUpSLP::getEntryCost(TreeEntry *E) {
1403   ArrayRef<Value*> VL = E->Scalars;
1404
1405   Type *ScalarTy = VL[0]->getType();
1406   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1407     ScalarTy = SI->getValueOperand()->getType();
1408   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1409
1410   if (E->NeedToGather) {
1411     if (allConstant(VL))
1412       return 0;
1413     if (isSplat(VL)) {
1414       return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
1415     }
1416     return getGatherCost(E->Scalars);
1417   }
1418   unsigned Opcode = getSameOpcode(VL);
1419   assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
1420   Instruction *VL0 = cast<Instruction>(VL[0]);
1421   switch (Opcode) {
1422     case Instruction::PHI: {
1423       return 0;
1424     }
1425     case Instruction::ExtractElement: {
1426       if (CanReuseExtract(VL)) {
1427         int DeadCost = 0;
1428         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
1429           ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
1430           if (E->hasOneUse())
1431             // Take credit for instruction that will become dead.
1432             DeadCost +=
1433                 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
1434         }
1435         return -DeadCost;
1436       }
1437       return getGatherCost(VecTy);
1438     }
1439     case Instruction::ZExt:
1440     case Instruction::SExt:
1441     case Instruction::FPToUI:
1442     case Instruction::FPToSI:
1443     case Instruction::FPExt:
1444     case Instruction::PtrToInt:
1445     case Instruction::IntToPtr:
1446     case Instruction::SIToFP:
1447     case Instruction::UIToFP:
1448     case Instruction::Trunc:
1449     case Instruction::FPTrunc:
1450     case Instruction::BitCast: {
1451       Type *SrcTy = VL0->getOperand(0)->getType();
1452
1453       // Calculate the cost of this instruction.
1454       int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
1455                                                          VL0->getType(), SrcTy);
1456
1457       VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
1458       int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
1459       return VecCost - ScalarCost;
1460     }
1461     case Instruction::FCmp:
1462     case Instruction::ICmp:
1463     case Instruction::Select:
1464     case Instruction::Add:
1465     case Instruction::FAdd:
1466     case Instruction::Sub:
1467     case Instruction::FSub:
1468     case Instruction::Mul:
1469     case Instruction::FMul:
1470     case Instruction::UDiv:
1471     case Instruction::SDiv:
1472     case Instruction::FDiv:
1473     case Instruction::URem:
1474     case Instruction::SRem:
1475     case Instruction::FRem:
1476     case Instruction::Shl:
1477     case Instruction::LShr:
1478     case Instruction::AShr:
1479     case Instruction::And:
1480     case Instruction::Or:
1481     case Instruction::Xor: {
1482       // Calculate the cost of this instruction.
1483       int ScalarCost = 0;
1484       int VecCost = 0;
1485       if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
1486           Opcode == Instruction::Select) {
1487         VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
1488         ScalarCost = VecTy->getNumElements() *
1489         TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
1490         VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
1491       } else {
1492         // Certain instructions can be cheaper to vectorize if they have a
1493         // constant second vector operand.
1494         TargetTransformInfo::OperandValueKind Op1VK =
1495             TargetTransformInfo::OK_AnyValue;
1496         TargetTransformInfo::OperandValueKind Op2VK =
1497             TargetTransformInfo::OK_UniformConstantValue;
1498         TargetTransformInfo::OperandValueProperties Op1VP =
1499             TargetTransformInfo::OP_None;
1500         TargetTransformInfo::OperandValueProperties Op2VP =
1501             TargetTransformInfo::OP_None;
1502
1503         // If all operands are exactly the same ConstantInt then set the
1504         // operand kind to OK_UniformConstantValue.
1505         // If instead not all operands are constants, then set the operand kind
1506         // to OK_AnyValue. If all operands are constants but not the same,
1507         // then set the operand kind to OK_NonUniformConstantValue.
1508         ConstantInt *CInt = nullptr;
1509         for (unsigned i = 0; i < VL.size(); ++i) {
1510           const Instruction *I = cast<Instruction>(VL[i]);
1511           if (!isa<ConstantInt>(I->getOperand(1))) {
1512             Op2VK = TargetTransformInfo::OK_AnyValue;
1513             break;
1514           }
1515           if (i == 0) {
1516             CInt = cast<ConstantInt>(I->getOperand(1));
1517             continue;
1518           }
1519           if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
1520               CInt != cast<ConstantInt>(I->getOperand(1)))
1521             Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
1522         }
1523         // FIXME: Currently cost of model modification for division by
1524         // power of 2 is handled only for X86. Add support for other targets.
1525         if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
1526             CInt->getValue().isPowerOf2())
1527           Op2VP = TargetTransformInfo::OP_PowerOf2;
1528
1529         ScalarCost = VecTy->getNumElements() *
1530                      TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
1531                                                  Op1VP, Op2VP);
1532         VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
1533                                               Op1VP, Op2VP);
1534       }
1535       return VecCost - ScalarCost;
1536     }
1537     case Instruction::GetElementPtr: {
1538       TargetTransformInfo::OperandValueKind Op1VK =
1539           TargetTransformInfo::OK_AnyValue;
1540       TargetTransformInfo::OperandValueKind Op2VK =
1541           TargetTransformInfo::OK_UniformConstantValue;
1542
1543       int ScalarCost =
1544           VecTy->getNumElements() *
1545           TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
1546       int VecCost =
1547           TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
1548
1549       return VecCost - ScalarCost;
1550     }
1551     case Instruction::Load: {
1552       // Cost of wide load - cost of scalar loads.
1553       int ScalarLdCost = VecTy->getNumElements() *
1554       TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
1555       int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
1556       return VecLdCost - ScalarLdCost;
1557     }
1558     case Instruction::Store: {
1559       // We know that we can merge the stores. Calculate the cost.
1560       int ScalarStCost = VecTy->getNumElements() *
1561       TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
1562       int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
1563       return VecStCost - ScalarStCost;
1564     }
1565     case Instruction::Call: {
1566       CallInst *CI = cast<CallInst>(VL0);
1567       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
1568
1569       // Calculate the cost of the scalar and vector calls.
1570       SmallVector<Type*, 4> ScalarTys, VecTys;
1571       for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
1572         ScalarTys.push_back(CI->getArgOperand(op)->getType());
1573         VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
1574                                          VecTy->getNumElements()));
1575       }
1576
1577       int ScalarCallCost = VecTy->getNumElements() *
1578           TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
1579
1580       int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
1581
1582       DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
1583             << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
1584             << " for " << *CI << "\n");
1585
1586       return VecCallCost - ScalarCallCost;
1587     }
1588     case Instruction::ShuffleVector: {
1589       TargetTransformInfo::OperandValueKind Op1VK =
1590           TargetTransformInfo::OK_AnyValue;
1591       TargetTransformInfo::OperandValueKind Op2VK =
1592           TargetTransformInfo::OK_AnyValue;
1593       int ScalarCost = 0;
1594       int VecCost = 0;
1595       for (unsigned i = 0; i < VL.size(); ++i) {
1596         Instruction *I = cast<Instruction>(VL[i]);
1597         if (!I)
1598           break;
1599         ScalarCost +=
1600             TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
1601       }
1602       // VecCost is equal to sum of the cost of creating 2 vectors
1603       // and the cost of creating shuffle.
1604       Instruction *I0 = cast<Instruction>(VL[0]);
1605       VecCost =
1606           TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
1607       Instruction *I1 = cast<Instruction>(VL[1]);
1608       VecCost +=
1609           TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
1610       VecCost +=
1611           TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
1612       return VecCost - ScalarCost;
1613     }
1614     default:
1615       llvm_unreachable("Unknown instruction");
1616   }
1617 }
1618
1619 bool BoUpSLP::isFullyVectorizableTinyTree() {
1620   DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
1621         VectorizableTree.size() << " is fully vectorizable .\n");
1622
1623   // We only handle trees of height 2.
1624   if (VectorizableTree.size() != 2)
1625     return false;
1626
1627   // Handle splat stores.
1628   if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars))
1629     return true;
1630
1631   // Gathering cost would be too much for tiny trees.
1632   if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
1633     return false;
1634
1635   return true;
1636 }
1637
1638 int BoUpSLP::getSpillCost() {
1639   // Walk from the bottom of the tree to the top, tracking which values are
1640   // live. When we see a call instruction that is not part of our tree,
1641   // query TTI to see if there is a cost to keeping values live over it
1642   // (for example, if spills and fills are required).
1643   unsigned BundleWidth = VectorizableTree.front().Scalars.size();
1644   int Cost = 0;
1645
1646   SmallPtrSet<Instruction*, 4> LiveValues;
1647   Instruction *PrevInst = nullptr;
1648
1649   for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
1650     Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
1651     if (!Inst)
1652       continue;
1653
1654     if (!PrevInst) {
1655       PrevInst = Inst;
1656       continue;
1657     }
1658
1659     DEBUG(
1660       dbgs() << "SLP: #LV: " << LiveValues.size();
1661       for (auto *X : LiveValues)
1662         dbgs() << " " << X->getName();
1663       dbgs() << ", Looking at ";
1664       Inst->dump();
1665       );
1666
1667     // Update LiveValues.
1668     LiveValues.erase(PrevInst);
1669     for (auto &J : PrevInst->operands()) {
1670       if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
1671         LiveValues.insert(cast<Instruction>(&*J));
1672     }
1673
1674     // Now find the sequence of instructions between PrevInst and Inst.
1675     BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst);
1676     --PrevInstIt;
1677     while (InstIt != PrevInstIt) {
1678       if (PrevInstIt == PrevInst->getParent()->rend()) {
1679         PrevInstIt = Inst->getParent()->rbegin();
1680         continue;
1681       }
1682
1683       if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
1684         SmallVector<Type*, 4> V;
1685         for (auto *II : LiveValues)
1686           V.push_back(VectorType::get(II->getType(), BundleWidth));
1687         Cost += TTI->getCostOfKeepingLiveOverCall(V);
1688       }
1689
1690       ++PrevInstIt;
1691     }
1692
1693     PrevInst = Inst;
1694   }
1695
1696   DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
1697   return Cost;
1698 }
1699
1700 int BoUpSLP::getTreeCost() {
1701   int Cost = 0;
1702   DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
1703         VectorizableTree.size() << ".\n");
1704
1705   // We only vectorize tiny trees if it is fully vectorizable.
1706   if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
1707     if (!VectorizableTree.size()) {
1708       assert(!ExternalUses.size() && "We should not have any external users");
1709     }
1710     return INT_MAX;
1711   }
1712
1713   unsigned BundleWidth = VectorizableTree[0].Scalars.size();
1714
1715   for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
1716     int C = getEntryCost(&VectorizableTree[i]);
1717     DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
1718           << *VectorizableTree[i].Scalars[0] << " .\n");
1719     Cost += C;
1720   }
1721
1722   SmallSet<Value *, 16> ExtractCostCalculated;
1723   int ExtractCost = 0;
1724   for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
1725        I != E; ++I) {
1726     // We only add extract cost once for the same scalar.
1727     if (!ExtractCostCalculated.insert(I->Scalar).second)
1728       continue;
1729
1730     // Uses by ephemeral values are free (because the ephemeral value will be
1731     // removed prior to code generation, and so the extraction will be
1732     // removed as well).
1733     if (EphValues.count(I->User))
1734       continue;
1735
1736     VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
1737     ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1738                                            I->Lane);
1739   }
1740
1741   Cost += getSpillCost();
1742
1743   DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
1744   return  Cost + ExtractCost;
1745 }
1746
1747 int BoUpSLP::getGatherCost(Type *Ty) {
1748   int Cost = 0;
1749   for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
1750     Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
1751   return Cost;
1752 }
1753
1754 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
1755   // Find the type of the operands in VL.
1756   Type *ScalarTy = VL[0]->getType();
1757   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1758     ScalarTy = SI->getValueOperand()->getType();
1759   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1760   // Find the cost of inserting/extracting values from the vector.
1761   return getGatherCost(VecTy);
1762 }
1763
1764 Value *BoUpSLP::getPointerOperand(Value *I) {
1765   if (LoadInst *LI = dyn_cast<LoadInst>(I))
1766     return LI->getPointerOperand();
1767   if (StoreInst *SI = dyn_cast<StoreInst>(I))
1768     return SI->getPointerOperand();
1769   return nullptr;
1770 }
1771
1772 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
1773   if (LoadInst *L = dyn_cast<LoadInst>(I))
1774     return L->getPointerAddressSpace();
1775   if (StoreInst *S = dyn_cast<StoreInst>(I))
1776     return S->getPointerAddressSpace();
1777   return -1;
1778 }
1779
1780 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
1781   Value *PtrA = getPointerOperand(A);
1782   Value *PtrB = getPointerOperand(B);
1783   unsigned ASA = getAddressSpaceOperand(A);
1784   unsigned ASB = getAddressSpaceOperand(B);
1785
1786   // Check that the address spaces match and that the pointers are valid.
1787   if (!PtrA || !PtrB || (ASA != ASB))
1788     return false;
1789
1790   // Make sure that A and B are different pointers of the same type.
1791   if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
1792     return false;
1793
1794   unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA);
1795   Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
1796   APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty));
1797
1798   APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
1799   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA);
1800   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB);
1801
1802   APInt OffsetDelta = OffsetB - OffsetA;
1803
1804   // Check if they are based on the same pointer. That makes the offsets
1805   // sufficient.
1806   if (PtrA == PtrB)
1807     return OffsetDelta == Size;
1808
1809   // Compute the necessary base pointer delta to have the necessary final delta
1810   // equal to the size.
1811   APInt BaseDelta = Size - OffsetDelta;
1812
1813   // Otherwise compute the distance with SCEV between the base pointers.
1814   const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
1815   const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
1816   const SCEV *C = SE->getConstant(BaseDelta);
1817   const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
1818   return X == PtrSCEVB;
1819 }
1820
1821 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
1822   Instruction *VL0 = cast<Instruction>(VL[0]);
1823   BasicBlock::iterator NextInst = VL0;
1824   ++NextInst;
1825   Builder.SetInsertPoint(VL0->getParent(), NextInst);
1826   Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
1827 }
1828
1829 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
1830   Value *Vec = UndefValue::get(Ty);
1831   // Generate the 'InsertElement' instruction.
1832   for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
1833     Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
1834     if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
1835       GatherSeq.insert(Insrt);
1836       CSEBlocks.insert(Insrt->getParent());
1837
1838       // Add to our 'need-to-extract' list.
1839       if (ScalarToTreeEntry.count(VL[i])) {
1840         int Idx = ScalarToTreeEntry[VL[i]];
1841         TreeEntry *E = &VectorizableTree[Idx];
1842         // Find which lane we need to extract.
1843         int FoundLane = -1;
1844         for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
1845           // Is this the lane of the scalar that we are looking for ?
1846           if (E->Scalars[Lane] == VL[i]) {
1847             FoundLane = Lane;
1848             break;
1849           }
1850         }
1851         assert(FoundLane >= 0 && "Could not find the correct lane");
1852         ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
1853       }
1854     }
1855   }
1856
1857   return Vec;
1858 }
1859
1860 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
1861   SmallDenseMap<Value*, int>::const_iterator Entry
1862     = ScalarToTreeEntry.find(VL[0]);
1863   if (Entry != ScalarToTreeEntry.end()) {
1864     int Idx = Entry->second;
1865     const TreeEntry *En = &VectorizableTree[Idx];
1866     if (En->isSame(VL) && En->VectorizedValue)
1867       return En->VectorizedValue;
1868   }
1869   return nullptr;
1870 }
1871
1872 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
1873   if (ScalarToTreeEntry.count(VL[0])) {
1874     int Idx = ScalarToTreeEntry[VL[0]];
1875     TreeEntry *E = &VectorizableTree[Idx];
1876     if (E->isSame(VL))
1877       return vectorizeTree(E);
1878   }
1879
1880   Type *ScalarTy = VL[0]->getType();
1881   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
1882     ScalarTy = SI->getValueOperand()->getType();
1883   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
1884
1885   return Gather(VL, VecTy);
1886 }
1887
1888 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
1889   IRBuilder<>::InsertPointGuard Guard(Builder);
1890
1891   if (E->VectorizedValue) {
1892     DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
1893     return E->VectorizedValue;
1894   }
1895
1896   Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
1897   Type *ScalarTy = VL0->getType();
1898   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
1899     ScalarTy = SI->getValueOperand()->getType();
1900   VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
1901
1902   if (E->NeedToGather) {
1903     setInsertPointAfterBundle(E->Scalars);
1904     return Gather(E->Scalars, VecTy);
1905   }
1906
1907   unsigned Opcode = getSameOpcode(E->Scalars);
1908
1909   switch (Opcode) {
1910     case Instruction::PHI: {
1911       PHINode *PH = dyn_cast<PHINode>(VL0);
1912       Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
1913       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
1914       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
1915       E->VectorizedValue = NewPhi;
1916
1917       // PHINodes may have multiple entries from the same block. We want to
1918       // visit every block once.
1919       SmallSet<BasicBlock*, 4> VisitedBBs;
1920
1921       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
1922         ValueList Operands;
1923         BasicBlock *IBB = PH->getIncomingBlock(i);
1924
1925         if (!VisitedBBs.insert(IBB).second) {
1926           NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
1927           continue;
1928         }
1929
1930         // Prepare the operand vector.
1931         for (unsigned j = 0; j < E->Scalars.size(); ++j)
1932           Operands.push_back(cast<PHINode>(E->Scalars[j])->
1933                              getIncomingValueForBlock(IBB));
1934
1935         Builder.SetInsertPoint(IBB->getTerminator());
1936         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
1937         Value *Vec = vectorizeTree(Operands);
1938         NewPhi->addIncoming(Vec, IBB);
1939       }
1940
1941       assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
1942              "Invalid number of incoming values");
1943       return NewPhi;
1944     }
1945
1946     case Instruction::ExtractElement: {
1947       if (CanReuseExtract(E->Scalars)) {
1948         Value *V = VL0->getOperand(0);
1949         E->VectorizedValue = V;
1950         return V;
1951       }
1952       return Gather(E->Scalars, VecTy);
1953     }
1954     case Instruction::ZExt:
1955     case Instruction::SExt:
1956     case Instruction::FPToUI:
1957     case Instruction::FPToSI:
1958     case Instruction::FPExt:
1959     case Instruction::PtrToInt:
1960     case Instruction::IntToPtr:
1961     case Instruction::SIToFP:
1962     case Instruction::UIToFP:
1963     case Instruction::Trunc:
1964     case Instruction::FPTrunc:
1965     case Instruction::BitCast: {
1966       ValueList INVL;
1967       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
1968         INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
1969
1970       setInsertPointAfterBundle(E->Scalars);
1971
1972       Value *InVec = vectorizeTree(INVL);
1973
1974       if (Value *V = alreadyVectorized(E->Scalars))
1975         return V;
1976
1977       CastInst *CI = dyn_cast<CastInst>(VL0);
1978       Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
1979       E->VectorizedValue = V;
1980       ++NumVectorInstructions;
1981       return V;
1982     }
1983     case Instruction::FCmp:
1984     case Instruction::ICmp: {
1985       ValueList LHSV, RHSV;
1986       for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
1987         LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
1988         RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
1989       }
1990
1991       setInsertPointAfterBundle(E->Scalars);
1992
1993       Value *L = vectorizeTree(LHSV);
1994       Value *R = vectorizeTree(RHSV);
1995
1996       if (Value *V = alreadyVectorized(E->Scalars))
1997         return V;
1998
1999       CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
2000       Value *V;
2001       if (Opcode == Instruction::FCmp)
2002         V = Builder.CreateFCmp(P0, L, R);
2003       else
2004         V = Builder.CreateICmp(P0, L, R);
2005
2006       E->VectorizedValue = V;
2007       ++NumVectorInstructions;
2008       return V;
2009     }
2010     case Instruction::Select: {
2011       ValueList TrueVec, FalseVec, CondVec;
2012       for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2013         CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2014         TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
2015         FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
2016       }
2017
2018       setInsertPointAfterBundle(E->Scalars);
2019
2020       Value *Cond = vectorizeTree(CondVec);
2021       Value *True = vectorizeTree(TrueVec);
2022       Value *False = vectorizeTree(FalseVec);
2023
2024       if (Value *V = alreadyVectorized(E->Scalars))
2025         return V;
2026
2027       Value *V = Builder.CreateSelect(Cond, True, False);
2028       E->VectorizedValue = V;
2029       ++NumVectorInstructions;
2030       return V;
2031     }
2032     case Instruction::Add:
2033     case Instruction::FAdd:
2034     case Instruction::Sub:
2035     case Instruction::FSub:
2036     case Instruction::Mul:
2037     case Instruction::FMul:
2038     case Instruction::UDiv:
2039     case Instruction::SDiv:
2040     case Instruction::FDiv:
2041     case Instruction::URem:
2042     case Instruction::SRem:
2043     case Instruction::FRem:
2044     case Instruction::Shl:
2045     case Instruction::LShr:
2046     case Instruction::AShr:
2047     case Instruction::And:
2048     case Instruction::Or:
2049     case Instruction::Xor: {
2050       ValueList LHSVL, RHSVL;
2051       if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
2052         reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
2053       else
2054         for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2055           LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2056           RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
2057         }
2058
2059       setInsertPointAfterBundle(E->Scalars);
2060
2061       Value *LHS = vectorizeTree(LHSVL);
2062       Value *RHS = vectorizeTree(RHSVL);
2063
2064       if (LHS == RHS && isa<Instruction>(LHS)) {
2065         assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
2066       }
2067
2068       if (Value *V = alreadyVectorized(E->Scalars))
2069         return V;
2070
2071       BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
2072       Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
2073       E->VectorizedValue = V;
2074       propagateIRFlags(E->VectorizedValue, E->Scalars);
2075       ++NumVectorInstructions;
2076
2077       if (Instruction *I = dyn_cast<Instruction>(V))
2078         return propagateMetadata(I, E->Scalars);
2079
2080       return V;
2081     }
2082     case Instruction::Load: {
2083       // Loads are inserted at the head of the tree because we don't want to
2084       // sink them all the way down past store instructions.
2085       setInsertPointAfterBundle(E->Scalars);
2086
2087       LoadInst *LI = cast<LoadInst>(VL0);
2088       Type *ScalarLoadTy = LI->getType();
2089       unsigned AS = LI->getPointerAddressSpace();
2090
2091       Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
2092                                             VecTy->getPointerTo(AS));
2093
2094       // The pointer operand uses an in-tree scalar so we add the new BitCast to
2095       // ExternalUses list to make sure that an extract will be generated in the
2096       // future.
2097       if (ScalarToTreeEntry.count(LI->getPointerOperand()))
2098         ExternalUses.push_back(
2099             ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
2100
2101       unsigned Alignment = LI->getAlignment();
2102       LI = Builder.CreateLoad(VecPtr);
2103       if (!Alignment)
2104         Alignment = DL->getABITypeAlignment(ScalarLoadTy);
2105       LI->setAlignment(Alignment);
2106       E->VectorizedValue = LI;
2107       ++NumVectorInstructions;
2108       return propagateMetadata(LI, E->Scalars);
2109     }
2110     case Instruction::Store: {
2111       StoreInst *SI = cast<StoreInst>(VL0);
2112       unsigned Alignment = SI->getAlignment();
2113       unsigned AS = SI->getPointerAddressSpace();
2114
2115       ValueList ValueOp;
2116       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2117         ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
2118
2119       setInsertPointAfterBundle(E->Scalars);
2120
2121       Value *VecValue = vectorizeTree(ValueOp);
2122       Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
2123                                             VecTy->getPointerTo(AS));
2124       StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
2125
2126       // The pointer operand uses an in-tree scalar so we add the new BitCast to
2127       // ExternalUses list to make sure that an extract will be generated in the
2128       // future.
2129       if (ScalarToTreeEntry.count(SI->getPointerOperand()))
2130         ExternalUses.push_back(
2131             ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
2132
2133       if (!Alignment)
2134         Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
2135       S->setAlignment(Alignment);
2136       E->VectorizedValue = S;
2137       ++NumVectorInstructions;
2138       return propagateMetadata(S, E->Scalars);
2139     }
2140     case Instruction::GetElementPtr: {
2141       setInsertPointAfterBundle(E->Scalars);
2142
2143       ValueList Op0VL;
2144       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2145         Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0));
2146
2147       Value *Op0 = vectorizeTree(Op0VL);
2148
2149       std::vector<Value *> OpVecs;
2150       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
2151            ++j) {
2152         ValueList OpVL;
2153         for (int i = 0, e = E->Scalars.size(); i < e; ++i)
2154           OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j));
2155
2156         Value *OpVec = vectorizeTree(OpVL);
2157         OpVecs.push_back(OpVec);
2158       }
2159
2160       Value *V = Builder.CreateGEP(Op0, OpVecs);
2161       E->VectorizedValue = V;
2162       ++NumVectorInstructions;
2163
2164       if (Instruction *I = dyn_cast<Instruction>(V))
2165         return propagateMetadata(I, E->Scalars);
2166
2167       return V;
2168     }
2169     case Instruction::Call: {
2170       CallInst *CI = cast<CallInst>(VL0);
2171       setInsertPointAfterBundle(E->Scalars);
2172       Function *FI;
2173       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
2174       Value *ScalarArg = nullptr;
2175       if (CI && (FI = CI->getCalledFunction())) {
2176         IID = (Intrinsic::ID) FI->getIntrinsicID();
2177       }
2178       std::vector<Value *> OpVecs;
2179       for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
2180         ValueList OpVL;
2181         // ctlz,cttz and powi are special intrinsics whose second argument is
2182         // a scalar. This argument should not be vectorized.
2183         if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
2184           CallInst *CEI = cast<CallInst>(E->Scalars[0]);
2185           ScalarArg = CEI->getArgOperand(j);
2186           OpVecs.push_back(CEI->getArgOperand(j));
2187           continue;
2188         }
2189         for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2190           CallInst *CEI = cast<CallInst>(E->Scalars[i]);
2191           OpVL.push_back(CEI->getArgOperand(j));
2192         }
2193
2194         Value *OpVec = vectorizeTree(OpVL);
2195         DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
2196         OpVecs.push_back(OpVec);
2197       }
2198
2199       Module *M = F->getParent();
2200       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
2201       Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
2202       Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
2203       Value *V = Builder.CreateCall(CF, OpVecs);
2204
2205       // The scalar argument uses an in-tree scalar so we add the new vectorized
2206       // call to ExternalUses list to make sure that an extract will be
2207       // generated in the future.
2208       if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
2209         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
2210
2211       E->VectorizedValue = V;
2212       ++NumVectorInstructions;
2213       return V;
2214     }
2215     case Instruction::ShuffleVector: {
2216       ValueList LHSVL, RHSVL;
2217       for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
2218         LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
2219         RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
2220       }
2221       setInsertPointAfterBundle(E->Scalars);
2222
2223       Value *LHS = vectorizeTree(LHSVL);
2224       Value *RHS = vectorizeTree(RHSVL);
2225
2226       if (Value *V = alreadyVectorized(E->Scalars))
2227         return V;
2228
2229       // Create a vector of LHS op1 RHS
2230       BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
2231       Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
2232
2233       // Create a vector of LHS op2 RHS
2234       Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
2235       BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
2236       Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
2237
2238       // Create shuffle to take alternate operations from the vector.
2239       // Also, gather up odd and even scalar ops to propagate IR flags to
2240       // each vector operation.
2241       ValueList OddScalars, EvenScalars;
2242       unsigned e = E->Scalars.size();
2243       SmallVector<Constant *, 8> Mask(e);
2244       for (unsigned i = 0; i < e; ++i) {
2245         if (i & 1) {
2246           Mask[i] = Builder.getInt32(e + i);
2247           OddScalars.push_back(E->Scalars[i]);
2248         } else {
2249           Mask[i] = Builder.getInt32(i);
2250           EvenScalars.push_back(E->Scalars[i]);
2251         }
2252       }
2253
2254       Value *ShuffleMask = ConstantVector::get(Mask);
2255       propagateIRFlags(V0, EvenScalars);
2256       propagateIRFlags(V1, OddScalars);
2257
2258       Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
2259       E->VectorizedValue = V;
2260       ++NumVectorInstructions;
2261       if (Instruction *I = dyn_cast<Instruction>(V))
2262         return propagateMetadata(I, E->Scalars);
2263
2264       return V;
2265     }
2266     default:
2267     llvm_unreachable("unknown inst");
2268   }
2269   return nullptr;
2270 }
2271
2272 Value *BoUpSLP::vectorizeTree() {
2273
2274   // All blocks must be scheduled before any instructions are inserted.
2275   for (auto &BSIter : BlocksSchedules) {
2276     scheduleBlock(BSIter.second.get());
2277   }
2278
2279   Builder.SetInsertPoint(F->getEntryBlock().begin());
2280   vectorizeTree(&VectorizableTree[0]);
2281
2282   DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
2283
2284   // Extract all of the elements with the external uses.
2285   for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
2286        it != e; ++it) {
2287     Value *Scalar = it->Scalar;
2288     llvm::User *User = it->User;
2289
2290     // Skip users that we already RAUW. This happens when one instruction
2291     // has multiple uses of the same value.
2292     if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
2293         Scalar->user_end())
2294       continue;
2295     assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
2296
2297     int Idx = ScalarToTreeEntry[Scalar];
2298     TreeEntry *E = &VectorizableTree[Idx];
2299     assert(!E->NeedToGather && "Extracting from a gather list");
2300
2301     Value *Vec = E->VectorizedValue;
2302     assert(Vec && "Can't find vectorizable value");
2303
2304     Value *Lane = Builder.getInt32(it->Lane);
2305     // Generate extracts for out-of-tree users.
2306     // Find the insertion point for the extractelement lane.
2307     if (isa<Instruction>(Vec)){
2308       if (PHINode *PH = dyn_cast<PHINode>(User)) {
2309         for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
2310           if (PH->getIncomingValue(i) == Scalar) {
2311             Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
2312             Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2313             CSEBlocks.insert(PH->getIncomingBlock(i));
2314             PH->setOperand(i, Ex);
2315           }
2316         }
2317       } else {
2318         Builder.SetInsertPoint(cast<Instruction>(User));
2319         Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2320         CSEBlocks.insert(cast<Instruction>(User)->getParent());
2321         User->replaceUsesOfWith(Scalar, Ex);
2322      }
2323     } else {
2324       Builder.SetInsertPoint(F->getEntryBlock().begin());
2325       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
2326       CSEBlocks.insert(&F->getEntryBlock());
2327       User->replaceUsesOfWith(Scalar, Ex);
2328     }
2329
2330     DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
2331   }
2332
2333   // For each vectorized value:
2334   for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
2335     TreeEntry *Entry = &VectorizableTree[EIdx];
2336
2337     // For each lane:
2338     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
2339       Value *Scalar = Entry->Scalars[Lane];
2340       // No need to handle users of gathered values.
2341       if (Entry->NeedToGather)
2342         continue;
2343
2344       assert(Entry->VectorizedValue && "Can't find vectorizable value");
2345
2346       Type *Ty = Scalar->getType();
2347       if (!Ty->isVoidTy()) {
2348 #ifndef NDEBUG
2349         for (User *U : Scalar->users()) {
2350           DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
2351
2352           assert((ScalarToTreeEntry.count(U) ||
2353                   // It is legal to replace users in the ignorelist by undef.
2354                   (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
2355                    UserIgnoreList.end())) &&
2356                  "Replacing out-of-tree value with undef");
2357         }
2358 #endif
2359         Value *Undef = UndefValue::get(Ty);
2360         Scalar->replaceAllUsesWith(Undef);
2361       }
2362       DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
2363       cast<Instruction>(Scalar)->eraseFromParent();
2364     }
2365   }
2366
2367   Builder.ClearInsertionPoint();
2368
2369   return VectorizableTree[0].VectorizedValue;
2370 }
2371
2372 void BoUpSLP::optimizeGatherSequence() {
2373   DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
2374         << " gather sequences instructions.\n");
2375   // LICM InsertElementInst sequences.
2376   for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
2377        e = GatherSeq.end(); it != e; ++it) {
2378     InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
2379
2380     if (!Insert)
2381       continue;
2382
2383     // Check if this block is inside a loop.
2384     Loop *L = LI->getLoopFor(Insert->getParent());
2385     if (!L)
2386       continue;
2387
2388     // Check if it has a preheader.
2389     BasicBlock *PreHeader = L->getLoopPreheader();
2390     if (!PreHeader)
2391       continue;
2392
2393     // If the vector or the element that we insert into it are
2394     // instructions that are defined in this basic block then we can't
2395     // hoist this instruction.
2396     Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
2397     Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
2398     if (CurrVec && L->contains(CurrVec))
2399       continue;
2400     if (NewElem && L->contains(NewElem))
2401       continue;
2402
2403     // We can hoist this instruction. Move it to the pre-header.
2404     Insert->moveBefore(PreHeader->getTerminator());
2405   }
2406
2407   // Make a list of all reachable blocks in our CSE queue.
2408   SmallVector<const DomTreeNode *, 8> CSEWorkList;
2409   CSEWorkList.reserve(CSEBlocks.size());
2410   for (BasicBlock *BB : CSEBlocks)
2411     if (DomTreeNode *N = DT->getNode(BB)) {
2412       assert(DT->isReachableFromEntry(N));
2413       CSEWorkList.push_back(N);
2414     }
2415
2416   // Sort blocks by domination. This ensures we visit a block after all blocks
2417   // dominating it are visited.
2418   std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
2419                    [this](const DomTreeNode *A, const DomTreeNode *B) {
2420     return DT->properlyDominates(A, B);
2421   });
2422
2423   // Perform O(N^2) search over the gather sequences and merge identical
2424   // instructions. TODO: We can further optimize this scan if we split the
2425   // instructions into different buckets based on the insert lane.
2426   SmallVector<Instruction *, 16> Visited;
2427   for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
2428     assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
2429            "Worklist not sorted properly!");
2430     BasicBlock *BB = (*I)->getBlock();
2431     // For all instructions in blocks containing gather sequences:
2432     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
2433       Instruction *In = it++;
2434       if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
2435         continue;
2436
2437       // Check if we can replace this instruction with any of the
2438       // visited instructions.
2439       for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
2440                                                     ve = Visited.end();
2441            v != ve; ++v) {
2442         if (In->isIdenticalTo(*v) &&
2443             DT->dominates((*v)->getParent(), In->getParent())) {
2444           In->replaceAllUsesWith(*v);
2445           In->eraseFromParent();
2446           In = nullptr;
2447           break;
2448         }
2449       }
2450       if (In) {
2451         assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
2452         Visited.push_back(In);
2453       }
2454     }
2455   }
2456   CSEBlocks.clear();
2457   GatherSeq.clear();
2458 }
2459
2460 // Groups the instructions to a bundle (which is then a single scheduling entity)
2461 // and schedules instructions until the bundle gets ready.
2462 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
2463                                                  AliasAnalysis *AA) {
2464   if (isa<PHINode>(VL[0]))
2465     return true;
2466
2467   // Initialize the instruction bundle.
2468   Instruction *OldScheduleEnd = ScheduleEnd;
2469   ScheduleData *PrevInBundle = nullptr;
2470   ScheduleData *Bundle = nullptr;
2471   bool ReSchedule = false;
2472   DEBUG(dbgs() << "SLP:  bundle: " << *VL[0] << "\n");
2473   for (Value *V : VL) {
2474     extendSchedulingRegion(V);
2475     ScheduleData *BundleMember = getScheduleData(V);
2476     assert(BundleMember &&
2477            "no ScheduleData for bundle member (maybe not in same basic block)");
2478     if (BundleMember->IsScheduled) {
2479       // A bundle member was scheduled as single instruction before and now
2480       // needs to be scheduled as part of the bundle. We just get rid of the
2481       // existing schedule.
2482       DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
2483                    << " was already scheduled\n");
2484       ReSchedule = true;
2485     }
2486     assert(BundleMember->isSchedulingEntity() &&
2487            "bundle member already part of other bundle");
2488     if (PrevInBundle) {
2489       PrevInBundle->NextInBundle = BundleMember;
2490     } else {
2491       Bundle = BundleMember;
2492     }
2493     BundleMember->UnscheduledDepsInBundle = 0;
2494     Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
2495
2496     // Group the instructions to a bundle.
2497     BundleMember->FirstInBundle = Bundle;
2498     PrevInBundle = BundleMember;
2499   }
2500   if (ScheduleEnd != OldScheduleEnd) {
2501     // The scheduling region got new instructions at the lower end (or it is a
2502     // new region for the first bundle). This makes it necessary to
2503     // recalculate all dependencies.
2504     // It is seldom that this needs to be done a second time after adding the
2505     // initial bundle to the region.
2506     for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2507       ScheduleData *SD = getScheduleData(I);
2508       SD->clearDependencies();
2509     }
2510     ReSchedule = true;
2511   }
2512   if (ReSchedule) {
2513     resetSchedule();
2514     initialFillReadyList(ReadyInsts);
2515   }
2516
2517   DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
2518                << BB->getName() << "\n");
2519
2520   calculateDependencies(Bundle, true, AA);
2521
2522   // Now try to schedule the new bundle. As soon as the bundle is "ready" it
2523   // means that there are no cyclic dependencies and we can schedule it.
2524   // Note that's important that we don't "schedule" the bundle yet (see
2525   // cancelScheduling).
2526   while (!Bundle->isReady() && !ReadyInsts.empty()) {
2527
2528     ScheduleData *pickedSD = ReadyInsts.back();
2529     ReadyInsts.pop_back();
2530
2531     if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
2532       schedule(pickedSD, ReadyInsts);
2533     }
2534   }
2535   return Bundle->isReady();
2536 }
2537
2538 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
2539   if (isa<PHINode>(VL[0]))
2540     return;
2541
2542   ScheduleData *Bundle = getScheduleData(VL[0]);
2543   DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
2544   assert(!Bundle->IsScheduled &&
2545          "Can't cancel bundle which is already scheduled");
2546   assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
2547          "tried to unbundle something which is not a bundle");
2548
2549   // Un-bundle: make single instructions out of the bundle.
2550   ScheduleData *BundleMember = Bundle;
2551   while (BundleMember) {
2552     assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
2553     BundleMember->FirstInBundle = BundleMember;
2554     ScheduleData *Next = BundleMember->NextInBundle;
2555     BundleMember->NextInBundle = nullptr;
2556     BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
2557     if (BundleMember->UnscheduledDepsInBundle == 0) {
2558       ReadyInsts.insert(BundleMember);
2559     }
2560     BundleMember = Next;
2561   }
2562 }
2563
2564 void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
2565   if (getScheduleData(V))
2566     return;
2567   Instruction *I = dyn_cast<Instruction>(V);
2568   assert(I && "bundle member must be an instruction");
2569   assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
2570   if (!ScheduleStart) {
2571     // It's the first instruction in the new region.
2572     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
2573     ScheduleStart = I;
2574     ScheduleEnd = I->getNextNode();
2575     assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2576     DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
2577     return;
2578   }
2579   // Search up and down at the same time, because we don't know if the new
2580   // instruction is above or below the existing scheduling region.
2581   BasicBlock::reverse_iterator UpIter(ScheduleStart);
2582   BasicBlock::reverse_iterator UpperEnd = BB->rend();
2583   BasicBlock::iterator DownIter(ScheduleEnd);
2584   BasicBlock::iterator LowerEnd = BB->end();
2585   for (;;) {
2586     if (UpIter != UpperEnd) {
2587       if (&*UpIter == I) {
2588         initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
2589         ScheduleStart = I;
2590         DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I << "\n");
2591         return;
2592       }
2593       UpIter++;
2594     }
2595     if (DownIter != LowerEnd) {
2596       if (&*DownIter == I) {
2597         initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
2598                          nullptr);
2599         ScheduleEnd = I->getNextNode();
2600         assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
2601         DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
2602         return;
2603       }
2604       DownIter++;
2605     }
2606     assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
2607            "instruction not found in block");
2608   }
2609 }
2610
2611 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
2612                                                 Instruction *ToI,
2613                                                 ScheduleData *PrevLoadStore,
2614                                                 ScheduleData *NextLoadStore) {
2615   ScheduleData *CurrentLoadStore = PrevLoadStore;
2616   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
2617     ScheduleData *SD = ScheduleDataMap[I];
2618     if (!SD) {
2619       // Allocate a new ScheduleData for the instruction.
2620       if (ChunkPos >= ChunkSize) {
2621         ScheduleDataChunks.push_back(
2622             llvm::make_unique<ScheduleData[]>(ChunkSize));
2623         ChunkPos = 0;
2624       }
2625       SD = &(ScheduleDataChunks.back()[ChunkPos++]);
2626       ScheduleDataMap[I] = SD;
2627       SD->Inst = I;
2628     }
2629     assert(!isInSchedulingRegion(SD) &&
2630            "new ScheduleData already in scheduling region");
2631     SD->init(SchedulingRegionID);
2632
2633     if (I->mayReadOrWriteMemory()) {
2634       // Update the linked list of memory accessing instructions.
2635       if (CurrentLoadStore) {
2636         CurrentLoadStore->NextLoadStore = SD;
2637       } else {
2638         FirstLoadStoreInRegion = SD;
2639       }
2640       CurrentLoadStore = SD;
2641     }
2642   }
2643   if (NextLoadStore) {
2644     if (CurrentLoadStore)
2645       CurrentLoadStore->NextLoadStore = NextLoadStore;
2646   } else {
2647     LastLoadStoreInRegion = CurrentLoadStore;
2648   }
2649 }
2650
2651 /// \returns the AA location that is being access by the instruction.
2652 static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) {
2653   if (StoreInst *SI = dyn_cast<StoreInst>(I))
2654     return AA->getLocation(SI);
2655   if (LoadInst *LI = dyn_cast<LoadInst>(I))
2656     return AA->getLocation(LI);
2657   return AliasAnalysis::Location();
2658 }
2659
2660 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
2661                                                      bool InsertInReadyList,
2662                                                      AliasAnalysis *AA) {
2663   assert(SD->isSchedulingEntity());
2664
2665   SmallVector<ScheduleData *, 10> WorkList;
2666   WorkList.push_back(SD);
2667
2668   while (!WorkList.empty()) {
2669     ScheduleData *SD = WorkList.back();
2670     WorkList.pop_back();
2671
2672     ScheduleData *BundleMember = SD;
2673     while (BundleMember) {
2674       assert(isInSchedulingRegion(BundleMember));
2675       if (!BundleMember->hasValidDependencies()) {
2676
2677         DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember << "\n");
2678         BundleMember->Dependencies = 0;
2679         BundleMember->resetUnscheduledDeps();
2680
2681         // Handle def-use chain dependencies.
2682         for (User *U : BundleMember->Inst->users()) {
2683           if (isa<Instruction>(U)) {
2684             ScheduleData *UseSD = getScheduleData(U);
2685             if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
2686               BundleMember->Dependencies++;
2687               ScheduleData *DestBundle = UseSD->FirstInBundle;
2688               if (!DestBundle->IsScheduled) {
2689                 BundleMember->incrementUnscheduledDeps(1);
2690               }
2691               if (!DestBundle->hasValidDependencies()) {
2692                 WorkList.push_back(DestBundle);
2693               }
2694             }
2695           } else {
2696             // I'm not sure if this can ever happen. But we need to be safe.
2697             // This lets the instruction/bundle never be scheduled and eventally
2698             // disable vectorization.
2699             BundleMember->Dependencies++;
2700             BundleMember->incrementUnscheduledDeps(1);
2701           }
2702         }
2703
2704         // Handle the memory dependencies.
2705         ScheduleData *DepDest = BundleMember->NextLoadStore;
2706         if (DepDest) {
2707           AliasAnalysis::Location SrcLoc = getLocation(BundleMember->Inst, AA);
2708           bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
2709
2710           while (DepDest) {
2711             assert(isInSchedulingRegion(DepDest));
2712             if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) {
2713               AliasAnalysis::Location DstLoc = getLocation(DepDest->Inst, AA);
2714               if (!SrcLoc.Ptr || !DstLoc.Ptr || AA->alias(SrcLoc, DstLoc)) {
2715                 DepDest->MemoryDependencies.push_back(BundleMember);
2716                 BundleMember->Dependencies++;
2717                 ScheduleData *DestBundle = DepDest->FirstInBundle;
2718                 if (!DestBundle->IsScheduled) {
2719                   BundleMember->incrementUnscheduledDeps(1);
2720                 }
2721                 if (!DestBundle->hasValidDependencies()) {
2722                   WorkList.push_back(DestBundle);
2723                 }
2724               }
2725             }
2726             DepDest = DepDest->NextLoadStore;
2727           }
2728         }
2729       }
2730       BundleMember = BundleMember->NextInBundle;
2731     }
2732     if (InsertInReadyList && SD->isReady()) {
2733       ReadyInsts.push_back(SD);
2734       DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst << "\n");
2735     }
2736   }
2737 }
2738
2739 void BoUpSLP::BlockScheduling::resetSchedule() {
2740   assert(ScheduleStart &&
2741          "tried to reset schedule on block which has not been scheduled");
2742   for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
2743     ScheduleData *SD = getScheduleData(I);
2744     assert(isInSchedulingRegion(SD));
2745     SD->IsScheduled = false;
2746     SD->resetUnscheduledDeps();
2747   }
2748   ReadyInsts.clear();
2749 }
2750
2751 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
2752
2753   if (!BS->ScheduleStart)
2754     return;
2755
2756   DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
2757
2758   BS->resetSchedule();
2759
2760   // For the real scheduling we use a more sophisticated ready-list: it is
2761   // sorted by the original instruction location. This lets the final schedule
2762   // be as  close as possible to the original instruction order.
2763   struct ScheduleDataCompare {
2764     bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
2765       return SD2->SchedulingPriority < SD1->SchedulingPriority;
2766     }
2767   };
2768   std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
2769
2770   // Ensure that all depencency data is updated and fill the ready-list with
2771   // initial instructions.
2772   int Idx = 0;
2773   int NumToSchedule = 0;
2774   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
2775        I = I->getNextNode()) {
2776     ScheduleData *SD = BS->getScheduleData(I);
2777     assert(
2778         SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
2779         "scheduler and vectorizer have different opinion on what is a bundle");
2780     SD->FirstInBundle->SchedulingPriority = Idx++;
2781     if (SD->isSchedulingEntity()) {
2782       BS->calculateDependencies(SD, false, AA);
2783       NumToSchedule++;
2784     }
2785   }
2786   BS->initialFillReadyList(ReadyInsts);
2787
2788   Instruction *LastScheduledInst = BS->ScheduleEnd;
2789
2790   // Do the "real" scheduling.
2791   while (!ReadyInsts.empty()) {
2792     ScheduleData *picked = *ReadyInsts.begin();
2793     ReadyInsts.erase(ReadyInsts.begin());
2794
2795     // Move the scheduled instruction(s) to their dedicated places, if not
2796     // there yet.
2797     ScheduleData *BundleMember = picked;
2798     while (BundleMember) {
2799       Instruction *pickedInst = BundleMember->Inst;
2800       if (LastScheduledInst->getNextNode() != pickedInst) {
2801         BS->BB->getInstList().remove(pickedInst);
2802         BS->BB->getInstList().insert(LastScheduledInst, pickedInst);
2803       }
2804       LastScheduledInst = pickedInst;
2805       BundleMember = BundleMember->NextInBundle;
2806     }
2807
2808     BS->schedule(picked, ReadyInsts);
2809     NumToSchedule--;
2810   }
2811   assert(NumToSchedule == 0 && "could not schedule all instructions");
2812
2813   // Avoid duplicate scheduling of the block.
2814   BS->ScheduleStart = nullptr;
2815 }
2816
2817 /// The SLPVectorizer Pass.
2818 struct SLPVectorizer : public FunctionPass {
2819   typedef SmallVector<StoreInst *, 8> StoreList;
2820   typedef MapVector<Value *, StoreList> StoreListMap;
2821
2822   /// Pass identification, replacement for typeid
2823   static char ID;
2824
2825   explicit SLPVectorizer() : FunctionPass(ID) {
2826     initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
2827   }
2828
2829   ScalarEvolution *SE;
2830   const DataLayout *DL;
2831   TargetTransformInfo *TTI;
2832   TargetLibraryInfo *TLI;
2833   AliasAnalysis *AA;
2834   LoopInfo *LI;
2835   DominatorTree *DT;
2836   AssumptionTracker *AT;
2837
2838   bool runOnFunction(Function &F) override {
2839     if (skipOptnoneFunction(F))
2840       return false;
2841
2842     SE = &getAnalysis<ScalarEvolution>();
2843     DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
2844     DL = DLP ? &DLP->getDataLayout() : nullptr;
2845     TTI = &getAnalysis<TargetTransformInfo>();
2846     TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
2847     AA = &getAnalysis<AliasAnalysis>();
2848     LI = &getAnalysis<LoopInfo>();
2849     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2850     AT = &getAnalysis<AssumptionTracker>();
2851
2852     StoreRefs.clear();
2853     bool Changed = false;
2854
2855     // If the target claims to have no vector registers don't attempt
2856     // vectorization.
2857     if (!TTI->getNumberOfRegisters(true))
2858       return false;
2859
2860     // Must have DataLayout. We can't require it because some tests run w/o
2861     // triple.
2862     if (!DL)
2863       return false;
2864
2865     // Don't vectorize when the attribute NoImplicitFloat is used.
2866     if (F.hasFnAttribute(Attribute::NoImplicitFloat))
2867       return false;
2868
2869     DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
2870
2871     // Use the bottom up slp vectorizer to construct chains that start with
2872     // store instructions.
2873     BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT, AT);
2874
2875     // Scan the blocks in the function in post order.
2876     for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
2877          e = po_end(&F.getEntryBlock()); it != e; ++it) {
2878       BasicBlock *BB = *it;
2879       // Vectorize trees that end at stores.
2880       if (unsigned count = collectStores(BB, R)) {
2881         (void)count;
2882         DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
2883         Changed |= vectorizeStoreChains(R);
2884       }
2885
2886       // Vectorize trees that end at reductions.
2887       Changed |= vectorizeChainsInBlock(BB, R);
2888     }
2889
2890     if (Changed) {
2891       R.optimizeGatherSequence();
2892       DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
2893       DEBUG(verifyFunction(F));
2894     }
2895     return Changed;
2896   }
2897
2898   void getAnalysisUsage(AnalysisUsage &AU) const override {
2899     FunctionPass::getAnalysisUsage(AU);
2900     AU.addRequired<AssumptionTracker>();
2901     AU.addRequired<ScalarEvolution>();
2902     AU.addRequired<AliasAnalysis>();
2903     AU.addRequired<TargetTransformInfo>();
2904     AU.addRequired<LoopInfo>();
2905     AU.addRequired<DominatorTreeWrapperPass>();
2906     AU.addPreserved<LoopInfo>();
2907     AU.addPreserved<DominatorTreeWrapperPass>();
2908     AU.setPreservesCFG();
2909   }
2910
2911 private:
2912
2913   /// \brief Collect memory references and sort them according to their base
2914   /// object. We sort the stores to their base objects to reduce the cost of the
2915   /// quadratic search on the stores. TODO: We can further reduce this cost
2916   /// if we flush the chain creation every time we run into a memory barrier.
2917   unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
2918
2919   /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
2920   bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
2921
2922   /// \brief Try to vectorize a list of operands.
2923   /// \@param BuildVector A list of users to ignore for the purpose of
2924   ///                     scheduling and that don't need extracting.
2925   /// \returns true if a value was vectorized.
2926   bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
2927                           ArrayRef<Value *> BuildVector = None,
2928                           bool allowReorder = false);
2929
2930   /// \brief Try to vectorize a chain that may start at the operands of \V;
2931   bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
2932
2933   /// \brief Vectorize the stores that were collected in StoreRefs.
2934   bool vectorizeStoreChains(BoUpSLP &R);
2935
2936   /// \brief Scan the basic block and look for patterns that are likely to start
2937   /// a vectorization chain.
2938   bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
2939
2940   bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
2941                            BoUpSLP &R);
2942
2943   bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
2944                        BoUpSLP &R);
2945 private:
2946   StoreListMap StoreRefs;
2947 };
2948
2949 /// \brief Check that the Values in the slice in VL array are still existent in
2950 /// the WeakVH array.
2951 /// Vectorization of part of the VL array may cause later values in the VL array
2952 /// to become invalid. We track when this has happened in the WeakVH array.
2953 static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL,
2954                                SmallVectorImpl<WeakVH> &VH,
2955                                unsigned SliceBegin,
2956                                unsigned SliceSize) {
2957   for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i)
2958     if (VH[i] != VL[i])
2959       return true;
2960
2961   return false;
2962 }
2963
2964 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
2965                                           int CostThreshold, BoUpSLP &R) {
2966   unsigned ChainLen = Chain.size();
2967   DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
2968         << "\n");
2969   Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
2970   unsigned Sz = DL->getTypeSizeInBits(StoreTy);
2971   unsigned VF = MinVecRegSize / Sz;
2972
2973   if (!isPowerOf2_32(Sz) || VF < 2)
2974     return false;
2975
2976   // Keep track of values that were deleted by vectorizing in the loop below.
2977   SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
2978
2979   bool Changed = false;
2980   // Look for profitable vectorizable trees at all offsets, starting at zero.
2981   for (unsigned i = 0, e = ChainLen; i < e; ++i) {
2982     if (i + VF > e)
2983       break;
2984
2985     // Check that a previous iteration of this loop did not delete the Value.
2986     if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
2987       continue;
2988
2989     DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
2990           << "\n");
2991     ArrayRef<Value *> Operands = Chain.slice(i, VF);
2992
2993     R.buildTree(Operands);
2994
2995     int Cost = R.getTreeCost();
2996
2997     DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
2998     if (Cost < CostThreshold) {
2999       DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
3000       R.vectorizeTree();
3001
3002       // Move to the next bundle.
3003       i += VF - 1;
3004       Changed = true;
3005     }
3006   }
3007
3008   return Changed;
3009 }
3010
3011 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
3012                                     int costThreshold, BoUpSLP &R) {
3013   SetVector<Value *> Heads, Tails;
3014   SmallDenseMap<Value *, Value *> ConsecutiveChain;
3015
3016   // We may run into multiple chains that merge into a single chain. We mark the
3017   // stores that we vectorized so that we don't visit the same store twice.
3018   BoUpSLP::ValueSet VectorizedStores;
3019   bool Changed = false;
3020
3021   // Do a quadratic search on all of the given stores and find
3022   // all of the pairs of stores that follow each other.
3023   for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
3024     for (unsigned j = 0; j < e; ++j) {
3025       if (i == j)
3026         continue;
3027
3028       if (R.isConsecutiveAccess(Stores[i], Stores[j])) {
3029         Tails.insert(Stores[j]);
3030         Heads.insert(Stores[i]);
3031         ConsecutiveChain[Stores[i]] = Stores[j];
3032       }
3033     }
3034   }
3035
3036   // For stores that start but don't end a link in the chain:
3037   for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
3038        it != e; ++it) {
3039     if (Tails.count(*it))
3040       continue;
3041
3042     // We found a store instr that starts a chain. Now follow the chain and try
3043     // to vectorize it.
3044     BoUpSLP::ValueList Operands;
3045     Value *I = *it;
3046     // Collect the chain into a list.
3047     while (Tails.count(I) || Heads.count(I)) {
3048       if (VectorizedStores.count(I))
3049         break;
3050       Operands.push_back(I);
3051       // Move to the next value in the chain.
3052       I = ConsecutiveChain[I];
3053     }
3054
3055     bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R);
3056
3057     // Mark the vectorized stores so that we don't vectorize them again.
3058     if (Vectorized)
3059       VectorizedStores.insert(Operands.begin(), Operands.end());
3060     Changed |= Vectorized;
3061   }
3062
3063   return Changed;
3064 }
3065
3066
3067 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
3068   unsigned count = 0;
3069   StoreRefs.clear();
3070   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
3071     StoreInst *SI = dyn_cast<StoreInst>(it);
3072     if (!SI)
3073       continue;
3074
3075     // Don't touch volatile stores.
3076     if (!SI->isSimple())
3077       continue;
3078
3079     // Check that the pointer points to scalars.
3080     Type *Ty = SI->getValueOperand()->getType();
3081     if (Ty->isAggregateType() || Ty->isVectorTy())
3082       continue;
3083
3084     // Find the base pointer.
3085     Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
3086
3087     // Save the store locations.
3088     StoreRefs[Ptr].push_back(SI);
3089     count++;
3090   }
3091   return count;
3092 }
3093
3094 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
3095   if (!A || !B)
3096     return false;
3097   Value *VL[] = { A, B };
3098   return tryToVectorizeList(VL, R, None, true);
3099 }
3100
3101 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
3102                                        ArrayRef<Value *> BuildVector,
3103                                        bool allowReorder) {
3104   if (VL.size() < 2)
3105     return false;
3106
3107   DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
3108
3109   // Check that all of the parts are scalar instructions of the same type.
3110   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
3111   if (!I0)
3112     return false;
3113
3114   unsigned Opcode0 = I0->getOpcode();
3115
3116   Type *Ty0 = I0->getType();
3117   unsigned Sz = DL->getTypeSizeInBits(Ty0);
3118   unsigned VF = MinVecRegSize / Sz;
3119
3120   for (int i = 0, e = VL.size(); i < e; ++i) {
3121     Type *Ty = VL[i]->getType();
3122     if (Ty->isAggregateType() || Ty->isVectorTy())
3123       return false;
3124     Instruction *Inst = dyn_cast<Instruction>(VL[i]);
3125     if (!Inst || Inst->getOpcode() != Opcode0)
3126       return false;
3127   }
3128
3129   bool Changed = false;
3130
3131   // Keep track of values that were deleted by vectorizing in the loop below.
3132   SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
3133
3134   for (unsigned i = 0, e = VL.size(); i < e; ++i) {
3135     unsigned OpsWidth = 0;
3136
3137     if (i + VF > e)
3138       OpsWidth = e - i;
3139     else
3140       OpsWidth = VF;
3141
3142     if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
3143       break;
3144
3145     // Check that a previous iteration of this loop did not delete the Value.
3146     if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
3147       continue;
3148
3149     DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
3150                  << "\n");
3151     ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
3152
3153     ArrayRef<Value *> BuildVectorSlice;
3154     if (!BuildVector.empty())
3155       BuildVectorSlice = BuildVector.slice(i, OpsWidth);
3156
3157     R.buildTree(Ops, BuildVectorSlice);
3158     // TODO: check if we can allow reordering also for other cases than
3159     // tryToVectorizePair()
3160     if (allowReorder && R.shouldReorder()) {
3161       assert(Ops.size() == 2);
3162       assert(BuildVectorSlice.empty());
3163       Value *ReorderedOps[] = { Ops[1], Ops[0] };
3164       R.buildTree(ReorderedOps, None);
3165     }
3166     int Cost = R.getTreeCost();
3167
3168     if (Cost < -SLPCostThreshold) {
3169       DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
3170       Value *VectorizedRoot = R.vectorizeTree();
3171
3172       // Reconstruct the build vector by extracting the vectorized root. This
3173       // way we handle the case where some elements of the vector are undefined.
3174       //  (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
3175       if (!BuildVectorSlice.empty()) {
3176         // The insert point is the last build vector instruction. The vectorized
3177         // root will precede it. This guarantees that we get an instruction. The
3178         // vectorized tree could have been constant folded.
3179         Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
3180         unsigned VecIdx = 0;
3181         for (auto &V : BuildVectorSlice) {
3182           IRBuilder<true, NoFolder> Builder(
3183               ++BasicBlock::iterator(InsertAfter));
3184           InsertElementInst *IE = cast<InsertElementInst>(V);
3185           Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
3186               VectorizedRoot, Builder.getInt32(VecIdx++)));
3187           IE->setOperand(1, Extract);
3188           IE->removeFromParent();
3189           IE->insertAfter(Extract);
3190           InsertAfter = IE;
3191         }
3192       }
3193       // Move to the next bundle.
3194       i += VF - 1;
3195       Changed = true;
3196     }
3197   }
3198
3199   return Changed;
3200 }
3201
3202 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
3203   if (!V)
3204     return false;
3205
3206   // Try to vectorize V.
3207   if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
3208     return true;
3209
3210   BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
3211   BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
3212   // Try to skip B.
3213   if (B && B->hasOneUse()) {
3214     BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
3215     BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
3216     if (tryToVectorizePair(A, B0, R)) {
3217       return true;
3218     }
3219     if (tryToVectorizePair(A, B1, R)) {
3220       return true;
3221     }
3222   }
3223
3224   // Try to skip A.
3225   if (A && A->hasOneUse()) {
3226     BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
3227     BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
3228     if (tryToVectorizePair(A0, B, R)) {
3229       return true;
3230     }
3231     if (tryToVectorizePair(A1, B, R)) {
3232       return true;
3233     }
3234   }
3235   return 0;
3236 }
3237
3238 /// \brief Generate a shuffle mask to be used in a reduction tree.
3239 ///
3240 /// \param VecLen The length of the vector to be reduced.
3241 /// \param NumEltsToRdx The number of elements that should be reduced in the
3242 ///        vector.
3243 /// \param IsPairwise Whether the reduction is a pairwise or splitting
3244 ///        reduction. A pairwise reduction will generate a mask of
3245 ///        <0,2,...> or <1,3,..> while a splitting reduction will generate
3246 ///        <2,3, undef,undef> for a vector of 4 and NumElts = 2.
3247 /// \param IsLeft True will generate a mask of even elements, odd otherwise.
3248 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
3249                                    bool IsPairwise, bool IsLeft,
3250                                    IRBuilder<> &Builder) {
3251   assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
3252
3253   SmallVector<Constant *, 32> ShuffleMask(
3254       VecLen, UndefValue::get(Builder.getInt32Ty()));
3255
3256   if (IsPairwise)
3257     // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
3258     for (unsigned i = 0; i != NumEltsToRdx; ++i)
3259       ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
3260   else
3261     // Move the upper half of the vector to the lower half.
3262     for (unsigned i = 0; i != NumEltsToRdx; ++i)
3263       ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
3264
3265   return ConstantVector::get(ShuffleMask);
3266 }
3267
3268
3269 /// Model horizontal reductions.
3270 ///
3271 /// A horizontal reduction is a tree of reduction operations (currently add and
3272 /// fadd) that has operations that can be put into a vector as its leaf.
3273 /// For example, this tree:
3274 ///
3275 /// mul mul mul mul
3276 ///  \  /    \  /
3277 ///   +       +
3278 ///    \     /
3279 ///       +
3280 /// This tree has "mul" as its reduced values and "+" as its reduction
3281 /// operations. A reduction might be feeding into a store or a binary operation
3282 /// feeding a phi.
3283 ///    ...
3284 ///    \  /
3285 ///     +
3286 ///     |
3287 ///  phi +=
3288 ///
3289 ///  Or:
3290 ///    ...
3291 ///    \  /
3292 ///     +
3293 ///     |
3294 ///   *p =
3295 ///
3296 class HorizontalReduction {
3297   SmallVector<Value *, 16> ReductionOps;
3298   SmallVector<Value *, 32> ReducedVals;
3299
3300   BinaryOperator *ReductionRoot;
3301   PHINode *ReductionPHI;
3302
3303   /// The opcode of the reduction.
3304   unsigned ReductionOpcode;
3305   /// The opcode of the values we perform a reduction on.
3306   unsigned ReducedValueOpcode;
3307   /// The width of one full horizontal reduction operation.
3308   unsigned ReduxWidth;
3309   /// Should we model this reduction as a pairwise reduction tree or a tree that
3310   /// splits the vector in halves and adds those halves.
3311   bool IsPairwiseReduction;
3312
3313 public:
3314   HorizontalReduction()
3315     : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
3316     ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
3317
3318   /// \brief Try to find a reduction tree.
3319   bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
3320                                  const DataLayout *DL) {
3321     assert((!Phi ||
3322             std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
3323            "Thi phi needs to use the binary operator");
3324
3325     // We could have a initial reductions that is not an add.
3326     //  r *= v1 + v2 + v3 + v4
3327     // In such a case start looking for a tree rooted in the first '+'.
3328     if (Phi) {
3329       if (B->getOperand(0) == Phi) {
3330         Phi = nullptr;
3331         B = dyn_cast<BinaryOperator>(B->getOperand(1));
3332       } else if (B->getOperand(1) == Phi) {
3333         Phi = nullptr;
3334         B = dyn_cast<BinaryOperator>(B->getOperand(0));
3335       }
3336     }
3337
3338     if (!B)
3339       return false;
3340
3341     Type *Ty = B->getType();
3342     if (Ty->isVectorTy())
3343       return false;
3344
3345     ReductionOpcode = B->getOpcode();
3346     ReducedValueOpcode = 0;
3347     ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty);
3348     ReductionRoot = B;
3349     ReductionPHI = Phi;
3350
3351     if (ReduxWidth < 4)
3352       return false;
3353
3354     // We currently only support adds.
3355     if (ReductionOpcode != Instruction::Add &&
3356         ReductionOpcode != Instruction::FAdd)
3357       return false;
3358
3359     // Post order traverse the reduction tree starting at B. We only handle true
3360     // trees containing only binary operators.
3361     SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
3362     Stack.push_back(std::make_pair(B, 0));
3363     while (!Stack.empty()) {
3364       BinaryOperator *TreeN = Stack.back().first;
3365       unsigned EdgeToVist = Stack.back().second++;
3366       bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
3367
3368       // Only handle trees in the current basic block.
3369       if (TreeN->getParent() != B->getParent())
3370         return false;
3371
3372       // Each tree node needs to have one user except for the ultimate
3373       // reduction.
3374       if (!TreeN->hasOneUse() && TreeN != B)
3375         return false;
3376
3377       // Postorder vist.
3378       if (EdgeToVist == 2 || IsReducedValue) {
3379         if (IsReducedValue) {
3380           // Make sure that the opcodes of the operations that we are going to
3381           // reduce match.
3382           if (!ReducedValueOpcode)
3383             ReducedValueOpcode = TreeN->getOpcode();
3384           else if (ReducedValueOpcode != TreeN->getOpcode())
3385             return false;
3386           ReducedVals.push_back(TreeN);
3387         } else {
3388           // We need to be able to reassociate the adds.
3389           if (!TreeN->isAssociative())
3390             return false;
3391           ReductionOps.push_back(TreeN);
3392         }
3393         // Retract.
3394         Stack.pop_back();
3395         continue;
3396       }
3397
3398       // Visit left or right.
3399       Value *NextV = TreeN->getOperand(EdgeToVist);
3400       BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
3401       if (Next)
3402         Stack.push_back(std::make_pair(Next, 0));
3403       else if (NextV != Phi)
3404         return false;
3405     }
3406     return true;
3407   }
3408
3409   /// \brief Attempt to vectorize the tree found by
3410   /// matchAssociativeReduction.
3411   bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
3412     if (ReducedVals.empty())
3413       return false;
3414
3415     unsigned NumReducedVals = ReducedVals.size();
3416     if (NumReducedVals < ReduxWidth)
3417       return false;
3418
3419     Value *VectorizedTree = nullptr;
3420     IRBuilder<> Builder(ReductionRoot);
3421     FastMathFlags Unsafe;
3422     Unsafe.setUnsafeAlgebra();
3423     Builder.SetFastMathFlags(Unsafe);
3424     unsigned i = 0;
3425
3426     for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
3427       V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
3428
3429       // Estimate cost.
3430       int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
3431       if (Cost >= -SLPCostThreshold)
3432         break;
3433
3434       DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
3435                    << ". (HorRdx)\n");
3436
3437       // Vectorize a tree.
3438       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
3439       Value *VectorizedRoot = V.vectorizeTree();
3440
3441       // Emit a reduction.
3442       Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
3443       if (VectorizedTree) {
3444         Builder.SetCurrentDebugLocation(Loc);
3445         VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3446                                      ReducedSubTree, "bin.rdx");
3447       } else
3448         VectorizedTree = ReducedSubTree;
3449     }
3450
3451     if (VectorizedTree) {
3452       // Finish the reduction.
3453       for (; i < NumReducedVals; ++i) {
3454         Builder.SetCurrentDebugLocation(
3455           cast<Instruction>(ReducedVals[i])->getDebugLoc());
3456         VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
3457                                      ReducedVals[i]);
3458       }
3459       // Update users.
3460       if (ReductionPHI) {
3461         assert(ReductionRoot && "Need a reduction operation");
3462         ReductionRoot->setOperand(0, VectorizedTree);
3463         ReductionRoot->setOperand(1, ReductionPHI);
3464       } else
3465         ReductionRoot->replaceAllUsesWith(VectorizedTree);
3466     }
3467     return VectorizedTree != nullptr;
3468   }
3469
3470 private:
3471
3472   /// \brief Calcuate the cost of a reduction.
3473   int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
3474     Type *ScalarTy = FirstReducedVal->getType();
3475     Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
3476
3477     int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
3478     int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
3479
3480     IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
3481     int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
3482
3483     int ScalarReduxCost =
3484         ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
3485
3486     DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
3487                  << " for reduction that starts with " << *FirstReducedVal
3488                  << " (It is a "
3489                  << (IsPairwiseReduction ? "pairwise" : "splitting")
3490                  << " reduction)\n");
3491
3492     return VecReduxCost - ScalarReduxCost;
3493   }
3494
3495   static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
3496                             Value *R, const Twine &Name = "") {
3497     if (Opcode == Instruction::FAdd)
3498       return Builder.CreateFAdd(L, R, Name);
3499     return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
3500   }
3501
3502   /// \brief Emit a horizontal reduction of the vectorized value.
3503   Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
3504     assert(VectorizedValue && "Need to have a vectorized tree node");
3505     Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue);
3506     assert(isPowerOf2_32(ReduxWidth) &&
3507            "We only handle power-of-two reductions for now");
3508
3509     Value *TmpVec = ValToReduce;
3510     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
3511       if (IsPairwiseReduction) {
3512         Value *LeftMask =
3513           createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
3514         Value *RightMask =
3515           createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
3516
3517         Value *LeftShuf = Builder.CreateShuffleVector(
3518           TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
3519         Value *RightShuf = Builder.CreateShuffleVector(
3520           TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
3521           "rdx.shuf.r");
3522         TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
3523                              "bin.rdx");
3524       } else {
3525         Value *UpperHalf =
3526           createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
3527         Value *Shuf = Builder.CreateShuffleVector(
3528           TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
3529         TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
3530       }
3531     }
3532
3533     // The result is in the first element of the vector.
3534     return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
3535   }
3536 };
3537
3538 /// \brief Recognize construction of vectors like
3539 ///  %ra = insertelement <4 x float> undef, float %s0, i32 0
3540 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
3541 ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
3542 ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
3543 ///
3544 /// Returns true if it matches
3545 ///
3546 static bool findBuildVector(InsertElementInst *FirstInsertElem,
3547                             SmallVectorImpl<Value *> &BuildVector,
3548                             SmallVectorImpl<Value *> &BuildVectorOpds) {
3549   if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
3550     return false;
3551
3552   InsertElementInst *IE = FirstInsertElem;
3553   while (true) {
3554     BuildVector.push_back(IE);
3555     BuildVectorOpds.push_back(IE->getOperand(1));
3556
3557     if (IE->use_empty())
3558       return false;
3559
3560     InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
3561     if (!NextUse)
3562       return true;
3563
3564     // If this isn't the final use, make sure the next insertelement is the only
3565     // use. It's OK if the final constructed vector is used multiple times
3566     if (!IE->hasOneUse())
3567       return false;
3568
3569     IE = NextUse;
3570   }
3571
3572   return false;
3573 }
3574
3575 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
3576   return V->getType() < V2->getType();
3577 }
3578
3579 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
3580   bool Changed = false;
3581   SmallVector<Value *, 4> Incoming;
3582   SmallSet<Value *, 16> VisitedInstrs;
3583
3584   bool HaveVectorizedPhiNodes = true;
3585   while (HaveVectorizedPhiNodes) {
3586     HaveVectorizedPhiNodes = false;
3587
3588     // Collect the incoming values from the PHIs.
3589     Incoming.clear();
3590     for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
3591          ++instr) {
3592       PHINode *P = dyn_cast<PHINode>(instr);
3593       if (!P)
3594         break;
3595
3596       if (!VisitedInstrs.count(P))
3597         Incoming.push_back(P);
3598     }
3599
3600     // Sort by type.
3601     std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
3602
3603     // Try to vectorize elements base on their type.
3604     for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
3605                                            E = Incoming.end();
3606          IncIt != E;) {
3607
3608       // Look for the next elements with the same type.
3609       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
3610       while (SameTypeIt != E &&
3611              (*SameTypeIt)->getType() == (*IncIt)->getType()) {
3612         VisitedInstrs.insert(*SameTypeIt);
3613         ++SameTypeIt;
3614       }
3615
3616       // Try to vectorize them.
3617       unsigned NumElts = (SameTypeIt - IncIt);
3618       DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
3619       if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
3620         // Success start over because instructions might have been changed.
3621         HaveVectorizedPhiNodes = true;
3622         Changed = true;
3623         break;
3624       }
3625
3626       // Start over at the next instruction of a different type (or the end).
3627       IncIt = SameTypeIt;
3628     }
3629   }
3630
3631   VisitedInstrs.clear();
3632
3633   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
3634     // We may go through BB multiple times so skip the one we have checked.
3635     if (!VisitedInstrs.insert(it).second)
3636       continue;
3637
3638     if (isa<DbgInfoIntrinsic>(it))
3639       continue;
3640
3641     // Try to vectorize reductions that use PHINodes.
3642     if (PHINode *P = dyn_cast<PHINode>(it)) {
3643       // Check that the PHI is a reduction PHI.
3644       if (P->getNumIncomingValues() != 2)
3645         return Changed;
3646       Value *Rdx =
3647           (P->getIncomingBlock(0) == BB
3648                ? (P->getIncomingValue(0))
3649                : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1)
3650                                                : nullptr));
3651       // Check if this is a Binary Operator.
3652       BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
3653       if (!BI)
3654         continue;
3655
3656       // Try to match and vectorize a horizontal reduction.
3657       HorizontalReduction HorRdx;
3658       if (ShouldVectorizeHor &&
3659           HorRdx.matchAssociativeReduction(P, BI, DL) &&
3660           HorRdx.tryToReduce(R, TTI)) {
3661         Changed = true;
3662         it = BB->begin();
3663         e = BB->end();
3664         continue;
3665       }
3666
3667      Value *Inst = BI->getOperand(0);
3668       if (Inst == P)
3669         Inst = BI->getOperand(1);
3670
3671       if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
3672         // We would like to start over since some instructions are deleted
3673         // and the iterator may become invalid value.
3674         Changed = true;
3675         it = BB->begin();
3676         e = BB->end();
3677         continue;
3678       }
3679
3680       continue;
3681     }
3682
3683     // Try to vectorize horizontal reductions feeding into a store.
3684     if (ShouldStartVectorizeHorAtStore)
3685       if (StoreInst *SI = dyn_cast<StoreInst>(it))
3686         if (BinaryOperator *BinOp =
3687                 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
3688           HorizontalReduction HorRdx;
3689           if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) &&
3690                 HorRdx.tryToReduce(R, TTI)) ||
3691                tryToVectorize(BinOp, R))) {
3692             Changed = true;
3693             it = BB->begin();
3694             e = BB->end();
3695             continue;
3696           }
3697         }
3698
3699     // Try to vectorize horizontal reductions feeding into a return.
3700     if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
3701       if (RI->getNumOperands() != 0)
3702         if (BinaryOperator *BinOp =
3703                 dyn_cast<BinaryOperator>(RI->getOperand(0))) {
3704           DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
3705           if (tryToVectorizePair(BinOp->getOperand(0),
3706                                  BinOp->getOperand(1), R)) {
3707             Changed = true;
3708             it = BB->begin();
3709             e = BB->end();
3710             continue;
3711           }
3712         }
3713
3714     // Try to vectorize trees that start at compare instructions.
3715     if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
3716       if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
3717         Changed = true;
3718         // We would like to start over since some instructions are deleted
3719         // and the iterator may become invalid value.
3720         it = BB->begin();
3721         e = BB->end();
3722         continue;
3723       }
3724
3725       for (int i = 0; i < 2; ++i) {
3726         if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
3727           if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
3728             Changed = true;
3729             // We would like to start over since some instructions are deleted
3730             // and the iterator may become invalid value.
3731             it = BB->begin();
3732             e = BB->end();
3733           }
3734         }
3735       }
3736       continue;
3737     }
3738
3739     // Try to vectorize trees that start at insertelement instructions.
3740     if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
3741       SmallVector<Value *, 16> BuildVector;
3742       SmallVector<Value *, 16> BuildVectorOpds;
3743       if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
3744         continue;
3745
3746       // Vectorize starting with the build vector operands ignoring the
3747       // BuildVector instructions for the purpose of scheduling and user
3748       // extraction.
3749       if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
3750         Changed = true;
3751         it = BB->begin();
3752         e = BB->end();
3753       }
3754
3755       continue;
3756     }
3757   }
3758
3759   return Changed;
3760 }
3761
3762 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
3763   bool Changed = false;
3764   // Attempt to sort and vectorize each of the store-groups.
3765   for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
3766        it != e; ++it) {
3767     if (it->second.size() < 2)
3768       continue;
3769
3770     DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
3771           << it->second.size() << ".\n");
3772
3773     // Process the stores in chunks of 16.
3774     for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
3775       unsigned Len = std::min<unsigned>(CE - CI, 16);
3776       Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
3777                                  -SLPCostThreshold, R);
3778     }
3779   }
3780   return Changed;
3781 }
3782
3783 } // end anonymous namespace
3784
3785 char SLPVectorizer::ID = 0;
3786 static const char lv_name[] = "SLP Vectorizer";
3787 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
3788 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
3789 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
3790 INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
3791 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
3792 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
3793 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
3794
3795 namespace llvm {
3796 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
3797 }