lib/Transforms/Scalar/LoopStrengthReduce.cpp

   1 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This transformation analyzes and transforms the induction variables (and
  11 // computations derived from them) into forms suitable for efficient execution
  12 // on the target.
  13 //
  14 // This pass performs a strength reduction on array references inside loops that
  15 // have as one or more of their components the loop induction variable, it
  16 // rewrites expressions to take advantage of scaled-index addressing modes
  17 // available on the target, and it performs a variety of other optimizations
  18 // related to loop induction variables.
  19 //
  20 // Terminology note: this code has a lot of handling for "post-increment" or
  21 // "post-inc" users. This is not talking about post-increment addressing modes;
  22 // it is instead talking about code like this:
  23 //
  24 //   %i = phi [ 0, %entry ], [ %i.next, %latch ]
  25 //   ...
  26 //   %i.next = add %i, 1
  27 //   %c = icmp eq %i.next, %n
  28 //
  29 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
  30 // it's useful to think about these as the same register, with some uses using
  31 // the value of the register before the add and some using // it after. In this
  32 // example, the icmp is a post-increment user, since it uses %i.next, which is
  33 // the value of the induction variable after the increment. The other common
  34 // case of post-increment users is users outside the loop.
  35 //
  36 // TODO: More sophistication in the way Formulae are generated and filtered.
  37 //
  38 // TODO: Handle multiple loops at a time.
  39 //
  40 // TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
  41 //       of a GlobalValue?
  42 //
  43 // TODO: When truncation is free, truncate ICmp users' operands to make it a
  44 //       smaller encoding (on x86 at least).
  45 //
  46 // TODO: When a negated register is used by an add (such as in a list of
  47 //       multiple base registers, or as the increment expression in an addrec),
  48 //       we may not actually need both reg and (-1 * reg) in registers; the
  49 //       negation can be implemented by using a sub instead of an add. The
  50 //       lack of support for taking this into consideration when making
  51 //       register pressure decisions is partly worked around by the "Special"
  52 //       use kind.
  53 //
  54 //===----------------------------------------------------------------------===//
  55
  56 #include "llvm/Transforms/Scalar.h"
  57 #include "llvm/ADT/DenseSet.h"
  58 #include "llvm/ADT/Hashing.h"
  59 #include "llvm/ADT/STLExtras.h"
  60 #include "llvm/ADT/SetVector.h"
  61 #include "llvm/ADT/SmallBitVector.h"
  62 #include "llvm/Analysis/IVUsers.h"
  63 #include "llvm/Analysis/LoopPass.h"
  64 #include "llvm/Analysis/ScalarEvolutionExpander.h"
  65 #include "llvm/Analysis/TargetTransformInfo.h"
  66 #include "llvm/IR/Constants.h"
  67 #include "llvm/IR/DerivedTypes.h"
  68 #include "llvm/IR/Dominators.h"
  69 #include "llvm/IR/Instructions.h"
  70 #include "llvm/IR/IntrinsicInst.h"
  71 #include "llvm/IR/ValueHandle.h"
  72 #include "llvm/Support/CommandLine.h"
  73 #include "llvm/Support/Debug.h"
  74 #include "llvm/Support/raw_ostream.h"
  75 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  76 #include "llvm/Transforms/Utils/Local.h"
  77 #include <algorithm>
  78 using namespace llvm;
  79
  80 #define DEBUG_TYPE "loop-reduce"
  81
  82 /// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for
  83 /// bail out. This threshold is far beyond the number of users that LSR can
  84 /// conceivably solve, so it should not affect generated code, but catches the
  85 /// worst cases before LSR burns too much compile time and stack space.
  86 static const unsigned MaxIVUsers = 200;
  87
  88 // Temporary flag to cleanup congruent phis after LSR phi expansion.
  89 // It's currently disabled until we can determine whether it's truly useful or
  90 // not. The flag should be removed after the v3.0 release.
  91 // This is now needed for ivchains.
  92 static cl::opt<bool> EnablePhiElim(
  93   "enable-lsr-phielim", cl::Hidden, cl::init(true),
  94   cl::desc("Enable LSR phi elimination"));
  95
  96 #ifndef NDEBUG
  97 // Stress test IV chain generation.
  98 static cl::opt<bool> StressIVChain(
  99   "stress-ivchain", cl::Hidden, cl::init(false),
 100   cl::desc("Stress test LSR IV chains"));
 101 #else
 102 static bool StressIVChain = false;
 103 #endif
 104
 105 namespace {
 106
 107 /// RegSortData - This class holds data which is used to order reuse candidates.
 108 class RegSortData {
 109 public:
 110   /// UsedByIndices - This represents the set of LSRUse indices which reference
 111   /// a particular register.
 112   SmallBitVector UsedByIndices;
 113
 114   RegSortData() {}
 115
 116   void print(raw_ostream &OS) const;
 117   void dump() const;
 118 };
 119
 120 }
 121
 122 void RegSortData::print(raw_ostream &OS) const {
 123   OS << "[NumUses=" << UsedByIndices.count() << ']';
 124 }
 125
 126 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 127 void RegSortData::dump() const {
 128   print(errs()); errs() << '\n';
 129 }
 130 #endif
 131
 132 namespace {
 133
 134 /// RegUseTracker - Map register candidates to information about how they are
 135 /// used.
 136 class RegUseTracker {
 137   typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
 138
 139   RegUsesTy RegUsesMap;
 140   SmallVector<const SCEV *, 16> RegSequence;
 141
 142 public:
 143   void CountRegister(const SCEV *Reg, size_t LUIdx);
 144   void DropRegister(const SCEV *Reg, size_t LUIdx);
 145   void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx);
 146
 147   bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
 148
 149   const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
 150
 151   void clear();
 152
 153   typedef SmallVectorImpl<const SCEV *>::iterator iterator;
 154   typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
 155   iterator begin() { return RegSequence.begin(); }
 156   iterator end()   { return RegSequence.end(); }
 157   const_iterator begin() const { return RegSequence.begin(); }
 158   const_iterator end() const   { return RegSequence.end(); }
 159 };
 160
 161 }
 162
 163 void
 164 RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) {
 165   std::pair<RegUsesTy::iterator, bool> Pair =
 166     RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
 167   RegSortData &RSD = Pair.first->second;
 168   if (Pair.second)
 169     RegSequence.push_back(Reg);
 170   RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
 171   RSD.UsedByIndices.set(LUIdx);
 172 }
 173
 174 void
 175 RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) {
 176   RegUsesTy::iterator It = RegUsesMap.find(Reg);
 177   assert(It != RegUsesMap.end());
 178   RegSortData &RSD = It->second;
 179   assert(RSD.UsedByIndices.size() > LUIdx);
 180   RSD.UsedByIndices.reset(LUIdx);
 181 }
 182
 183 void
 184 RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
 185   assert(LUIdx <= LastLUIdx);
 186
 187   // Update RegUses. The data structure is not optimized for this purpose;
 188   // we must iterate through it and update each of the bit vectors.
 189   for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end();
 190        I != E; ++I) {
 191     SmallBitVector &UsedByIndices = I->second.UsedByIndices;
 192     if (LUIdx < UsedByIndices.size())
 193       UsedByIndices[LUIdx] =
 194         LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0;
 195     UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
 196   }
 197 }
 198
 199 bool
 200 RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
 201   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
 202   if (I == RegUsesMap.end())
 203     return false;
 204   const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
 205   int i = UsedByIndices.find_first();
 206   if (i == -1) return false;
 207   if ((size_t)i != LUIdx) return true;
 208   return UsedByIndices.find_next(i) != -1;
 209 }
 210
 211 const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
 212   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
 213   assert(I != RegUsesMap.end() && "Unknown register!");
 214   return I->second.UsedByIndices;
 215 }
 216
 217 void RegUseTracker::clear() {
 218   RegUsesMap.clear();
 219   RegSequence.clear();
 220 }
 221
 222 namespace {
 223
 224 /// Formula - This class holds information that describes a formula for
 225 /// computing satisfying a use. It may include broken-out immediates and scaled
 226 /// registers.
 227 struct Formula {
 228   /// Global base address used for complex addressing.
 229   GlobalValue *BaseGV;
 230
 231   /// Base offset for complex addressing.
 232   int64_t BaseOffset;
 233
 234   /// Whether any complex addressing has a base register.
 235   bool HasBaseReg;
 236
 237   /// The scale of any complex addressing.
 238   int64_t Scale;
 239
 240   /// BaseRegs - The list of "base" registers for this use. When this is
 241   /// non-empty,
 242   SmallVector<const SCEV *, 4> BaseRegs;
 243
 244   /// ScaledReg - The 'scaled' register for this use. This should be non-null
 245   /// when Scale is not zero.
 246   const SCEV *ScaledReg;
 247
 248   /// UnfoldedOffset - An additional constant offset which added near the
 249   /// use. This requires a temporary register, but the offset itself can
 250   /// live in an add immediate field rather than a register.
 251   int64_t UnfoldedOffset;
 252
 253   Formula()
 254       : BaseGV(0), BaseOffset(0), HasBaseReg(false), Scale(0), ScaledReg(0),
 255         UnfoldedOffset(0) {}
 256
 257   void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 258
 259   unsigned getNumRegs() const;
 260   Type *getType() const;
 261
 262   void DeleteBaseReg(const SCEV *&S);
 263
 264   bool referencesReg(const SCEV *S) const;
 265   bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
 266                                   const RegUseTracker &RegUses) const;
 267
 268   void print(raw_ostream &OS) const;
 269   void dump() const;
 270 };
 271
 272 }
 273
 274 /// DoInitialMatch - Recursion helper for InitialMatch.
 275 static void DoInitialMatch(const SCEV *S, Loop *L,
 276                            SmallVectorImpl<const SCEV *> &Good,
 277                            SmallVectorImpl<const SCEV *> &Bad,
 278                            ScalarEvolution &SE) {
 279   // Collect expressions which properly dominate the loop header.
 280   if (SE.properlyDominates(S, L->getHeader())) {
 281     Good.push_back(S);
 282     return;
 283   }
 284
 285   // Look at add operands.
 286   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 287     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
 288          I != E; ++I)
 289       DoInitialMatch(*I, L, Good, Bad, SE);
 290     return;
 291   }
 292
 293   // Look at addrec operands.
 294   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
 295     if (!AR->getStart()->isZero()) {
 296       DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
 297       DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
 298                                       AR->getStepRecurrence(SE),
 299                                       // FIXME: AR->getNoWrapFlags()
 300                                       AR->getLoop(), SCEV::FlagAnyWrap),
 301                      L, Good, Bad, SE);
 302       return;
 303     }
 304
 305   // Handle a multiplication by -1 (negation) if it didn't fold.
 306   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
 307     if (Mul->getOperand(0)->isAllOnesValue()) {
 308       SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
 309       const SCEV *NewMul = SE.getMulExpr(Ops);
 310
 311       SmallVector<const SCEV *, 4> MyGood;
 312       SmallVector<const SCEV *, 4> MyBad;
 313       DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
 314       const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
 315         SE.getEffectiveSCEVType(NewMul->getType())));
 316       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(),
 317            E = MyGood.end(); I != E; ++I)
 318         Good.push_back(SE.getMulExpr(NegOne, *I));
 319       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(),
 320            E = MyBad.end(); I != E; ++I)
 321         Bad.push_back(SE.getMulExpr(NegOne, *I));
 322       return;
 323     }
 324
 325   // Ok, we can't do anything interesting. Just stuff the whole thing into a
 326   // register and hope for the best.
 327   Bad.push_back(S);
 328 }
 329
 330 /// InitialMatch - Incorporate loop-variant parts of S into this Formula,
 331 /// attempting to keep all loop-invariant and loop-computable values in a
 332 /// single base register.
 333 void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
 334   SmallVector<const SCEV *, 4> Good;
 335   SmallVector<const SCEV *, 4> Bad;
 336   DoInitialMatch(S, L, Good, Bad, SE);
 337   if (!Good.empty()) {
 338     const SCEV *Sum = SE.getAddExpr(Good);
 339     if (!Sum->isZero())
 340       BaseRegs.push_back(Sum);
 341     HasBaseReg = true;
 342   }
 343   if (!Bad.empty()) {
 344     const SCEV *Sum = SE.getAddExpr(Bad);
 345     if (!Sum->isZero())
 346       BaseRegs.push_back(Sum);
 347     HasBaseReg = true;
 348   }
 349 }
 350
 351 /// getNumRegs - Return the total number of register operands used by this
 352 /// formula. This does not include register uses implied by non-constant
 353 /// addrec strides.
 354 unsigned Formula::getNumRegs() const {
 355   return !!ScaledReg + BaseRegs.size();
 356 }
 357
 358 /// getType - Return the type of this formula, if it has one, or null
 359 /// otherwise. This type is meaningless except for the bit size.
 360 Type *Formula::getType() const {
 361   return !BaseRegs.empty() ? BaseRegs.front()->getType() :
 362          ScaledReg ? ScaledReg->getType() :
 363          BaseGV ? BaseGV->getType() :
 364          0;
 365 }
 366
 367 /// DeleteBaseReg - Delete the given base reg from the BaseRegs list.
 368 void Formula::DeleteBaseReg(const SCEV *&S) {
 369   if (&S != &BaseRegs.back())
 370     std::swap(S, BaseRegs.back());
 371   BaseRegs.pop_back();
 372 }
 373
 374 /// referencesReg - Test if this formula references the given register.
 375 bool Formula::referencesReg(const SCEV *S) const {
 376   return S == ScaledReg ||
 377          std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
 378 }
 379
 380 /// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers
 381 /// which are used by uses other than the use with the given index.
 382 bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
 383                                          const RegUseTracker &RegUses) const {
 384   if (ScaledReg)
 385     if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
 386       return true;
 387   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
 388        E = BaseRegs.end(); I != E; ++I)
 389     if (RegUses.isRegUsedByUsesOtherThan(*I, LUIdx))
 390       return true;
 391   return false;
 392 }
 393
 394 void Formula::print(raw_ostream &OS) const {
 395   bool First = true;
 396   if (BaseGV) {
 397     if (!First) OS << " + "; else First = false;
 398     BaseGV->printAsOperand(OS, /*PrintType=*/false);
 399   }
 400   if (BaseOffset != 0) {
 401     if (!First) OS << " + "; else First = false;
 402     OS << BaseOffset;
 403   }
 404   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
 405        E = BaseRegs.end(); I != E; ++I) {
 406     if (!First) OS << " + "; else First = false;
 407     OS << "reg(" << **I << ')';
 408   }
 409   if (HasBaseReg && BaseRegs.empty()) {
 410     if (!First) OS << " + "; else First = false;
 411     OS << "**error: HasBaseReg**";
 412   } else if (!HasBaseReg && !BaseRegs.empty()) {
 413     if (!First) OS << " + "; else First = false;
 414     OS << "**error: !HasBaseReg**";
 415   }
 416   if (Scale != 0) {
 417     if (!First) OS << " + "; else First = false;
 418     OS << Scale << "*reg(";
 419     if (ScaledReg)
 420       OS << *ScaledReg;
 421     else
 422       OS << "<unknown>";
 423     OS << ')';
 424   }
 425   if (UnfoldedOffset != 0) {
 426     if (!First) OS << " + ";
 427     OS << "imm(" << UnfoldedOffset << ')';
 428   }
 429 }
 430
 431 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 432 void Formula::dump() const {
 433   print(errs()); errs() << '\n';
 434 }
 435 #endif
 436
 437 /// isAddRecSExtable - Return true if the given addrec can be sign-extended
 438 /// without changing its value.
 439 static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
 440   Type *WideTy =
 441     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
 442   return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
 443 }
 444
 445 /// isAddSExtable - Return true if the given add can be sign-extended
 446 /// without changing its value.
 447 static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
 448   Type *WideTy =
 449     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
 450   return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
 451 }
 452
 453 /// isMulSExtable - Return true if the given mul can be sign-extended
 454 /// without changing its value.
 455 static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
 456   Type *WideTy =
 457     IntegerType::get(SE.getContext(),
 458                      SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
 459   return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
 460 }
 461
 462 /// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined
 463 /// and if the remainder is known to be zero,  or null otherwise. If
 464 /// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified
 465 /// to Y, ignoring that the multiplication may overflow, which is useful when
 466 /// the result will be used in a context where the most significant bits are
 467 /// ignored.
 468 static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
 469                                 ScalarEvolution &SE,
 470                                 bool IgnoreSignificantBits = false) {
 471   // Handle the trivial case, which works for any SCEV type.
 472   if (LHS == RHS)
 473     return SE.getConstant(LHS->getType(), 1);
 474
 475   // Handle a few RHS special cases.
 476   const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
 477   if (RC) {
 478     const APInt &RA = RC->getValue()->getValue();
 479     // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
 480     // some folding.
 481     if (RA.isAllOnesValue())
 482       return SE.getMulExpr(LHS, RC);
 483     // Handle x /s 1 as x.
 484     if (RA == 1)
 485       return LHS;
 486   }
 487
 488   // Check for a division of a constant by a constant.
 489   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
 490     if (!RC)
 491       return 0;
 492     const APInt &LA = C->getValue()->getValue();
 493     const APInt &RA = RC->getValue()->getValue();
 494     if (LA.srem(RA) != 0)
 495       return 0;
 496     return SE.getConstant(LA.sdiv(RA));
 497   }
 498
 499   // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
 500   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
 501     if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
 502       const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
 503                                       IgnoreSignificantBits);
 504       if (!Step) return 0;
 505       const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
 506                                        IgnoreSignificantBits);
 507       if (!Start) return 0;
 508       // FlagNW is independent of the start value, step direction, and is
 509       // preserved with smaller magnitude steps.
 510       // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
 511       return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
 512     }
 513     return 0;
 514   }
 515
 516   // Distribute the sdiv over add operands, if the add doesn't overflow.
 517   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
 518     if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
 519       SmallVector<const SCEV *, 8> Ops;
 520       for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
 521            I != E; ++I) {
 522         const SCEV *Op = getExactSDiv(*I, RHS, SE,
 523                                       IgnoreSignificantBits);
 524         if (!Op) return 0;
 525         Ops.push_back(Op);
 526       }
 527       return SE.getAddExpr(Ops);
 528     }
 529     return 0;
 530   }
 531
 532   // Check for a multiply operand that we can pull RHS out of.
 533   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
 534     if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
 535       SmallVector<const SCEV *, 4> Ops;
 536       bool Found = false;
 537       for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end();
 538            I != E; ++I) {
 539         const SCEV *S = *I;
 540         if (!Found)
 541           if (const SCEV *Q = getExactSDiv(S, RHS, SE,
 542                                            IgnoreSignificantBits)) {
 543             S = Q;
 544             Found = true;
 545           }
 546         Ops.push_back(S);
 547       }
 548       return Found ? SE.getMulExpr(Ops) : 0;
 549     }
 550     return 0;
 551   }
 552
 553   // Otherwise we don't know.
 554   return 0;
 555 }
 556
 557 /// ExtractImmediate - If S involves the addition of a constant integer value,
 558 /// return that integer value, and mutate S to point to a new SCEV with that
 559 /// value excluded.
 560 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
 561   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
 562     if (C->getValue()->getValue().getMinSignedBits() <= 64) {
 563       S = SE.getConstant(C->getType(), 0);
 564       return C->getValue()->getSExtValue();
 565     }
 566   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 567     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
 568     int64_t Result = ExtractImmediate(NewOps.front(), SE);
 569     if (Result != 0)
 570       S = SE.getAddExpr(NewOps);
 571     return Result;
 572   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
 573     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
 574     int64_t Result = ExtractImmediate(NewOps.front(), SE);
 575     if (Result != 0)
 576       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
 577                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
 578                            SCEV::FlagAnyWrap);
 579     return Result;
 580   }
 581   return 0;
 582 }
 583
 584 /// ExtractSymbol - If S involves the addition of a GlobalValue address,
 585 /// return that symbol, and mutate S to point to a new SCEV with that
 586 /// value excluded.
 587 static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
 588   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
 589     if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
 590       S = SE.getConstant(GV->getType(), 0);
 591       return GV;
 592     }
 593   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 594     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
 595     GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
 596     if (Result)
 597       S = SE.getAddExpr(NewOps);
 598     return Result;
 599   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
 600     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
 601     GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
 602     if (Result)
 603       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
 604                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
 605                            SCEV::FlagAnyWrap);
 606     return Result;
 607   }
 608   return 0;
 609 }
 610
 611 /// isAddressUse - Returns true if the specified instruction is using the
 612 /// specified value as an address.
 613 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
 614   bool isAddress = isa<LoadInst>(Inst);
 615   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
 616     if (SI->getOperand(1) == OperandVal)
 617       isAddress = true;
 618   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
 619     // Addressing modes can also be folded into prefetches and a variety
 620     // of intrinsics.
 621     switch (II->getIntrinsicID()) {
 622       default: break;
 623       case Intrinsic::prefetch:
 624       case Intrinsic::x86_sse_storeu_ps:
 625       case Intrinsic::x86_sse2_storeu_pd:
 626       case Intrinsic::x86_sse2_storeu_dq:
 627       case Intrinsic::x86_sse2_storel_dq:
 628         if (II->getArgOperand(0) == OperandVal)
 629           isAddress = true;
 630         break;
 631     }
 632   }
 633   return isAddress;
 634 }
 635
 636 /// getAccessType - Return the type of the memory being accessed.
 637 static Type *getAccessType(const Instruction *Inst) {
 638   Type *AccessTy = Inst->getType();
 639   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
 640     AccessTy = SI->getOperand(0)->getType();
 641   else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
 642     // Addressing modes can also be folded into prefetches and a variety
 643     // of intrinsics.
 644     switch (II->getIntrinsicID()) {
 645     default: break;
 646     case Intrinsic::x86_sse_storeu_ps:
 647     case Intrinsic::x86_sse2_storeu_pd:
 648     case Intrinsic::x86_sse2_storeu_dq:
 649     case Intrinsic::x86_sse2_storel_dq:
 650       AccessTy = II->getArgOperand(0)->getType();
 651       break;
 652     }
 653   }
 654
 655   // All pointers have the same requirements, so canonicalize them to an
 656   // arbitrary pointer type to minimize variation.
 657   if (PointerType *PTy = dyn_cast<PointerType>(AccessTy))
 658     AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
 659                                 PTy->getAddressSpace());
 660
 661   return AccessTy;
 662 }
 663
 664 /// isExistingPhi - Return true if this AddRec is already a phi in its loop.
 665 static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
 666   for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
 667        PHINode *PN = dyn_cast<PHINode>(I); ++I) {
 668     if (SE.isSCEVable(PN->getType()) &&
 669         (SE.getEffectiveSCEVType(PN->getType()) ==
 670          SE.getEffectiveSCEVType(AR->getType())) &&
 671         SE.getSCEV(PN) == AR)
 672       return true;
 673   }
 674   return false;
 675 }
 676
 677 /// Check if expanding this expression is likely to incur significant cost. This
 678 /// is tricky because SCEV doesn't track which expressions are actually computed
 679 /// by the current IR.
 680 ///
 681 /// We currently allow expansion of IV increments that involve adds,
 682 /// multiplication by constants, and AddRecs from existing phis.
 683 ///
 684 /// TODO: Allow UDivExpr if we can find an existing IV increment that is an
 685 /// obvious multiple of the UDivExpr.
 686 static bool isHighCostExpansion(const SCEV *S,
 687                                 SmallPtrSet<const SCEV*, 8> &Processed,
 688                                 ScalarEvolution &SE) {
 689   // Zero/One operand expressions
 690   switch (S->getSCEVType()) {
 691   case scUnknown:
 692   case scConstant:
 693     return false;
 694   case scTruncate:
 695     return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
 696                                Processed, SE);
 697   case scZeroExtend:
 698     return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
 699                                Processed, SE);
 700   case scSignExtend:
 701     return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
 702                                Processed, SE);
 703   }
 704
 705   if (!Processed.insert(S))
 706     return false;
 707
 708   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 709     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
 710          I != E; ++I) {
 711       if (isHighCostExpansion(*I, Processed, SE))
 712         return true;
 713     }
 714     return false;
 715   }
 716
 717   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
 718     if (Mul->getNumOperands() == 2) {
 719       // Multiplication by a constant is ok
 720       if (isa<SCEVConstant>(Mul->getOperand(0)))
 721         return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
 722
 723       // If we have the value of one operand, check if an existing
 724       // multiplication already generates this expression.
 725       if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
 726         Value *UVal = U->getValue();
 727         for (User *UR : UVal->users()) {
 728           // If U is a constant, it may be used by a ConstantExpr.
 729           Instruction *UI = dyn_cast<Instruction>(UR);
 730           if (UI && UI->getOpcode() == Instruction::Mul &&
 731               SE.isSCEVable(UI->getType())) {
 732             return SE.getSCEV(UI) == Mul;
 733           }
 734         }
 735       }
 736     }
 737   }
 738
 739   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
 740     if (isExistingPhi(AR, SE))
 741       return false;
 742   }
 743
 744   // Fow now, consider any other type of expression (div/mul/min/max) high cost.
 745   return true;
 746 }
 747
 748 /// DeleteTriviallyDeadInstructions - If any of the instructions is the
 749 /// specified set are trivially dead, delete them and see if this makes any of
 750 /// their operands subsequently dead.
 751 static bool
 752 DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
 753   bool Changed = false;
 754
 755   while (!DeadInsts.empty()) {
 756     Value *V = DeadInsts.pop_back_val();
 757     Instruction *I = dyn_cast_or_null<Instruction>(V);
 758
 759     if (I == 0 || !isInstructionTriviallyDead(I))
 760       continue;
 761
 762     for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
 763       if (Instruction *U = dyn_cast<Instruction>(*OI)) {
 764         *OI = 0;
 765         if (U->use_empty())
 766           DeadInsts.push_back(U);
 767       }
 768
 769     I->eraseFromParent();
 770     Changed = true;
 771   }
 772
 773   return Changed;
 774 }
 775
 776 namespace {
 777 class LSRUse;
 778 }
 779 // Check if it is legal to fold 2 base registers.
 780 static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
 781                              const Formula &F);
 782 // Get the cost of the scaling factor used in F for LU.
 783 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
 784                                      const LSRUse &LU, const Formula &F);
 785
 786 namespace {
 787
 788 /// Cost - This class is used to measure and compare candidate formulae.
 789 class Cost {
 790   /// TODO: Some of these could be merged. Also, a lexical ordering
 791   /// isn't always optimal.
 792   unsigned NumRegs;
 793   unsigned AddRecCost;
 794   unsigned NumIVMuls;
 795   unsigned NumBaseAdds;
 796   unsigned ImmCost;
 797   unsigned SetupCost;
 798   unsigned ScaleCost;
 799
 800 public:
 801   Cost()
 802     : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
 803       SetupCost(0), ScaleCost(0) {}
 804
 805   bool operator<(const Cost &Other) const;
 806
 807   void Lose();
 808
 809 #ifndef NDEBUG
 810   // Once any of the metrics loses, they must all remain losers.
 811   bool isValid() {
 812     return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
 813              | ImmCost | SetupCost | ScaleCost) != ~0u)
 814       || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
 815            & ImmCost & SetupCost & ScaleCost) == ~0u);
 816   }
 817 #endif
 818
 819   bool isLoser() {
 820     assert(isValid() && "invalid cost");
 821     return NumRegs == ~0u;
 822   }
 823
 824   void RateFormula(const TargetTransformInfo &TTI,
 825                    const Formula &F,
 826                    SmallPtrSet<const SCEV *, 16> &Regs,
 827                    const DenseSet<const SCEV *> &VisitedRegs,
 828                    const Loop *L,
 829                    const SmallVectorImpl<int64_t> &Offsets,
 830                    ScalarEvolution &SE, DominatorTree &DT,
 831                    const LSRUse &LU,
 832                    SmallPtrSet<const SCEV *, 16> *LoserRegs = 0);
 833
 834   void print(raw_ostream &OS) const;
 835   void dump() const;
 836
 837 private:
 838   void RateRegister(const SCEV *Reg,
 839                     SmallPtrSet<const SCEV *, 16> &Regs,
 840                     const Loop *L,
 841                     ScalarEvolution &SE, DominatorTree &DT);
 842   void RatePrimaryRegister(const SCEV *Reg,
 843                            SmallPtrSet<const SCEV *, 16> &Regs,
 844                            const Loop *L,
 845                            ScalarEvolution &SE, DominatorTree &DT,
 846                            SmallPtrSet<const SCEV *, 16> *LoserRegs);
 847 };
 848
 849 }
 850
 851 /// RateRegister - Tally up interesting quantities from the given register.
 852 void Cost::RateRegister(const SCEV *Reg,
 853                         SmallPtrSet<const SCEV *, 16> &Regs,
 854                         const Loop *L,
 855                         ScalarEvolution &SE, DominatorTree &DT) {
 856   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
 857     // If this is an addrec for another loop, don't second-guess its addrec phi
 858     // nodes. LSR isn't currently smart enough to reason about more than one
 859     // loop at a time. LSR has already run on inner loops, will not run on outer
 860     // loops, and cannot be expected to change sibling loops.
 861     if (AR->getLoop() != L) {
 862       // If the AddRec exists, consider it's register free and leave it alone.
 863       if (isExistingPhi(AR, SE))
 864         return;
 865
 866       // Otherwise, do not consider this formula at all.
 867       Lose();
 868       return;
 869     }
 870     AddRecCost += 1; /// TODO: This should be a function of the stride.
 871
 872     // Add the step value register, if it needs one.
 873     // TODO: The non-affine case isn't precisely modeled here.
 874     if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
 875       if (!Regs.count(AR->getOperand(1))) {
 876         RateRegister(AR->getOperand(1), Regs, L, SE, DT);
 877         if (isLoser())
 878           return;
 879       }
 880     }
 881   }
 882   ++NumRegs;
 883
 884   // Rough heuristic; favor registers which don't require extra setup
 885   // instructions in the preheader.
 886   if (!isa<SCEVUnknown>(Reg) &&
 887       !isa<SCEVConstant>(Reg) &&
 888       !(isa<SCEVAddRecExpr>(Reg) &&
 889         (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
 890          isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
 891     ++SetupCost;
 892
 893     NumIVMuls += isa<SCEVMulExpr>(Reg) &&
 894                  SE.hasComputableLoopEvolution(Reg, L);
 895 }
 896
 897 /// RatePrimaryRegister - Record this register in the set. If we haven't seen it
 898 /// before, rate it. Optional LoserRegs provides a way to declare any formula
 899 /// that refers to one of those regs an instant loser.
 900 void Cost::RatePrimaryRegister(const SCEV *Reg,
 901                                SmallPtrSet<const SCEV *, 16> &Regs,
 902                                const Loop *L,
 903                                ScalarEvolution &SE, DominatorTree &DT,
 904                                SmallPtrSet<const SCEV *, 16> *LoserRegs) {
 905   if (LoserRegs && LoserRegs->count(Reg)) {
 906     Lose();
 907     return;
 908   }
 909   if (Regs.insert(Reg)) {
 910     RateRegister(Reg, Regs, L, SE, DT);
 911     if (LoserRegs && isLoser())
 912       LoserRegs->insert(Reg);
 913   }
 914 }
 915
 916 void Cost::RateFormula(const TargetTransformInfo &TTI,
 917                        const Formula &F,
 918                        SmallPtrSet<const SCEV *, 16> &Regs,
 919                        const DenseSet<const SCEV *> &VisitedRegs,
 920                        const Loop *L,
 921                        const SmallVectorImpl<int64_t> &Offsets,
 922                        ScalarEvolution &SE, DominatorTree &DT,
 923                        const LSRUse &LU,
 924                        SmallPtrSet<const SCEV *, 16> *LoserRegs) {
 925   // Tally up the registers.
 926   if (const SCEV *ScaledReg = F.ScaledReg) {
 927     if (VisitedRegs.count(ScaledReg)) {
 928       Lose();
 929       return;
 930     }
 931     RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
 932     if (isLoser())
 933       return;
 934   }
 935   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
 936        E = F.BaseRegs.end(); I != E; ++I) {
 937     const SCEV *BaseReg = *I;
 938     if (VisitedRegs.count(BaseReg)) {
 939       Lose();
 940       return;
 941     }
 942     RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
 943     if (isLoser())
 944       return;
 945   }
 946
 947   // Determine how many (unfolded) adds we'll need inside the loop.
 948   size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
 949   if (NumBaseParts > 1)
 950     // Do not count the base and a possible second register if the target
 951     // allows to fold 2 registers.
 952     NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F));
 953
 954   // Accumulate non-free scaling amounts.
 955   ScaleCost += getScalingFactorCost(TTI, LU, F);
 956
 957   // Tally up the non-zero immediates.
 958   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
 959        E = Offsets.end(); I != E; ++I) {
 960     int64_t Offset = (uint64_t)*I + F.BaseOffset;
 961     if (F.BaseGV)
 962       ImmCost += 64; // Handle symbolic values conservatively.
 963                      // TODO: This should probably be the pointer size.
 964     else if (Offset != 0)
 965       ImmCost += APInt(64, Offset, true).getMinSignedBits();
 966   }
 967   assert(isValid() && "invalid cost");
 968 }
 969
 970 /// Lose - Set this cost to a losing value.
 971 void Cost::Lose() {
 972   NumRegs = ~0u;
 973   AddRecCost = ~0u;
 974   NumIVMuls = ~0u;
 975   NumBaseAdds = ~0u;
 976   ImmCost = ~0u;
 977   SetupCost = ~0u;
 978   ScaleCost = ~0u;
 979 }
 980
 981 /// operator< - Choose the lower cost.
 982 bool Cost::operator<(const Cost &Other) const {
 983   return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
 984                   ImmCost, SetupCost) <
 985          std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
 986                   Other.NumBaseAdds, Other.ScaleCost, Other.ImmCost,
 987                   Other.SetupCost);
 988 }
 989
 990 void Cost::print(raw_ostream &OS) const {
 991   OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
 992   if (AddRecCost != 0)
 993     OS << ", with addrec cost " << AddRecCost;
 994   if (NumIVMuls != 0)
 995     OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
 996   if (NumBaseAdds != 0)
 997     OS << ", plus " << NumBaseAdds << " base add"
 998        << (NumBaseAdds == 1 ? "" : "s");
 999   if (ScaleCost != 0)
1000     OS << ", plus " << ScaleCost << " scale cost";
1001   if (ImmCost != 0)
1002     OS << ", plus " << ImmCost << " imm cost";
1003   if (SetupCost != 0)
1004     OS << ", plus " << SetupCost << " setup cost";
1005 }
1006
1007 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1008 void Cost::dump() const {
1009   print(errs()); errs() << '\n';
1010 }
1011 #endif
1012
1013 namespace {
1014
1015 /// LSRFixup - An operand value in an instruction which is to be replaced
1016 /// with some equivalent, possibly strength-reduced, replacement.
1017 struct LSRFixup {
1018   /// UserInst - The instruction which will be updated.
1019   Instruction *UserInst;
1020
1021   /// OperandValToReplace - The operand of the instruction which will
1022   /// be replaced. The operand may be used more than once; every instance
1023   /// will be replaced.
1024   Value *OperandValToReplace;
1025
1026   /// PostIncLoops - If this user is to use the post-incremented value of an
1027   /// induction variable, this variable is non-null and holds the loop
1028   /// associated with the induction variable.
1029   PostIncLoopSet PostIncLoops;
1030
1031   /// LUIdx - The index of the LSRUse describing the expression which
1032   /// this fixup needs, minus an offset (below).
1033   size_t LUIdx;
1034
1035   /// Offset - A constant offset to be added to the LSRUse expression.
1036   /// This allows multiple fixups to share the same LSRUse with different
1037   /// offsets, for example in an unrolled loop.
1038   int64_t Offset;
1039
1040   bool isUseFullyOutsideLoop(const Loop *L) const;
1041
1042   LSRFixup();
1043
1044   void print(raw_ostream &OS) const;
1045   void dump() const;
1046 };
1047
1048 }
1049
1050 LSRFixup::LSRFixup()
1051   : UserInst(0), OperandValToReplace(0), LUIdx(~size_t(0)), Offset(0) {}
1052
1053 /// isUseFullyOutsideLoop - Test whether this fixup always uses its
1054 /// value outside of the given loop.
1055 bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1056   // PHI nodes use their value in their incoming blocks.
1057   if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1058     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1059       if (PN->getIncomingValue(i) == OperandValToReplace &&
1060           L->contains(PN->getIncomingBlock(i)))
1061         return false;
1062     return true;
1063   }
1064
1065   return !L->contains(UserInst);
1066 }
1067
1068 void LSRFixup::print(raw_ostream &OS) const {
1069   OS << "UserInst=";
1070   // Store is common and interesting enough to be worth special-casing.
1071   if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1072     OS << "store ";
1073     Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1074   } else if (UserInst->getType()->isVoidTy())
1075     OS << UserInst->getOpcodeName();
1076   else
1077     UserInst->printAsOperand(OS, /*PrintType=*/false);
1078
1079   OS << ", OperandValToReplace=";
1080   OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1081
1082   for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(),
1083        E = PostIncLoops.end(); I != E; ++I) {
1084     OS << ", PostIncLoop=";
1085     (*I)->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1086   }
1087
1088   if (LUIdx != ~size_t(0))
1089     OS << ", LUIdx=" << LUIdx;
1090
1091   if (Offset != 0)
1092     OS << ", Offset=" << Offset;
1093 }
1094
1095 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1096 void LSRFixup::dump() const {
1097   print(errs()); errs() << '\n';
1098 }
1099 #endif
1100
1101 namespace {
1102
1103 /// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding
1104 /// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*.
1105 struct UniquifierDenseMapInfo {
1106   static SmallVector<const SCEV *, 4> getEmptyKey() {
1107     SmallVector<const SCEV *, 4>  V;
1108     V.push_back(reinterpret_cast<const SCEV *>(-1));
1109     return V;
1110   }
1111
1112   static SmallVector<const SCEV *, 4> getTombstoneKey() {
1113     SmallVector<const SCEV *, 4> V;
1114     V.push_back(reinterpret_cast<const SCEV *>(-2));
1115     return V;
1116   }
1117
1118   static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
1119     return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
1120   }
1121
1122   static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
1123                       const SmallVector<const SCEV *, 4> &RHS) {
1124     return LHS == RHS;
1125   }
1126 };
1127
1128 /// LSRUse - This class holds the state that LSR keeps for each use in
1129 /// IVUsers, as well as uses invented by LSR itself. It includes information
1130 /// about what kinds of things can be folded into the user, information about
1131 /// the user itself, and information about how the use may be satisfied.
1132 /// TODO: Represent multiple users of the same expression in common?
1133 class LSRUse {
1134   DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
1135
1136 public:
1137   /// KindType - An enum for a kind of use, indicating what types of
1138   /// scaled and immediate operands it might support.
1139   enum KindType {
1140     Basic,   ///< A normal use, with no folding.
1141     Special, ///< A special case of basic, allowing -1 scales.
1142     Address, ///< An address use; folding according to TargetLowering
1143     ICmpZero ///< An equality icmp with both operands folded into one.
1144     // TODO: Add a generic icmp too?
1145   };
1146
1147   typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair;
1148
1149   KindType Kind;
1150   Type *AccessTy;
1151
1152   SmallVector<int64_t, 8> Offsets;
1153   int64_t MinOffset;
1154   int64_t MaxOffset;
1155
1156   /// AllFixupsOutsideLoop - This records whether all of the fixups using this
1157   /// LSRUse are outside of the loop, in which case some special-case heuristics
1158   /// may be used.
1159   bool AllFixupsOutsideLoop;
1160
1161   /// RigidFormula is set to true to guarantee that this use will be associated
1162   /// with a single formula--the one that initially matched. Some SCEV
1163   /// expressions cannot be expanded. This allows LSR to consider the registers
1164   /// used by those expressions without the need to expand them later after
1165   /// changing the formula.
1166   bool RigidFormula;
1167
1168   /// WidestFixupType - This records the widest use type for any fixup using
1169   /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
1170   /// max fixup widths to be equivalent, because the narrower one may be relying
1171   /// on the implicit truncation to truncate away bogus bits.
1172   Type *WidestFixupType;
1173
1174   /// Formulae - A list of ways to build a value that can satisfy this user.
1175   /// After the list is populated, one of these is selected heuristically and
1176   /// used to formulate a replacement for OperandValToReplace in UserInst.
1177   SmallVector<Formula, 12> Formulae;
1178
1179   /// Regs - The set of register candidates used by all formulae in this LSRUse.
1180   SmallPtrSet<const SCEV *, 4> Regs;
1181
1182   LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T),
1183                                       MinOffset(INT64_MAX),
1184                                       MaxOffset(INT64_MIN),
1185                                       AllFixupsOutsideLoop(true),
1186                                       RigidFormula(false),
1187                                       WidestFixupType(0) {}
1188
1189   bool HasFormulaWithSameRegs(const Formula &F) const;
1190   bool InsertFormula(const Formula &F);
1191   void DeleteFormula(Formula &F);
1192   void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1193
1194   void print(raw_ostream &OS) const;
1195   void dump() const;
1196 };
1197
1198 }
1199
1200 /// HasFormula - Test whether this use as a formula which has the same
1201 /// registers as the given formula.
1202 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1203   SmallVector<const SCEV *, 4> Key = F.BaseRegs;
1204   if (F.ScaledReg) Key.push_back(F.ScaledReg);
1205   // Unstable sort by host order ok, because this is only used for uniquifying.
1206   std::sort(Key.begin(), Key.end());
1207   return Uniquifier.count(Key);
1208 }
1209
1210 /// InsertFormula - If the given formula has not yet been inserted, add it to
1211 /// the list, and return true. Return false otherwise.
1212 bool LSRUse::InsertFormula(const Formula &F) {
1213   if (!Formulae.empty() && RigidFormula)
1214     return false;
1215
1216   SmallVector<const SCEV *, 4> Key = F.BaseRegs;
1217   if (F.ScaledReg) Key.push_back(F.ScaledReg);
1218   // Unstable sort by host order ok, because this is only used for uniquifying.
1219   std::sort(Key.begin(), Key.end());
1220
1221   if (!Uniquifier.insert(Key).second)
1222     return false;
1223
1224   // Using a register to hold the value of 0 is not profitable.
1225   assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1226          "Zero allocated in a scaled register!");
1227 #ifndef NDEBUG
1228   for (SmallVectorImpl<const SCEV *>::const_iterator I =
1229        F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I)
1230     assert(!(*I)->isZero() && "Zero allocated in a base register!");
1231 #endif
1232
1233   // Add the formula to the list.
1234   Formulae.push_back(F);
1235
1236   // Record registers now being used by this use.
1237   Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1238
1239   return true;
1240 }
1241
1242 /// DeleteFormula - Remove the given formula from this use's list.
1243 void LSRUse::DeleteFormula(Formula &F) {
1244   if (&F != &Formulae.back())
1245     std::swap(F, Formulae.back());
1246   Formulae.pop_back();
1247 }
1248
1249 /// RecomputeRegs - Recompute the Regs field, and update RegUses.
1250 void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1251   // Now that we've filtered out some formulae, recompute the Regs set.
1252   SmallPtrSet<const SCEV *, 4> OldRegs = Regs;
1253   Regs.clear();
1254   for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(),
1255        E = Formulae.end(); I != E; ++I) {
1256     const Formula &F = *I;
1257     if (F.ScaledReg) Regs.insert(F.ScaledReg);
1258     Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1259   }
1260
1261   // Update the RegTracker.
1262   for (SmallPtrSet<const SCEV *, 4>::iterator I = OldRegs.begin(),
1263        E = OldRegs.end(); I != E; ++I)
1264     if (!Regs.count(*I))
1265       RegUses.DropRegister(*I, LUIdx);
1266 }
1267
1268 void LSRUse::print(raw_ostream &OS) const {
1269   OS << "LSR Use: Kind=";
1270   switch (Kind) {
1271   case Basic:    OS << "Basic"; break;
1272   case Special:  OS << "Special"; break;
1273   case ICmpZero: OS << "ICmpZero"; break;
1274   case Address:
1275     OS << "Address of ";
1276     if (AccessTy->isPointerTy())
1277       OS << "pointer"; // the full pointer type could be really verbose
1278     else
1279       OS << *AccessTy;
1280   }
1281
1282   OS << ", Offsets={";
1283   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
1284        E = Offsets.end(); I != E; ++I) {
1285     OS << *I;
1286     if (std::next(I) != E)
1287       OS << ',';
1288   }
1289   OS << '}';
1290
1291   if (AllFixupsOutsideLoop)
1292     OS << ", all-fixups-outside-loop";
1293
1294   if (WidestFixupType)
1295     OS << ", widest fixup type: " << *WidestFixupType;
1296 }
1297
1298 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1299 void LSRUse::dump() const {
1300   print(errs()); errs() << '\n';
1301 }
1302 #endif
1303
1304 /// isLegalUse - Test whether the use described by AM is "legal", meaning it can
1305 /// be completely folded into the user instruction at isel time. This includes
1306 /// address-mode folding and special icmp tricks.
1307 static bool isLegalUse(const TargetTransformInfo &TTI, LSRUse::KindType Kind,
1308                        Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset,
1309                        bool HasBaseReg, int64_t Scale) {
1310   switch (Kind) {
1311   case LSRUse::Address:
1312     return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale);
1313
1314     // Otherwise, just guess that reg+reg addressing is legal.
1315     //return ;
1316
1317   case LSRUse::ICmpZero:
1318     // There's not even a target hook for querying whether it would be legal to
1319     // fold a GV into an ICmp.
1320     if (BaseGV)
1321       return false;
1322
1323     // ICmp only has two operands; don't allow more than two non-trivial parts.
1324     if (Scale != 0 && HasBaseReg && BaseOffset != 0)
1325       return false;
1326
1327     // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1328     // putting the scaled register in the other operand of the icmp.
1329     if (Scale != 0 && Scale != -1)
1330       return false;
1331
1332     // If we have low-level target information, ask the target if it can fold an
1333     // integer immediate on an icmp.
1334     if (BaseOffset != 0) {
1335       // We have one of:
1336       // ICmpZero     BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1337       // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1338       // Offs is the ICmp immediate.
1339       if (Scale == 0)
1340         // The cast does the right thing with INT64_MIN.
1341         BaseOffset = -(uint64_t)BaseOffset;
1342       return TTI.isLegalICmpImmediate(BaseOffset);
1343     }
1344
1345     // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1346     return true;
1347
1348   case LSRUse::Basic:
1349     // Only handle single-register values.
1350     return !BaseGV && Scale == 0 && BaseOffset == 0;
1351
1352   case LSRUse::Special:
1353     // Special case Basic to handle -1 scales.
1354     return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
1355   }
1356
1357   llvm_unreachable("Invalid LSRUse Kind!");
1358 }
1359
1360 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
1361                        int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy,
1362                        GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg,
1363                        int64_t Scale) {
1364   // Check for overflow.
1365   if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
1366       (MinOffset > 0))
1367     return false;
1368   MinOffset = (uint64_t)BaseOffset + MinOffset;
1369   if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
1370       (MaxOffset > 0))
1371     return false;
1372   MaxOffset = (uint64_t)BaseOffset + MaxOffset;
1373
1374   return isLegalUse(TTI, Kind, AccessTy, BaseGV, MinOffset, HasBaseReg,
1375                     Scale) &&
1376          isLegalUse(TTI, Kind, AccessTy, BaseGV, MaxOffset, HasBaseReg, Scale);
1377 }
1378
1379 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
1380                        int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy,
1381                        const Formula &F) {
1382   return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1383                     F.BaseOffset, F.HasBaseReg, F.Scale);
1384 }
1385
1386 static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
1387                              const Formula &F) {
1388   // If F is used as an Addressing Mode, it may fold one Base plus one
1389   // scaled register. If the scaled register is nil, do as if another
1390   // element of the base regs is a 1-scaled register.
1391   // This is possible if BaseRegs has at least 2 registers.
1392
1393   // If this is not an address calculation, this is not an addressing mode
1394   // use.
1395   if (LU.Kind !=  LSRUse::Address)
1396     return false;
1397
1398   // F is already scaled.
1399   if (F.Scale != 0)
1400     return false;
1401
1402   // We need to keep one register for the base and one to scale.
1403   if (F.BaseRegs.size() < 2)
1404     return false;
1405
1406   return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
1407                     F.BaseGV, F.BaseOffset, F.HasBaseReg, 1);
1408  }
1409
1410 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
1411                                      const LSRUse &LU, const Formula &F) {
1412   if (!F.Scale)
1413     return 0;
1414   assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1415                     LU.AccessTy, F) && "Illegal formula in use.");
1416
1417   switch (LU.Kind) {
1418   case LSRUse::Address: {
1419     // Check the scaling factor cost with both the min and max offsets.
1420     int ScaleCostMinOffset =
1421       TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV,
1422                                F.BaseOffset + LU.MinOffset,
1423                                F.HasBaseReg, F.Scale);
1424     int ScaleCostMaxOffset =
1425       TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV,
1426                                F.BaseOffset + LU.MaxOffset,
1427                                F.HasBaseReg, F.Scale);
1428
1429     assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
1430            "Legal addressing mode has an illegal cost!");
1431     return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1432   }
1433   case LSRUse::ICmpZero:
1434     // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg.
1435     // Therefore, return 0 in case F.Scale == -1.
1436     return F.Scale != -1;
1437
1438   case LSRUse::Basic:
1439   case LSRUse::Special:
1440     return 0;
1441   }
1442
1443   llvm_unreachable("Invalid LSRUse Kind!");
1444 }
1445
1446 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
1447                              LSRUse::KindType Kind, Type *AccessTy,
1448                              GlobalValue *BaseGV, int64_t BaseOffset,
1449                              bool HasBaseReg) {
1450   // Fast-path: zero is always foldable.
1451   if (BaseOffset == 0 && !BaseGV) return true;
1452
1453   // Conservatively, create an address with an immediate and a
1454   // base and a scale.
1455   int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1456
1457   // Canonicalize a scale of 1 to a base register if the formula doesn't
1458   // already have a base register.
1459   if (!HasBaseReg && Scale == 1) {
1460     Scale = 0;
1461     HasBaseReg = true;
1462   }
1463
1464   return isLegalUse(TTI, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale);
1465 }
1466
1467 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
1468                              ScalarEvolution &SE, int64_t MinOffset,
1469                              int64_t MaxOffset, LSRUse::KindType Kind,
1470                              Type *AccessTy, const SCEV *S, bool HasBaseReg) {
1471   // Fast-path: zero is always foldable.
1472   if (S->isZero()) return true;
1473
1474   // Conservatively, create an address with an immediate and a
1475   // base and a scale.
1476   int64_t BaseOffset = ExtractImmediate(S, SE);
1477   GlobalValue *BaseGV = ExtractSymbol(S, SE);
1478
1479   // If there's anything else involved, it's not foldable.
1480   if (!S->isZero()) return false;
1481
1482   // Fast-path: zero is always foldable.
1483   if (BaseOffset == 0 && !BaseGV) return true;
1484
1485   // Conservatively, create an address with an immediate and a
1486   // base and a scale.
1487   int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1488
1489   return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1490                     BaseOffset, HasBaseReg, Scale);
1491 }
1492
1493 namespace {
1494
1495 /// IVInc - An individual increment in a Chain of IV increments.
1496 /// Relate an IV user to an expression that computes the IV it uses from the IV
1497 /// used by the previous link in the Chain.
1498 ///
1499 /// For the head of a chain, IncExpr holds the absolute SCEV expression for the
1500 /// original IVOperand. The head of the chain's IVOperand is only valid during
1501 /// chain collection, before LSR replaces IV users. During chain generation,
1502 /// IncExpr can be used to find the new IVOperand that computes the same
1503 /// expression.
1504 struct IVInc {
1505   Instruction *UserInst;
1506   Value* IVOperand;
1507   const SCEV *IncExpr;
1508
1509   IVInc(Instruction *U, Value *O, const SCEV *E):
1510     UserInst(U), IVOperand(O), IncExpr(E) {}
1511 };
1512
1513 // IVChain - The list of IV increments in program order.
1514 // We typically add the head of a chain without finding subsequent links.
1515 struct IVChain {
1516   SmallVector<IVInc,1> Incs;
1517   const SCEV *ExprBase;
1518
1519   IVChain() : ExprBase(0) {}
1520
1521   IVChain(const IVInc &Head, const SCEV *Base)
1522     : Incs(1, Head), ExprBase(Base) {}
1523
1524   typedef SmallVectorImpl<IVInc>::const_iterator const_iterator;
1525
1526   // begin - return the first increment in the chain.
1527   const_iterator begin() const {
1528     assert(!Incs.empty());
1529     return std::next(Incs.begin());
1530   }
1531   const_iterator end() const {
1532     return Incs.end();
1533   }
1534
1535   // hasIncs - Returns true if this chain contains any increments.
1536   bool hasIncs() const { return Incs.size() >= 2; }
1537
1538   // add - Add an IVInc to the end of this chain.
1539   void add(const IVInc &X) { Incs.push_back(X); }
1540
1541   // tailUserInst - Returns the last UserInst in the chain.
1542   Instruction *tailUserInst() const { return Incs.back().UserInst; }
1543
1544   // isProfitableIncrement - Returns true if IncExpr can be profitably added to
1545   // this chain.
1546   bool isProfitableIncrement(const SCEV *OperExpr,
1547                              const SCEV *IncExpr,
1548                              ScalarEvolution&);
1549 };
1550
1551 /// ChainUsers - Helper for CollectChains to track multiple IV increment uses.
1552 /// Distinguish between FarUsers that definitely cross IV increments and
1553 /// NearUsers that may be used between IV increments.
1554 struct ChainUsers {
1555   SmallPtrSet<Instruction*, 4> FarUsers;
1556   SmallPtrSet<Instruction*, 4> NearUsers;
1557 };
1558
1559 /// LSRInstance - This class holds state for the main loop strength reduction
1560 /// logic.
1561 class LSRInstance {
1562   IVUsers &IU;
1563   ScalarEvolution &SE;
1564   DominatorTree &DT;
1565   LoopInfo &LI;
1566   const TargetTransformInfo &TTI;
1567   Loop *const L;
1568   bool Changed;
1569
1570   /// IVIncInsertPos - This is the insert position that the current loop's
1571   /// induction variable increment should be placed. In simple loops, this is
1572   /// the latch block's terminator. But in more complicated cases, this is a
1573   /// position which will dominate all the in-loop post-increment users.
1574   Instruction *IVIncInsertPos;
1575
1576   /// Factors - Interesting factors between use strides.
1577   SmallSetVector<int64_t, 8> Factors;
1578
1579   /// Types - Interesting use types, to facilitate truncation reuse.
1580   SmallSetVector<Type *, 4> Types;
1581
1582   /// Fixups - The list of operands which are to be replaced.
1583   SmallVector<LSRFixup, 16> Fixups;
1584
1585   /// Uses - The list of interesting uses.
1586   SmallVector<LSRUse, 16> Uses;
1587
1588   /// RegUses - Track which uses use which register candidates.
1589   RegUseTracker RegUses;
1590
1591   // Limit the number of chains to avoid quadratic behavior. We don't expect to
1592   // have more than a few IV increment chains in a loop. Missing a Chain falls
1593   // back to normal LSR behavior for those uses.
1594   static const unsigned MaxChains = 8;
1595
1596   /// IVChainVec - IV users can form a chain of IV increments.
1597   SmallVector<IVChain, MaxChains> IVChainVec;
1598
1599   /// IVIncSet - IV users that belong to profitable IVChains.
1600   SmallPtrSet<Use*, MaxChains> IVIncSet;
1601
1602   void OptimizeShadowIV();
1603   bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
1604   ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
1605   void OptimizeLoopTermCond();
1606
1607   void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
1608                         SmallVectorImpl<ChainUsers> &ChainUsersVec);
1609   void FinalizeChain(IVChain &Chain);
1610   void CollectChains();
1611   void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
1612                        SmallVectorImpl<WeakVH> &DeadInsts);
1613
1614   void CollectInterestingTypesAndFactors();
1615   void CollectFixupsAndInitialFormulae();
1616
1617   LSRFixup &getNewFixup() {
1618     Fixups.push_back(LSRFixup());
1619     return Fixups.back();
1620   }
1621
1622   // Support for sharing of LSRUses between LSRFixups.
1623   typedef DenseMap<LSRUse::SCEVUseKindPair, size_t> UseMapTy;
1624   UseMapTy UseMap;
1625
1626   bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
1627                           LSRUse::KindType Kind, Type *AccessTy);
1628
1629   std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
1630                                     LSRUse::KindType Kind,
1631                                     Type *AccessTy);
1632
1633   void DeleteUse(LSRUse &LU, size_t LUIdx);
1634
1635   LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
1636
1637   void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1638   void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1639   void CountRegisters(const Formula &F, size_t LUIdx);
1640   bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
1641
1642   void CollectLoopInvariantFixupsAndFormulae();
1643
1644   void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
1645                               unsigned Depth = 0);
1646   void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
1647   void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1648   void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1649   void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1650   void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1651   void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
1652   void GenerateCrossUseConstantOffsets();
1653   void GenerateAllReuseFormulae();
1654
1655   void FilterOutUndesirableDedicatedRegisters();
1656
1657   size_t EstimateSearchSpaceComplexity() const;
1658   void NarrowSearchSpaceByDetectingSupersets();
1659   void NarrowSearchSpaceByCollapsingUnrolledCode();
1660   void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
1661   void NarrowSearchSpaceByPickingWinnerRegs();
1662   void NarrowSearchSpaceUsingHeuristics();
1663
1664   void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
1665                     Cost &SolutionCost,
1666                     SmallVectorImpl<const Formula *> &Workspace,
1667                     const Cost &CurCost,
1668                     const SmallPtrSet<const SCEV *, 16> &CurRegs,
1669                     DenseSet<const SCEV *> &VisitedRegs) const;
1670   void Solve(SmallVectorImpl<const Formula *> &Solution) const;
1671
1672   BasicBlock::iterator
1673     HoistInsertPosition(BasicBlock::iterator IP,
1674                         const SmallVectorImpl<Instruction *> &Inputs) const;
1675   BasicBlock::iterator
1676     AdjustInsertPositionForExpand(BasicBlock::iterator IP,
1677                                   const LSRFixup &LF,
1678                                   const LSRUse &LU,
1679                                   SCEVExpander &Rewriter) const;
1680
1681   Value *Expand(const LSRFixup &LF,
1682                 const Formula &F,
1683                 BasicBlock::iterator IP,
1684                 SCEVExpander &Rewriter,
1685                 SmallVectorImpl<WeakVH> &DeadInsts) const;
1686   void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
1687                      const Formula &F,
1688                      SCEVExpander &Rewriter,
1689                      SmallVectorImpl<WeakVH> &DeadInsts,
1690                      Pass *P) const;
1691   void Rewrite(const LSRFixup &LF,
1692                const Formula &F,
1693                SCEVExpander &Rewriter,
1694                SmallVectorImpl<WeakVH> &DeadInsts,
1695                Pass *P) const;
1696   void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
1697                          Pass *P);
1698
1699 public:
1700   LSRInstance(Loop *L, Pass *P);
1701
1702   bool getChanged() const { return Changed; }
1703
1704   void print_factors_and_types(raw_ostream &OS) const;
1705   void print_fixups(raw_ostream &OS) const;
1706   void print_uses(raw_ostream &OS) const;
1707   void print(raw_ostream &OS) const;
1708   void dump() const;
1709 };
1710
1711 }
1712
1713 /// OptimizeShadowIV - If IV is used in a int-to-float cast
1714 /// inside the loop then try to eliminate the cast operation.
1715 void LSRInstance::OptimizeShadowIV() {
1716   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1717   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1718     return;
1719
1720   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
1721        UI != E; /* empty */) {
1722     IVUsers::const_iterator CandidateUI = UI;
1723     ++UI;
1724     Instruction *ShadowUse = CandidateUI->getUser();
1725     Type *DestTy = 0;
1726     bool IsSigned = false;
1727
1728     /* If shadow use is a int->float cast then insert a second IV
1729        to eliminate this cast.
1730
1731          for (unsigned i = 0; i < n; ++i)
1732            foo((double)i);
1733
1734        is transformed into
1735
1736          double d = 0.0;
1737          for (unsigned i = 0; i < n; ++i, ++d)
1738            foo(d);
1739     */
1740     if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
1741       IsSigned = false;
1742       DestTy = UCast->getDestTy();
1743     }
1744     else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
1745       IsSigned = true;
1746       DestTy = SCast->getDestTy();
1747     }
1748     if (!DestTy) continue;
1749
1750     // If target does not support DestTy natively then do not apply
1751     // this transformation.
1752     if (!TTI.isTypeLegal(DestTy)) continue;
1753
1754     PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
1755     if (!PH) continue;
1756     if (PH->getNumIncomingValues() != 2) continue;
1757
1758     Type *SrcTy = PH->getType();
1759     int Mantissa = DestTy->getFPMantissaWidth();
1760     if (Mantissa == -1) continue;
1761     if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
1762       continue;
1763
1764     unsigned Entry, Latch;
1765     if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
1766       Entry = 0;
1767       Latch = 1;
1768     } else {
1769       Entry = 1;
1770       Latch = 0;
1771     }
1772
1773     ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
1774     if (!Init) continue;
1775     Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
1776                                         (double)Init->getSExtValue() :
1777                                         (double)Init->getZExtValue());
1778
1779     BinaryOperator *Incr =
1780       dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
1781     if (!Incr) continue;
1782     if (Incr->getOpcode() != Instruction::Add
1783         && Incr->getOpcode() != Instruction::Sub)
1784       continue;
1785
1786     /* Initialize new IV, double d = 0.0 in above example. */
1787     ConstantInt *C = 0;
1788     if (Incr->getOperand(0) == PH)
1789       C = dyn_cast<ConstantInt>(Incr->getOperand(1));
1790     else if (Incr->getOperand(1) == PH)
1791       C = dyn_cast<ConstantInt>(Incr->getOperand(0));
1792     else
1793       continue;
1794
1795     if (!C) continue;
1796
1797     // Ignore negative constants, as the code below doesn't handle them
1798     // correctly. TODO: Remove this restriction.
1799     if (!C->getValue().isStrictlyPositive()) continue;
1800
1801     /* Add new PHINode. */
1802     PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
1803
1804     /* create new increment. '++d' in above example. */
1805     Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
1806     BinaryOperator *NewIncr =
1807       BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
1808                                Instruction::FAdd : Instruction::FSub,
1809                              NewPH, CFP, "IV.S.next.", Incr);
1810
1811     NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
1812     NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
1813
1814     /* Remove cast operation */
1815     ShadowUse->replaceAllUsesWith(NewPH);
1816     ShadowUse->eraseFromParent();
1817     Changed = true;
1818     break;
1819   }
1820 }
1821
1822 /// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
1823 /// set the IV user and stride information and return true, otherwise return
1824 /// false.
1825 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
1826   for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
1827     if (UI->getUser() == Cond) {
1828       // NOTE: we could handle setcc instructions with multiple uses here, but
1829       // InstCombine does it as well for simple uses, it's not clear that it
1830       // occurs enough in real life to handle.
1831       CondUse = UI;
1832       return true;
1833     }
1834   return false;
1835 }
1836
1837 /// OptimizeMax - Rewrite the loop's terminating condition if it uses
1838 /// a max computation.
1839 ///
1840 /// This is a narrow solution to a specific, but acute, problem. For loops
1841 /// like this:
1842 ///
1843 ///   i = 0;
1844 ///   do {
1845 ///     p[i] = 0.0;
1846 ///   } while (++i < n);
1847 ///
1848 /// the trip count isn't just 'n', because 'n' might not be positive. And
1849 /// unfortunately this can come up even for loops where the user didn't use
1850 /// a C do-while loop. For example, seemingly well-behaved top-test loops
1851 /// will commonly be lowered like this:
1852 //
1853 ///   if (n > 0) {
1854 ///     i = 0;
1855 ///     do {
1856 ///       p[i] = 0.0;
1857 ///     } while (++i < n);
1858 ///   }
1859 ///
1860 /// and then it's possible for subsequent optimization to obscure the if
1861 /// test in such a way that indvars can't find it.
1862 ///
1863 /// When indvars can't find the if test in loops like this, it creates a
1864 /// max expression, which allows it to give the loop a canonical
1865 /// induction variable:
1866 ///
1867 ///   i = 0;
1868 ///   max = n < 1 ? 1 : n;
1869 ///   do {
1870 ///     p[i] = 0.0;
1871 ///   } while (++i != max);
1872 ///
1873 /// Canonical induction variables are necessary because the loop passes
1874 /// are designed around them. The most obvious example of this is the
1875 /// LoopInfo analysis, which doesn't remember trip count values. It
1876 /// expects to be able to rediscover the trip count each time it is
1877 /// needed, and it does this using a simple analysis that only succeeds if
1878 /// the loop has a canonical induction variable.
1879 ///
1880 /// However, when it comes time to generate code, the maximum operation
1881 /// can be quite costly, especially if it's inside of an outer loop.
1882 ///
1883 /// This function solves this problem by detecting this type of loop and
1884 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
1885 /// the instructions for the maximum computation.
1886 ///
1887 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
1888   // Check that the loop matches the pattern we're looking for.
1889   if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
1890       Cond->getPredicate() != CmpInst::ICMP_NE)
1891     return Cond;
1892
1893   SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
1894   if (!Sel || !Sel->hasOneUse()) return Cond;
1895
1896   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1897   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1898     return Cond;
1899   const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
1900
1901   // Add one to the backedge-taken count to get the trip count.
1902   const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
1903   if (IterationCount != SE.getSCEV(Sel)) return Cond;
1904
1905   // Check for a max calculation that matches the pattern. There's no check
1906   // for ICMP_ULE here because the comparison would be with zero, which
1907   // isn't interesting.
1908   CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
1909   const SCEVNAryExpr *Max = 0;
1910   if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
1911     Pred = ICmpInst::ICMP_SLE;
1912     Max = S;
1913   } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
1914     Pred = ICmpInst::ICMP_SLT;
1915     Max = S;
1916   } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
1917     Pred = ICmpInst::ICMP_ULT;
1918     Max = U;
1919   } else {
1920     // No match; bail.
1921     return Cond;
1922   }
1923
1924   // To handle a max with more than two operands, this optimization would
1925   // require additional checking and setup.
1926   if (Max->getNumOperands() != 2)
1927     return Cond;
1928
1929   const SCEV *MaxLHS = Max->getOperand(0);
1930   const SCEV *MaxRHS = Max->getOperand(1);
1931
1932   // ScalarEvolution canonicalizes constants to the left. For < and >, look
1933   // for a comparison with 1. For <= and >=, a comparison with zero.
1934   if (!MaxLHS ||
1935       (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
1936     return Cond;
1937
1938   // Check the relevant induction variable for conformance to
1939   // the pattern.
1940   const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
1941   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
1942   if (!AR || !AR->isAffine() ||
1943       AR->getStart() != One ||
1944       AR->getStepRecurrence(SE) != One)
1945     return Cond;
1946
1947   assert(AR->getLoop() == L &&
1948          "Loop condition operand is an addrec in a different loop!");
1949
1950   // Check the right operand of the select, and remember it, as it will
1951   // be used in the new comparison instruction.
1952   Value *NewRHS = 0;
1953   if (ICmpInst::isTrueWhenEqual(Pred)) {
1954     // Look for n+1, and grab n.
1955     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
1956       if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
1957          if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
1958            NewRHS = BO->getOperand(0);
1959     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
1960       if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
1961         if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
1962           NewRHS = BO->getOperand(0);
1963     if (!NewRHS)
1964       return Cond;
1965   } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
1966     NewRHS = Sel->getOperand(1);
1967   else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
1968     NewRHS = Sel->getOperand(2);
1969   else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
1970     NewRHS = SU->getValue();
1971   else
1972     // Max doesn't match expected pattern.
1973     return Cond;
1974
1975   // Determine the new comparison opcode. It may be signed or unsigned,
1976   // and the original comparison may be either equality or inequality.
1977   if (Cond->getPredicate() == CmpInst::ICMP_EQ)
1978     Pred = CmpInst::getInversePredicate(Pred);
1979
1980   // Ok, everything looks ok to change the condition into an SLT or SGE and
1981   // delete the max calculation.
1982   ICmpInst *NewCond =
1983     new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
1984
1985   // Delete the max calculation instructions.
1986   Cond->replaceAllUsesWith(NewCond);
1987   CondUse->setUser(NewCond);
1988   Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
1989   Cond->eraseFromParent();
1990   Sel->eraseFromParent();
1991   if (Cmp->use_empty())
1992     Cmp->eraseFromParent();
1993   return NewCond;
1994 }
1995
1996 /// OptimizeLoopTermCond - Change loop terminating condition to use the
1997 /// postinc iv when possible.
1998 void
1999 LSRInstance::OptimizeLoopTermCond() {
2000   SmallPtrSet<Instruction *, 4> PostIncs;
2001
2002   BasicBlock *LatchBlock = L->getLoopLatch();
2003   SmallVector<BasicBlock*, 8> ExitingBlocks;
2004   L->getExitingBlocks(ExitingBlocks);
2005
2006   for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
2007     BasicBlock *ExitingBlock = ExitingBlocks[i];
2008
2009     // Get the terminating condition for the loop if possible.  If we
2010     // can, we want to change it to use a post-incremented version of its
2011     // induction variable, to allow coalescing the live ranges for the IV into
2012     // one register value.
2013
2014     BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2015     if (!TermBr)
2016       continue;
2017     // FIXME: Overly conservative, termination condition could be an 'or' etc..
2018     if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2019       continue;
2020
2021     // Search IVUsesByStride to find Cond's IVUse if there is one.
2022     IVStrideUse *CondUse = 0;
2023     ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
2024     if (!FindIVUserForCond(Cond, CondUse))
2025       continue;
2026
2027     // If the trip count is computed in terms of a max (due to ScalarEvolution
2028     // being unable to find a sufficient guard, for example), change the loop
2029     // comparison to use SLT or ULT instead of NE.
2030     // One consequence of doing this now is that it disrupts the count-down
2031     // optimization. That's not always a bad thing though, because in such
2032     // cases it may still be worthwhile to avoid a max.
2033     Cond = OptimizeMax(Cond, CondUse);
2034
2035     // If this exiting block dominates the latch block, it may also use
2036     // the post-inc value if it won't be shared with other uses.
2037     // Check for dominance.
2038     if (!DT.dominates(ExitingBlock, LatchBlock))
2039       continue;
2040
2041     // Conservatively avoid trying to use the post-inc value in non-latch
2042     // exits if there may be pre-inc users in intervening blocks.
2043     if (LatchBlock != ExitingBlock)
2044       for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
2045         // Test if the use is reachable from the exiting block. This dominator
2046         // query is a conservative approximation of reachability.
2047         if (&*UI != CondUse &&
2048             !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
2049           // Conservatively assume there may be reuse if the quotient of their
2050           // strides could be a legal scale.
2051           const SCEV *A = IU.getStride(*CondUse, L);
2052           const SCEV *B = IU.getStride(*UI, L);
2053           if (!A || !B) continue;
2054           if (SE.getTypeSizeInBits(A->getType()) !=
2055               SE.getTypeSizeInBits(B->getType())) {
2056             if (SE.getTypeSizeInBits(A->getType()) >
2057                 SE.getTypeSizeInBits(B->getType()))
2058               B = SE.getSignExtendExpr(B, A->getType());
2059             else
2060               A = SE.getSignExtendExpr(A, B->getType());
2061           }
2062           if (const SCEVConstant *D =
2063                 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
2064             const ConstantInt *C = D->getValue();
2065             // Stride of one or negative one can have reuse with non-addresses.
2066             if (C->isOne() || C->isAllOnesValue())
2067               goto decline_post_inc;
2068             // Avoid weird situations.
2069             if (C->getValue().getMinSignedBits() >= 64 ||
2070                 C->getValue().isMinSignedValue())
2071               goto decline_post_inc;
2072             // Check for possible scaled-address reuse.
2073             Type *AccessTy = getAccessType(UI->getUser());
2074             int64_t Scale = C->getSExtValue();
2075             if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0,
2076                                           /*BaseOffset=*/ 0,
2077                                           /*HasBaseReg=*/ false, Scale))
2078               goto decline_post_inc;
2079             Scale = -Scale;
2080             if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0,
2081                                           /*BaseOffset=*/ 0,
2082                                           /*HasBaseReg=*/ false, Scale))
2083               goto decline_post_inc;
2084           }
2085         }
2086
2087     DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
2088                  << *Cond << '\n');
2089
2090     // It's possible for the setcc instruction to be anywhere in the loop, and
2091     // possible for it to have multiple users.  If it is not immediately before
2092     // the exiting block branch, move it.
2093     if (&*++BasicBlock::iterator(Cond) != TermBr) {
2094       if (Cond->hasOneUse()) {
2095         Cond->moveBefore(TermBr);
2096       } else {
2097         // Clone the terminating condition and insert into the loopend.
2098         ICmpInst *OldCond = Cond;
2099         Cond = cast<ICmpInst>(Cond->clone());
2100         Cond->setName(L->getHeader()->getName() + ".termcond");
2101         ExitingBlock->getInstList().insert(TermBr, Cond);
2102
2103         // Clone the IVUse, as the old use still exists!
2104         CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2105         TermBr->replaceUsesOfWith(OldCond, Cond);
2106       }
2107     }
2108
2109     // If we get to here, we know that we can transform the setcc instruction to
2110     // use the post-incremented version of the IV, allowing us to coalesce the
2111     // live ranges for the IV correctly.
2112     CondUse->transformToPostInc(L);
2113     Changed = true;
2114
2115     PostIncs.insert(Cond);
2116   decline_post_inc:;
2117   }
2118
2119   // Determine an insertion point for the loop induction variable increment. It
2120   // must dominate all the post-inc comparisons we just set up, and it must
2121   // dominate the loop latch edge.
2122   IVIncInsertPos = L->getLoopLatch()->getTerminator();
2123   for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(),
2124        E = PostIncs.end(); I != E; ++I) {
2125     BasicBlock *BB =
2126       DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
2127                                     (*I)->getParent());
2128     if (BB == (*I)->getParent())
2129       IVIncInsertPos = *I;
2130     else if (BB != IVIncInsertPos->getParent())
2131       IVIncInsertPos = BB->getTerminator();
2132   }
2133 }
2134
2135 /// reconcileNewOffset - Determine if the given use can accommodate a fixup
2136 /// at the given offset and other details. If so, update the use and
2137 /// return true.
2138 bool
2139 LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
2140                                 LSRUse::KindType Kind, Type *AccessTy) {
2141   int64_t NewMinOffset = LU.MinOffset;
2142   int64_t NewMaxOffset = LU.MaxOffset;
2143   Type *NewAccessTy = AccessTy;
2144
2145   // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2146   // something conservative, however this can pessimize in the case that one of
2147   // the uses will have all its uses outside the loop, for example.
2148   if (LU.Kind != Kind)
2149     return false;
2150   // Conservatively assume HasBaseReg is true for now.
2151   if (NewOffset < LU.MinOffset) {
2152     if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0,
2153                           LU.MaxOffset - NewOffset, HasBaseReg))
2154       return false;
2155     NewMinOffset = NewOffset;
2156   } else if (NewOffset > LU.MaxOffset) {
2157     if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0,
2158                           NewOffset - LU.MinOffset, HasBaseReg))
2159       return false;
2160     NewMaxOffset = NewOffset;
2161   }
2162   // Check for a mismatched access type, and fall back conservatively as needed.
2163   // TODO: Be less conservative when the type is similar and can use the same
2164   // addressing modes.
2165   if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
2166     NewAccessTy = Type::getVoidTy(AccessTy->getContext());
2167
2168   // Update the use.
2169   LU.MinOffset = NewMinOffset;
2170   LU.MaxOffset = NewMaxOffset;
2171   LU.AccessTy = NewAccessTy;
2172   if (NewOffset != LU.Offsets.back())
2173     LU.Offsets.push_back(NewOffset);
2174   return true;
2175 }
2176
2177 /// getUse - Return an LSRUse index and an offset value for a fixup which
2178 /// needs the given expression, with the given kind and optional access type.
2179 /// Either reuse an existing use or create a new one, as needed.
2180 std::pair<size_t, int64_t>
2181 LSRInstance::getUse(const SCEV *&Expr,
2182                     LSRUse::KindType Kind, Type *AccessTy) {
2183   const SCEV *Copy = Expr;
2184   int64_t Offset = ExtractImmediate(Expr, SE);
2185
2186   // Basic uses can't accept any offset, for example.
2187   if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0,
2188                         Offset, /*HasBaseReg=*/ true)) {
2189     Expr = Copy;
2190     Offset = 0;
2191   }
2192
2193   std::pair<UseMapTy::iterator, bool> P =
2194     UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
2195   if (!P.second) {
2196     // A use already existed with this base.
2197     size_t LUIdx = P.first->second;
2198     LSRUse &LU = Uses[LUIdx];
2199     if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2200       // Reuse this use.
2201       return std::make_pair(LUIdx, Offset);
2202   }
2203
2204   // Create a new use.
2205   size_t LUIdx = Uses.size();
2206   P.first->second = LUIdx;
2207   Uses.push_back(LSRUse(Kind, AccessTy));
2208   LSRUse &LU = Uses[LUIdx];
2209
2210   // We don't need to track redundant offsets, but we don't need to go out
2211   // of our way here to avoid them.
2212   if (LU.Offsets.empty() || Offset != LU.Offsets.back())
2213     LU.Offsets.push_back(Offset);
2214
2215   LU.MinOffset = Offset;
2216   LU.MaxOffset = Offset;
2217   return std::make_pair(LUIdx, Offset);
2218 }
2219
2220 /// DeleteUse - Delete the given use from the Uses list.
2221 void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2222   if (&LU != &Uses.back())
2223     std::swap(LU, Uses.back());
2224   Uses.pop_back();
2225
2226   // Update RegUses.
2227   RegUses.SwapAndDropUse(LUIdx, Uses.size());
2228 }
2229
2230 /// FindUseWithFormula - Look for a use distinct from OrigLU which is has
2231 /// a formula that has the same registers as the given formula.
2232 LSRUse *
2233 LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2234                                        const LSRUse &OrigLU) {
2235   // Search all uses for the formula. This could be more clever.
2236   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2237     LSRUse &LU = Uses[LUIdx];
2238     // Check whether this use is close enough to OrigLU, to see whether it's
2239     // worthwhile looking through its formulae.
2240     // Ignore ICmpZero uses because they may contain formulae generated by
2241     // GenerateICmpZeroScales, in which case adding fixup offsets may
2242     // be invalid.
2243     if (&LU != &OrigLU &&
2244         LU.Kind != LSRUse::ICmpZero &&
2245         LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2246         LU.WidestFixupType == OrigLU.WidestFixupType &&
2247         LU.HasFormulaWithSameRegs(OrigF)) {
2248       // Scan through this use's formulae.
2249       for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
2250            E = LU.Formulae.end(); I != E; ++I) {
2251         const Formula &F = *I;
2252         // Check to see if this formula has the same registers and symbols
2253         // as OrigF.
2254         if (F.BaseRegs == OrigF.BaseRegs &&
2255             F.ScaledReg == OrigF.ScaledReg &&
2256             F.BaseGV == OrigF.BaseGV &&
2257             F.Scale == OrigF.Scale &&
2258             F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2259           if (F.BaseOffset == 0)
2260             return &LU;
2261           // This is the formula where all the registers and symbols matched;
2262           // there aren't going to be any others. Since we declined it, we
2263           // can skip the rest of the formulae and proceed to the next LSRUse.
2264           break;
2265         }
2266       }
2267     }
2268   }
2269
2270   // Nothing looked good.
2271   return 0;
2272 }
2273
2274 void LSRInstance::CollectInterestingTypesAndFactors() {
2275   SmallSetVector<const SCEV *, 4> Strides;
2276
2277   // Collect interesting types and strides.
2278   SmallVector<const SCEV *, 4> Worklist;
2279   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
2280     const SCEV *Expr = IU.getExpr(*UI);
2281
2282     // Collect interesting types.
2283     Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2284
2285     // Add strides for mentioned loops.
2286     Worklist.push_back(Expr);
2287     do {
2288       const SCEV *S = Worklist.pop_back_val();
2289       if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2290         if (AR->getLoop() == L)
2291           Strides.insert(AR->getStepRecurrence(SE));
2292         Worklist.push_back(AR->getStart());
2293       } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2294         Worklist.append(Add->op_begin(), Add->op_end());
2295       }
2296     } while (!Worklist.empty());
2297   }
2298
2299   // Compute interesting factors from the set of interesting strides.
2300   for (SmallSetVector<const SCEV *, 4>::const_iterator
2301        I = Strides.begin(), E = Strides.end(); I != E; ++I)
2302     for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2303          std::next(I); NewStrideIter != E; ++NewStrideIter) {
2304       const SCEV *OldStride = *I;
2305       const SCEV *NewStride = *NewStrideIter;
2306
2307       if (SE.getTypeSizeInBits(OldStride->getType()) !=
2308           SE.getTypeSizeInBits(NewStride->getType())) {
2309         if (SE.getTypeSizeInBits(OldStride->getType()) >
2310             SE.getTypeSizeInBits(NewStride->getType()))
2311           NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2312         else
2313           OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2314       }
2315       if (const SCEVConstant *Factor =
2316             dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2317                                                         SE, true))) {
2318         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
2319           Factors.insert(Factor->getValue()->getValue().getSExtValue());
2320       } else if (const SCEVConstant *Factor =
2321                    dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2322                                                                NewStride,
2323                                                                SE, true))) {
2324         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
2325           Factors.insert(Factor->getValue()->getValue().getSExtValue());
2326       }
2327     }
2328
2329   // If all uses use the same type, don't bother looking for truncation-based
2330   // reuse.
2331   if (Types.size() == 1)
2332     Types.clear();
2333
2334   DEBUG(print_factors_and_types(dbgs()));
2335 }
2336
2337 /// findIVOperand - Helper for CollectChains that finds an IV operand (computed
2338 /// by an AddRec in this loop) within [OI,OE) or returns OE. If IVUsers mapped
2339 /// Instructions to IVStrideUses, we could partially skip this.
2340 static User::op_iterator
2341 findIVOperand(User::op_iterator OI, User::op_iterator OE,
2342               Loop *L, ScalarEvolution &SE) {
2343   for(; OI != OE; ++OI) {
2344     if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2345       if (!SE.isSCEVable(Oper->getType()))
2346         continue;
2347
2348       if (const SCEVAddRecExpr *AR =
2349           dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
2350         if (AR->getLoop() == L)
2351           break;
2352       }
2353     }
2354   }
2355   return OI;
2356 }
2357
2358 /// getWideOperand - IVChain logic must consistenctly peek base TruncInst
2359 /// operands, so wrap it in a convenient helper.
2360 static Value *getWideOperand(Value *Oper) {
2361   if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2362     return Trunc->getOperand(0);
2363   return Oper;
2364 }
2365
2366 /// isCompatibleIVType - Return true if we allow an IV chain to include both
2367 /// types.
2368 static bool isCompatibleIVType(Value *LVal, Value *RVal) {
2369   Type *LType = LVal->getType();
2370   Type *RType = RVal->getType();
2371   return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());
2372 }
2373
2374 /// getExprBase - Return an approximation of this SCEV expression's "base", or
2375 /// NULL for any constant. Returning the expression itself is
2376 /// conservative. Returning a deeper subexpression is more precise and valid as
2377 /// long as it isn't less complex than another subexpression. For expressions
2378 /// involving multiple unscaled values, we need to return the pointer-type
2379 /// SCEVUnknown. This avoids forming chains across objects, such as:
2380 /// PrevOper==a[i], IVOper==b[i], IVInc==b-a.
2381 ///
2382 /// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2383 /// SCEVUnknown, we simply return the rightmost SCEV operand.
2384 static const SCEV *getExprBase(const SCEV *S) {
2385   switch (S->getSCEVType()) {
2386   default: // uncluding scUnknown.
2387     return S;
2388   case scConstant:
2389     return 0;
2390   case scTruncate:
2391     return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
2392   case scZeroExtend:
2393     return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
2394   case scSignExtend:
2395     return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
2396   case scAddExpr: {
2397     // Skip over scaled operands (scMulExpr) to follow add operands as long as
2398     // there's nothing more complex.
2399     // FIXME: not sure if we want to recognize negation.
2400     const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
2401     for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
2402            E(Add->op_begin()); I != E; ++I) {
2403       const SCEV *SubExpr = *I;
2404       if (SubExpr->getSCEVType() == scAddExpr)
2405         return getExprBase(SubExpr);
2406
2407       if (SubExpr->getSCEVType() != scMulExpr)
2408         return SubExpr;
2409     }
2410     return S; // all operands are scaled, be conservative.
2411   }
2412   case scAddRecExpr:
2413     return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
2414   }
2415 }
2416
2417 /// Return true if the chain increment is profitable to expand into a loop
2418 /// invariant value, which may require its own register. A profitable chain
2419 /// increment will be an offset relative to the same base. We allow such offsets
2420 /// to potentially be used as chain increment as long as it's not obviously
2421 /// expensive to expand using real instructions.
2422 bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
2423                                     const SCEV *IncExpr,
2424                                     ScalarEvolution &SE) {
2425   // Aggressively form chains when -stress-ivchain.
2426   if (StressIVChain)
2427     return true;
2428
2429   // Do not replace a constant offset from IV head with a nonconstant IV
2430   // increment.
2431   if (!isa<SCEVConstant>(IncExpr)) {
2432     const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
2433     if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
2434       return 0;
2435   }
2436
2437   SmallPtrSet<const SCEV*, 8> Processed;
2438   return !isHighCostExpansion(IncExpr, Processed, SE);
2439 }
2440
2441 /// Return true if the number of registers needed for the chain is estimated to
2442 /// be less than the number required for the individual IV users. First prohibit
2443 /// any IV users that keep the IV live across increments (the Users set should
2444 /// be empty). Next count the number and type of increments in the chain.
2445 ///
2446 /// Chaining IVs can lead to considerable code bloat if ISEL doesn't
2447 /// effectively use postinc addressing modes. Only consider it profitable it the
2448 /// increments can be computed in fewer registers when chained.
2449 ///
2450 /// TODO: Consider IVInc free if it's already used in another chains.
2451 static bool
2452 isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users,
2453                   ScalarEvolution &SE, const TargetTransformInfo &TTI) {
2454   if (StressIVChain)
2455     return true;
2456
2457   if (!Chain.hasIncs())
2458     return false;
2459
2460   if (!Users.empty()) {
2461     DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
2462           for (SmallPtrSet<Instruction*, 4>::const_iterator I = Users.begin(),
2463                  E = Users.end(); I != E; ++I) {
2464             dbgs() << "  " << **I << "\n";
2465           });
2466     return false;
2467   }
2468   assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
2469
2470   // The chain itself may require a register, so intialize cost to 1.
2471   int cost = 1;
2472
2473   // A complete chain likely eliminates the need for keeping the original IV in
2474   // a register. LSR does not currently know how to form a complete chain unless
2475   // the header phi already exists.
2476   if (isa<PHINode>(Chain.tailUserInst())
2477       && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
2478     --cost;
2479   }
2480   const SCEV *LastIncExpr = 0;
2481   unsigned NumConstIncrements = 0;
2482   unsigned NumVarIncrements = 0;
2483   unsigned NumReusedIncrements = 0;
2484   for (IVChain::const_iterator I = Chain.begin(), E = Chain.end();
2485        I != E; ++I) {
2486
2487     if (I->IncExpr->isZero())
2488       continue;
2489
2490     // Incrementing by zero or some constant is neutral. We assume constants can
2491     // be folded into an addressing mode or an add's immediate operand.
2492     if (isa<SCEVConstant>(I->IncExpr)) {
2493       ++NumConstIncrements;
2494       continue;
2495     }
2496
2497     if (I->IncExpr == LastIncExpr)
2498       ++NumReusedIncrements;
2499     else
2500       ++NumVarIncrements;
2501
2502     LastIncExpr = I->IncExpr;
2503   }
2504   // An IV chain with a single increment is handled by LSR's postinc
2505   // uses. However, a chain with multiple increments requires keeping the IV's
2506   // value live longer than it needs to be if chained.
2507   if (NumConstIncrements > 1)
2508     --cost;
2509
2510   // Materializing increment expressions in the preheader that didn't exist in
2511   // the original code may cost a register. For example, sign-extended array
2512   // indices can produce ridiculous increments like this:
2513   // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
2514   cost += NumVarIncrements;
2515
2516   // Reusing variable increments likely saves a register to hold the multiple of
2517   // the stride.
2518   cost -= NumReusedIncrements;
2519
2520   DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
2521                << "\n");
2522
2523   return cost < 0;
2524 }
2525
2526 /// ChainInstruction - Add this IV user to an existing chain or make it the head
2527 /// of a new chain.
2528 void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2529                                    SmallVectorImpl<ChainUsers> &ChainUsersVec) {
2530   // When IVs are used as types of varying widths, they are generally converted
2531   // to a wider type with some uses remaining narrow under a (free) trunc.
2532   Value *const NextIV = getWideOperand(IVOper);
2533   const SCEV *const OperExpr = SE.getSCEV(NextIV);
2534   const SCEV *const OperExprBase = getExprBase(OperExpr);
2535
2536   // Visit all existing chains. Check if its IVOper can be computed as a
2537   // profitable loop invariant increment from the last link in the Chain.
2538   unsigned ChainIdx = 0, NChains = IVChainVec.size();
2539   const SCEV *LastIncExpr = 0;
2540   for (; ChainIdx < NChains; ++ChainIdx) {
2541     IVChain &Chain = IVChainVec[ChainIdx];
2542
2543     // Prune the solution space aggressively by checking that both IV operands
2544     // are expressions that operate on the same unscaled SCEVUnknown. This
2545     // "base" will be canceled by the subsequent getMinusSCEV call. Checking
2546     // first avoids creating extra SCEV expressions.
2547     if (!StressIVChain && Chain.ExprBase != OperExprBase)
2548       continue;
2549
2550     Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
2551     if (!isCompatibleIVType(PrevIV, NextIV))
2552       continue;
2553
2554     // A phi node terminates a chain.
2555     if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
2556       continue;
2557
2558     // The increment must be loop-invariant so it can be kept in a register.
2559     const SCEV *PrevExpr = SE.getSCEV(PrevIV);
2560     const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
2561     if (!SE.isLoopInvariant(IncExpr, L))
2562       continue;
2563
2564     if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
2565       LastIncExpr = IncExpr;
2566       break;
2567     }
2568   }
2569   // If we haven't found a chain, create a new one, unless we hit the max. Don't
2570   // bother for phi nodes, because they must be last in the chain.
2571   if (ChainIdx == NChains) {
2572     if (isa<PHINode>(UserInst))
2573       return;
2574     if (NChains >= MaxChains && !StressIVChain) {
2575       DEBUG(dbgs() << "IV Chain Limit\n");
2576       return;
2577     }
2578     LastIncExpr = OperExpr;
2579     // IVUsers may have skipped over sign/zero extensions. We don't currently
2580     // attempt to form chains involving extensions unless they can be hoisted
2581     // into this loop's AddRec.
2582     if (!isa<SCEVAddRecExpr>(LastIncExpr))
2583       return;
2584     ++NChains;
2585     IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
2586                                  OperExprBase));
2587     ChainUsersVec.resize(NChains);
2588     DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
2589                  << ") IV=" << *LastIncExpr << "\n");
2590   } else {
2591     DEBUG(dbgs() << "IV Chain#" << ChainIdx << "  Inc: (" << *UserInst
2592                  << ") IV+" << *LastIncExpr << "\n");
2593     // Add this IV user to the end of the chain.
2594     IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
2595   }
2596   IVChain &Chain = IVChainVec[ChainIdx];
2597
2598   SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
2599   // This chain's NearUsers become FarUsers.
2600   if (!LastIncExpr->isZero()) {
2601     ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
2602                                             NearUsers.end());
2603     NearUsers.clear();
2604   }
2605
2606   // All other uses of IVOperand become near uses of the chain.
2607   // We currently ignore intermediate values within SCEV expressions, assuming
2608   // they will eventually be used be the current chain, or can be computed
2609   // from one of the chain increments. To be more precise we could
2610   // transitively follow its user and only add leaf IV users to the set.
2611   for (User *U : IVOper->users()) {
2612     Instruction *OtherUse = dyn_cast<Instruction>(U);
2613     if (!OtherUse)
2614       continue;
2615     // Uses in the chain will no longer be uses if the chain is formed.
2616     // Include the head of the chain in this iteration (not Chain.begin()).
2617     IVChain::const_iterator IncIter = Chain.Incs.begin();
2618     IVChain::const_iterator IncEnd = Chain.Incs.end();
2619     for( ; IncIter != IncEnd; ++IncIter) {
2620       if (IncIter->UserInst == OtherUse)
2621         break;
2622     }
2623     if (IncIter != IncEnd)
2624       continue;
2625
2626     if (SE.isSCEVable(OtherUse->getType())
2627         && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
2628         && IU.isIVUserOrOperand(OtherUse)) {
2629       continue;
2630     }
2631     NearUsers.insert(OtherUse);
2632   }
2633
2634   // Since this user is part of the chain, it's no longer considered a use
2635   // of the chain.
2636   ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
2637 }
2638
2639 /// CollectChains - Populate the vector of Chains.
2640 ///
2641 /// This decreases ILP at the architecture level. Targets with ample registers,
2642 /// multiple memory ports, and no register renaming probably don't want
2643 /// this. However, such targets should probably disable LSR altogether.
2644 ///
2645 /// The job of LSR is to make a reasonable choice of induction variables across
2646 /// the loop. Subsequent passes can easily "unchain" computation exposing more
2647 /// ILP *within the loop* if the target wants it.
2648 ///
2649 /// Finding the best IV chain is potentially a scheduling problem. Since LSR
2650 /// will not reorder memory operations, it will recognize this as a chain, but
2651 /// will generate redundant IV increments. Ideally this would be corrected later
2652 /// by a smart scheduler:
2653 ///        = A[i]
2654 ///        = A[i+x]
2655 /// A[i]   =
2656 /// A[i+x] =
2657 ///
2658 /// TODO: Walk the entire domtree within this loop, not just the path to the
2659 /// loop latch. This will discover chains on side paths, but requires
2660 /// maintaining multiple copies of the Chains state.
2661 void LSRInstance::CollectChains() {
2662   DEBUG(dbgs() << "Collecting IV Chains.\n");
2663   SmallVector<ChainUsers, 8> ChainUsersVec;
2664
2665   SmallVector<BasicBlock *,8> LatchPath;
2666   BasicBlock *LoopHeader = L->getHeader();
2667   for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
2668        Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
2669     LatchPath.push_back(Rung->getBlock());
2670   }
2671   LatchPath.push_back(LoopHeader);
2672
2673   // Walk the instruction stream from the loop header to the loop latch.
2674   for (SmallVectorImpl<BasicBlock *>::reverse_iterator
2675          BBIter = LatchPath.rbegin(), BBEnd = LatchPath.rend();
2676        BBIter != BBEnd; ++BBIter) {
2677     for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end();
2678          I != E; ++I) {
2679       // Skip instructions that weren't seen by IVUsers analysis.
2680       if (isa<PHINode>(I) || !IU.isIVUserOrOperand(I))
2681         continue;
2682
2683       // Ignore users that are part of a SCEV expression. This way we only
2684       // consider leaf IV Users. This effectively rediscovers a portion of
2685       // IVUsers analysis but in program order this time.
2686       if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(I)))
2687         continue;
2688
2689       // Remove this instruction from any NearUsers set it may be in.
2690       for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
2691            ChainIdx < NChains; ++ChainIdx) {
2692         ChainUsersVec[ChainIdx].NearUsers.erase(I);
2693       }
2694       // Search for operands that can be chained.
2695       SmallPtrSet<Instruction*, 4> UniqueOperands;
2696       User::op_iterator IVOpEnd = I->op_end();
2697       User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE);
2698       while (IVOpIter != IVOpEnd) {
2699         Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
2700         if (UniqueOperands.insert(IVOpInst))
2701           ChainInstruction(I, IVOpInst, ChainUsersVec);
2702         IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
2703       }
2704     } // Continue walking down the instructions.
2705   } // Continue walking down the domtree.
2706   // Visit phi backedges to determine if the chain can generate the IV postinc.
2707   for (BasicBlock::iterator I = L->getHeader()->begin();
2708        PHINode *PN = dyn_cast<PHINode>(I); ++I) {
2709     if (!SE.isSCEVable(PN->getType()))
2710       continue;
2711
2712     Instruction *IncV =
2713       dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch()));
2714     if (IncV)
2715       ChainInstruction(PN, IncV, ChainUsersVec);
2716   }
2717   // Remove any unprofitable chains.
2718   unsigned ChainIdx = 0;
2719   for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
2720        UsersIdx < NChains; ++UsersIdx) {
2721     if (!isProfitableChain(IVChainVec[UsersIdx],
2722                            ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
2723       continue;
2724     // Preserve the chain at UsesIdx.
2725     if (ChainIdx != UsersIdx)
2726       IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
2727     FinalizeChain(IVChainVec[ChainIdx]);
2728     ++ChainIdx;
2729   }
2730   IVChainVec.resize(ChainIdx);
2731 }
2732
2733 void LSRInstance::FinalizeChain(IVChain &Chain) {
2734   assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
2735   DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
2736
2737   for (IVChain::const_iterator I = Chain.begin(), E = Chain.end();
2738        I != E; ++I) {
2739     DEBUG(dbgs() << "        Inc: " << *I->UserInst << "\n");
2740     User::op_iterator UseI =
2741       std::find(I->UserInst->op_begin(), I->UserInst->op_end(), I->IVOperand);
2742     assert(UseI != I->UserInst->op_end() && "cannot find IV operand");
2743     IVIncSet.insert(UseI);
2744   }
2745 }
2746
2747 /// Return true if the IVInc can be folded into an addressing mode.
2748 static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
2749                              Value *Operand, const TargetTransformInfo &TTI) {
2750   const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
2751   if (!IncConst || !isAddressUse(UserInst, Operand))
2752     return false;
2753
2754   if (IncConst->getValue()->getValue().getMinSignedBits() > 64)
2755     return false;
2756
2757   int64_t IncOffset = IncConst->getValue()->getSExtValue();
2758   if (!isAlwaysFoldable(TTI, LSRUse::Address,
2759                         getAccessType(UserInst), /*BaseGV=*/ 0,
2760                         IncOffset, /*HaseBaseReg=*/ false))
2761     return false;
2762
2763   return true;
2764 }
2765
2766 /// GenerateIVChains - Generate an add or subtract for each IVInc in a chain to
2767 /// materialize the IV user's operand from the previous IV user's operand.
2768 void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
2769                                   SmallVectorImpl<WeakVH> &DeadInsts) {
2770   // Find the new IVOperand for the head of the chain. It may have been replaced
2771   // by LSR.
2772   const IVInc &Head = Chain.Incs[0];
2773   User::op_iterator IVOpEnd = Head.UserInst->op_end();
2774   // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
2775   User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
2776                                              IVOpEnd, L, SE);
2777   Value *IVSrc = 0;
2778   while (IVOpIter != IVOpEnd) {
2779     IVSrc = getWideOperand(*IVOpIter);
2780
2781     // If this operand computes the expression that the chain needs, we may use
2782     // it. (Check this after setting IVSrc which is used below.)
2783     //
2784     // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
2785     // narrow for the chain, so we can no longer use it. We do allow using a
2786     // wider phi, assuming the LSR checked for free truncation. In that case we
2787     // should already have a truncate on this operand such that
2788     // getSCEV(IVSrc) == IncExpr.
2789     if (SE.getSCEV(*IVOpIter) == Head.IncExpr
2790         || SE.getSCEV(IVSrc) == Head.IncExpr) {
2791       break;
2792     }
2793     IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
2794   }
2795   if (IVOpIter == IVOpEnd) {
2796     // Gracefully give up on this chain.
2797     DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
2798     return;
2799   }
2800
2801   DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
2802   Type *IVTy = IVSrc->getType();
2803   Type *IntTy = SE.getEffectiveSCEVType(IVTy);
2804   const SCEV *LeftOverExpr = 0;
2805   for (IVChain::const_iterator IncI = Chain.begin(),
2806          IncE = Chain.end(); IncI != IncE; ++IncI) {
2807
2808     Instruction *InsertPt = IncI->UserInst;
2809     if (isa<PHINode>(InsertPt))
2810       InsertPt = L->getLoopLatch()->getTerminator();
2811
2812     // IVOper will replace the current IV User's operand. IVSrc is the IV
2813     // value currently held in a register.
2814     Value *IVOper = IVSrc;
2815     if (!IncI->IncExpr->isZero()) {
2816       // IncExpr was the result of subtraction of two narrow values, so must
2817       // be signed.
2818       const SCEV *IncExpr = SE.getNoopOrSignExtend(IncI->IncExpr, IntTy);
2819       LeftOverExpr = LeftOverExpr ?
2820         SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
2821     }
2822     if (LeftOverExpr && !LeftOverExpr->isZero()) {
2823       // Expand the IV increment.
2824       Rewriter.clearPostInc();
2825       Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
2826       const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
2827                                              SE.getUnknown(IncV));
2828       IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
2829
2830       // If an IV increment can't be folded, use it as the next IV value.
2831       if (!canFoldIVIncExpr(LeftOverExpr, IncI->UserInst, IncI->IVOperand,
2832                             TTI)) {
2833         assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
2834         IVSrc = IVOper;
2835         LeftOverExpr = 0;
2836       }
2837     }
2838     Type *OperTy = IncI->IVOperand->getType();
2839     if (IVTy != OperTy) {
2840       assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
2841              "cannot extend a chained IV");
2842       IRBuilder<> Builder(InsertPt);
2843       IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
2844     }
2845     IncI->UserInst->replaceUsesOfWith(IncI->IVOperand, IVOper);
2846     DeadInsts.push_back(IncI->IVOperand);
2847   }
2848   // If LSR created a new, wider phi, we may also replace its postinc. We only
2849   // do this if we also found a wide value for the head of the chain.
2850   if (isa<PHINode>(Chain.tailUserInst())) {
2851     for (BasicBlock::iterator I = L->getHeader()->begin();
2852          PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
2853       if (!isCompatibleIVType(Phi, IVSrc))
2854         continue;
2855       Instruction *PostIncV = dyn_cast<Instruction>(
2856         Phi->getIncomingValueForBlock(L->getLoopLatch()));
2857       if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
2858         continue;
2859       Value *IVOper = IVSrc;
2860       Type *PostIncTy = PostIncV->getType();
2861       if (IVTy != PostIncTy) {
2862         assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
2863         IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
2864         Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
2865         IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
2866       }
2867       Phi->replaceUsesOfWith(PostIncV, IVOper);
2868       DeadInsts.push_back(PostIncV);
2869     }
2870   }
2871 }
2872
2873 void LSRInstance::CollectFixupsAndInitialFormulae() {
2874   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
2875     Instruction *UserInst = UI->getUser();
2876     // Skip IV users that are part of profitable IV Chains.
2877     User::op_iterator UseI = std::find(UserInst->op_begin(), UserInst->op_end(),
2878                                        UI->getOperandValToReplace());
2879     assert(UseI != UserInst->op_end() && "cannot find IV operand");
2880     if (IVIncSet.count(UseI))
2881       continue;
2882
2883     // Record the uses.
2884     LSRFixup &LF = getNewFixup();
2885     LF.UserInst = UserInst;
2886     LF.OperandValToReplace = UI->getOperandValToReplace();
2887     LF.PostIncLoops = UI->getPostIncLoops();
2888
2889     LSRUse::KindType Kind = LSRUse::Basic;
2890     Type *AccessTy = 0;
2891     if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
2892       Kind = LSRUse::Address;
2893       AccessTy = getAccessType(LF.UserInst);
2894     }
2895
2896     const SCEV *S = IU.getExpr(*UI);
2897
2898     // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
2899     // (N - i == 0), and this allows (N - i) to be the expression that we work
2900     // with rather than just N or i, so we can consider the register
2901     // requirements for both N and i at the same time. Limiting this code to
2902     // equality icmps is not a problem because all interesting loops use
2903     // equality icmps, thanks to IndVarSimplify.
2904     if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst))
2905       if (CI->isEquality()) {
2906         // Swap the operands if needed to put the OperandValToReplace on the
2907         // left, for consistency.
2908         Value *NV = CI->getOperand(1);
2909         if (NV == LF.OperandValToReplace) {
2910           CI->setOperand(1, CI->getOperand(0));
2911           CI->setOperand(0, NV);
2912           NV = CI->getOperand(1);
2913           Changed = true;
2914         }
2915
2916         // x == y  -->  x - y == 0
2917         const SCEV *N = SE.getSCEV(NV);
2918         if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
2919           // S is normalized, so normalize N before folding it into S
2920           // to keep the result normalized.
2921           N = TransformForPostIncUse(Normalize, N, CI, 0,
2922                                      LF.PostIncLoops, SE, DT);
2923           Kind = LSRUse::ICmpZero;
2924           S = SE.getMinusSCEV(N, S);
2925         }
2926
2927         // -1 and the negations of all interesting strides (except the negation
2928         // of -1) are now also interesting.
2929         for (size_t i = 0, e = Factors.size(); i != e; ++i)
2930           if (Factors[i] != -1)
2931             Factors.insert(-(uint64_t)Factors[i]);
2932         Factors.insert(-1);
2933       }
2934
2935     // Set up the initial formula for this use.
2936     std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
2937     LF.LUIdx = P.first;
2938     LF.Offset = P.second;
2939     LSRUse &LU = Uses[LF.LUIdx];
2940     LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
2941     if (!LU.WidestFixupType ||
2942         SE.getTypeSizeInBits(LU.WidestFixupType) <
2943         SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
2944       LU.WidestFixupType = LF.OperandValToReplace->getType();
2945
2946     // If this is the first use of this LSRUse, give it a formula.
2947     if (LU.Formulae.empty()) {
2948       InsertInitialFormula(S, LU, LF.LUIdx);
2949       CountRegisters(LU.Formulae.back(), LF.LUIdx);
2950     }
2951   }
2952
2953   DEBUG(print_fixups(dbgs()));
2954 }
2955
2956 /// InsertInitialFormula - Insert a formula for the given expression into
2957 /// the given use, separating out loop-variant portions from loop-invariant
2958 /// and loop-computable portions.
2959 void
2960 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
2961   // Mark uses whose expressions cannot be expanded.
2962   if (!isSafeToExpand(S, SE))
2963     LU.RigidFormula = true;
2964
2965   Formula F;
2966   F.InitialMatch(S, L, SE);
2967   bool Inserted = InsertFormula(LU, LUIdx, F);
2968   assert(Inserted && "Initial formula already exists!"); (void)Inserted;
2969 }
2970
2971 /// InsertSupplementalFormula - Insert a simple single-register formula for
2972 /// the given expression into the given use.
2973 void
2974 LSRInstance::InsertSupplementalFormula(const SCEV *S,
2975                                        LSRUse &LU, size_t LUIdx) {
2976   Formula F;
2977   F.BaseRegs.push_back(S);
2978   F.HasBaseReg = true;
2979   bool Inserted = InsertFormula(LU, LUIdx, F);
2980   assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
2981 }
2982
2983 /// CountRegisters - Note which registers are used by the given formula,
2984 /// updating RegUses.
2985 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
2986   if (F.ScaledReg)
2987     RegUses.CountRegister(F.ScaledReg, LUIdx);
2988   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
2989        E = F.BaseRegs.end(); I != E; ++I)
2990     RegUses.CountRegister(*I, LUIdx);
2991 }
2992
2993 /// InsertFormula - If the given formula has not yet been inserted, add it to
2994 /// the list, and return true. Return false otherwise.
2995 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
2996   if (!LU.InsertFormula(F))
2997     return false;
2998
2999   CountRegisters(F, LUIdx);
3000   return true;
3001 }
3002
3003 /// CollectLoopInvariantFixupsAndFormulae - Check for other uses of
3004 /// loop-invariant values which we're tracking. These other uses will pin these
3005 /// values in registers, making them less profitable for elimination.
3006 /// TODO: This currently misses non-constant addrec step registers.
3007 /// TODO: Should this give more weight to users inside the loop?
3008 void
3009 LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3010   SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3011   SmallPtrSet<const SCEV *, 8> Inserted;
3012
3013   while (!Worklist.empty()) {
3014     const SCEV *S = Worklist.pop_back_val();
3015
3016     if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3017       Worklist.append(N->op_begin(), N->op_end());
3018     else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
3019       Worklist.push_back(C->getOperand());
3020     else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3021       Worklist.push_back(D->getLHS());
3022       Worklist.push_back(D->getRHS());
3023     } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3024       if (!Inserted.insert(US)) continue;
3025       const Value *V = US->getValue();
3026       if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3027         // Look for instructions defined outside the loop.
3028         if (L->contains(Inst)) continue;
3029       } else if (isa<UndefValue>(V))
3030         // Undef doesn't have a live range, so it doesn't matter.
3031         continue;
3032       for (const Use &U : V->uses()) {
3033         const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3034         // Ignore non-instructions.
3035         if (!UserInst)
3036           continue;
3037         // Ignore instructions in other functions (as can happen with
3038         // Constants).
3039         if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3040           continue;
3041         // Ignore instructions not dominated by the loop.
3042         const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3043           UserInst->getParent() :
3044           cast<PHINode>(UserInst)->getIncomingBlock(
3045             PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3046         if (!DT.dominates(L->getHeader(), UseBB))
3047           continue;
3048         // Ignore uses which are part of other SCEV expressions, to avoid
3049         // analyzing them multiple times.
3050         if (SE.isSCEVable(UserInst->getType())) {
3051           const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3052           // If the user is a no-op, look through to its uses.
3053           if (!isa<SCEVUnknown>(UserS))
3054             continue;
3055           if (UserS == US) {
3056             Worklist.push_back(
3057               SE.getUnknown(const_cast<Instruction *>(UserInst)));
3058             continue;
3059           }
3060         }
3061         // Ignore icmp instructions which are already being analyzed.
3062         if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3063           unsigned OtherIdx = !U.getOperandNo();
3064           Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
3065           if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3066             continue;
3067         }
3068
3069         LSRFixup &LF = getNewFixup();
3070         LF.UserInst = const_cast<Instruction *>(UserInst);
3071         LF.OperandValToReplace = U;
3072         std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0);
3073         LF.LUIdx = P.first;
3074         LF.Offset = P.second;
3075         LSRUse &LU = Uses[LF.LUIdx];
3076         LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3077         if (!LU.WidestFixupType ||
3078             SE.getTypeSizeInBits(LU.WidestFixupType) <
3079             SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3080           LU.WidestFixupType = LF.OperandValToReplace->getType();
3081         InsertSupplementalFormula(US, LU, LF.LUIdx);
3082         CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3083         break;
3084       }
3085     }
3086   }
3087 }
3088
3089 /// CollectSubexprs - Split S into subexpressions which can be pulled out into
3090 /// separate registers. If C is non-null, multiply each subexpression by C.
3091 ///
3092 /// Return remainder expression after factoring the subexpressions captured by
3093 /// Ops. If Ops is complete, return NULL.
3094 static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3095                                    SmallVectorImpl<const SCEV *> &Ops,
3096                                    const Loop *L,
3097                                    ScalarEvolution &SE,
3098                                    unsigned Depth = 0) {
3099   // Arbitrarily cap recursion to protect compile time.
3100   if (Depth >= 3)
3101     return S;
3102
3103   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3104     // Break out add operands.
3105     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
3106          I != E; ++I) {
3107       const SCEV *Remainder = CollectSubexprs(*I, C, Ops, L, SE, Depth+1);
3108       if (Remainder)
3109         Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3110     }
3111     return 0;
3112   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
3113     // Split a non-zero base out of an addrec.
3114     if (AR->getStart()->isZero())
3115       return S;
3116
3117     const SCEV *Remainder = CollectSubexprs(AR->getStart(),
3118                                             C, Ops, L, SE, Depth+1);
3119     // Split the non-zero AddRec unless it is part of a nested recurrence that
3120     // does not pertain to this loop.
3121     if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
3122       Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3123       Remainder = 0;
3124     }
3125     if (Remainder != AR->getStart()) {
3126       if (!Remainder)
3127         Remainder = SE.getConstant(AR->getType(), 0);
3128       return SE.getAddRecExpr(Remainder,
3129                               AR->getStepRecurrence(SE),
3130                               AR->getLoop(),
3131                               //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3132                               SCEV::FlagAnyWrap);
3133     }
3134   } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
3135     // Break (C * (a + b + c)) into C*a + C*b + C*c.
3136     if (Mul->getNumOperands() != 2)
3137       return S;
3138     if (const SCEVConstant *Op0 =
3139         dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
3140       C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3141       const SCEV *Remainder =
3142         CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
3143       if (Remainder)
3144         Ops.push_back(SE.getMulExpr(C, Remainder));
3145       return 0;
3146     }
3147   }
3148   return S;
3149 }
3150
3151 /// GenerateReassociations - Split out subexpressions from adds and the bases of
3152 /// addrecs.
3153 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
3154                                          Formula Base,
3155                                          unsigned Depth) {
3156   // Arbitrarily cap recursion to protect compile time.
3157   if (Depth >= 3) return;
3158
3159   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
3160     const SCEV *BaseReg = Base.BaseRegs[i];
3161
3162     SmallVector<const SCEV *, 8> AddOps;
3163     const SCEV *Remainder = CollectSubexprs(BaseReg, 0, AddOps, L, SE);
3164     if (Remainder)
3165       AddOps.push_back(Remainder);
3166
3167     if (AddOps.size() == 1) continue;
3168
3169     for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
3170          JE = AddOps.end(); J != JE; ++J) {
3171
3172       // Loop-variant "unknown" values are uninteresting; we won't be able to
3173       // do anything meaningful with them.
3174       if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3175         continue;
3176
3177       // Don't pull a constant into a register if the constant could be folded
3178       // into an immediate field.
3179       if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3180                            LU.AccessTy, *J, Base.getNumRegs() > 1))
3181         continue;
3182
3183       // Collect all operands except *J.
3184       SmallVector<const SCEV *, 8> InnerAddOps(
3185           ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
3186       InnerAddOps.append(std::next(J),
3187                          ((const SmallVector<const SCEV *, 8> &)AddOps).end());
3188
3189       // Don't leave just a constant behind in a register if the constant could
3190       // be folded into an immediate field.
3191       if (InnerAddOps.size() == 1 &&
3192           isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3193                            LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3194         continue;
3195
3196       const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3197       if (InnerSum->isZero())
3198         continue;
3199       Formula F = Base;
3200
3201       // Add the remaining pieces of the add back into the new formula.
3202       const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3203       if (InnerSumSC &&
3204           SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3205           TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
3206                                   InnerSumSC->getValue()->getZExtValue())) {
3207         F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
3208                            InnerSumSC->getValue()->getZExtValue();
3209         F.BaseRegs.erase(F.BaseRegs.begin() + i);
3210       } else
3211         F.BaseRegs[i] = InnerSum;
3212
3213       // Add J as its own register, or an unfolded immediate.
3214       const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3215       if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3216           TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
3217                                   SC->getValue()->getZExtValue()))
3218         F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
3219                            SC->getValue()->getZExtValue();
3220       else
3221         F.BaseRegs.push_back(*J);
3222
3223       if (InsertFormula(LU, LUIdx, F))
3224         // If that formula hadn't been seen before, recurse to find more like
3225         // it.
3226         GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1);
3227     }
3228   }
3229 }
3230
3231 /// GenerateCombinations - Generate a formula consisting of all of the
3232 /// loop-dominating registers added into a single register.
3233 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
3234                                        Formula Base) {
3235   // This method is only interesting on a plurality of registers.
3236   if (Base.BaseRegs.size() <= 1) return;
3237
3238   Formula F = Base;
3239   F.BaseRegs.clear();
3240   SmallVector<const SCEV *, 4> Ops;
3241   for (SmallVectorImpl<const SCEV *>::const_iterator
3242        I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) {
3243     const SCEV *BaseReg = *I;
3244     if (SE.properlyDominates(BaseReg, L->getHeader()) &&
3245         !SE.hasComputableLoopEvolution(BaseReg, L))
3246       Ops.push_back(BaseReg);
3247     else
3248       F.BaseRegs.push_back(BaseReg);
3249   }
3250   if (Ops.size() > 1) {
3251     const SCEV *Sum = SE.getAddExpr(Ops);
3252     // TODO: If Sum is zero, it probably means ScalarEvolution missed an
3253     // opportunity to fold something. For now, just ignore such cases
3254     // rather than proceed with zero in a register.
3255     if (!Sum->isZero()) {
3256       F.BaseRegs.push_back(Sum);
3257       (void)InsertFormula(LU, LUIdx, F);
3258     }
3259   }
3260 }
3261
3262 /// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets.
3263 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
3264                                           Formula Base) {
3265   // We can't add a symbolic offset if the address already contains one.
3266   if (Base.BaseGV) return;
3267
3268   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
3269     const SCEV *G = Base.BaseRegs[i];
3270     GlobalValue *GV = ExtractSymbol(G, SE);
3271     if (G->isZero() || !GV)
3272       continue;
3273     Formula F = Base;
3274     F.BaseGV = GV;
3275     if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
3276       continue;
3277     F.BaseRegs[i] = G;
3278     (void)InsertFormula(LU, LUIdx, F);
3279   }
3280 }
3281
3282 /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
3283 void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
3284                                           Formula Base) {
3285   // TODO: For now, just add the min and max offset, because it usually isn't
3286   // worthwhile looking at everything inbetween.
3287   SmallVector<int64_t, 2> Worklist;
3288   Worklist.push_back(LU.MinOffset);
3289   if (LU.MaxOffset != LU.MinOffset)
3290     Worklist.push_back(LU.MaxOffset);
3291
3292   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
3293     const SCEV *G = Base.BaseRegs[i];
3294
3295     for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
3296          E = Worklist.end(); I != E; ++I) {
3297       Formula F = Base;
3298       F.BaseOffset = (uint64_t)Base.BaseOffset - *I;
3299       if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind,
3300                      LU.AccessTy, F)) {
3301         // Add the offset to the base register.
3302         const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
3303         // If it cancelled out, drop the base register, otherwise update it.
3304         if (NewG->isZero()) {
3305           std::swap(F.BaseRegs[i], F.BaseRegs.back());
3306           F.BaseRegs.pop_back();
3307         } else
3308           F.BaseRegs[i] = NewG;
3309
3310         (void)InsertFormula(LU, LUIdx, F);
3311       }
3312     }
3313
3314     int64_t Imm = ExtractImmediate(G, SE);
3315     if (G->isZero() || Imm == 0)
3316       continue;
3317     Formula F = Base;
3318     F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
3319     if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
3320       continue;
3321     F.BaseRegs[i] = G;
3322     (void)InsertFormula(LU, LUIdx, F);
3323   }
3324 }
3325
3326 /// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up
3327 /// the comparison. For example, x == y -> x*c == y*c.
3328 void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
3329                                          Formula Base) {
3330   if (LU.Kind != LSRUse::ICmpZero) return;
3331
3332   // Determine the integer type for the base formula.
3333   Type *IntTy = Base.getType();
3334   if (!IntTy) return;
3335   if (SE.getTypeSizeInBits(IntTy) > 64) return;
3336
3337   // Don't do this if there is more than one offset.
3338   if (LU.MinOffset != LU.MaxOffset) return;
3339
3340   assert(!Base.BaseGV && "ICmpZero use is not legal!");
3341
3342   // Check each interesting stride.
3343   for (SmallSetVector<int64_t, 8>::const_iterator
3344        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
3345     int64_t Factor = *I;
3346
3347     // Check that the multiplication doesn't overflow.
3348     if (Base.BaseOffset == INT64_MIN && Factor == -1)
3349       continue;
3350     int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
3351     if (NewBaseOffset / Factor != Base.BaseOffset)
3352       continue;
3353     // If the offset will be truncated at this use, check that it is in bounds.
3354     if (!IntTy->isPointerTy() &&
3355         !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
3356       continue;
3357
3358     // Check that multiplying with the use offset doesn't overflow.
3359     int64_t Offset = LU.MinOffset;
3360     if (Offset == INT64_MIN && Factor == -1)
3361       continue;
3362     Offset = (uint64_t)Offset * Factor;
3363     if (Offset / Factor != LU.MinOffset)
3364       continue;
3365     // If the offset will be truncated at this use, check that it is in bounds.
3366     if (!IntTy->isPointerTy() &&
3367         !ConstantInt::isValueValidForType(IntTy, Offset))
3368       continue;
3369
3370     Formula F = Base;
3371     F.BaseOffset = NewBaseOffset;
3372
3373     // Check that this scale is legal.
3374     if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
3375       continue;
3376
3377     // Compensate for the use having MinOffset built into it.
3378     F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
3379
3380     const SCEV *FactorS = SE.getConstant(IntTy, Factor);
3381
3382     // Check that multiplying with each base register doesn't overflow.
3383     for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
3384       F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
3385       if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
3386         goto next;
3387     }
3388
3389     // Check that multiplying with the scaled register doesn't overflow.
3390     if (F.ScaledReg) {
3391       F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
3392       if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
3393         continue;
3394     }
3395
3396     // Check that multiplying with the unfolded offset doesn't overflow.
3397     if (F.UnfoldedOffset != 0) {
3398       if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
3399         continue;
3400       F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
3401       if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
3402         continue;
3403       // If the offset will be truncated, check that it is in bounds.
3404       if (!IntTy->isPointerTy() &&
3405           !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
3406         continue;
3407     }
3408
3409     // If we make it here and it's legal, add it.
3410     (void)InsertFormula(LU, LUIdx, F);
3411   next:;
3412   }
3413 }
3414
3415 /// GenerateScales - Generate stride factor reuse formulae by making use of
3416 /// scaled-offset address modes, for example.
3417 void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
3418   // Determine the integer type for the base formula.
3419   Type *IntTy = Base.getType();
3420   if (!IntTy) return;
3421
3422   // If this Formula already has a scaled register, we can't add another one.
3423   if (Base.Scale != 0) return;
3424
3425   // Check each interesting stride.
3426   for (SmallSetVector<int64_t, 8>::const_iterator
3427        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
3428     int64_t Factor = *I;
3429
3430     Base.Scale = Factor;
3431     Base.HasBaseReg = Base.BaseRegs.size() > 1;
3432     // Check whether this scale is going to be legal.
3433     if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
3434                     Base)) {
3435       // As a special-case, handle special out-of-loop Basic users specially.
3436       // TODO: Reconsider this special case.
3437       if (LU.Kind == LSRUse::Basic &&
3438           isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
3439                      LU.AccessTy, Base) &&
3440           LU.AllFixupsOutsideLoop)
3441         LU.Kind = LSRUse::Special;
3442       else
3443         continue;
3444     }
3445     // For an ICmpZero, negating a solitary base register won't lead to
3446     // new solutions.
3447     if (LU.Kind == LSRUse::ICmpZero &&
3448         !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
3449       continue;
3450     // For each addrec base reg, apply the scale, if possible.
3451     for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3452       if (const SCEVAddRecExpr *AR =
3453             dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
3454         const SCEV *FactorS = SE.getConstant(IntTy, Factor);
3455         if (FactorS->isZero())
3456           continue;
3457         // Divide out the factor, ignoring high bits, since we'll be
3458         // scaling the value back up in the end.
3459         if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
3460           // TODO: This could be optimized to avoid all the copying.
3461           Formula F = Base;
3462           F.ScaledReg = Quotient;
3463           F.DeleteBaseReg(F.BaseRegs[i]);
3464           (void)InsertFormula(LU, LUIdx, F);
3465         }
3466       }
3467   }
3468 }
3469
3470 /// GenerateTruncates - Generate reuse formulae from different IV types.
3471 void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
3472   // Don't bother truncating symbolic values.
3473   if (Base.BaseGV) return;
3474
3475   // Determine the integer type for the base formula.
3476   Type *DstTy = Base.getType();
3477   if (!DstTy) return;
3478   DstTy = SE.getEffectiveSCEVType(DstTy);
3479
3480   for (SmallSetVector<Type *, 4>::const_iterator
3481        I = Types.begin(), E = Types.end(); I != E; ++I) {
3482     Type *SrcTy = *I;
3483     if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
3484       Formula F = Base;
3485
3486       if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
3487       for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
3488            JE = F.BaseRegs.end(); J != JE; ++J)
3489         *J = SE.getAnyExtendExpr(*J, SrcTy);
3490
3491       // TODO: This assumes we've done basic processing on all uses and
3492       // have an idea what the register usage is.
3493       if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
3494         continue;
3495
3496       (void)InsertFormula(LU, LUIdx, F);
3497     }
3498   }
3499 }
3500
3501 namespace {
3502
3503 /// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to
3504 /// defer modifications so that the search phase doesn't have to worry about
3505 /// the data structures moving underneath it.
3506 struct WorkItem {
3507   size_t LUIdx;
3508   int64_t Imm;
3509   const SCEV *OrigReg;
3510
3511   WorkItem(size_t LI, int64_t I, const SCEV *R)
3512     : LUIdx(LI), Imm(I), OrigReg(R) {}
3513
3514   void print(raw_ostream &OS) const;
3515   void dump() const;
3516 };
3517
3518 }
3519
3520 void WorkItem::print(raw_ostream &OS) const {
3521   OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
3522      << " , add offset " << Imm;
3523 }
3524
3525 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3526 void WorkItem::dump() const {
3527   print(errs()); errs() << '\n';
3528 }
3529 #endif
3530
3531 /// GenerateCrossUseConstantOffsets - Look for registers which are a constant
3532 /// distance apart and try to form reuse opportunities between them.
3533 void LSRInstance::GenerateCrossUseConstantOffsets() {
3534   // Group the registers by their value without any added constant offset.
3535   typedef std::map<int64_t, const SCEV *> ImmMapTy;
3536   typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
3537   RegMapTy Map;
3538   DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
3539   SmallVector<const SCEV *, 8> Sequence;
3540   for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
3541        I != E; ++I) {
3542     const SCEV *Reg = *I;
3543     int64_t Imm = ExtractImmediate(Reg, SE);
3544     std::pair<RegMapTy::iterator, bool> Pair =
3545       Map.insert(std::make_pair(Reg, ImmMapTy()));
3546     if (Pair.second)
3547       Sequence.push_back(Reg);
3548     Pair.first->second.insert(std::make_pair(Imm, *I));
3549     UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I);
3550   }
3551
3552   // Now examine each set of registers with the same base value. Build up
3553   // a list of work to do and do the work in a separate step so that we're
3554   // not adding formulae and register counts while we're searching.
3555   SmallVector<WorkItem, 32> WorkItems;
3556   SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
3557   for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
3558        E = Sequence.end(); I != E; ++I) {
3559     const SCEV *Reg = *I;
3560     const ImmMapTy &Imms = Map.find(Reg)->second;
3561
3562     // It's not worthwhile looking for reuse if there's only one offset.
3563     if (Imms.size() == 1)
3564       continue;
3565
3566     DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
3567           for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
3568                J != JE; ++J)
3569             dbgs() << ' ' << J->first;
3570           dbgs() << '\n');
3571
3572     // Examine each offset.
3573     for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
3574          J != JE; ++J) {
3575       const SCEV *OrigReg = J->second;
3576
3577       int64_t JImm = J->first;
3578       const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
3579
3580       if (!isa<SCEVConstant>(OrigReg) &&
3581           UsedByIndicesMap[Reg].count() == 1) {
3582         DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
3583         continue;
3584       }
3585
3586       // Conservatively examine offsets between this orig reg a few selected
3587       // other orig regs.
3588       ImmMapTy::const_iterator OtherImms[] = {
3589         Imms.begin(), std::prev(Imms.end()),
3590         Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) /
3591                          2)
3592       };
3593       for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
3594         ImmMapTy::const_iterator M = OtherImms[i];
3595         if (M == J || M == JE) continue;
3596
3597         // Compute the difference between the two.
3598         int64_t Imm = (uint64_t)JImm - M->first;
3599         for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
3600              LUIdx = UsedByIndices.find_next(LUIdx))
3601           // Make a memo of this use, offset, and register tuple.
3602           if (UniqueItems.insert(std::make_pair(LUIdx, Imm)))
3603             WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
3604       }
3605     }
3606   }
3607
3608   Map.clear();
3609   Sequence.clear();
3610   UsedByIndicesMap.clear();
3611   UniqueItems.clear();
3612
3613   // Now iterate through the worklist and add new formulae.
3614   for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
3615        E = WorkItems.end(); I != E; ++I) {
3616     const WorkItem &WI = *I;
3617     size_t LUIdx = WI.LUIdx;
3618     LSRUse &LU = Uses[LUIdx];
3619     int64_t Imm = WI.Imm;
3620     const SCEV *OrigReg = WI.OrigReg;
3621
3622     Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
3623     const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
3624     unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
3625
3626     // TODO: Use a more targeted data structure.
3627     for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
3628       const Formula &F = LU.Formulae[L];
3629       // Use the immediate in the scaled register.
3630       if (F.ScaledReg == OrigReg) {
3631         int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
3632         // Don't create 50 + reg(-50).
3633         if (F.referencesReg(SE.getSCEV(
3634                    ConstantInt::get(IntTy, -(uint64_t)Offset))))
3635           continue;
3636         Formula NewF = F;
3637         NewF.BaseOffset = Offset;
3638         if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
3639                         NewF))
3640           continue;
3641         NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
3642
3643         // If the new scale is a constant in a register, and adding the constant
3644         // value to the immediate would produce a value closer to zero than the
3645         // immediate itself, then the formula isn't worthwhile.
3646         if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
3647           if (C->getValue()->isNegative() !=
3648                 (NewF.BaseOffset < 0) &&
3649               (C->getValue()->getValue().abs() * APInt(BitWidth, F.Scale))
3650                 .ule(abs64(NewF.BaseOffset)))
3651             continue;
3652
3653         // OK, looks good.
3654         (void)InsertFormula(LU, LUIdx, NewF);
3655       } else {
3656         // Use the immediate in a base register.
3657         for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
3658           const SCEV *BaseReg = F.BaseRegs[N];
3659           if (BaseReg != OrigReg)
3660             continue;
3661           Formula NewF = F;
3662           NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
3663           if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
3664                           LU.Kind, LU.AccessTy, NewF)) {
3665             if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
3666               continue;
3667             NewF = F;
3668             NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
3669           }
3670           NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
3671
3672           // If the new formula has a constant in a register, and adding the
3673           // constant value to the immediate would produce a value closer to
3674           // zero than the immediate itself, then the formula isn't worthwhile.
3675           for (SmallVectorImpl<const SCEV *>::const_iterator
3676                J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end();
3677                J != JE; ++J)
3678             if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J))
3679               if ((C->getValue()->getValue() + NewF.BaseOffset).abs().slt(
3680                    abs64(NewF.BaseOffset)) &&
3681                   (C->getValue()->getValue() +
3682                    NewF.BaseOffset).countTrailingZeros() >=
3683                    countTrailingZeros<uint64_t>(NewF.BaseOffset))
3684                 goto skip_formula;
3685
3686           // Ok, looks good.
3687           (void)InsertFormula(LU, LUIdx, NewF);
3688           break;
3689         skip_formula:;
3690         }
3691       }
3692     }
3693   }
3694 }
3695
3696 /// GenerateAllReuseFormulae - Generate formulae for each use.
3697 void
3698 LSRInstance::GenerateAllReuseFormulae() {
3699   // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
3700   // queries are more precise.
3701   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3702     LSRUse &LU = Uses[LUIdx];
3703     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3704       GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
3705     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3706       GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
3707   }
3708   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3709     LSRUse &LU = Uses[LUIdx];
3710     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3711       GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
3712     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3713       GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
3714     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3715       GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
3716     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3717       GenerateScales(LU, LUIdx, LU.Formulae[i]);
3718   }
3719   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3720     LSRUse &LU = Uses[LUIdx];
3721     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3722       GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
3723   }
3724
3725   GenerateCrossUseConstantOffsets();
3726
3727   DEBUG(dbgs() << "\n"
3728                   "After generating reuse formulae:\n";
3729         print_uses(dbgs()));
3730 }
3731
3732 /// If there are multiple formulae with the same set of registers used
3733 /// by other uses, pick the best one and delete the others.
3734 void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
3735   DenseSet<const SCEV *> VisitedRegs;
3736   SmallPtrSet<const SCEV *, 16> Regs;
3737   SmallPtrSet<const SCEV *, 16> LoserRegs;
3738 #ifndef NDEBUG
3739   bool ChangedFormulae = false;
3740 #endif
3741
3742   // Collect the best formula for each unique set of shared registers. This
3743   // is reset for each use.
3744   typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>
3745     BestFormulaeTy;
3746   BestFormulaeTy BestFormulae;
3747
3748   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3749     LSRUse &LU = Uses[LUIdx];
3750     DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
3751
3752     bool Any = false;
3753     for (size_t FIdx = 0, NumForms = LU.Formulae.size();
3754          FIdx != NumForms; ++FIdx) {
3755       Formula &F = LU.Formulae[FIdx];
3756
3757       // Some formulas are instant losers. For example, they may depend on
3758       // nonexistent AddRecs from other loops. These need to be filtered
3759       // immediately, otherwise heuristics could choose them over others leading
3760       // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
3761       // avoids the need to recompute this information across formulae using the
3762       // same bad AddRec. Passing LoserRegs is also essential unless we remove
3763       // the corresponding bad register from the Regs set.
3764       Cost CostF;
3765       Regs.clear();
3766       CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU,
3767                         &LoserRegs);
3768       if (CostF.isLoser()) {
3769         // During initial formula generation, undesirable formulae are generated
3770         // by uses within other loops that have some non-trivial address mode or
3771         // use the postinc form of the IV. LSR needs to provide these formulae
3772         // as the basis of rediscovering the desired formula that uses an AddRec
3773         // corresponding to the existing phi. Once all formulae have been
3774         // generated, these initial losers may be pruned.
3775         DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs());
3776               dbgs() << "\n");
3777       }
3778       else {
3779         SmallVector<const SCEV *, 4> Key;
3780         for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
3781                JE = F.BaseRegs.end(); J != JE; ++J) {
3782           const SCEV *Reg = *J;
3783           if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
3784             Key.push_back(Reg);
3785         }
3786         if (F.ScaledReg &&
3787             RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
3788           Key.push_back(F.ScaledReg);
3789         // Unstable sort by host order ok, because this is only used for
3790         // uniquifying.
3791         std::sort(Key.begin(), Key.end());
3792
3793         std::pair<BestFormulaeTy::const_iterator, bool> P =
3794           BestFormulae.insert(std::make_pair(Key, FIdx));
3795         if (P.second)
3796           continue;
3797
3798         Formula &Best = LU.Formulae[P.first->second];
3799
3800         Cost CostBest;
3801         Regs.clear();
3802         CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE,
3803                              DT, LU);
3804         if (CostF < CostBest)
3805           std::swap(F, Best);
3806         DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
3807               dbgs() << "\n"
3808                         "    in favor of formula "; Best.print(dbgs());
3809               dbgs() << '\n');
3810       }
3811 #ifndef NDEBUG
3812       ChangedFormulae = true;
3813 #endif
3814       LU.DeleteFormula(F);
3815       --FIdx;
3816       --NumForms;
3817       Any = true;
3818     }
3819
3820     // Now that we've filtered out some formulae, recompute the Regs set.
3821     if (Any)
3822       LU.RecomputeRegs(LUIdx, RegUses);
3823
3824     // Reset this to prepare for the next use.
3825     BestFormulae.clear();
3826   }
3827
3828   DEBUG(if (ChangedFormulae) {
3829           dbgs() << "\n"
3830                     "After filtering out undesirable candidates:\n";
3831           print_uses(dbgs());
3832         });
3833 }
3834
3835 // This is a rough guess that seems to work fairly well.
3836 static const size_t ComplexityLimit = UINT16_MAX;
3837
3838 /// EstimateSearchSpaceComplexity - Estimate the worst-case number of
3839 /// solutions the solver might have to consider. It almost never considers
3840 /// this many solutions because it prune the search space, but the pruning
3841 /// isn't always sufficient.
3842 size_t LSRInstance::EstimateSearchSpaceComplexity() const {
3843   size_t Power = 1;
3844   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
3845        E = Uses.end(); I != E; ++I) {
3846     size_t FSize = I->Formulae.size();
3847     if (FSize >= ComplexityLimit) {
3848       Power = ComplexityLimit;
3849       break;
3850     }
3851     Power *= FSize;
3852     if (Power >= ComplexityLimit)
3853       break;
3854   }
3855   return Power;
3856 }
3857
3858 /// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset
3859 /// of the registers of another formula, it won't help reduce register
3860 /// pressure (though it may not necessarily hurt register pressure); remove
3861 /// it to simplify the system.
3862 void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
3863   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
3864     DEBUG(dbgs() << "The search space is too complex.\n");
3865
3866     DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
3867                     "which use a superset of registers used by other "
3868                     "formulae.\n");
3869
3870     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3871       LSRUse &LU = Uses[LUIdx];
3872       bool Any = false;
3873       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
3874         Formula &F = LU.Formulae[i];
3875         // Look for a formula with a constant or GV in a register. If the use
3876         // also has a formula with that same value in an immediate field,
3877         // delete the one that uses a register.
3878         for (SmallVectorImpl<const SCEV *>::const_iterator
3879              I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
3880           if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
3881             Formula NewF = F;
3882             NewF.BaseOffset += C->getValue()->getSExtValue();
3883             NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
3884                                 (I - F.BaseRegs.begin()));
3885             if (LU.HasFormulaWithSameRegs(NewF)) {
3886               DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
3887               LU.DeleteFormula(F);
3888               --i;
3889               --e;
3890               Any = true;
3891               break;
3892             }
3893           } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
3894             if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
3895               if (!F.BaseGV) {
3896                 Formula NewF = F;
3897                 NewF.BaseGV = GV;
3898                 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
3899                                     (I - F.BaseRegs.begin()));
3900                 if (LU.HasFormulaWithSameRegs(NewF)) {
3901                   DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
3902                         dbgs() << '\n');
3903                   LU.DeleteFormula(F);
3904                   --i;
3905                   --e;
3906                   Any = true;
3907                   break;
3908                 }
3909               }
3910           }
3911         }
3912       }
3913       if (Any)
3914         LU.RecomputeRegs(LUIdx, RegUses);
3915     }
3916
3917     DEBUG(dbgs() << "After pre-selection:\n";
3918           print_uses(dbgs()));
3919   }
3920 }
3921
3922 /// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers
3923 /// for expressions like A, A+1, A+2, etc., allocate a single register for
3924 /// them.
3925 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
3926   if (EstimateSearchSpaceComplexity() < ComplexityLimit)
3927     return;
3928
3929   DEBUG(dbgs() << "The search space is too complex.\n"
3930                   "Narrowing the search space by assuming that uses separated "
3931                   "by a constant offset will use the same registers.\n");
3932
3933   // This is especially useful for unrolled loops.
3934
3935   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3936     LSRUse &LU = Uses[LUIdx];
3937     for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
3938          E = LU.Formulae.end(); I != E; ++I) {
3939       const Formula &F = *I;
3940       if (F.BaseOffset == 0 || F.Scale != 0)
3941         continue;
3942
3943       LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
3944       if (!LUThatHas)
3945         continue;
3946
3947       if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
3948                               LU.Kind, LU.AccessTy))
3949         continue;
3950
3951       DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');
3952
3953       LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
3954
3955       // Update the relocs to reference the new use.
3956       for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(),
3957            E = Fixups.end(); I != E; ++I) {
3958         LSRFixup &Fixup = *I;
3959         if (Fixup.LUIdx == LUIdx) {
3960           Fixup.LUIdx = LUThatHas - &Uses.front();
3961           Fixup.Offset += F.BaseOffset;
3962           // Add the new offset to LUThatHas' offset list.
3963           if (LUThatHas->Offsets.back() != Fixup.Offset) {
3964             LUThatHas->Offsets.push_back(Fixup.Offset);
3965             if (Fixup.Offset > LUThatHas->MaxOffset)
3966               LUThatHas->MaxOffset = Fixup.Offset;
3967             if (Fixup.Offset < LUThatHas->MinOffset)
3968               LUThatHas->MinOffset = Fixup.Offset;
3969           }
3970           DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
3971         }
3972         if (Fixup.LUIdx == NumUses-1)
3973           Fixup.LUIdx = LUIdx;
3974       }
3975
3976       // Delete formulae from the new use which are no longer legal.
3977       bool Any = false;
3978       for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
3979         Formula &F = LUThatHas->Formulae[i];
3980         if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
3981                         LUThatHas->Kind, LUThatHas->AccessTy, F)) {
3982           DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
3983                 dbgs() << '\n');
3984           LUThatHas->DeleteFormula(F);
3985           --i;
3986           --e;
3987           Any = true;
3988         }
3989       }
3990
3991       if (Any)
3992         LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
3993
3994       // Delete the old use.
3995       DeleteUse(LU, LUIdx);
3996       --LUIdx;
3997       --NumUses;
3998       break;
3999     }
4000   }
4001
4002   DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4003 }
4004
4005 /// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call
4006 /// FilterOutUndesirableDedicatedRegisters again, if necessary, now that
4007 /// we've done more filtering, as it may be able to find more formulae to
4008 /// eliminate.
4009 void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
4010   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4011     DEBUG(dbgs() << "The search space is too complex.\n");
4012
4013     DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
4014                     "undesirable dedicated registers.\n");
4015
4016     FilterOutUndesirableDedicatedRegisters();
4017
4018     DEBUG(dbgs() << "After pre-selection:\n";
4019           print_uses(dbgs()));
4020   }
4021 }
4022
4023 /// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely
4024 /// to be profitable, and then in any use which has any reference to that
4025 /// register, delete all formulae which do not reference that register.
4026 void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
4027   // With all other options exhausted, loop until the system is simple
4028   // enough to handle.
4029   SmallPtrSet<const SCEV *, 4> Taken;
4030   while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4031     // Ok, we have too many of formulae on our hands to conveniently handle.
4032     // Use a rough heuristic to thin out the list.
4033     DEBUG(dbgs() << "The search space is too complex.\n");
4034
4035     // Pick the register which is used by the most LSRUses, which is likely
4036     // to be a good reuse register candidate.
4037     const SCEV *Best = 0;
4038     unsigned BestNum = 0;
4039     for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
4040          I != E; ++I) {
4041       const SCEV *Reg = *I;
4042       if (Taken.count(Reg))
4043         continue;
4044       if (!Best)
4045         Best = Reg;
4046       else {
4047         unsigned Count = RegUses.getUsedByIndices(Reg).count();
4048         if (Count > BestNum) {
4049           Best = Reg;
4050           BestNum = Count;
4051         }
4052       }
4053     }
4054
4055     DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
4056                  << " will yield profitable reuse.\n");
4057     Taken.insert(Best);
4058
4059     // In any use with formulae which references this register, delete formulae
4060     // which don't reference it.
4061     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4062       LSRUse &LU = Uses[LUIdx];
4063       if (!LU.Regs.count(Best)) continue;
4064
4065       bool Any = false;
4066       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4067         Formula &F = LU.Formulae[i];
4068         if (!F.referencesReg(Best)) {
4069           DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
4070           LU.DeleteFormula(F);
4071           --e;
4072           --i;
4073           Any = true;
4074           assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
4075           continue;
4076         }
4077       }
4078
4079       if (Any)
4080         LU.RecomputeRegs(LUIdx, RegUses);
4081     }
4082
4083     DEBUG(dbgs() << "After pre-selection:\n";
4084           print_uses(dbgs()));
4085   }
4086 }
4087
4088 /// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of
4089 /// formulae to choose from, use some rough heuristics to prune down the number
4090 /// of formulae. This keeps the main solver from taking an extraordinary amount
4091 /// of time in some worst-case scenarios.
4092 void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
4093   NarrowSearchSpaceByDetectingSupersets();
4094   NarrowSearchSpaceByCollapsingUnrolledCode();
4095   NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
4096   NarrowSearchSpaceByPickingWinnerRegs();
4097 }
4098
4099 /// SolveRecurse - This is the recursive solver.
4100 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
4101                                Cost &SolutionCost,
4102                                SmallVectorImpl<const Formula *> &Workspace,
4103                                const Cost &CurCost,
4104                                const SmallPtrSet<const SCEV *, 16> &CurRegs,
4105                                DenseSet<const SCEV *> &VisitedRegs) const {
4106   // Some ideas:
4107   //  - prune more:
4108   //    - use more aggressive filtering
4109   //    - sort the formula so that the most profitable solutions are found first
4110   //    - sort the uses too
4111   //  - search faster:
4112   //    - don't compute a cost, and then compare. compare while computing a cost
4113   //      and bail early.
4114   //    - track register sets with SmallBitVector
4115
4116   const LSRUse &LU = Uses[Workspace.size()];
4117
4118   // If this use references any register that's already a part of the
4119   // in-progress solution, consider it a requirement that a formula must
4120   // reference that register in order to be considered. This prunes out
4121   // unprofitable searching.
4122   SmallSetVector<const SCEV *, 4> ReqRegs;
4123   for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(),
4124        E = CurRegs.end(); I != E; ++I)
4125     if (LU.Regs.count(*I))
4126       ReqRegs.insert(*I);
4127
4128   SmallPtrSet<const SCEV *, 16> NewRegs;
4129   Cost NewCost;
4130   for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
4131        E = LU.Formulae.end(); I != E; ++I) {
4132     const Formula &F = *I;
4133
4134     // Ignore formulae which do not use any of the required registers.
4135     bool SatisfiedReqReg = true;
4136     for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
4137          JE = ReqRegs.end(); J != JE; ++J) {
4138       const SCEV *Reg = *J;
4139       if ((!F.ScaledReg || F.ScaledReg != Reg) &&
4140           std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) ==
4141           F.BaseRegs.end()) {
4142         SatisfiedReqReg = false;
4143         break;
4144       }
4145     }
4146     if (!SatisfiedReqReg) {
4147       // If none of the formulae satisfied the required registers, then we could
4148       // clear ReqRegs and try again. Currently, we simply give up in this case.
4149       continue;
4150     }
4151
4152     // Evaluate the cost of the current formula. If it's already worse than
4153     // the current best, prune the search at that point.
4154     NewCost = CurCost;
4155     NewRegs = CurRegs;
4156     NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT,
4157                         LU);
4158     if (NewCost < SolutionCost) {
4159       Workspace.push_back(&F);
4160       if (Workspace.size() != Uses.size()) {
4161         SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
4162                      NewRegs, VisitedRegs);
4163         if (F.getNumRegs() == 1 && Workspace.size() == 1)
4164           VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
4165       } else {
4166         DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
4167               dbgs() << ".\n Regs:";
4168               for (SmallPtrSet<const SCEV *, 16>::const_iterator
4169                    I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I)
4170                 dbgs() << ' ' << **I;
4171               dbgs() << '\n');
4172
4173         SolutionCost = NewCost;
4174         Solution = Workspace;
4175       }
4176       Workspace.pop_back();
4177     }
4178   }
4179 }
4180
4181 /// Solve - Choose one formula from each use. Return the results in the given
4182 /// Solution vector.
4183 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
4184   SmallVector<const Formula *, 8> Workspace;
4185   Cost SolutionCost;
4186   SolutionCost.Lose();
4187   Cost CurCost;
4188   SmallPtrSet<const SCEV *, 16> CurRegs;
4189   DenseSet<const SCEV *> VisitedRegs;
4190   Workspace.reserve(Uses.size());
4191
4192   // SolveRecurse does all the work.
4193   SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
4194                CurRegs, VisitedRegs);
4195   if (Solution.empty()) {
4196     DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
4197     return;
4198   }
4199
4200   // Ok, we've now made all our decisions.
4201   DEBUG(dbgs() << "\n"
4202                   "The chosen solution requires "; SolutionCost.print(dbgs());
4203         dbgs() << ":\n";
4204         for (size_t i = 0, e = Uses.size(); i != e; ++i) {
4205           dbgs() << "  ";
4206           Uses[i].print(dbgs());
4207           dbgs() << "\n"
4208                     "    ";
4209           Solution[i]->print(dbgs());
4210           dbgs() << '\n';
4211         });
4212
4213   assert(Solution.size() == Uses.size() && "Malformed solution!");
4214 }
4215
4216 /// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up
4217 /// the dominator tree far as we can go while still being dominated by the
4218 /// input positions. This helps canonicalize the insert position, which
4219 /// encourages sharing.
4220 BasicBlock::iterator
4221 LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
4222                                  const SmallVectorImpl<Instruction *> &Inputs)
4223                                                                          const {
4224   for (;;) {
4225     const Loop *IPLoop = LI.getLoopFor(IP->getParent());
4226     unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
4227
4228     BasicBlock *IDom;
4229     for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
4230       if (!Rung) return IP;
4231       Rung = Rung->getIDom();
4232       if (!Rung) return IP;
4233       IDom = Rung->getBlock();
4234
4235       // Don't climb into a loop though.
4236       const Loop *IDomLoop = LI.getLoopFor(IDom);
4237       unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
4238       if (IDomDepth <= IPLoopDepth &&
4239           (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
4240         break;
4241     }
4242
4243     bool AllDominate = true;
4244     Instruction *BetterPos = 0;
4245     Instruction *Tentative = IDom->getTerminator();
4246     for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
4247          E = Inputs.end(); I != E; ++I) {
4248       Instruction *Inst = *I;
4249       if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
4250         AllDominate = false;
4251         break;
4252       }
4253       // Attempt to find an insert position in the middle of the block,
4254       // instead of at the end, so that it can be used for other expansions.
4255       if (IDom == Inst->getParent() &&
4256           (!BetterPos || !DT.dominates(Inst, BetterPos)))
4257         BetterPos = std::next(BasicBlock::iterator(Inst));
4258     }
4259     if (!AllDominate)
4260       break;
4261     if (BetterPos)
4262       IP = BetterPos;
4263     else
4264       IP = Tentative;
4265   }
4266
4267   return IP;
4268 }
4269
4270 /// AdjustInsertPositionForExpand - Determine an input position which will be
4271 /// dominated by the operands and which will dominate the result.
4272 BasicBlock::iterator
4273 LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
4274                                            const LSRFixup &LF,
4275                                            const LSRUse &LU,
4276                                            SCEVExpander &Rewriter) const {
4277   // Collect some instructions which must be dominated by the
4278   // expanding replacement. These must be dominated by any operands that
4279   // will be required in the expansion.
4280   SmallVector<Instruction *, 4> Inputs;
4281   if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
4282     Inputs.push_back(I);
4283   if (LU.Kind == LSRUse::ICmpZero)
4284     if (Instruction *I =
4285           dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
4286       Inputs.push_back(I);
4287   if (LF.PostIncLoops.count(L)) {
4288     if (LF.isUseFullyOutsideLoop(L))
4289       Inputs.push_back(L->getLoopLatch()->getTerminator());
4290     else
4291       Inputs.push_back(IVIncInsertPos);
4292   }
4293   // The expansion must also be dominated by the increment positions of any
4294   // loops it for which it is using post-inc mode.
4295   for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(),
4296        E = LF.PostIncLoops.end(); I != E; ++I) {
4297     const Loop *PIL = *I;
4298     if (PIL == L) continue;
4299
4300     // Be dominated by the loop exit.
4301     SmallVector<BasicBlock *, 4> ExitingBlocks;
4302     PIL->getExitingBlocks(ExitingBlocks);
4303     if (!ExitingBlocks.empty()) {
4304       BasicBlock *BB = ExitingBlocks[0];
4305       for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
4306         BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
4307       Inputs.push_back(BB->getTerminator());
4308     }
4309   }
4310
4311   assert(!isa<PHINode>(LowestIP) && !isa<LandingPadInst>(LowestIP)
4312          && !isa<DbgInfoIntrinsic>(LowestIP) &&
4313          "Insertion point must be a normal instruction");
4314
4315   // Then, climb up the immediate dominator tree as far as we can go while
4316   // still being dominated by the input positions.
4317   BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
4318
4319   // Don't insert instructions before PHI nodes.
4320   while (isa<PHINode>(IP)) ++IP;
4321
4322   // Ignore landingpad instructions.
4323   while (isa<LandingPadInst>(IP)) ++IP;
4324
4325   // Ignore debug intrinsics.
4326   while (isa<DbgInfoIntrinsic>(IP)) ++IP;
4327
4328   // Set IP below instructions recently inserted by SCEVExpander. This keeps the
4329   // IP consistent across expansions and allows the previously inserted
4330   // instructions to be reused by subsequent expansion.
4331   while (Rewriter.isInsertedInstruction(IP) && IP != LowestIP) ++IP;
4332
4333   return IP;
4334 }
4335
4336 /// Expand - Emit instructions for the leading candidate expression for this
4337 /// LSRUse (this is called "expanding").
4338 Value *LSRInstance::Expand(const LSRFixup &LF,
4339                            const Formula &F,
4340                            BasicBlock::iterator IP,
4341                            SCEVExpander &Rewriter,
4342                            SmallVectorImpl<WeakVH> &DeadInsts) const {
4343   const LSRUse &LU = Uses[LF.LUIdx];
4344   if (LU.RigidFormula)
4345     return LF.OperandValToReplace;
4346
4347   // Determine an input position which will be dominated by the operands and
4348   // which will dominate the result.
4349   IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
4350
4351   // Inform the Rewriter if we have a post-increment use, so that it can
4352   // perform an advantageous expansion.
4353   Rewriter.setPostInc(LF.PostIncLoops);
4354
4355   // This is the type that the user actually needs.
4356   Type *OpTy = LF.OperandValToReplace->getType();
4357   // This will be the type that we'll initially expand to.
4358   Type *Ty = F.getType();
4359   if (!Ty)
4360     // No type known; just expand directly to the ultimate type.
4361     Ty = OpTy;
4362   else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
4363     // Expand directly to the ultimate type if it's the right size.
4364     Ty = OpTy;
4365   // This is the type to do integer arithmetic in.
4366   Type *IntTy = SE.getEffectiveSCEVType(Ty);
4367
4368   // Build up a list of operands to add together to form the full base.
4369   SmallVector<const SCEV *, 8> Ops;
4370
4371   // Expand the BaseRegs portion.
4372   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
4373        E = F.BaseRegs.end(); I != E; ++I) {
4374     const SCEV *Reg = *I;
4375     assert(!Reg->isZero() && "Zero allocated in a base register!");
4376
4377     // If we're expanding for a post-inc user, make the post-inc adjustment.
4378     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
4379     Reg = TransformForPostIncUse(Denormalize, Reg,
4380                                  LF.UserInst, LF.OperandValToReplace,
4381                                  Loops, SE, DT);
4382
4383     Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
4384   }
4385
4386   // Expand the ScaledReg portion.
4387   Value *ICmpScaledV = 0;
4388   if (F.Scale != 0) {
4389     const SCEV *ScaledS = F.ScaledReg;
4390
4391     // If we're expanding for a post-inc user, make the post-inc adjustment.
4392     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
4393     ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
4394                                      LF.UserInst, LF.OperandValToReplace,
4395                                      Loops, SE, DT);
4396
4397     if (LU.Kind == LSRUse::ICmpZero) {
4398       // An interesting way of "folding" with an icmp is to use a negated
4399       // scale, which we'll implement by inserting it into the other operand
4400       // of the icmp.
4401       assert(F.Scale == -1 &&
4402              "The only scale supported by ICmpZero uses is -1!");
4403       ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
4404     } else {
4405       // Otherwise just expand the scaled register and an explicit scale,
4406       // which is expected to be matched as part of the address.
4407
4408       // Flush the operand list to suppress SCEVExpander hoisting address modes.
4409       if (!Ops.empty() && LU.Kind == LSRUse::Address) {
4410         Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
4411         Ops.clear();
4412         Ops.push_back(SE.getUnknown(FullV));
4413       }
4414       ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
4415       ScaledS = SE.getMulExpr(ScaledS,
4416                               SE.getConstant(ScaledS->getType(), F.Scale));
4417       Ops.push_back(ScaledS);
4418     }
4419   }
4420
4421   // Expand the GV portion.
4422   if (F.BaseGV) {
4423     // Flush the operand list to suppress SCEVExpander hoisting.
4424     if (!Ops.empty()) {
4425       Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
4426       Ops.clear();
4427       Ops.push_back(SE.getUnknown(FullV));
4428     }
4429     Ops.push_back(SE.getUnknown(F.BaseGV));
4430   }
4431
4432   // Flush the operand list to suppress SCEVExpander hoisting of both folded and
4433   // unfolded offsets. LSR assumes they both live next to their uses.
4434   if (!Ops.empty()) {
4435     Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
4436     Ops.clear();
4437     Ops.push_back(SE.getUnknown(FullV));
4438   }
4439
4440   // Expand the immediate portion.
4441   int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
4442   if (Offset != 0) {
4443     if (LU.Kind == LSRUse::ICmpZero) {
4444       // The other interesting way of "folding" with an ICmpZero is to use a
4445       // negated immediate.
4446       if (!ICmpScaledV)
4447         ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
4448       else {
4449         Ops.push_back(SE.getUnknown(ICmpScaledV));
4450         ICmpScaledV = ConstantInt::get(IntTy, Offset);
4451       }
4452     } else {
4453       // Just add the immediate values. These again are expected to be matched
4454       // as part of the address.
4455       Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
4456     }
4457   }
4458
4459   // Expand the unfolded offset portion.
4460   int64_t UnfoldedOffset = F.UnfoldedOffset;
4461   if (UnfoldedOffset != 0) {
4462     // Just add the immediate values.
4463     Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
4464                                                        UnfoldedOffset)));
4465   }
4466
4467   // Emit instructions summing all the operands.
4468   const SCEV *FullS = Ops.empty() ?
4469                       SE.getConstant(IntTy, 0) :
4470                       SE.getAddExpr(Ops);
4471   Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);
4472
4473   // We're done expanding now, so reset the rewriter.
4474   Rewriter.clearPostInc();
4475
4476   // An ICmpZero Formula represents an ICmp which we're handling as a
4477   // comparison against zero. Now that we've expanded an expression for that
4478   // form, update the ICmp's other operand.
4479   if (LU.Kind == LSRUse::ICmpZero) {
4480     ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
4481     DeadInsts.push_back(CI->getOperand(1));
4482     assert(!F.BaseGV && "ICmp does not support folding a global value and "
4483                            "a scale at the same time!");
4484     if (F.Scale == -1) {
4485       if (ICmpScaledV->getType() != OpTy) {
4486         Instruction *Cast =
4487           CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
4488                                                    OpTy, false),
4489                            ICmpScaledV, OpTy, "tmp", CI);
4490         ICmpScaledV = Cast;
4491       }
4492       CI->setOperand(1, ICmpScaledV);
4493     } else {
4494       assert(F.Scale == 0 &&
4495              "ICmp does not support folding a global value and "
4496              "a scale at the same time!");
4497       Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
4498                                            -(uint64_t)Offset);
4499       if (C->getType() != OpTy)
4500         C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
4501                                                           OpTy, false),
4502                                   C, OpTy);
4503
4504       CI->setOperand(1, C);
4505     }
4506   }
4507
4508   return FullV;
4509 }
4510
4511 /// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use
4512 /// of their operands effectively happens in their predecessor blocks, so the
4513 /// expression may need to be expanded in multiple places.
4514 void LSRInstance::RewriteForPHI(PHINode *PN,
4515                                 const LSRFixup &LF,
4516                                 const Formula &F,
4517                                 SCEVExpander &Rewriter,
4518                                 SmallVectorImpl<WeakVH> &DeadInsts,
4519                                 Pass *P) const {
4520   DenseMap<BasicBlock *, Value *> Inserted;
4521   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
4522     if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
4523       BasicBlock *BB = PN->getIncomingBlock(i);
4524
4525       // If this is a critical edge, split the edge so that we do not insert
4526       // the code on all predecessor/successor paths.  We do this unless this
4527       // is the canonical backedge for this loop, which complicates post-inc
4528       // users.
4529       if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
4530           !isa<IndirectBrInst>(BB->getTerminator())) {
4531         BasicBlock *Parent = PN->getParent();
4532         Loop *PNLoop = LI.getLoopFor(Parent);
4533         if (!PNLoop || Parent != PNLoop->getHeader()) {
4534           // Split the critical edge.
4535           BasicBlock *NewBB = 0;
4536           if (!Parent->isLandingPad()) {
4537             NewBB = SplitCriticalEdge(BB, Parent, P,
4538                                       /*MergeIdenticalEdges=*/true,
4539                                       /*DontDeleteUselessPhis=*/true);
4540           } else {
4541             SmallVector<BasicBlock*, 2> NewBBs;
4542             SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs);
4543             NewBB = NewBBs[0];
4544           }
4545           // If NewBB==NULL, then SplitCriticalEdge refused to split because all
4546           // phi predecessors are identical. The simple thing to do is skip
4547           // splitting in this case rather than complicate the API.
4548           if (NewBB) {
4549             // If PN is outside of the loop and BB is in the loop, we want to
4550             // move the block to be immediately before the PHI block, not
4551             // immediately after BB.
4552             if (L->contains(BB) && !L->contains(PN))
4553               NewBB->moveBefore(PN->getParent());
4554
4555             // Splitting the edge can reduce the number of PHI entries we have.
4556             e = PN->getNumIncomingValues();
4557             BB = NewBB;
4558             i = PN->getBasicBlockIndex(BB);
4559           }
4560         }
4561       }
4562
4563       std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
4564         Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
4565       if (!Pair.second)
4566         PN->setIncomingValue(i, Pair.first->second);
4567       else {
4568         Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts);
4569
4570         // If this is reuse-by-noop-cast, insert the noop cast.
4571         Type *OpTy = LF.OperandValToReplace->getType();
4572         if (FullV->getType() != OpTy)
4573           FullV =
4574             CastInst::Create(CastInst::getCastOpcode(FullV, false,
4575                                                      OpTy, false),
4576                              FullV, LF.OperandValToReplace->getType(),
4577                              "tmp", BB->getTerminator());
4578
4579         PN->setIncomingValue(i, FullV);
4580         Pair.first->second = FullV;
4581       }
4582     }
4583 }
4584
4585 /// Rewrite - Emit instructions for the leading candidate expression for this
4586 /// LSRUse (this is called "expanding"), and update the UserInst to reference
4587 /// the newly expanded value.
4588 void LSRInstance::Rewrite(const LSRFixup &LF,
4589                           const Formula &F,
4590                           SCEVExpander &Rewriter,
4591                           SmallVectorImpl<WeakVH> &DeadInsts,
4592                           Pass *P) const {
4593   // First, find an insertion point that dominates UserInst. For PHI nodes,
4594   // find the nearest block which dominates all the relevant uses.
4595   if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
4596     RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P);
4597   } else {
4598     Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts);
4599
4600     // If this is reuse-by-noop-cast, insert the noop cast.
4601     Type *OpTy = LF.OperandValToReplace->getType();
4602     if (FullV->getType() != OpTy) {
4603       Instruction *Cast =
4604         CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
4605                          FullV, OpTy, "tmp", LF.UserInst);
4606       FullV = Cast;
4607     }
4608
4609     // Update the user. ICmpZero is handled specially here (for now) because
4610     // Expand may have updated one of the operands of the icmp already, and
4611     // its new value may happen to be equal to LF.OperandValToReplace, in
4612     // which case doing replaceUsesOfWith leads to replacing both operands
4613     // with the same value. TODO: Reorganize this.
4614     if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
4615       LF.UserInst->setOperand(0, FullV);
4616     else
4617       LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
4618   }
4619
4620   DeadInsts.push_back(LF.OperandValToReplace);
4621 }
4622
4623 /// ImplementSolution - Rewrite all the fixup locations with new values,
4624 /// following the chosen solution.
4625 void
4626 LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
4627                                Pass *P) {
4628   // Keep track of instructions we may have made dead, so that
4629   // we can remove them after we are done working.
4630   SmallVector<WeakVH, 16> DeadInsts;
4631
4632   SCEVExpander Rewriter(SE, "lsr");
4633 #ifndef NDEBUG
4634   Rewriter.setDebugType(DEBUG_TYPE);
4635 #endif
4636   Rewriter.disableCanonicalMode();
4637   Rewriter.enableLSRMode();
4638   Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
4639
4640   // Mark phi nodes that terminate chains so the expander tries to reuse them.
4641   for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(),
4642          ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) {
4643     if (PHINode *PN = dyn_cast<PHINode>(ChainI->tailUserInst()))
4644       Rewriter.setChainedPhi(PN);
4645   }
4646
4647   // Expand the new value definitions and update the users.
4648   for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
4649        E = Fixups.end(); I != E; ++I) {
4650     const LSRFixup &Fixup = *I;
4651
4652     Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P);
4653
4654     Changed = true;
4655   }
4656
4657   for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(),
4658          ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) {
4659     GenerateIVChain(*ChainI, Rewriter, DeadInsts);
4660     Changed = true;
4661   }
4662   // Clean up after ourselves. This must be done before deleting any
4663   // instructions.
4664   Rewriter.clear();
4665
4666   Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
4667 }
4668
4669 LSRInstance::LSRInstance(Loop *L, Pass *P)
4670     : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()),
4671       DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()),
4672       LI(P->getAnalysis<LoopInfo>()),
4673       TTI(P->getAnalysis<TargetTransformInfo>()), L(L), Changed(false),
4674       IVIncInsertPos(0) {
4675   // If LoopSimplify form is not available, stay out of trouble.
4676   if (!L->isLoopSimplifyForm())
4677     return;
4678
4679   // If there's no interesting work to be done, bail early.
4680   if (IU.empty()) return;
4681
4682   // If there's too much analysis to be done, bail early. We won't be able to
4683   // model the problem anyway.
4684   unsigned NumUsers = 0;
4685   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
4686     if (++NumUsers > MaxIVUsers) {
4687       DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << *L
4688             << "\n");
4689       return;
4690     }
4691   }
4692
4693 #ifndef NDEBUG
4694   // All dominating loops must have preheaders, or SCEVExpander may not be able
4695   // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
4696   //
4697   // IVUsers analysis should only create users that are dominated by simple loop
4698   // headers. Since this loop should dominate all of its users, its user list
4699   // should be empty if this loop itself is not within a simple loop nest.
4700   for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
4701        Rung; Rung = Rung->getIDom()) {
4702     BasicBlock *BB = Rung->getBlock();
4703     const Loop *DomLoop = LI.getLoopFor(BB);
4704     if (DomLoop && DomLoop->getHeader() == BB) {
4705       assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
4706     }
4707   }
4708 #endif // DEBUG
4709
4710   DEBUG(dbgs() << "\nLSR on loop ";
4711         L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
4712         dbgs() << ":\n");
4713
4714   // First, perform some low-level loop optimizations.
4715   OptimizeShadowIV();
4716   OptimizeLoopTermCond();
4717
4718   // If loop preparation eliminates all interesting IV users, bail.
4719   if (IU.empty()) return;
4720
4721   // Skip nested loops until we can model them better with formulae.
4722   if (!L->empty()) {
4723     DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
4724     return;
4725   }
4726
4727   // Start collecting data and preparing for the solver.
4728   CollectChains();
4729   CollectInterestingTypesAndFactors();
4730   CollectFixupsAndInitialFormulae();
4731   CollectLoopInvariantFixupsAndFormulae();
4732
4733   assert(!Uses.empty() && "IVUsers reported at least one use");
4734   DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
4735         print_uses(dbgs()));
4736
4737   // Now use the reuse data to generate a bunch of interesting ways
4738   // to formulate the values needed for the uses.
4739   GenerateAllReuseFormulae();
4740
4741   FilterOutUndesirableDedicatedRegisters();
4742   NarrowSearchSpaceUsingHeuristics();
4743
4744   SmallVector<const Formula *, 8> Solution;
4745   Solve(Solution);
4746
4747   // Release memory that is no longer needed.
4748   Factors.clear();
4749   Types.clear();
4750   RegUses.clear();
4751
4752   if (Solution.empty())
4753     return;
4754
4755 #ifndef NDEBUG
4756   // Formulae should be legal.
4757   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), E = Uses.end();
4758        I != E; ++I) {
4759     const LSRUse &LU = *I;
4760     for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
4761                                                   JE = LU.Formulae.end();
4762          J != JE; ++J)
4763       assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4764                         *J) && "Illegal formula generated!");
4765   };
4766 #endif
4767
4768   // Now that we've decided what we want, make it so.
4769   ImplementSolution(Solution, P);
4770 }
4771
4772 void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
4773   if (Factors.empty() && Types.empty()) return;
4774
4775   OS << "LSR has identified the following interesting factors and types: ";
4776   bool First = true;
4777
4778   for (SmallSetVector<int64_t, 8>::const_iterator
4779        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
4780     if (!First) OS << ", ";
4781     First = false;
4782     OS << '*' << *I;
4783   }
4784
4785   for (SmallSetVector<Type *, 4>::const_iterator
4786        I = Types.begin(), E = Types.end(); I != E; ++I) {
4787     if (!First) OS << ", ";
4788     First = false;
4789     OS << '(' << **I << ')';
4790   }
4791   OS << '\n';
4792 }
4793
4794 void LSRInstance::print_fixups(raw_ostream &OS) const {
4795   OS << "LSR is examining the following fixup sites:\n";
4796   for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
4797        E = Fixups.end(); I != E; ++I) {
4798     dbgs() << "  ";
4799     I->print(OS);
4800     OS << '\n';
4801   }
4802 }
4803
4804 void LSRInstance::print_uses(raw_ostream &OS) const {
4805   OS << "LSR is examining the following uses:\n";
4806   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
4807        E = Uses.end(); I != E; ++I) {
4808     const LSRUse &LU = *I;
4809     dbgs() << "  ";
4810     LU.print(OS);
4811     OS << '\n';
4812     for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
4813          JE = LU.Formulae.end(); J != JE; ++J) {
4814       OS << "    ";
4815       J->print(OS);
4816       OS << '\n';
4817     }
4818   }
4819 }
4820
4821 void LSRInstance::print(raw_ostream &OS) const {
4822   print_factors_and_types(OS);
4823   print_fixups(OS);
4824   print_uses(OS);
4825 }
4826
4827 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4828 void LSRInstance::dump() const {
4829   print(errs()); errs() << '\n';
4830 }
4831 #endif
4832
4833 namespace {
4834
4835 class LoopStrengthReduce : public LoopPass {
4836 public:
4837   static char ID; // Pass ID, replacement for typeid
4838   LoopStrengthReduce();
4839
4840 private:
4841   bool runOnLoop(Loop *L, LPPassManager &LPM) override;
4842   void getAnalysisUsage(AnalysisUsage &AU) const override;
4843 };
4844
4845 }
4846
4847 char LoopStrengthReduce::ID = 0;
4848 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
4849                 "Loop Strength Reduction", false, false)
4850 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
4851 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
4852 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
4853 INITIALIZE_PASS_DEPENDENCY(IVUsers)
4854 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
4855 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
4856 INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
4857                 "Loop Strength Reduction", false, false)
4858
4859
4860 Pass *llvm::createLoopStrengthReducePass() {
4861   return new LoopStrengthReduce();
4862 }
4863
4864 LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
4865   initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
4866 }
4867
4868 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
4869   // We split critical edges, so we change the CFG.  However, we do update
4870   // many analyses if they are around.
4871   AU.addPreservedID(LoopSimplifyID);
4872
4873   AU.addRequired<LoopInfo>();
4874   AU.addPreserved<LoopInfo>();
4875   AU.addRequiredID(LoopSimplifyID);
4876   AU.addRequired<DominatorTreeWrapperPass>();
4877   AU.addPreserved<DominatorTreeWrapperPass>();
4878   AU.addRequired<ScalarEvolution>();
4879   AU.addPreserved<ScalarEvolution>();
4880   // Requiring LoopSimplify a second time here prevents IVUsers from running
4881   // twice, since LoopSimplify was invalidated by running ScalarEvolution.
4882   AU.addRequiredID(LoopSimplifyID);
4883   AU.addRequired<IVUsers>();
4884   AU.addPreserved<IVUsers>();
4885   AU.addRequired<TargetTransformInfo>();
4886 }
4887
4888 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
4889   if (skipOptnoneFunction(L))
4890     return false;
4891
4892   bool Changed = false;
4893
4894   // Run the main LSR transformation.
4895   Changed |= LSRInstance(L, this).getChanged();
4896
4897   // Remove any extra phis created by processing inner loops.
4898   Changed |= DeleteDeadPHIs(L->getHeader());
4899   if (EnablePhiElim && L->isLoopSimplifyForm()) {
4900     SmallVector<WeakVH, 16> DeadInsts;
4901     SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), "lsr");
4902 #ifndef NDEBUG
4903     Rewriter.setDebugType(DEBUG_TYPE);
4904 #endif
4905     unsigned numFolded = Rewriter.replaceCongruentIVs(
4906         L, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), DeadInsts,
4907         &getAnalysis<TargetTransformInfo>());
4908     if (numFolded) {
4909       Changed = true;
4910       DeleteTriviallyDeadInstructions(DeadInsts);
4911       DeleteDeadPHIs(L->getHeader());
4912     }
4913   }
4914   return Changed;
4915 }