lib/Transforms/Scalar/LoopStrengthReduce.cpp

   1 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This transformation analyzes and transforms the induction variables (and
  11 // computations derived from them) into forms suitable for efficient execution
  12 // on the target.
  13 //
  14 // This pass performs a strength reduction on array references inside loops that
  15 // have as one or more of their components the loop induction variable, it
  16 // rewrites expressions to take advantage of scaled-index addressing modes
  17 // available on the target, and it performs a variety of other optimizations
  18 // related to loop induction variables.
  19 //
  20 // Terminology note: this code has a lot of handling for "post-increment" or
  21 // "post-inc" users. This is not talking about post-increment addressing modes;
  22 // it is instead talking about code like this:
  23 //
  24 //   %i = phi [ 0, %entry ], [ %i.next, %latch ]
  25 //   ...
  26 //   %i.next = add %i, 1
  27 //   %c = icmp eq %i.next, %n
  28 //
  29 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
  30 // it's useful to think about these as the same register, with some uses using
  31 // the value of the register before the add and some using // it after. In this
  32 // example, the icmp is a post-increment user, since it uses %i.next, which is
  33 // the value of the induction variable after the increment. The other common
  34 // case of post-increment users is users outside the loop.
  35 //
  36 // TODO: More sophistication in the way Formulae are generated and filtered.
  37 //
  38 // TODO: Handle multiple loops at a time.
  39 //
  40 // TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr
  41 //       instead of a GlobalValue?
  42 //
  43 // TODO: When truncation is free, truncate ICmp users' operands to make it a
  44 //       smaller encoding (on x86 at least).
  45 //
  46 // TODO: When a negated register is used by an add (such as in a list of
  47 //       multiple base registers, or as the increment expression in an addrec),
  48 //       we may not actually need both reg and (-1 * reg) in registers; the
  49 //       negation can be implemented by using a sub instead of an add. The
  50 //       lack of support for taking this into consideration when making
  51 //       register pressure decisions is partly worked around by the "Special"
  52 //       use kind.
  53 //
  54 //===----------------------------------------------------------------------===//
  55
  56 #define DEBUG_TYPE "loop-reduce"
  57 #include "llvm/Transforms/Scalar.h"
  58 #include "llvm/Constants.h"
  59 #include "llvm/Instructions.h"
  60 #include "llvm/IntrinsicInst.h"
  61 #include "llvm/DerivedTypes.h"
  62 #include "llvm/Analysis/IVUsers.h"
  63 #include "llvm/Analysis/Dominators.h"
  64 #include "llvm/Analysis/LoopPass.h"
  65 #include "llvm/Analysis/ScalarEvolutionExpander.h"
  66 #include "llvm/Assembly/Writer.h"
  67 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  68 #include "llvm/Transforms/Utils/Local.h"
  69 #include "llvm/ADT/SmallBitVector.h"
  70 #include "llvm/ADT/SetVector.h"
  71 #include "llvm/ADT/DenseSet.h"
  72 #include "llvm/Support/Debug.h"
  73 #include "llvm/Support/CommandLine.h"
  74 #include "llvm/Support/ValueHandle.h"
  75 #include "llvm/Support/raw_ostream.h"
  76 #include "llvm/Target/TargetLowering.h"
  77 #include <algorithm>
  78 using namespace llvm;
  79
  80 /// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for
  81 /// bail out. This threshold is far beyond the number of users that LSR can
  82 /// conceivably solve, so it should not affect generated code, but catches the
  83 /// worst cases before LSR burns too much compile time and stack space.
  84 static const unsigned MaxIVUsers = 200;
  85
  86 // Temporary flag to cleanup congruent phis after LSR phi expansion.
  87 // It's currently disabled until we can determine whether it's truly useful or
  88 // not. The flag should be removed after the v3.0 release.
  89 // This is now needed for ivchains.
  90 static cl::opt<bool> EnablePhiElim(
  91   "enable-lsr-phielim", cl::Hidden, cl::init(true),
  92   cl::desc("Enable LSR phi elimination"));
  93
  94 #ifndef NDEBUG
  95 // Stress test IV chain generation.
  96 static cl::opt<bool> StressIVChain(
  97   "stress-ivchain", cl::Hidden, cl::init(false),
  98   cl::desc("Stress test LSR IV chains"));
  99 #else
 100 static bool StressIVChain = false;
 101 #endif
 102
 103 namespace {
 104
 105 /// RegSortData - This class holds data which is used to order reuse candidates.
 106 class RegSortData {
 107 public:
 108   /// UsedByIndices - This represents the set of LSRUse indices which reference
 109   /// a particular register.
 110   SmallBitVector UsedByIndices;
 111
 112   RegSortData() {}
 113
 114   void print(raw_ostream &OS) const;
 115   void dump() const;
 116 };
 117
 118 }
 119
 120 void RegSortData::print(raw_ostream &OS) const {
 121   OS << "[NumUses=" << UsedByIndices.count() << ']';
 122 }
 123
 124 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 125 void RegSortData::dump() const {
 126   print(errs()); errs() << '\n';
 127 }
 128 #endif
 129
 130 namespace {
 131
 132 /// RegUseTracker - Map register candidates to information about how they are
 133 /// used.
 134 class RegUseTracker {
 135   typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
 136
 137   RegUsesTy RegUsesMap;
 138   SmallVector<const SCEV *, 16> RegSequence;
 139
 140 public:
 141   void CountRegister(const SCEV *Reg, size_t LUIdx);
 142   void DropRegister(const SCEV *Reg, size_t LUIdx);
 143   void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx);
 144
 145   bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
 146
 147   const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
 148
 149   void clear();
 150
 151   typedef SmallVectorImpl<const SCEV *>::iterator iterator;
 152   typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
 153   iterator begin() { return RegSequence.begin(); }
 154   iterator end()   { return RegSequence.end(); }
 155   const_iterator begin() const { return RegSequence.begin(); }
 156   const_iterator end() const   { return RegSequence.end(); }
 157 };
 158
 159 }
 160
 161 void
 162 RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) {
 163   std::pair<RegUsesTy::iterator, bool> Pair =
 164     RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
 165   RegSortData &RSD = Pair.first->second;
 166   if (Pair.second)
 167     RegSequence.push_back(Reg);
 168   RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
 169   RSD.UsedByIndices.set(LUIdx);
 170 }
 171
 172 void
 173 RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) {
 174   RegUsesTy::iterator It = RegUsesMap.find(Reg);
 175   assert(It != RegUsesMap.end());
 176   RegSortData &RSD = It->second;
 177   assert(RSD.UsedByIndices.size() > LUIdx);
 178   RSD.UsedByIndices.reset(LUIdx);
 179 }
 180
 181 void
 182 RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
 183   assert(LUIdx <= LastLUIdx);
 184
 185   // Update RegUses. The data structure is not optimized for this purpose;
 186   // we must iterate through it and update each of the bit vectors.
 187   for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end();
 188        I != E; ++I) {
 189     SmallBitVector &UsedByIndices = I->second.UsedByIndices;
 190     if (LUIdx < UsedByIndices.size())
 191       UsedByIndices[LUIdx] =
 192         LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0;
 193     UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
 194   }
 195 }
 196
 197 bool
 198 RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
 199   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
 200   if (I == RegUsesMap.end())
 201     return false;
 202   const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
 203   int i = UsedByIndices.find_first();
 204   if (i == -1) return false;
 205   if ((size_t)i != LUIdx) return true;
 206   return UsedByIndices.find_next(i) != -1;
 207 }
 208
 209 const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
 210   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
 211   assert(I != RegUsesMap.end() && "Unknown register!");
 212   return I->second.UsedByIndices;
 213 }
 214
 215 void RegUseTracker::clear() {
 216   RegUsesMap.clear();
 217   RegSequence.clear();
 218 }
 219
 220 namespace {
 221
 222 /// Formula - This class holds information that describes a formula for
 223 /// computing satisfying a use. It may include broken-out immediates and scaled
 224 /// registers.
 225 struct Formula {
 226   /// AM - This is used to represent complex addressing, as well as other kinds
 227   /// of interesting uses.
 228   TargetLowering::AddrMode AM;
 229
 230   /// BaseRegs - The list of "base" registers for this use. When this is
 231   /// non-empty, AM.HasBaseReg should be set to true.
 232   SmallVector<const SCEV *, 2> BaseRegs;
 233
 234   /// ScaledReg - The 'scaled' register for this use. This should be non-null
 235   /// when AM.Scale is not zero.
 236   const SCEV *ScaledReg;
 237
 238   /// UnfoldedOffset - An additional constant offset which added near the
 239   /// use. This requires a temporary register, but the offset itself can
 240   /// live in an add immediate field rather than a register.
 241   int64_t UnfoldedOffset;
 242
 243   Formula() : ScaledReg(0), UnfoldedOffset(0) {}
 244
 245   void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 246
 247   unsigned getNumRegs() const;
 248   Type *getType() const;
 249
 250   void DeleteBaseReg(const SCEV *&S);
 251
 252   bool referencesReg(const SCEV *S) const;
 253   bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
 254                                   const RegUseTracker &RegUses) const;
 255
 256   void print(raw_ostream &OS) const;
 257   void dump() const;
 258 };
 259
 260 }
 261
 262 /// DoInitialMatch - Recursion helper for InitialMatch.
 263 static void DoInitialMatch(const SCEV *S, Loop *L,
 264                            SmallVectorImpl<const SCEV *> &Good,
 265                            SmallVectorImpl<const SCEV *> &Bad,
 266                            ScalarEvolution &SE) {
 267   // Collect expressions which properly dominate the loop header.
 268   if (SE.properlyDominates(S, L->getHeader())) {
 269     Good.push_back(S);
 270     return;
 271   }
 272
 273   // Look at add operands.
 274   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 275     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
 276          I != E; ++I)
 277       DoInitialMatch(*I, L, Good, Bad, SE);
 278     return;
 279   }
 280
 281   // Look at addrec operands.
 282   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
 283     if (!AR->getStart()->isZero()) {
 284       DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
 285       DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
 286                                       AR->getStepRecurrence(SE),
 287                                       // FIXME: AR->getNoWrapFlags()
 288                                       AR->getLoop(), SCEV::FlagAnyWrap),
 289                      L, Good, Bad, SE);
 290       return;
 291     }
 292
 293   // Handle a multiplication by -1 (negation) if it didn't fold.
 294   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
 295     if (Mul->getOperand(0)->isAllOnesValue()) {
 296       SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
 297       const SCEV *NewMul = SE.getMulExpr(Ops);
 298
 299       SmallVector<const SCEV *, 4> MyGood;
 300       SmallVector<const SCEV *, 4> MyBad;
 301       DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
 302       const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
 303         SE.getEffectiveSCEVType(NewMul->getType())));
 304       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(),
 305            E = MyGood.end(); I != E; ++I)
 306         Good.push_back(SE.getMulExpr(NegOne, *I));
 307       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(),
 308            E = MyBad.end(); I != E; ++I)
 309         Bad.push_back(SE.getMulExpr(NegOne, *I));
 310       return;
 311     }
 312
 313   // Ok, we can't do anything interesting. Just stuff the whole thing into a
 314   // register and hope for the best.
 315   Bad.push_back(S);
 316 }
 317
 318 /// InitialMatch - Incorporate loop-variant parts of S into this Formula,
 319 /// attempting to keep all loop-invariant and loop-computable values in a
 320 /// single base register.
 321 void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
 322   SmallVector<const SCEV *, 4> Good;
 323   SmallVector<const SCEV *, 4> Bad;
 324   DoInitialMatch(S, L, Good, Bad, SE);
 325   if (!Good.empty()) {
 326     const SCEV *Sum = SE.getAddExpr(Good);
 327     if (!Sum->isZero())
 328       BaseRegs.push_back(Sum);
 329     AM.HasBaseReg = true;
 330   }
 331   if (!Bad.empty()) {
 332     const SCEV *Sum = SE.getAddExpr(Bad);
 333     if (!Sum->isZero())
 334       BaseRegs.push_back(Sum);
 335     AM.HasBaseReg = true;
 336   }
 337 }
 338
 339 /// getNumRegs - Return the total number of register operands used by this
 340 /// formula. This does not include register uses implied by non-constant
 341 /// addrec strides.
 342 unsigned Formula::getNumRegs() const {
 343   return !!ScaledReg + BaseRegs.size();
 344 }
 345
 346 /// getType - Return the type of this formula, if it has one, or null
 347 /// otherwise. This type is meaningless except for the bit size.
 348 Type *Formula::getType() const {
 349   return !BaseRegs.empty() ? BaseRegs.front()->getType() :
 350          ScaledReg ? ScaledReg->getType() :
 351          AM.BaseGV ? AM.BaseGV->getType() :
 352          0;
 353 }
 354
 355 /// DeleteBaseReg - Delete the given base reg from the BaseRegs list.
 356 void Formula::DeleteBaseReg(const SCEV *&S) {
 357   if (&S != &BaseRegs.back())
 358     std::swap(S, BaseRegs.back());
 359   BaseRegs.pop_back();
 360 }
 361
 362 /// referencesReg - Test if this formula references the given register.
 363 bool Formula::referencesReg(const SCEV *S) const {
 364   return S == ScaledReg ||
 365          std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
 366 }
 367
 368 /// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers
 369 /// which are used by uses other than the use with the given index.
 370 bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
 371                                          const RegUseTracker &RegUses) const {
 372   if (ScaledReg)
 373     if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
 374       return true;
 375   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
 376        E = BaseRegs.end(); I != E; ++I)
 377     if (RegUses.isRegUsedByUsesOtherThan(*I, LUIdx))
 378       return true;
 379   return false;
 380 }
 381
 382 void Formula::print(raw_ostream &OS) const {
 383   bool First = true;
 384   if (AM.BaseGV) {
 385     if (!First) OS << " + "; else First = false;
 386     WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false);
 387   }
 388   if (AM.BaseOffs != 0) {
 389     if (!First) OS << " + "; else First = false;
 390     OS << AM.BaseOffs;
 391   }
 392   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
 393        E = BaseRegs.end(); I != E; ++I) {
 394     if (!First) OS << " + "; else First = false;
 395     OS << "reg(" << **I << ')';
 396   }
 397   if (AM.HasBaseReg && BaseRegs.empty()) {
 398     if (!First) OS << " + "; else First = false;
 399     OS << "**error: HasBaseReg**";
 400   } else if (!AM.HasBaseReg && !BaseRegs.empty()) {
 401     if (!First) OS << " + "; else First = false;
 402     OS << "**error: !HasBaseReg**";
 403   }
 404   if (AM.Scale != 0) {
 405     if (!First) OS << " + "; else First = false;
 406     OS << AM.Scale << "*reg(";
 407     if (ScaledReg)
 408       OS << *ScaledReg;
 409     else
 410       OS << "<unknown>";
 411     OS << ')';
 412   }
 413   if (UnfoldedOffset != 0) {
 414     if (!First) OS << " + "; else First = false;
 415     OS << "imm(" << UnfoldedOffset << ')';
 416   }
 417 }
 418
 419 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 420 void Formula::dump() const {
 421   print(errs()); errs() << '\n';
 422 }
 423 #endif
 424
 425 /// isAddRecSExtable - Return true if the given addrec can be sign-extended
 426 /// without changing its value.
 427 static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
 428   Type *WideTy =
 429     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
 430   return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
 431 }
 432
 433 /// isAddSExtable - Return true if the given add can be sign-extended
 434 /// without changing its value.
 435 static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
 436   Type *WideTy =
 437     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
 438   return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
 439 }
 440
 441 /// isMulSExtable - Return true if the given mul can be sign-extended
 442 /// without changing its value.
 443 static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
 444   Type *WideTy =
 445     IntegerType::get(SE.getContext(),
 446                      SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
 447   return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
 448 }
 449
 450 /// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined
 451 /// and if the remainder is known to be zero,  or null otherwise. If
 452 /// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified
 453 /// to Y, ignoring that the multiplication may overflow, which is useful when
 454 /// the result will be used in a context where the most significant bits are
 455 /// ignored.
 456 static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
 457                                 ScalarEvolution &SE,
 458                                 bool IgnoreSignificantBits = false) {
 459   // Handle the trivial case, which works for any SCEV type.
 460   if (LHS == RHS)
 461     return SE.getConstant(LHS->getType(), 1);
 462
 463   // Handle a few RHS special cases.
 464   const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
 465   if (RC) {
 466     const APInt &RA = RC->getValue()->getValue();
 467     // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
 468     // some folding.
 469     if (RA.isAllOnesValue())
 470       return SE.getMulExpr(LHS, RC);
 471     // Handle x /s 1 as x.
 472     if (RA == 1)
 473       return LHS;
 474   }
 475
 476   // Check for a division of a constant by a constant.
 477   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
 478     if (!RC)
 479       return 0;
 480     const APInt &LA = C->getValue()->getValue();
 481     const APInt &RA = RC->getValue()->getValue();
 482     if (LA.srem(RA) != 0)
 483       return 0;
 484     return SE.getConstant(LA.sdiv(RA));
 485   }
 486
 487   // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
 488   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
 489     if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
 490       const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
 491                                       IgnoreSignificantBits);
 492       if (!Step) return 0;
 493       const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
 494                                        IgnoreSignificantBits);
 495       if (!Start) return 0;
 496       // FlagNW is independent of the start value, step direction, and is
 497       // preserved with smaller magnitude steps.
 498       // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
 499       return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
 500     }
 501     return 0;
 502   }
 503
 504   // Distribute the sdiv over add operands, if the add doesn't overflow.
 505   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
 506     if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
 507       SmallVector<const SCEV *, 8> Ops;
 508       for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
 509            I != E; ++I) {
 510         const SCEV *Op = getExactSDiv(*I, RHS, SE,
 511                                       IgnoreSignificantBits);
 512         if (!Op) return 0;
 513         Ops.push_back(Op);
 514       }
 515       return SE.getAddExpr(Ops);
 516     }
 517     return 0;
 518   }
 519
 520   // Check for a multiply operand that we can pull RHS out of.
 521   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
 522     if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
 523       SmallVector<const SCEV *, 4> Ops;
 524       bool Found = false;
 525       for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end();
 526            I != E; ++I) {
 527         const SCEV *S = *I;
 528         if (!Found)
 529           if (const SCEV *Q = getExactSDiv(S, RHS, SE,
 530                                            IgnoreSignificantBits)) {
 531             S = Q;
 532             Found = true;
 533           }
 534         Ops.push_back(S);
 535       }
 536       return Found ? SE.getMulExpr(Ops) : 0;
 537     }
 538     return 0;
 539   }
 540
 541   // Otherwise we don't know.
 542   return 0;
 543 }
 544
 545 /// ExtractImmediate - If S involves the addition of a constant integer value,
 546 /// return that integer value, and mutate S to point to a new SCEV with that
 547 /// value excluded.
 548 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
 549   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
 550     if (C->getValue()->getValue().getMinSignedBits() <= 64) {
 551       S = SE.getConstant(C->getType(), 0);
 552       return C->getValue()->getSExtValue();
 553     }
 554   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 555     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
 556     int64_t Result = ExtractImmediate(NewOps.front(), SE);
 557     if (Result != 0)
 558       S = SE.getAddExpr(NewOps);
 559     return Result;
 560   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
 561     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
 562     int64_t Result = ExtractImmediate(NewOps.front(), SE);
 563     if (Result != 0)
 564       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
 565                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
 566                            SCEV::FlagAnyWrap);
 567     return Result;
 568   }
 569   return 0;
 570 }
 571
 572 /// ExtractSymbol - If S involves the addition of a GlobalValue address,
 573 /// return that symbol, and mutate S to point to a new SCEV with that
 574 /// value excluded.
 575 static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
 576   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
 577     if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
 578       S = SE.getConstant(GV->getType(), 0);
 579       return GV;
 580     }
 581   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 582     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
 583     GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
 584     if (Result)
 585       S = SE.getAddExpr(NewOps);
 586     return Result;
 587   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
 588     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
 589     GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
 590     if (Result)
 591       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
 592                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
 593                            SCEV::FlagAnyWrap);
 594     return Result;
 595   }
 596   return 0;
 597 }
 598
 599 /// isAddressUse - Returns true if the specified instruction is using the
 600 /// specified value as an address.
 601 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
 602   bool isAddress = isa<LoadInst>(Inst);
 603   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
 604     if (SI->getOperand(1) == OperandVal)
 605       isAddress = true;
 606   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
 607     // Addressing modes can also be folded into prefetches and a variety
 608     // of intrinsics.
 609     switch (II->getIntrinsicID()) {
 610       default: break;
 611       case Intrinsic::prefetch:
 612       case Intrinsic::x86_sse_storeu_ps:
 613       case Intrinsic::x86_sse2_storeu_pd:
 614       case Intrinsic::x86_sse2_storeu_dq:
 615       case Intrinsic::x86_sse2_storel_dq:
 616         if (II->getArgOperand(0) == OperandVal)
 617           isAddress = true;
 618         break;
 619     }
 620   }
 621   return isAddress;
 622 }
 623
 624 /// getAccessType - Return the type of the memory being accessed.
 625 static Type *getAccessType(const Instruction *Inst) {
 626   Type *AccessTy = Inst->getType();
 627   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
 628     AccessTy = SI->getOperand(0)->getType();
 629   else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
 630     // Addressing modes can also be folded into prefetches and a variety
 631     // of intrinsics.
 632     switch (II->getIntrinsicID()) {
 633     default: break;
 634     case Intrinsic::x86_sse_storeu_ps:
 635     case Intrinsic::x86_sse2_storeu_pd:
 636     case Intrinsic::x86_sse2_storeu_dq:
 637     case Intrinsic::x86_sse2_storel_dq:
 638       AccessTy = II->getArgOperand(0)->getType();
 639       break;
 640     }
 641   }
 642
 643   // All pointers have the same requirements, so canonicalize them to an
 644   // arbitrary pointer type to minimize variation.
 645   if (PointerType *PTy = dyn_cast<PointerType>(AccessTy))
 646     AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
 647                                 PTy->getAddressSpace());
 648
 649   return AccessTy;
 650 }
 651
 652 /// isExistingPhi - Return true if this AddRec is already a phi in its loop.
 653 static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
 654   for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
 655        PHINode *PN = dyn_cast<PHINode>(I); ++I) {
 656     if (SE.isSCEVable(PN->getType()) &&
 657         (SE.getEffectiveSCEVType(PN->getType()) ==
 658          SE.getEffectiveSCEVType(AR->getType())) &&
 659         SE.getSCEV(PN) == AR)
 660       return true;
 661   }
 662   return false;
 663 }
 664
 665 /// Check if expanding this expression is likely to incur significant cost. This
 666 /// is tricky because SCEV doesn't track which expressions are actually computed
 667 /// by the current IR.
 668 ///
 669 /// We currently allow expansion of IV increments that involve adds,
 670 /// multiplication by constants, and AddRecs from existing phis.
 671 ///
 672 /// TODO: Allow UDivExpr if we can find an existing IV increment that is an
 673 /// obvious multiple of the UDivExpr.
 674 static bool isHighCostExpansion(const SCEV *S,
 675                                 SmallPtrSet<const SCEV*, 8> &Processed,
 676                                 ScalarEvolution &SE) {
 677   // Zero/One operand expressions
 678   switch (S->getSCEVType()) {
 679   case scUnknown:
 680   case scConstant:
 681     return false;
 682   case scTruncate:
 683     return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
 684                                Processed, SE);
 685   case scZeroExtend:
 686     return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
 687                                Processed, SE);
 688   case scSignExtend:
 689     return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
 690                                Processed, SE);
 691   }
 692
 693   if (!Processed.insert(S))
 694     return false;
 695
 696   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 697     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
 698          I != E; ++I) {
 699       if (isHighCostExpansion(*I, Processed, SE))
 700         return true;
 701     }
 702     return false;
 703   }
 704
 705   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
 706     if (Mul->getNumOperands() == 2) {
 707       // Multiplication by a constant is ok
 708       if (isa<SCEVConstant>(Mul->getOperand(0)))
 709         return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
 710
 711       // If we have the value of one operand, check if an existing
 712       // multiplication already generates this expression.
 713       if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
 714         Value *UVal = U->getValue();
 715         for (Value::use_iterator UI = UVal->use_begin(), UE = UVal->use_end();
 716              UI != UE; ++UI) {
 717           // If U is a constant, it may be used by a ConstantExpr.
 718           Instruction *User = dyn_cast<Instruction>(*UI);
 719           if (User && User->getOpcode() == Instruction::Mul
 720               && SE.isSCEVable(User->getType())) {
 721             return SE.getSCEV(User) == Mul;
 722           }
 723         }
 724       }
 725     }
 726   }
 727
 728   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
 729     if (isExistingPhi(AR, SE))
 730       return false;
 731   }
 732
 733   // Fow now, consider any other type of expression (div/mul/min/max) high cost.
 734   return true;
 735 }
 736
 737 /// DeleteTriviallyDeadInstructions - If any of the instructions is the
 738 /// specified set are trivially dead, delete them and see if this makes any of
 739 /// their operands subsequently dead.
 740 static bool
 741 DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
 742   bool Changed = false;
 743
 744   while (!DeadInsts.empty()) {
 745     Value *V = DeadInsts.pop_back_val();
 746     Instruction *I = dyn_cast_or_null<Instruction>(V);
 747
 748     if (I == 0 || !isInstructionTriviallyDead(I))
 749       continue;
 750
 751     for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
 752       if (Instruction *U = dyn_cast<Instruction>(*OI)) {
 753         *OI = 0;
 754         if (U->use_empty())
 755           DeadInsts.push_back(U);
 756       }
 757
 758     I->eraseFromParent();
 759     Changed = true;
 760   }
 761
 762   return Changed;
 763 }
 764
 765 namespace {
 766
 767 /// Cost - This class is used to measure and compare candidate formulae.
 768 class Cost {
 769   /// TODO: Some of these could be merged. Also, a lexical ordering
 770   /// isn't always optimal.
 771   unsigned NumRegs;
 772   unsigned AddRecCost;
 773   unsigned NumIVMuls;
 774   unsigned NumBaseAdds;
 775   unsigned ImmCost;
 776   unsigned SetupCost;
 777
 778 public:
 779   Cost()
 780     : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
 781       SetupCost(0) {}
 782
 783   bool operator<(const Cost &Other) const;
 784
 785   void Loose();
 786
 787 #ifndef NDEBUG
 788   // Once any of the metrics loses, they must all remain losers.
 789   bool isValid() {
 790     return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
 791              | ImmCost | SetupCost) != ~0u)
 792       || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
 793            & ImmCost & SetupCost) == ~0u);
 794   }
 795 #endif
 796
 797   bool isLoser() {
 798     assert(isValid() && "invalid cost");
 799     return NumRegs == ~0u;
 800   }
 801
 802   void RateFormula(const Formula &F,
 803                    SmallPtrSet<const SCEV *, 16> &Regs,
 804                    const DenseSet<const SCEV *> &VisitedRegs,
 805                    const Loop *L,
 806                    const SmallVectorImpl<int64_t> &Offsets,
 807                    ScalarEvolution &SE, DominatorTree &DT,
 808                    SmallPtrSet<const SCEV *, 16> *LoserRegs = 0);
 809
 810   void print(raw_ostream &OS) const;
 811   void dump() const;
 812
 813 private:
 814   void RateRegister(const SCEV *Reg,
 815                     SmallPtrSet<const SCEV *, 16> &Regs,
 816                     const Loop *L,
 817                     ScalarEvolution &SE, DominatorTree &DT);
 818   void RatePrimaryRegister(const SCEV *Reg,
 819                            SmallPtrSet<const SCEV *, 16> &Regs,
 820                            const Loop *L,
 821                            ScalarEvolution &SE, DominatorTree &DT,
 822                            SmallPtrSet<const SCEV *, 16> *LoserRegs);
 823 };
 824
 825 }
 826
 827 /// RateRegister - Tally up interesting quantities from the given register.
 828 void Cost::RateRegister(const SCEV *Reg,
 829                         SmallPtrSet<const SCEV *, 16> &Regs,
 830                         const Loop *L,
 831                         ScalarEvolution &SE, DominatorTree &DT) {
 832   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
 833     // If this is an addrec for another loop, don't second-guess its addrec phi
 834     // nodes. LSR isn't currently smart enough to reason about more than one
 835     // loop at a time. LSR has already run on inner loops, will not run on outer
 836     // loops, and cannot be expected to change sibling loops.
 837     if (AR->getLoop() != L) {
 838       // If the AddRec exists, consider it's register free and leave it alone.
 839       if (isExistingPhi(AR, SE))
 840         return;
 841
 842       // Otherwise, do not consider this formula at all.
 843       Loose();
 844       return;
 845     }
 846     AddRecCost += 1; /// TODO: This should be a function of the stride.
 847
 848     // Add the step value register, if it needs one.
 849     // TODO: The non-affine case isn't precisely modeled here.
 850     if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
 851       if (!Regs.count(AR->getOperand(1))) {
 852         RateRegister(AR->getOperand(1), Regs, L, SE, DT);
 853         if (isLoser())
 854           return;
 855       }
 856     }
 857   }
 858   ++NumRegs;
 859
 860   // Rough heuristic; favor registers which don't require extra setup
 861   // instructions in the preheader.
 862   if (!isa<SCEVUnknown>(Reg) &&
 863       !isa<SCEVConstant>(Reg) &&
 864       !(isa<SCEVAddRecExpr>(Reg) &&
 865         (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
 866          isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
 867     ++SetupCost;
 868
 869     NumIVMuls += isa<SCEVMulExpr>(Reg) &&
 870                  SE.hasComputableLoopEvolution(Reg, L);
 871 }
 872
 873 /// RatePrimaryRegister - Record this register in the set. If we haven't seen it
 874 /// before, rate it. Optional LoserRegs provides a way to declare any formula
 875 /// that refers to one of those regs an instant loser.
 876 void Cost::RatePrimaryRegister(const SCEV *Reg,
 877                                SmallPtrSet<const SCEV *, 16> &Regs,
 878                                const Loop *L,
 879                                ScalarEvolution &SE, DominatorTree &DT,
 880                                SmallPtrSet<const SCEV *, 16> *LoserRegs) {
 881   if (LoserRegs && LoserRegs->count(Reg)) {
 882     Loose();
 883     return;
 884   }
 885   if (Regs.insert(Reg)) {
 886     RateRegister(Reg, Regs, L, SE, DT);
 887     if (isLoser())
 888       LoserRegs->insert(Reg);
 889   }
 890 }
 891
 892 void Cost::RateFormula(const Formula &F,
 893                        SmallPtrSet<const SCEV *, 16> &Regs,
 894                        const DenseSet<const SCEV *> &VisitedRegs,
 895                        const Loop *L,
 896                        const SmallVectorImpl<int64_t> &Offsets,
 897                        ScalarEvolution &SE, DominatorTree &DT,
 898                        SmallPtrSet<const SCEV *, 16> *LoserRegs) {
 899   // Tally up the registers.
 900   if (const SCEV *ScaledReg = F.ScaledReg) {
 901     if (VisitedRegs.count(ScaledReg)) {
 902       Loose();
 903       return;
 904     }
 905     RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
 906     if (isLoser())
 907       return;
 908   }
 909   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
 910        E = F.BaseRegs.end(); I != E; ++I) {
 911     const SCEV *BaseReg = *I;
 912     if (VisitedRegs.count(BaseReg)) {
 913       Loose();
 914       return;
 915     }
 916     RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
 917     if (isLoser())
 918       return;
 919   }
 920
 921   // Determine how many (unfolded) adds we'll need inside the loop.
 922   size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
 923   if (NumBaseParts > 1)
 924     NumBaseAdds += NumBaseParts - 1;
 925
 926   // Tally up the non-zero immediates.
 927   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
 928        E = Offsets.end(); I != E; ++I) {
 929     int64_t Offset = (uint64_t)*I + F.AM.BaseOffs;
 930     if (F.AM.BaseGV)
 931       ImmCost += 64; // Handle symbolic values conservatively.
 932                      // TODO: This should probably be the pointer size.
 933     else if (Offset != 0)
 934       ImmCost += APInt(64, Offset, true).getMinSignedBits();
 935   }
 936   assert(isValid() && "invalid cost");
 937 }
 938
 939 /// Loose - Set this cost to a losing value.
 940 void Cost::Loose() {
 941   NumRegs = ~0u;
 942   AddRecCost = ~0u;
 943   NumIVMuls = ~0u;
 944   NumBaseAdds = ~0u;
 945   ImmCost = ~0u;
 946   SetupCost = ~0u;
 947 }
 948
 949 /// operator< - Choose the lower cost.
 950 bool Cost::operator<(const Cost &Other) const {
 951   if (NumRegs != Other.NumRegs)
 952     return NumRegs < Other.NumRegs;
 953   if (AddRecCost != Other.AddRecCost)
 954     return AddRecCost < Other.AddRecCost;
 955   if (NumIVMuls != Other.NumIVMuls)
 956     return NumIVMuls < Other.NumIVMuls;
 957   if (NumBaseAdds != Other.NumBaseAdds)
 958     return NumBaseAdds < Other.NumBaseAdds;
 959   if (ImmCost != Other.ImmCost)
 960     return ImmCost < Other.ImmCost;
 961   if (SetupCost != Other.SetupCost)
 962     return SetupCost < Other.SetupCost;
 963   return false;
 964 }
 965
 966 void Cost::print(raw_ostream &OS) const {
 967   OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
 968   if (AddRecCost != 0)
 969     OS << ", with addrec cost " << AddRecCost;
 970   if (NumIVMuls != 0)
 971     OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
 972   if (NumBaseAdds != 0)
 973     OS << ", plus " << NumBaseAdds << " base add"
 974        << (NumBaseAdds == 1 ? "" : "s");
 975   if (ImmCost != 0)
 976     OS << ", plus " << ImmCost << " imm cost";
 977   if (SetupCost != 0)
 978     OS << ", plus " << SetupCost << " setup cost";
 979 }
 980
 981 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 982 void Cost::dump() const {
 983   print(errs()); errs() << '\n';
 984 }
 985 #endif
 986
 987 namespace {
 988
 989 /// LSRFixup - An operand value in an instruction which is to be replaced
 990 /// with some equivalent, possibly strength-reduced, replacement.
 991 struct LSRFixup {
 992   /// UserInst - The instruction which will be updated.
 993   Instruction *UserInst;
 994
 995   /// OperandValToReplace - The operand of the instruction which will
 996   /// be replaced. The operand may be used more than once; every instance
 997   /// will be replaced.
 998   Value *OperandValToReplace;
 999
1000   /// PostIncLoops - If this user is to use the post-incremented value of an
1001   /// induction variable, this variable is non-null and holds the loop
1002   /// associated with the induction variable.
1003   PostIncLoopSet PostIncLoops;
1004
1005   /// LUIdx - The index of the LSRUse describing the expression which
1006   /// this fixup needs, minus an offset (below).
1007   size_t LUIdx;
1008
1009   /// Offset - A constant offset to be added to the LSRUse expression.
1010   /// This allows multiple fixups to share the same LSRUse with different
1011   /// offsets, for example in an unrolled loop.
1012   int64_t Offset;
1013
1014   bool isUseFullyOutsideLoop(const Loop *L) const;
1015
1016   LSRFixup();
1017
1018   void print(raw_ostream &OS) const;
1019   void dump() const;
1020 };
1021
1022 }
1023
1024 LSRFixup::LSRFixup()
1025   : UserInst(0), OperandValToReplace(0), LUIdx(~size_t(0)), Offset(0) {}
1026
1027 /// isUseFullyOutsideLoop - Test whether this fixup always uses its
1028 /// value outside of the given loop.
1029 bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1030   // PHI nodes use their value in their incoming blocks.
1031   if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1032     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1033       if (PN->getIncomingValue(i) == OperandValToReplace &&
1034           L->contains(PN->getIncomingBlock(i)))
1035         return false;
1036     return true;
1037   }
1038
1039   return !L->contains(UserInst);
1040 }
1041
1042 void LSRFixup::print(raw_ostream &OS) const {
1043   OS << "UserInst=";
1044   // Store is common and interesting enough to be worth special-casing.
1045   if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1046     OS << "store ";
1047     WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false);
1048   } else if (UserInst->getType()->isVoidTy())
1049     OS << UserInst->getOpcodeName();
1050   else
1051     WriteAsOperand(OS, UserInst, /*PrintType=*/false);
1052
1053   OS << ", OperandValToReplace=";
1054   WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false);
1055
1056   for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(),
1057        E = PostIncLoops.end(); I != E; ++I) {
1058     OS << ", PostIncLoop=";
1059     WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false);
1060   }
1061
1062   if (LUIdx != ~size_t(0))
1063     OS << ", LUIdx=" << LUIdx;
1064
1065   if (Offset != 0)
1066     OS << ", Offset=" << Offset;
1067 }
1068
1069 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1070 void LSRFixup::dump() const {
1071   print(errs()); errs() << '\n';
1072 }
1073 #endif
1074
1075 namespace {
1076
1077 /// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding
1078 /// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*.
1079 struct UniquifierDenseMapInfo {
1080   static SmallVector<const SCEV *, 2> getEmptyKey() {
1081     SmallVector<const SCEV *, 2> V;
1082     V.push_back(reinterpret_cast<const SCEV *>(-1));
1083     return V;
1084   }
1085
1086   static SmallVector<const SCEV *, 2> getTombstoneKey() {
1087     SmallVector<const SCEV *, 2> V;
1088     V.push_back(reinterpret_cast<const SCEV *>(-2));
1089     return V;
1090   }
1091
1092   static unsigned getHashValue(const SmallVector<const SCEV *, 2> &V) {
1093     unsigned Result = 0;
1094     for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(),
1095          E = V.end(); I != E; ++I)
1096       Result ^= DenseMapInfo<const SCEV *>::getHashValue(*I);
1097     return Result;
1098   }
1099
1100   static bool isEqual(const SmallVector<const SCEV *, 2> &LHS,
1101                       const SmallVector<const SCEV *, 2> &RHS) {
1102     return LHS == RHS;
1103   }
1104 };
1105
1106 /// LSRUse - This class holds the state that LSR keeps for each use in
1107 /// IVUsers, as well as uses invented by LSR itself. It includes information
1108 /// about what kinds of things can be folded into the user, information about
1109 /// the user itself, and information about how the use may be satisfied.
1110 /// TODO: Represent multiple users of the same expression in common?
1111 class LSRUse {
1112   DenseSet<SmallVector<const SCEV *, 2>, UniquifierDenseMapInfo> Uniquifier;
1113
1114 public:
1115   /// KindType - An enum for a kind of use, indicating what types of
1116   /// scaled and immediate operands it might support.
1117   enum KindType {
1118     Basic,   ///< A normal use, with no folding.
1119     Special, ///< A special case of basic, allowing -1 scales.
1120     Address, ///< An address use; folding according to TargetLowering
1121     ICmpZero ///< An equality icmp with both operands folded into one.
1122     // TODO: Add a generic icmp too?
1123   };
1124
1125   KindType Kind;
1126   Type *AccessTy;
1127
1128   SmallVector<int64_t, 8> Offsets;
1129   int64_t MinOffset;
1130   int64_t MaxOffset;
1131
1132   /// AllFixupsOutsideLoop - This records whether all of the fixups using this
1133   /// LSRUse are outside of the loop, in which case some special-case heuristics
1134   /// may be used.
1135   bool AllFixupsOutsideLoop;
1136
1137   /// WidestFixupType - This records the widest use type for any fixup using
1138   /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
1139   /// max fixup widths to be equivalent, because the narrower one may be relying
1140   /// on the implicit truncation to truncate away bogus bits.
1141   Type *WidestFixupType;
1142
1143   /// Formulae - A list of ways to build a value that can satisfy this user.
1144   /// After the list is populated, one of these is selected heuristically and
1145   /// used to formulate a replacement for OperandValToReplace in UserInst.
1146   SmallVector<Formula, 12> Formulae;
1147
1148   /// Regs - The set of register candidates used by all formulae in this LSRUse.
1149   SmallPtrSet<const SCEV *, 4> Regs;
1150
1151   LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T),
1152                                       MinOffset(INT64_MAX),
1153                                       MaxOffset(INT64_MIN),
1154                                       AllFixupsOutsideLoop(true),
1155                                       WidestFixupType(0) {}
1156
1157   bool HasFormulaWithSameRegs(const Formula &F) const;
1158   bool InsertFormula(const Formula &F);
1159   void DeleteFormula(Formula &F);
1160   void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1161
1162   void print(raw_ostream &OS) const;
1163   void dump() const;
1164 };
1165
1166 }
1167
1168 /// HasFormula - Test whether this use as a formula which has the same
1169 /// registers as the given formula.
1170 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1171   SmallVector<const SCEV *, 2> Key = F.BaseRegs;
1172   if (F.ScaledReg) Key.push_back(F.ScaledReg);
1173   // Unstable sort by host order ok, because this is only used for uniquifying.
1174   std::sort(Key.begin(), Key.end());
1175   return Uniquifier.count(Key);
1176 }
1177
1178 /// InsertFormula - If the given formula has not yet been inserted, add it to
1179 /// the list, and return true. Return false otherwise.
1180 bool LSRUse::InsertFormula(const Formula &F) {
1181   SmallVector<const SCEV *, 2> Key = F.BaseRegs;
1182   if (F.ScaledReg) Key.push_back(F.ScaledReg);
1183   // Unstable sort by host order ok, because this is only used for uniquifying.
1184   std::sort(Key.begin(), Key.end());
1185
1186   if (!Uniquifier.insert(Key).second)
1187     return false;
1188
1189   // Using a register to hold the value of 0 is not profitable.
1190   assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1191          "Zero allocated in a scaled register!");
1192 #ifndef NDEBUG
1193   for (SmallVectorImpl<const SCEV *>::const_iterator I =
1194        F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I)
1195     assert(!(*I)->isZero() && "Zero allocated in a base register!");
1196 #endif
1197
1198   // Add the formula to the list.
1199   Formulae.push_back(F);
1200
1201   // Record registers now being used by this use.
1202   Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1203
1204   return true;
1205 }
1206
1207 /// DeleteFormula - Remove the given formula from this use's list.
1208 void LSRUse::DeleteFormula(Formula &F) {
1209   if (&F != &Formulae.back())
1210     std::swap(F, Formulae.back());
1211   Formulae.pop_back();
1212 }
1213
1214 /// RecomputeRegs - Recompute the Regs field, and update RegUses.
1215 void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1216   // Now that we've filtered out some formulae, recompute the Regs set.
1217   SmallPtrSet<const SCEV *, 4> OldRegs = Regs;
1218   Regs.clear();
1219   for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(),
1220        E = Formulae.end(); I != E; ++I) {
1221     const Formula &F = *I;
1222     if (F.ScaledReg) Regs.insert(F.ScaledReg);
1223     Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1224   }
1225
1226   // Update the RegTracker.
1227   for (SmallPtrSet<const SCEV *, 4>::iterator I = OldRegs.begin(),
1228        E = OldRegs.end(); I != E; ++I)
1229     if (!Regs.count(*I))
1230       RegUses.DropRegister(*I, LUIdx);
1231 }
1232
1233 void LSRUse::print(raw_ostream &OS) const {
1234   OS << "LSR Use: Kind=";
1235   switch (Kind) {
1236   case Basic:    OS << "Basic"; break;
1237   case Special:  OS << "Special"; break;
1238   case ICmpZero: OS << "ICmpZero"; break;
1239   case Address:
1240     OS << "Address of ";
1241     if (AccessTy->isPointerTy())
1242       OS << "pointer"; // the full pointer type could be really verbose
1243     else
1244       OS << *AccessTy;
1245   }
1246
1247   OS << ", Offsets={";
1248   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
1249        E = Offsets.end(); I != E; ++I) {
1250     OS << *I;
1251     if (llvm::next(I) != E)
1252       OS << ',';
1253   }
1254   OS << '}';
1255
1256   if (AllFixupsOutsideLoop)
1257     OS << ", all-fixups-outside-loop";
1258
1259   if (WidestFixupType)
1260     OS << ", widest fixup type: " << *WidestFixupType;
1261 }
1262
1263 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1264 void LSRUse::dump() const {
1265   print(errs()); errs() << '\n';
1266 }
1267 #endif
1268
1269 /// isLegalUse - Test whether the use described by AM is "legal", meaning it can
1270 /// be completely folded into the user instruction at isel time. This includes
1271 /// address-mode folding and special icmp tricks.
1272 static bool isLegalUse(const TargetLowering::AddrMode &AM,
1273                        LSRUse::KindType Kind, Type *AccessTy,
1274                        const TargetLowering *TLI) {
1275   switch (Kind) {
1276   case LSRUse::Address:
1277     // If we have low-level target information, ask the target if it can
1278     // completely fold this address.
1279     if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy);
1280
1281     // Otherwise, just guess that reg+reg addressing is legal.
1282     return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1;
1283
1284   case LSRUse::ICmpZero:
1285     // There's not even a target hook for querying whether it would be legal to
1286     // fold a GV into an ICmp.
1287     if (AM.BaseGV)
1288       return false;
1289
1290     // ICmp only has two operands; don't allow more than two non-trivial parts.
1291     if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0)
1292       return false;
1293
1294     // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1295     // putting the scaled register in the other operand of the icmp.
1296     if (AM.Scale != 0 && AM.Scale != -1)
1297       return false;
1298
1299     // If we have low-level target information, ask the target if it can fold an
1300     // integer immediate on an icmp.
1301     if (AM.BaseOffs != 0) {
1302       if (!TLI)
1303         return false;
1304       // We have one of:
1305       // ICmpZero     BaseReg + Offset => ICmp BaseReg, -Offset
1306       // ICmpZero -1*ScaleReg + Offset => ICmp ScaleReg, Offset
1307       // Offs is the ICmp immediate.
1308       int64_t Offs = AM.BaseOffs;
1309       if (AM.Scale == 0)
1310         Offs = -(uint64_t)Offs; // The cast does the right thing with INT64_MIN.
1311       return TLI->isLegalICmpImmediate(Offs);
1312     }
1313
1314     // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1315     return true;
1316
1317   case LSRUse::Basic:
1318     // Only handle single-register values.
1319     return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0;
1320
1321   case LSRUse::Special:
1322     // Special case Basic to handle -1 scales.
1323     return !AM.BaseGV && (AM.Scale == 0 || AM.Scale == -1) && AM.BaseOffs == 0;
1324   }
1325
1326   llvm_unreachable("Invalid LSRUse Kind!");
1327 }
1328
1329 static bool isLegalUse(TargetLowering::AddrMode AM,
1330                        int64_t MinOffset, int64_t MaxOffset,
1331                        LSRUse::KindType Kind, Type *AccessTy,
1332                        const TargetLowering *TLI) {
1333   // Check for overflow.
1334   if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) !=
1335       (MinOffset > 0))
1336     return false;
1337   AM.BaseOffs = (uint64_t)AM.BaseOffs + MinOffset;
1338   if (isLegalUse(AM, Kind, AccessTy, TLI)) {
1339     AM.BaseOffs = (uint64_t)AM.BaseOffs - MinOffset;
1340     // Check for overflow.
1341     if (((int64_t)((uint64_t)AM.BaseOffs + MaxOffset) > AM.BaseOffs) !=
1342         (MaxOffset > 0))
1343       return false;
1344     AM.BaseOffs = (uint64_t)AM.BaseOffs + MaxOffset;
1345     return isLegalUse(AM, Kind, AccessTy, TLI);
1346   }
1347   return false;
1348 }
1349
1350 static bool isAlwaysFoldable(int64_t BaseOffs,
1351                              GlobalValue *BaseGV,
1352                              bool HasBaseReg,
1353                              LSRUse::KindType Kind, Type *AccessTy,
1354                              const TargetLowering *TLI) {
1355   // Fast-path: zero is always foldable.
1356   if (BaseOffs == 0 && !BaseGV) return true;
1357
1358   // Conservatively, create an address with an immediate and a
1359   // base and a scale.
1360   TargetLowering::AddrMode AM;
1361   AM.BaseOffs = BaseOffs;
1362   AM.BaseGV = BaseGV;
1363   AM.HasBaseReg = HasBaseReg;
1364   AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1365
1366   // Canonicalize a scale of 1 to a base register if the formula doesn't
1367   // already have a base register.
1368   if (!AM.HasBaseReg && AM.Scale == 1) {
1369     AM.Scale = 0;
1370     AM.HasBaseReg = true;
1371   }
1372
1373   return isLegalUse(AM, Kind, AccessTy, TLI);
1374 }
1375
1376 static bool isAlwaysFoldable(const SCEV *S,
1377                              int64_t MinOffset, int64_t MaxOffset,
1378                              bool HasBaseReg,
1379                              LSRUse::KindType Kind, Type *AccessTy,
1380                              const TargetLowering *TLI,
1381                              ScalarEvolution &SE) {
1382   // Fast-path: zero is always foldable.
1383   if (S->isZero()) return true;
1384
1385   // Conservatively, create an address with an immediate and a
1386   // base and a scale.
1387   int64_t BaseOffs = ExtractImmediate(S, SE);
1388   GlobalValue *BaseGV = ExtractSymbol(S, SE);
1389
1390   // If there's anything else involved, it's not foldable.
1391   if (!S->isZero()) return false;
1392
1393   // Fast-path: zero is always foldable.
1394   if (BaseOffs == 0 && !BaseGV) return true;
1395
1396   // Conservatively, create an address with an immediate and a
1397   // base and a scale.
1398   TargetLowering::AddrMode AM;
1399   AM.BaseOffs = BaseOffs;
1400   AM.BaseGV = BaseGV;
1401   AM.HasBaseReg = HasBaseReg;
1402   AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1403
1404   return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI);
1405 }
1406
1407 namespace {
1408
1409 /// UseMapDenseMapInfo - A DenseMapInfo implementation for holding
1410 /// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind.
1411 struct UseMapDenseMapInfo {
1412   static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() {
1413     return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic);
1414   }
1415
1416   static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() {
1417     return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic);
1418   }
1419
1420   static unsigned
1421   getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) {
1422     unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first);
1423     Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second));
1424     return Result;
1425   }
1426
1427   static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS,
1428                       const std::pair<const SCEV *, LSRUse::KindType> &RHS) {
1429     return LHS == RHS;
1430   }
1431 };
1432
1433 /// IVInc - An individual increment in a Chain of IV increments.
1434 /// Relate an IV user to an expression that computes the IV it uses from the IV
1435 /// used by the previous link in the Chain.
1436 ///
1437 /// For the head of a chain, IncExpr holds the absolute SCEV expression for the
1438 /// original IVOperand. The head of the chain's IVOperand is only valid during
1439 /// chain collection, before LSR replaces IV users. During chain generation,
1440 /// IncExpr can be used to find the new IVOperand that computes the same
1441 /// expression.
1442 struct IVInc {
1443   Instruction *UserInst;
1444   Value* IVOperand;
1445   const SCEV *IncExpr;
1446
1447   IVInc(Instruction *U, Value *O, const SCEV *E):
1448     UserInst(U), IVOperand(O), IncExpr(E) {}
1449 };
1450
1451 // IVChain - The list of IV increments in program order.
1452 // We typically add the head of a chain without finding subsequent links.
1453 struct IVChain {
1454   SmallVector<IVInc,1> Incs;
1455   const SCEV *ExprBase;
1456
1457   IVChain() : ExprBase(0) {}
1458
1459   IVChain(const IVInc &Head, const SCEV *Base)
1460     : Incs(1, Head), ExprBase(Base) {}
1461
1462   typedef SmallVectorImpl<IVInc>::const_iterator const_iterator;
1463
1464   // begin - return the first increment in the chain.
1465   const_iterator begin() const {
1466     assert(!Incs.empty());
1467     return llvm::next(Incs.begin());
1468   }
1469   const_iterator end() const {
1470     return Incs.end();
1471   }
1472
1473   // hasIncs - Returns true if this chain contains any increments.
1474   bool hasIncs() const { return Incs.size() >= 2; }
1475
1476   // add - Add an IVInc to the end of this chain.
1477   void add(const IVInc &X) { Incs.push_back(X); }
1478
1479   // tailUserInst - Returns the last UserInst in the chain.
1480   Instruction *tailUserInst() const { return Incs.back().UserInst; }
1481
1482   // isProfitableIncrement - Returns true if IncExpr can be profitably added to
1483   // this chain.
1484   bool isProfitableIncrement(const SCEV *OperExpr,
1485                              const SCEV *IncExpr,
1486                              ScalarEvolution&);
1487 };
1488
1489 /// ChainUsers - Helper for CollectChains to track multiple IV increment uses.
1490 /// Distinguish between FarUsers that definitely cross IV increments and
1491 /// NearUsers that may be used between IV increments.
1492 struct ChainUsers {
1493   SmallPtrSet<Instruction*, 4> FarUsers;
1494   SmallPtrSet<Instruction*, 4> NearUsers;
1495 };
1496
1497 /// LSRInstance - This class holds state for the main loop strength reduction
1498 /// logic.
1499 class LSRInstance {
1500   IVUsers &IU;
1501   ScalarEvolution &SE;
1502   DominatorTree &DT;
1503   LoopInfo &LI;
1504   const TargetLowering *const TLI;
1505   Loop *const L;
1506   bool Changed;
1507
1508   /// IVIncInsertPos - This is the insert position that the current loop's
1509   /// induction variable increment should be placed. In simple loops, this is
1510   /// the latch block's terminator. But in more complicated cases, this is a
1511   /// position which will dominate all the in-loop post-increment users.
1512   Instruction *IVIncInsertPos;
1513
1514   /// Factors - Interesting factors between use strides.
1515   SmallSetVector<int64_t, 8> Factors;
1516
1517   /// Types - Interesting use types, to facilitate truncation reuse.
1518   SmallSetVector<Type *, 4> Types;
1519
1520   /// Fixups - The list of operands which are to be replaced.
1521   SmallVector<LSRFixup, 16> Fixups;
1522
1523   /// Uses - The list of interesting uses.
1524   SmallVector<LSRUse, 16> Uses;
1525
1526   /// RegUses - Track which uses use which register candidates.
1527   RegUseTracker RegUses;
1528
1529   // Limit the number of chains to avoid quadratic behavior. We don't expect to
1530   // have more than a few IV increment chains in a loop. Missing a Chain falls
1531   // back to normal LSR behavior for those uses.
1532   static const unsigned MaxChains = 8;
1533
1534   /// IVChainVec - IV users can form a chain of IV increments.
1535   SmallVector<IVChain, MaxChains> IVChainVec;
1536
1537   /// IVIncSet - IV users that belong to profitable IVChains.
1538   SmallPtrSet<Use*, MaxChains> IVIncSet;
1539
1540   void OptimizeShadowIV();
1541   bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
1542   ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
1543   void OptimizeLoopTermCond();
1544
1545   void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
1546                         SmallVectorImpl<ChainUsers> &ChainUsersVec);
1547   void FinalizeChain(IVChain &Chain);
1548   void CollectChains();
1549   void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
1550                        SmallVectorImpl<WeakVH> &DeadInsts);
1551
1552   void CollectInterestingTypesAndFactors();
1553   void CollectFixupsAndInitialFormulae();
1554
1555   LSRFixup &getNewFixup() {
1556     Fixups.push_back(LSRFixup());
1557     return Fixups.back();
1558   }
1559
1560   // Support for sharing of LSRUses between LSRFixups.
1561   typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>,
1562                    size_t,
1563                    UseMapDenseMapInfo> UseMapTy;
1564   UseMapTy UseMap;
1565
1566   bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
1567                           LSRUse::KindType Kind, Type *AccessTy);
1568
1569   std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
1570                                     LSRUse::KindType Kind,
1571                                     Type *AccessTy);
1572
1573   void DeleteUse(LSRUse &LU, size_t LUIdx);
1574
1575   LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
1576
1577   void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1578   void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1579   void CountRegisters(const Formula &F, size_t LUIdx);
1580   bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
1581
1582   void CollectLoopInvariantFixupsAndFormulae();
1583
1584   void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
1585                               unsigned Depth = 0);
1586   void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
1587   void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1588   void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1589   void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1590   void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1591   void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
1592   void GenerateCrossUseConstantOffsets();
1593   void GenerateAllReuseFormulae();
1594
1595   void FilterOutUndesirableDedicatedRegisters();
1596
1597   size_t EstimateSearchSpaceComplexity() const;
1598   void NarrowSearchSpaceByDetectingSupersets();
1599   void NarrowSearchSpaceByCollapsingUnrolledCode();
1600   void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
1601   void NarrowSearchSpaceByPickingWinnerRegs();
1602   void NarrowSearchSpaceUsingHeuristics();
1603
1604   void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
1605                     Cost &SolutionCost,
1606                     SmallVectorImpl<const Formula *> &Workspace,
1607                     const Cost &CurCost,
1608                     const SmallPtrSet<const SCEV *, 16> &CurRegs,
1609                     DenseSet<const SCEV *> &VisitedRegs) const;
1610   void Solve(SmallVectorImpl<const Formula *> &Solution) const;
1611
1612   BasicBlock::iterator
1613     HoistInsertPosition(BasicBlock::iterator IP,
1614                         const SmallVectorImpl<Instruction *> &Inputs) const;
1615   BasicBlock::iterator
1616     AdjustInsertPositionForExpand(BasicBlock::iterator IP,
1617                                   const LSRFixup &LF,
1618                                   const LSRUse &LU,
1619                                   SCEVExpander &Rewriter) const;
1620
1621   Value *Expand(const LSRFixup &LF,
1622                 const Formula &F,
1623                 BasicBlock::iterator IP,
1624                 SCEVExpander &Rewriter,
1625                 SmallVectorImpl<WeakVH> &DeadInsts) const;
1626   void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
1627                      const Formula &F,
1628                      SCEVExpander &Rewriter,
1629                      SmallVectorImpl<WeakVH> &DeadInsts,
1630                      Pass *P) const;
1631   void Rewrite(const LSRFixup &LF,
1632                const Formula &F,
1633                SCEVExpander &Rewriter,
1634                SmallVectorImpl<WeakVH> &DeadInsts,
1635                Pass *P) const;
1636   void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
1637                          Pass *P);
1638
1639 public:
1640   LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);
1641
1642   bool getChanged() const { return Changed; }
1643
1644   void print_factors_and_types(raw_ostream &OS) const;
1645   void print_fixups(raw_ostream &OS) const;
1646   void print_uses(raw_ostream &OS) const;
1647   void print(raw_ostream &OS) const;
1648   void dump() const;
1649 };
1650
1651 }
1652
1653 /// OptimizeShadowIV - If IV is used in a int-to-float cast
1654 /// inside the loop then try to eliminate the cast operation.
1655 void LSRInstance::OptimizeShadowIV() {
1656   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1657   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1658     return;
1659
1660   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
1661        UI != E; /* empty */) {
1662     IVUsers::const_iterator CandidateUI = UI;
1663     ++UI;
1664     Instruction *ShadowUse = CandidateUI->getUser();
1665     Type *DestTy = NULL;
1666     bool IsSigned = false;
1667
1668     /* If shadow use is a int->float cast then insert a second IV
1669        to eliminate this cast.
1670
1671          for (unsigned i = 0; i < n; ++i)
1672            foo((double)i);
1673
1674        is transformed into
1675
1676          double d = 0.0;
1677          for (unsigned i = 0; i < n; ++i, ++d)
1678            foo(d);
1679     */
1680     if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
1681       IsSigned = false;
1682       DestTy = UCast->getDestTy();
1683     }
1684     else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
1685       IsSigned = true;
1686       DestTy = SCast->getDestTy();
1687     }
1688     if (!DestTy) continue;
1689
1690     if (TLI) {
1691       // If target does not support DestTy natively then do not apply
1692       // this transformation.
1693       EVT DVT = TLI->getValueType(DestTy);
1694       if (!TLI->isTypeLegal(DVT)) continue;
1695     }
1696
1697     PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
1698     if (!PH) continue;
1699     if (PH->getNumIncomingValues() != 2) continue;
1700
1701     Type *SrcTy = PH->getType();
1702     int Mantissa = DestTy->getFPMantissaWidth();
1703     if (Mantissa == -1) continue;
1704     if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
1705       continue;
1706
1707     unsigned Entry, Latch;
1708     if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
1709       Entry = 0;
1710       Latch = 1;
1711     } else {
1712       Entry = 1;
1713       Latch = 0;
1714     }
1715
1716     ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
1717     if (!Init) continue;
1718     Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
1719                                         (double)Init->getSExtValue() :
1720                                         (double)Init->getZExtValue());
1721
1722     BinaryOperator *Incr =
1723       dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
1724     if (!Incr) continue;
1725     if (Incr->getOpcode() != Instruction::Add
1726         && Incr->getOpcode() != Instruction::Sub)
1727       continue;
1728
1729     /* Initialize new IV, double d = 0.0 in above example. */
1730     ConstantInt *C = NULL;
1731     if (Incr->getOperand(0) == PH)
1732       C = dyn_cast<ConstantInt>(Incr->getOperand(1));
1733     else if (Incr->getOperand(1) == PH)
1734       C = dyn_cast<ConstantInt>(Incr->getOperand(0));
1735     else
1736       continue;
1737
1738     if (!C) continue;
1739
1740     // Ignore negative constants, as the code below doesn't handle them
1741     // correctly. TODO: Remove this restriction.
1742     if (!C->getValue().isStrictlyPositive()) continue;
1743
1744     /* Add new PHINode. */
1745     PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
1746
1747     /* create new increment. '++d' in above example. */
1748     Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
1749     BinaryOperator *NewIncr =
1750       BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
1751                                Instruction::FAdd : Instruction::FSub,
1752                              NewPH, CFP, "IV.S.next.", Incr);
1753
1754     NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
1755     NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
1756
1757     /* Remove cast operation */
1758     ShadowUse->replaceAllUsesWith(NewPH);
1759     ShadowUse->eraseFromParent();
1760     Changed = true;
1761     break;
1762   }
1763 }
1764
1765 /// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
1766 /// set the IV user and stride information and return true, otherwise return
1767 /// false.
1768 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
1769   for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
1770     if (UI->getUser() == Cond) {
1771       // NOTE: we could handle setcc instructions with multiple uses here, but
1772       // InstCombine does it as well for simple uses, it's not clear that it
1773       // occurs enough in real life to handle.
1774       CondUse = UI;
1775       return true;
1776     }
1777   return false;
1778 }
1779
1780 /// OptimizeMax - Rewrite the loop's terminating condition if it uses
1781 /// a max computation.
1782 ///
1783 /// This is a narrow solution to a specific, but acute, problem. For loops
1784 /// like this:
1785 ///
1786 ///   i = 0;
1787 ///   do {
1788 ///     p[i] = 0.0;
1789 ///   } while (++i < n);
1790 ///
1791 /// the trip count isn't just 'n', because 'n' might not be positive. And
1792 /// unfortunately this can come up even for loops where the user didn't use
1793 /// a C do-while loop. For example, seemingly well-behaved top-test loops
1794 /// will commonly be lowered like this:
1795 //
1796 ///   if (n > 0) {
1797 ///     i = 0;
1798 ///     do {
1799 ///       p[i] = 0.0;
1800 ///     } while (++i < n);
1801 ///   }
1802 ///
1803 /// and then it's possible for subsequent optimization to obscure the if
1804 /// test in such a way that indvars can't find it.
1805 ///
1806 /// When indvars can't find the if test in loops like this, it creates a
1807 /// max expression, which allows it to give the loop a canonical
1808 /// induction variable:
1809 ///
1810 ///   i = 0;
1811 ///   max = n < 1 ? 1 : n;
1812 ///   do {
1813 ///     p[i] = 0.0;
1814 ///   } while (++i != max);
1815 ///
1816 /// Canonical induction variables are necessary because the loop passes
1817 /// are designed around them. The most obvious example of this is the
1818 /// LoopInfo analysis, which doesn't remember trip count values. It
1819 /// expects to be able to rediscover the trip count each time it is
1820 /// needed, and it does this using a simple analysis that only succeeds if
1821 /// the loop has a canonical induction variable.
1822 ///
1823 /// However, when it comes time to generate code, the maximum operation
1824 /// can be quite costly, especially if it's inside of an outer loop.
1825 ///
1826 /// This function solves this problem by detecting this type of loop and
1827 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
1828 /// the instructions for the maximum computation.
1829 ///
1830 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
1831   // Check that the loop matches the pattern we're looking for.
1832   if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
1833       Cond->getPredicate() != CmpInst::ICMP_NE)
1834     return Cond;
1835
1836   SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
1837   if (!Sel || !Sel->hasOneUse()) return Cond;
1838
1839   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1840   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1841     return Cond;
1842   const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
1843
1844   // Add one to the backedge-taken count to get the trip count.
1845   const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
1846   if (IterationCount != SE.getSCEV(Sel)) return Cond;
1847
1848   // Check for a max calculation that matches the pattern. There's no check
1849   // for ICMP_ULE here because the comparison would be with zero, which
1850   // isn't interesting.
1851   CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
1852   const SCEVNAryExpr *Max = 0;
1853   if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
1854     Pred = ICmpInst::ICMP_SLE;
1855     Max = S;
1856   } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
1857     Pred = ICmpInst::ICMP_SLT;
1858     Max = S;
1859   } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
1860     Pred = ICmpInst::ICMP_ULT;
1861     Max = U;
1862   } else {
1863     // No match; bail.
1864     return Cond;
1865   }
1866
1867   // To handle a max with more than two operands, this optimization would
1868   // require additional checking and setup.
1869   if (Max->getNumOperands() != 2)
1870     return Cond;
1871
1872   const SCEV *MaxLHS = Max->getOperand(0);
1873   const SCEV *MaxRHS = Max->getOperand(1);
1874
1875   // ScalarEvolution canonicalizes constants to the left. For < and >, look
1876   // for a comparison with 1. For <= and >=, a comparison with zero.
1877   if (!MaxLHS ||
1878       (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
1879     return Cond;
1880
1881   // Check the relevant induction variable for conformance to
1882   // the pattern.
1883   const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
1884   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
1885   if (!AR || !AR->isAffine() ||
1886       AR->getStart() != One ||
1887       AR->getStepRecurrence(SE) != One)
1888     return Cond;
1889
1890   assert(AR->getLoop() == L &&
1891          "Loop condition operand is an addrec in a different loop!");
1892
1893   // Check the right operand of the select, and remember it, as it will
1894   // be used in the new comparison instruction.
1895   Value *NewRHS = 0;
1896   if (ICmpInst::isTrueWhenEqual(Pred)) {
1897     // Look for n+1, and grab n.
1898     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
1899       if (isa<ConstantInt>(BO->getOperand(1)) &&
1900           cast<ConstantInt>(BO->getOperand(1))->isOne() &&
1901           SE.getSCEV(BO->getOperand(0)) == MaxRHS)
1902         NewRHS = BO->getOperand(0);
1903     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
1904       if (isa<ConstantInt>(BO->getOperand(1)) &&
1905           cast<ConstantInt>(BO->getOperand(1))->isOne() &&
1906           SE.getSCEV(BO->getOperand(0)) == MaxRHS)
1907         NewRHS = BO->getOperand(0);
1908     if (!NewRHS)
1909       return Cond;
1910   } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
1911     NewRHS = Sel->getOperand(1);
1912   else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
1913     NewRHS = Sel->getOperand(2);
1914   else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
1915     NewRHS = SU->getValue();
1916   else
1917     // Max doesn't match expected pattern.
1918     return Cond;
1919
1920   // Determine the new comparison opcode. It may be signed or unsigned,
1921   // and the original comparison may be either equality or inequality.
1922   if (Cond->getPredicate() == CmpInst::ICMP_EQ)
1923     Pred = CmpInst::getInversePredicate(Pred);
1924
1925   // Ok, everything looks ok to change the condition into an SLT or SGE and
1926   // delete the max calculation.
1927   ICmpInst *NewCond =
1928     new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
1929
1930   // Delete the max calculation instructions.
1931   Cond->replaceAllUsesWith(NewCond);
1932   CondUse->setUser(NewCond);
1933   Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
1934   Cond->eraseFromParent();
1935   Sel->eraseFromParent();
1936   if (Cmp->use_empty())
1937     Cmp->eraseFromParent();
1938   return NewCond;
1939 }
1940
1941 /// OptimizeLoopTermCond - Change loop terminating condition to use the
1942 /// postinc iv when possible.
1943 void
1944 LSRInstance::OptimizeLoopTermCond() {
1945   SmallPtrSet<Instruction *, 4> PostIncs;
1946
1947   BasicBlock *LatchBlock = L->getLoopLatch();
1948   SmallVector<BasicBlock*, 8> ExitingBlocks;
1949   L->getExitingBlocks(ExitingBlocks);
1950
1951   for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
1952     BasicBlock *ExitingBlock = ExitingBlocks[i];
1953
1954     // Get the terminating condition for the loop if possible.  If we
1955     // can, we want to change it to use a post-incremented version of its
1956     // induction variable, to allow coalescing the live ranges for the IV into
1957     // one register value.
1958
1959     BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
1960     if (!TermBr)
1961       continue;
1962     // FIXME: Overly conservative, termination condition could be an 'or' etc..
1963     if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
1964       continue;
1965
1966     // Search IVUsesByStride to find Cond's IVUse if there is one.
1967     IVStrideUse *CondUse = 0;
1968     ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
1969     if (!FindIVUserForCond(Cond, CondUse))
1970       continue;
1971
1972     // If the trip count is computed in terms of a max (due to ScalarEvolution
1973     // being unable to find a sufficient guard, for example), change the loop
1974     // comparison to use SLT or ULT instead of NE.
1975     // One consequence of doing this now is that it disrupts the count-down
1976     // optimization. That's not always a bad thing though, because in such
1977     // cases it may still be worthwhile to avoid a max.
1978     Cond = OptimizeMax(Cond, CondUse);
1979
1980     // If this exiting block dominates the latch block, it may also use
1981     // the post-inc value if it won't be shared with other uses.
1982     // Check for dominance.
1983     if (!DT.dominates(ExitingBlock, LatchBlock))
1984       continue;
1985
1986     // Conservatively avoid trying to use the post-inc value in non-latch
1987     // exits if there may be pre-inc users in intervening blocks.
1988     if (LatchBlock != ExitingBlock)
1989       for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
1990         // Test if the use is reachable from the exiting block. This dominator
1991         // query is a conservative approximation of reachability.
1992         if (&*UI != CondUse &&
1993             !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
1994           // Conservatively assume there may be reuse if the quotient of their
1995           // strides could be a legal scale.
1996           const SCEV *A = IU.getStride(*CondUse, L);
1997           const SCEV *B = IU.getStride(*UI, L);
1998           if (!A || !B) continue;
1999           if (SE.getTypeSizeInBits(A->getType()) !=
2000               SE.getTypeSizeInBits(B->getType())) {
2001             if (SE.getTypeSizeInBits(A->getType()) >
2002                 SE.getTypeSizeInBits(B->getType()))
2003               B = SE.getSignExtendExpr(B, A->getType());
2004             else
2005               A = SE.getSignExtendExpr(A, B->getType());
2006           }
2007           if (const SCEVConstant *D =
2008                 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
2009             const ConstantInt *C = D->getValue();
2010             // Stride of one or negative one can have reuse with non-addresses.
2011             if (C->isOne() || C->isAllOnesValue())
2012               goto decline_post_inc;
2013             // Avoid weird situations.
2014             if (C->getValue().getMinSignedBits() >= 64 ||
2015                 C->getValue().isMinSignedValue())
2016               goto decline_post_inc;
2017             // Without TLI, assume that any stride might be valid, and so any
2018             // use might be shared.
2019             if (!TLI)
2020               goto decline_post_inc;
2021             // Check for possible scaled-address reuse.
2022             Type *AccessTy = getAccessType(UI->getUser());
2023             TargetLowering::AddrMode AM;
2024             AM.Scale = C->getSExtValue();
2025             if (TLI->isLegalAddressingMode(AM, AccessTy))
2026               goto decline_post_inc;
2027             AM.Scale = -AM.Scale;
2028             if (TLI->isLegalAddressingMode(AM, AccessTy))
2029               goto decline_post_inc;
2030           }
2031         }
2032
2033     DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
2034                  << *Cond << '\n');
2035
2036     // It's possible for the setcc instruction to be anywhere in the loop, and
2037     // possible for it to have multiple users.  If it is not immediately before
2038     // the exiting block branch, move it.
2039     if (&*++BasicBlock::iterator(Cond) != TermBr) {
2040       if (Cond->hasOneUse()) {
2041         Cond->moveBefore(TermBr);
2042       } else {
2043         // Clone the terminating condition and insert into the loopend.
2044         ICmpInst *OldCond = Cond;
2045         Cond = cast<ICmpInst>(Cond->clone());
2046         Cond->setName(L->getHeader()->getName() + ".termcond");
2047         ExitingBlock->getInstList().insert(TermBr, Cond);
2048
2049         // Clone the IVUse, as the old use still exists!
2050         CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2051         TermBr->replaceUsesOfWith(OldCond, Cond);
2052       }
2053     }
2054
2055     // If we get to here, we know that we can transform the setcc instruction to
2056     // use the post-incremented version of the IV, allowing us to coalesce the
2057     // live ranges for the IV correctly.
2058     CondUse->transformToPostInc(L);
2059     Changed = true;
2060
2061     PostIncs.insert(Cond);
2062   decline_post_inc:;
2063   }
2064
2065   // Determine an insertion point for the loop induction variable increment. It
2066   // must dominate all the post-inc comparisons we just set up, and it must
2067   // dominate the loop latch edge.
2068   IVIncInsertPos = L->getLoopLatch()->getTerminator();
2069   for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(),
2070        E = PostIncs.end(); I != E; ++I) {
2071     BasicBlock *BB =
2072       DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
2073                                     (*I)->getParent());
2074     if (BB == (*I)->getParent())
2075       IVIncInsertPos = *I;
2076     else if (BB != IVIncInsertPos->getParent())
2077       IVIncInsertPos = BB->getTerminator();
2078   }
2079 }
2080
2081 /// reconcileNewOffset - Determine if the given use can accommodate a fixup
2082 /// at the given offset and other details. If so, update the use and
2083 /// return true.
2084 bool
2085 LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
2086                                 LSRUse::KindType Kind, Type *AccessTy) {
2087   int64_t NewMinOffset = LU.MinOffset;
2088   int64_t NewMaxOffset = LU.MaxOffset;
2089   Type *NewAccessTy = AccessTy;
2090
2091   // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2092   // something conservative, however this can pessimize in the case that one of
2093   // the uses will have all its uses outside the loop, for example.
2094   if (LU.Kind != Kind)
2095     return false;
2096   // Conservatively assume HasBaseReg is true for now.
2097   if (NewOffset < LU.MinOffset) {
2098     if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg,
2099                           Kind, AccessTy, TLI))
2100       return false;
2101     NewMinOffset = NewOffset;
2102   } else if (NewOffset > LU.MaxOffset) {
2103     if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg,
2104                           Kind, AccessTy, TLI))
2105       return false;
2106     NewMaxOffset = NewOffset;
2107   }
2108   // Check for a mismatched access type, and fall back conservatively as needed.
2109   // TODO: Be less conservative when the type is similar and can use the same
2110   // addressing modes.
2111   if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
2112     NewAccessTy = Type::getVoidTy(AccessTy->getContext());
2113
2114   // Update the use.
2115   LU.MinOffset = NewMinOffset;
2116   LU.MaxOffset = NewMaxOffset;
2117   LU.AccessTy = NewAccessTy;
2118   if (NewOffset != LU.Offsets.back())
2119     LU.Offsets.push_back(NewOffset);
2120   return true;
2121 }
2122
2123 /// getUse - Return an LSRUse index and an offset value for a fixup which
2124 /// needs the given expression, with the given kind and optional access type.
2125 /// Either reuse an existing use or create a new one, as needed.
2126 std::pair<size_t, int64_t>
2127 LSRInstance::getUse(const SCEV *&Expr,
2128                     LSRUse::KindType Kind, Type *AccessTy) {
2129   const SCEV *Copy = Expr;
2130   int64_t Offset = ExtractImmediate(Expr, SE);
2131
2132   // Basic uses can't accept any offset, for example.
2133   if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) {
2134     Expr = Copy;
2135     Offset = 0;
2136   }
2137
2138   std::pair<UseMapTy::iterator, bool> P =
2139     UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0));
2140   if (!P.second) {
2141     // A use already existed with this base.
2142     size_t LUIdx = P.first->second;
2143     LSRUse &LU = Uses[LUIdx];
2144     if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2145       // Reuse this use.
2146       return std::make_pair(LUIdx, Offset);
2147   }
2148
2149   // Create a new use.
2150   size_t LUIdx = Uses.size();
2151   P.first->second = LUIdx;
2152   Uses.push_back(LSRUse(Kind, AccessTy));
2153   LSRUse &LU = Uses[LUIdx];
2154
2155   // We don't need to track redundant offsets, but we don't need to go out
2156   // of our way here to avoid them.
2157   if (LU.Offsets.empty() || Offset != LU.Offsets.back())
2158     LU.Offsets.push_back(Offset);
2159
2160   LU.MinOffset = Offset;
2161   LU.MaxOffset = Offset;
2162   return std::make_pair(LUIdx, Offset);
2163 }
2164
2165 /// DeleteUse - Delete the given use from the Uses list.
2166 void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2167   if (&LU != &Uses.back())
2168     std::swap(LU, Uses.back());
2169   Uses.pop_back();
2170
2171   // Update RegUses.
2172   RegUses.SwapAndDropUse(LUIdx, Uses.size());
2173 }
2174
2175 /// FindUseWithFormula - Look for a use distinct from OrigLU which is has
2176 /// a formula that has the same registers as the given formula.
2177 LSRUse *
2178 LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2179                                        const LSRUse &OrigLU) {
2180   // Search all uses for the formula. This could be more clever.
2181   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2182     LSRUse &LU = Uses[LUIdx];
2183     // Check whether this use is close enough to OrigLU, to see whether it's
2184     // worthwhile looking through its formulae.
2185     // Ignore ICmpZero uses because they may contain formulae generated by
2186     // GenerateICmpZeroScales, in which case adding fixup offsets may
2187     // be invalid.
2188     if (&LU != &OrigLU &&
2189         LU.Kind != LSRUse::ICmpZero &&
2190         LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2191         LU.WidestFixupType == OrigLU.WidestFixupType &&
2192         LU.HasFormulaWithSameRegs(OrigF)) {
2193       // Scan through this use's formulae.
2194       for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
2195            E = LU.Formulae.end(); I != E; ++I) {
2196         const Formula &F = *I;
2197         // Check to see if this formula has the same registers and symbols
2198         // as OrigF.
2199         if (F.BaseRegs == OrigF.BaseRegs &&
2200             F.ScaledReg == OrigF.ScaledReg &&
2201             F.AM.BaseGV == OrigF.AM.BaseGV &&
2202             F.AM.Scale == OrigF.AM.Scale &&
2203             F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2204           if (F.AM.BaseOffs == 0)
2205             return &LU;
2206           // This is the formula where all the registers and symbols matched;
2207           // there aren't going to be any others. Since we declined it, we
2208           // can skip the rest of the formulae and proceed to the next LSRUse.
2209           break;
2210         }
2211       }
2212     }
2213   }
2214
2215   // Nothing looked good.
2216   return 0;
2217 }
2218
2219 void LSRInstance::CollectInterestingTypesAndFactors() {
2220   SmallSetVector<const SCEV *, 4> Strides;
2221
2222   // Collect interesting types and strides.
2223   SmallVector<const SCEV *, 4> Worklist;
2224   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
2225     const SCEV *Expr = IU.getExpr(*UI);
2226
2227     // Collect interesting types.
2228     Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2229
2230     // Add strides for mentioned loops.
2231     Worklist.push_back(Expr);
2232     do {
2233       const SCEV *S = Worklist.pop_back_val();
2234       if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2235         if (AR->getLoop() == L)
2236           Strides.insert(AR->getStepRecurrence(SE));
2237         Worklist.push_back(AR->getStart());
2238       } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2239         Worklist.append(Add->op_begin(), Add->op_end());
2240       }
2241     } while (!Worklist.empty());
2242   }
2243
2244   // Compute interesting factors from the set of interesting strides.
2245   for (SmallSetVector<const SCEV *, 4>::const_iterator
2246        I = Strides.begin(), E = Strides.end(); I != E; ++I)
2247     for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2248          llvm::next(I); NewStrideIter != E; ++NewStrideIter) {
2249       const SCEV *OldStride = *I;
2250       const SCEV *NewStride = *NewStrideIter;
2251
2252       if (SE.getTypeSizeInBits(OldStride->getType()) !=
2253           SE.getTypeSizeInBits(NewStride->getType())) {
2254         if (SE.getTypeSizeInBits(OldStride->getType()) >
2255             SE.getTypeSizeInBits(NewStride->getType()))
2256           NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2257         else
2258           OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2259       }
2260       if (const SCEVConstant *Factor =
2261             dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2262                                                         SE, true))) {
2263         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
2264           Factors.insert(Factor->getValue()->getValue().getSExtValue());
2265       } else if (const SCEVConstant *Factor =
2266                    dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2267                                                                NewStride,
2268                                                                SE, true))) {
2269         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
2270           Factors.insert(Factor->getValue()->getValue().getSExtValue());
2271       }
2272     }
2273
2274   // If all uses use the same type, don't bother looking for truncation-based
2275   // reuse.
2276   if (Types.size() == 1)
2277     Types.clear();
2278
2279   DEBUG(print_factors_and_types(dbgs()));
2280 }
2281
2282 /// findIVOperand - Helper for CollectChains that finds an IV operand (computed
2283 /// by an AddRec in this loop) within [OI,OE) or returns OE. If IVUsers mapped
2284 /// Instructions to IVStrideUses, we could partially skip this.
2285 static User::op_iterator
2286 findIVOperand(User::op_iterator OI, User::op_iterator OE,
2287               Loop *L, ScalarEvolution &SE) {
2288   for(; OI != OE; ++OI) {
2289     if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2290       if (!SE.isSCEVable(Oper->getType()))
2291         continue;
2292
2293       if (const SCEVAddRecExpr *AR =
2294           dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
2295         if (AR->getLoop() == L)
2296           break;
2297       }
2298     }
2299   }
2300   return OI;
2301 }
2302
2303 /// getWideOperand - IVChain logic must consistenctly peek base TruncInst
2304 /// operands, so wrap it in a convenient helper.
2305 static Value *getWideOperand(Value *Oper) {
2306   if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2307     return Trunc->getOperand(0);
2308   return Oper;
2309 }
2310
2311 /// isCompatibleIVType - Return true if we allow an IV chain to include both
2312 /// types.
2313 static bool isCompatibleIVType(Value *LVal, Value *RVal) {
2314   Type *LType = LVal->getType();
2315   Type *RType = RVal->getType();
2316   return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());
2317 }
2318
2319 /// getExprBase - Return an approximation of this SCEV expression's "base", or
2320 /// NULL for any constant. Returning the expression itself is
2321 /// conservative. Returning a deeper subexpression is more precise and valid as
2322 /// long as it isn't less complex than another subexpression. For expressions
2323 /// involving multiple unscaled values, we need to return the pointer-type
2324 /// SCEVUnknown. This avoids forming chains across objects, such as:
2325 /// PrevOper==a[i], IVOper==b[i], IVInc==b-a.
2326 ///
2327 /// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2328 /// SCEVUnknown, we simply return the rightmost SCEV operand.
2329 static const SCEV *getExprBase(const SCEV *S) {
2330   switch (S->getSCEVType()) {
2331   default: // uncluding scUnknown.
2332     return S;
2333   case scConstant:
2334     return 0;
2335   case scTruncate:
2336     return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
2337   case scZeroExtend:
2338     return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
2339   case scSignExtend:
2340     return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
2341   case scAddExpr: {
2342     // Skip over scaled operands (scMulExpr) to follow add operands as long as
2343     // there's nothing more complex.
2344     // FIXME: not sure if we want to recognize negation.
2345     const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
2346     for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
2347            E(Add->op_begin()); I != E; ++I) {
2348       const SCEV *SubExpr = *I;
2349       if (SubExpr->getSCEVType() == scAddExpr)
2350         return getExprBase(SubExpr);
2351
2352       if (SubExpr->getSCEVType() != scMulExpr)
2353         return SubExpr;
2354     }
2355     return S; // all operands are scaled, be conservative.
2356   }
2357   case scAddRecExpr:
2358     return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
2359   }
2360 }
2361
2362 /// Return true if the chain increment is profitable to expand into a loop
2363 /// invariant value, which may require its own register. A profitable chain
2364 /// increment will be an offset relative to the same base. We allow such offsets
2365 /// to potentially be used as chain increment as long as it's not obviously
2366 /// expensive to expand using real instructions.
2367 bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
2368                                     const SCEV *IncExpr,
2369                                     ScalarEvolution &SE) {
2370   // Aggressively form chains when -stress-ivchain.
2371   if (StressIVChain)
2372     return true;
2373
2374   // Do not replace a constant offset from IV head with a nonconstant IV
2375   // increment.
2376   if (!isa<SCEVConstant>(IncExpr)) {
2377     const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
2378     if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
2379       return 0;
2380   }
2381
2382   SmallPtrSet<const SCEV*, 8> Processed;
2383   return !isHighCostExpansion(IncExpr, Processed, SE);
2384 }
2385
2386 /// Return true if the number of registers needed for the chain is estimated to
2387 /// be less than the number required for the individual IV users. First prohibit
2388 /// any IV users that keep the IV live across increments (the Users set should
2389 /// be empty). Next count the number and type of increments in the chain.
2390 ///
2391 /// Chaining IVs can lead to considerable code bloat if ISEL doesn't
2392 /// effectively use postinc addressing modes. Only consider it profitable it the
2393 /// increments can be computed in fewer registers when chained.
2394 ///
2395 /// TODO: Consider IVInc free if it's already used in another chains.
2396 static bool
2397 isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users,
2398                   ScalarEvolution &SE, const TargetLowering *TLI) {
2399   if (StressIVChain)
2400     return true;
2401
2402   if (!Chain.hasIncs())
2403     return false;
2404
2405   if (!Users.empty()) {
2406     DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
2407           for (SmallPtrSet<Instruction*, 4>::const_iterator I = Users.begin(),
2408                  E = Users.end(); I != E; ++I) {
2409             dbgs() << "  " << **I << "\n";
2410           });
2411     return false;
2412   }
2413   assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
2414
2415   // The chain itself may require a register, so intialize cost to 1.
2416   int cost = 1;
2417
2418   // A complete chain likely eliminates the need for keeping the original IV in
2419   // a register. LSR does not currently know how to form a complete chain unless
2420   // the header phi already exists.
2421   if (isa<PHINode>(Chain.tailUserInst())
2422       && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
2423     --cost;
2424   }
2425   const SCEV *LastIncExpr = 0;
2426   unsigned NumConstIncrements = 0;
2427   unsigned NumVarIncrements = 0;
2428   unsigned NumReusedIncrements = 0;
2429   for (IVChain::const_iterator I = Chain.begin(), E = Chain.end();
2430        I != E; ++I) {
2431
2432     if (I->IncExpr->isZero())
2433       continue;
2434
2435     // Incrementing by zero or some constant is neutral. We assume constants can
2436     // be folded into an addressing mode or an add's immediate operand.
2437     if (isa<SCEVConstant>(I->IncExpr)) {
2438       ++NumConstIncrements;
2439       continue;
2440     }
2441
2442     if (I->IncExpr == LastIncExpr)
2443       ++NumReusedIncrements;
2444     else
2445       ++NumVarIncrements;
2446
2447     LastIncExpr = I->IncExpr;
2448   }
2449   // An IV chain with a single increment is handled by LSR's postinc
2450   // uses. However, a chain with multiple increments requires keeping the IV's
2451   // value live longer than it needs to be if chained.
2452   if (NumConstIncrements > 1)
2453     --cost;
2454
2455   // Materializing increment expressions in the preheader that didn't exist in
2456   // the original code may cost a register. For example, sign-extended array
2457   // indices can produce ridiculous increments like this:
2458   // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
2459   cost += NumVarIncrements;
2460
2461   // Reusing variable increments likely saves a register to hold the multiple of
2462   // the stride.
2463   cost -= NumReusedIncrements;
2464
2465   DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
2466                << "\n");
2467
2468   return cost < 0;
2469 }
2470
2471 /// ChainInstruction - Add this IV user to an existing chain or make it the head
2472 /// of a new chain.
2473 void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2474                                    SmallVectorImpl<ChainUsers> &ChainUsersVec) {
2475   // When IVs are used as types of varying widths, they are generally converted
2476   // to a wider type with some uses remaining narrow under a (free) trunc.
2477   Value *const NextIV = getWideOperand(IVOper);
2478   const SCEV *const OperExpr = SE.getSCEV(NextIV);
2479   const SCEV *const OperExprBase = getExprBase(OperExpr);
2480
2481   // Visit all existing chains. Check if its IVOper can be computed as a
2482   // profitable loop invariant increment from the last link in the Chain.
2483   unsigned ChainIdx = 0, NChains = IVChainVec.size();
2484   const SCEV *LastIncExpr = 0;
2485   for (; ChainIdx < NChains; ++ChainIdx) {
2486     IVChain &Chain = IVChainVec[ChainIdx];
2487
2488     // Prune the solution space aggressively by checking that both IV operands
2489     // are expressions that operate on the same unscaled SCEVUnknown. This
2490     // "base" will be canceled by the subsequent getMinusSCEV call. Checking
2491     // first avoids creating extra SCEV expressions.
2492     if (!StressIVChain && Chain.ExprBase != OperExprBase)
2493       continue;
2494
2495     Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
2496     if (!isCompatibleIVType(PrevIV, NextIV))
2497       continue;
2498
2499     // A phi node terminates a chain.
2500     if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
2501       continue;
2502
2503     // The increment must be loop-invariant so it can be kept in a register.
2504     const SCEV *PrevExpr = SE.getSCEV(PrevIV);
2505     const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
2506     if (!SE.isLoopInvariant(IncExpr, L))
2507       continue;
2508
2509     if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
2510       LastIncExpr = IncExpr;
2511       break;
2512     }
2513   }
2514   // If we haven't found a chain, create a new one, unless we hit the max. Don't
2515   // bother for phi nodes, because they must be last in the chain.
2516   if (ChainIdx == NChains) {
2517     if (isa<PHINode>(UserInst))
2518       return;
2519     if (NChains >= MaxChains && !StressIVChain) {
2520       DEBUG(dbgs() << "IV Chain Limit\n");
2521       return;
2522     }
2523     LastIncExpr = OperExpr;
2524     // IVUsers may have skipped over sign/zero extensions. We don't currently
2525     // attempt to form chains involving extensions unless they can be hoisted
2526     // into this loop's AddRec.
2527     if (!isa<SCEVAddRecExpr>(LastIncExpr))
2528       return;
2529     ++NChains;
2530     IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
2531                                  OperExprBase));
2532     ChainUsersVec.resize(NChains);
2533     DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
2534                  << ") IV=" << *LastIncExpr << "\n");
2535   } else {
2536     DEBUG(dbgs() << "IV Chain#" << ChainIdx << "  Inc: (" << *UserInst
2537                  << ") IV+" << *LastIncExpr << "\n");
2538     // Add this IV user to the end of the chain.
2539     IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
2540   }
2541
2542   SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
2543   // This chain's NearUsers become FarUsers.
2544   if (!LastIncExpr->isZero()) {
2545     ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
2546                                             NearUsers.end());
2547     NearUsers.clear();
2548   }
2549
2550   // All other uses of IVOperand become near uses of the chain.
2551   // We currently ignore intermediate values within SCEV expressions, assuming
2552   // they will eventually be used be the current chain, or can be computed
2553   // from one of the chain increments. To be more precise we could
2554   // transitively follow its user and only add leaf IV users to the set.
2555   for (Value::use_iterator UseIter = IVOper->use_begin(),
2556          UseEnd = IVOper->use_end(); UseIter != UseEnd; ++UseIter) {
2557     Instruction *OtherUse = dyn_cast<Instruction>(*UseIter);
2558     if (!OtherUse || OtherUse == UserInst)
2559       continue;
2560     if (SE.isSCEVable(OtherUse->getType())
2561         && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
2562         && IU.isIVUserOrOperand(OtherUse)) {
2563       continue;
2564     }
2565     NearUsers.insert(OtherUse);
2566   }
2567
2568   // Since this user is part of the chain, it's no longer considered a use
2569   // of the chain.
2570   ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
2571 }
2572
2573 /// CollectChains - Populate the vector of Chains.
2574 ///
2575 /// This decreases ILP at the architecture level. Targets with ample registers,
2576 /// multiple memory ports, and no register renaming probably don't want
2577 /// this. However, such targets should probably disable LSR altogether.
2578 ///
2579 /// The job of LSR is to make a reasonable choice of induction variables across
2580 /// the loop. Subsequent passes can easily "unchain" computation exposing more
2581 /// ILP *within the loop* if the target wants it.
2582 ///
2583 /// Finding the best IV chain is potentially a scheduling problem. Since LSR
2584 /// will not reorder memory operations, it will recognize this as a chain, but
2585 /// will generate redundant IV increments. Ideally this would be corrected later
2586 /// by a smart scheduler:
2587 ///        = A[i]
2588 ///        = A[i+x]
2589 /// A[i]   =
2590 /// A[i+x] =
2591 ///
2592 /// TODO: Walk the entire domtree within this loop, not just the path to the
2593 /// loop latch. This will discover chains on side paths, but requires
2594 /// maintaining multiple copies of the Chains state.
2595 void LSRInstance::CollectChains() {
2596   DEBUG(dbgs() << "Collecting IV Chains.\n");
2597   SmallVector<ChainUsers, 8> ChainUsersVec;
2598
2599   SmallVector<BasicBlock *,8> LatchPath;
2600   BasicBlock *LoopHeader = L->getHeader();
2601   for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
2602        Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
2603     LatchPath.push_back(Rung->getBlock());
2604   }
2605   LatchPath.push_back(LoopHeader);
2606
2607   // Walk the instruction stream from the loop header to the loop latch.
2608   for (SmallVectorImpl<BasicBlock *>::reverse_iterator
2609          BBIter = LatchPath.rbegin(), BBEnd = LatchPath.rend();
2610        BBIter != BBEnd; ++BBIter) {
2611     for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end();
2612          I != E; ++I) {
2613       // Skip instructions that weren't seen by IVUsers analysis.
2614       if (isa<PHINode>(I) || !IU.isIVUserOrOperand(I))
2615         continue;
2616
2617       // Ignore users that are part of a SCEV expression. This way we only
2618       // consider leaf IV Users. This effectively rediscovers a portion of
2619       // IVUsers analysis but in program order this time.
2620       if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(I)))
2621         continue;
2622
2623       // Remove this instruction from any NearUsers set it may be in.
2624       for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
2625            ChainIdx < NChains; ++ChainIdx) {
2626         ChainUsersVec[ChainIdx].NearUsers.erase(I);
2627       }
2628       // Search for operands that can be chained.
2629       SmallPtrSet<Instruction*, 4> UniqueOperands;
2630       User::op_iterator IVOpEnd = I->op_end();
2631       User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE);
2632       while (IVOpIter != IVOpEnd) {
2633         Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
2634         if (UniqueOperands.insert(IVOpInst))
2635           ChainInstruction(I, IVOpInst, ChainUsersVec);
2636         IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE);
2637       }
2638     } // Continue walking down the instructions.
2639   } // Continue walking down the domtree.
2640   // Visit phi backedges to determine if the chain can generate the IV postinc.
2641   for (BasicBlock::iterator I = L->getHeader()->begin();
2642        PHINode *PN = dyn_cast<PHINode>(I); ++I) {
2643     if (!SE.isSCEVable(PN->getType()))
2644       continue;
2645
2646     Instruction *IncV =
2647       dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch()));
2648     if (IncV)
2649       ChainInstruction(PN, IncV, ChainUsersVec);
2650   }
2651   // Remove any unprofitable chains.
2652   unsigned ChainIdx = 0;
2653   for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
2654        UsersIdx < NChains; ++UsersIdx) {
2655     if (!isProfitableChain(IVChainVec[UsersIdx],
2656                            ChainUsersVec[UsersIdx].FarUsers, SE, TLI))
2657       continue;
2658     // Preserve the chain at UsesIdx.
2659     if (ChainIdx != UsersIdx)
2660       IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
2661     FinalizeChain(IVChainVec[ChainIdx]);
2662     ++ChainIdx;
2663   }
2664   IVChainVec.resize(ChainIdx);
2665 }
2666
2667 void LSRInstance::FinalizeChain(IVChain &Chain) {
2668   assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
2669   DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
2670
2671   for (IVChain::const_iterator I = Chain.begin(), E = Chain.end();
2672        I != E; ++I) {
2673     DEBUG(dbgs() << "        Inc: " << *I->UserInst << "\n");
2674     User::op_iterator UseI =
2675       std::find(I->UserInst->op_begin(), I->UserInst->op_end(), I->IVOperand);
2676     assert(UseI != I->UserInst->op_end() && "cannot find IV operand");
2677     IVIncSet.insert(UseI);
2678   }
2679 }
2680
2681 /// Return true if the IVInc can be folded into an addressing mode.
2682 static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
2683                              Value *Operand, const TargetLowering *TLI) {
2684   const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
2685   if (!IncConst || !isAddressUse(UserInst, Operand))
2686     return false;
2687
2688   if (IncConst->getValue()->getValue().getMinSignedBits() > 64)
2689     return false;
2690
2691   int64_t IncOffset = IncConst->getValue()->getSExtValue();
2692   if (!isAlwaysFoldable(IncOffset, /*BaseGV=*/0, /*HaseBaseReg=*/false,
2693                        LSRUse::Address, getAccessType(UserInst), TLI))
2694     return false;
2695
2696   return true;
2697 }
2698
2699 /// GenerateIVChains - Generate an add or subtract for each IVInc in a chain to
2700 /// materialize the IV user's operand from the previous IV user's operand.
2701 void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
2702                                   SmallVectorImpl<WeakVH> &DeadInsts) {
2703   // Find the new IVOperand for the head of the chain. It may have been replaced
2704   // by LSR.
2705   const IVInc &Head = Chain.Incs[0];
2706   User::op_iterator IVOpEnd = Head.UserInst->op_end();
2707   User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
2708                                              IVOpEnd, L, SE);
2709   Value *IVSrc = 0;
2710   while (IVOpIter != IVOpEnd) {
2711     IVSrc = getWideOperand(*IVOpIter);
2712
2713     // If this operand computes the expression that the chain needs, we may use
2714     // it. (Check this after setting IVSrc which is used below.)
2715     //
2716     // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
2717     // narrow for the chain, so we can no longer use it. We do allow using a
2718     // wider phi, assuming the LSR checked for free truncation. In that case we
2719     // should already have a truncate on this operand such that
2720     // getSCEV(IVSrc) == IncExpr.
2721     if (SE.getSCEV(*IVOpIter) == Head.IncExpr
2722         || SE.getSCEV(IVSrc) == Head.IncExpr) {
2723       break;
2724     }
2725     IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE);
2726   }
2727   if (IVOpIter == IVOpEnd) {
2728     // Gracefully give up on this chain.
2729     DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
2730     return;
2731   }
2732
2733   DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
2734   Type *IVTy = IVSrc->getType();
2735   Type *IntTy = SE.getEffectiveSCEVType(IVTy);
2736   const SCEV *LeftOverExpr = 0;
2737   for (IVChain::const_iterator IncI = Chain.begin(),
2738          IncE = Chain.end(); IncI != IncE; ++IncI) {
2739
2740     Instruction *InsertPt = IncI->UserInst;
2741     if (isa<PHINode>(InsertPt))
2742       InsertPt = L->getLoopLatch()->getTerminator();
2743
2744     // IVOper will replace the current IV User's operand. IVSrc is the IV
2745     // value currently held in a register.
2746     Value *IVOper = IVSrc;
2747     if (!IncI->IncExpr->isZero()) {
2748       // IncExpr was the result of subtraction of two narrow values, so must
2749       // be signed.
2750       const SCEV *IncExpr = SE.getNoopOrSignExtend(IncI->IncExpr, IntTy);
2751       LeftOverExpr = LeftOverExpr ?
2752         SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
2753     }
2754     if (LeftOverExpr && !LeftOverExpr->isZero()) {
2755       // Expand the IV increment.
2756       Rewriter.clearPostInc();
2757       Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
2758       const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
2759                                              SE.getUnknown(IncV));
2760       IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
2761
2762       // If an IV increment can't be folded, use it as the next IV value.
2763       if (!canFoldIVIncExpr(LeftOverExpr, IncI->UserInst, IncI->IVOperand,
2764                             TLI)) {
2765         assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
2766         IVSrc = IVOper;
2767         LeftOverExpr = 0;
2768       }
2769     }
2770     Type *OperTy = IncI->IVOperand->getType();
2771     if (IVTy != OperTy) {
2772       assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
2773              "cannot extend a chained IV");
2774       IRBuilder<> Builder(InsertPt);
2775       IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
2776     }
2777     IncI->UserInst->replaceUsesOfWith(IncI->IVOperand, IVOper);
2778     DeadInsts.push_back(IncI->IVOperand);
2779   }
2780   // If LSR created a new, wider phi, we may also replace its postinc. We only
2781   // do this if we also found a wide value for the head of the chain.
2782   if (isa<PHINode>(Chain.tailUserInst())) {
2783     for (BasicBlock::iterator I = L->getHeader()->begin();
2784          PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
2785       if (!isCompatibleIVType(Phi, IVSrc))
2786         continue;
2787       Instruction *PostIncV = dyn_cast<Instruction>(
2788         Phi->getIncomingValueForBlock(L->getLoopLatch()));
2789       if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
2790         continue;
2791       Value *IVOper = IVSrc;
2792       Type *PostIncTy = PostIncV->getType();
2793       if (IVTy != PostIncTy) {
2794         assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
2795         IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
2796         Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
2797         IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
2798       }
2799       Phi->replaceUsesOfWith(PostIncV, IVOper);
2800       DeadInsts.push_back(PostIncV);
2801     }
2802   }
2803 }
2804
2805 void LSRInstance::CollectFixupsAndInitialFormulae() {
2806   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
2807     Instruction *UserInst = UI->getUser();
2808     // Skip IV users that are part of profitable IV Chains.
2809     User::op_iterator UseI = std::find(UserInst->op_begin(), UserInst->op_end(),
2810                                        UI->getOperandValToReplace());
2811     assert(UseI != UserInst->op_end() && "cannot find IV operand");
2812     if (IVIncSet.count(UseI))
2813       continue;
2814
2815     // Record the uses.
2816     LSRFixup &LF = getNewFixup();
2817     LF.UserInst = UserInst;
2818     LF.OperandValToReplace = UI->getOperandValToReplace();
2819     LF.PostIncLoops = UI->getPostIncLoops();
2820
2821     LSRUse::KindType Kind = LSRUse::Basic;
2822     Type *AccessTy = 0;
2823     if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
2824       Kind = LSRUse::Address;
2825       AccessTy = getAccessType(LF.UserInst);
2826     }
2827
2828     const SCEV *S = IU.getExpr(*UI);
2829
2830     // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
2831     // (N - i == 0), and this allows (N - i) to be the expression that we work
2832     // with rather than just N or i, so we can consider the register
2833     // requirements for both N and i at the same time. Limiting this code to
2834     // equality icmps is not a problem because all interesting loops use
2835     // equality icmps, thanks to IndVarSimplify.
2836     if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst))
2837       if (CI->isEquality()) {
2838         // Swap the operands if needed to put the OperandValToReplace on the
2839         // left, for consistency.
2840         Value *NV = CI->getOperand(1);
2841         if (NV == LF.OperandValToReplace) {
2842           CI->setOperand(1, CI->getOperand(0));
2843           CI->setOperand(0, NV);
2844           NV = CI->getOperand(1);
2845           Changed = true;
2846         }
2847
2848         // x == y  -->  x - y == 0
2849         const SCEV *N = SE.getSCEV(NV);
2850         if (SE.isLoopInvariant(N, L) && isSafeToExpand(N)) {
2851           // S is normalized, so normalize N before folding it into S
2852           // to keep the result normalized.
2853           N = TransformForPostIncUse(Normalize, N, CI, 0,
2854                                      LF.PostIncLoops, SE, DT);
2855           Kind = LSRUse::ICmpZero;
2856           S = SE.getMinusSCEV(N, S);
2857         }
2858
2859         // -1 and the negations of all interesting strides (except the negation
2860         // of -1) are now also interesting.
2861         for (size_t i = 0, e = Factors.size(); i != e; ++i)
2862           if (Factors[i] != -1)
2863             Factors.insert(-(uint64_t)Factors[i]);
2864         Factors.insert(-1);
2865       }
2866
2867     // Set up the initial formula for this use.
2868     std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
2869     LF.LUIdx = P.first;
2870     LF.Offset = P.second;
2871     LSRUse &LU = Uses[LF.LUIdx];
2872     LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
2873     if (!LU.WidestFixupType ||
2874         SE.getTypeSizeInBits(LU.WidestFixupType) <
2875         SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
2876       LU.WidestFixupType = LF.OperandValToReplace->getType();
2877
2878     // If this is the first use of this LSRUse, give it a formula.
2879     if (LU.Formulae.empty()) {
2880       InsertInitialFormula(S, LU, LF.LUIdx);
2881       CountRegisters(LU.Formulae.back(), LF.LUIdx);
2882     }
2883   }
2884
2885   DEBUG(print_fixups(dbgs()));
2886 }
2887
2888 /// InsertInitialFormula - Insert a formula for the given expression into
2889 /// the given use, separating out loop-variant portions from loop-invariant
2890 /// and loop-computable portions.
2891 void
2892 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
2893   Formula F;
2894   F.InitialMatch(S, L, SE);
2895   bool Inserted = InsertFormula(LU, LUIdx, F);
2896   assert(Inserted && "Initial formula already exists!"); (void)Inserted;
2897 }
2898
2899 /// InsertSupplementalFormula - Insert a simple single-register formula for
2900 /// the given expression into the given use.
2901 void
2902 LSRInstance::InsertSupplementalFormula(const SCEV *S,
2903                                        LSRUse &LU, size_t LUIdx) {
2904   Formula F;
2905   F.BaseRegs.push_back(S);
2906   F.AM.HasBaseReg = true;
2907   bool Inserted = InsertFormula(LU, LUIdx, F);
2908   assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
2909 }
2910
2911 /// CountRegisters - Note which registers are used by the given formula,
2912 /// updating RegUses.
2913 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
2914   if (F.ScaledReg)
2915     RegUses.CountRegister(F.ScaledReg, LUIdx);
2916   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
2917        E = F.BaseRegs.end(); I != E; ++I)
2918     RegUses.CountRegister(*I, LUIdx);
2919 }
2920
2921 /// InsertFormula - If the given formula has not yet been inserted, add it to
2922 /// the list, and return true. Return false otherwise.
2923 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
2924   if (!LU.InsertFormula(F))
2925     return false;
2926
2927   CountRegisters(F, LUIdx);
2928   return true;
2929 }
2930
2931 /// CollectLoopInvariantFixupsAndFormulae - Check for other uses of
2932 /// loop-invariant values which we're tracking. These other uses will pin these
2933 /// values in registers, making them less profitable for elimination.
2934 /// TODO: This currently misses non-constant addrec step registers.
2935 /// TODO: Should this give more weight to users inside the loop?
2936 void
2937 LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
2938   SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
2939   SmallPtrSet<const SCEV *, 8> Inserted;
2940
2941   while (!Worklist.empty()) {
2942     const SCEV *S = Worklist.pop_back_val();
2943
2944     if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
2945       Worklist.append(N->op_begin(), N->op_end());
2946     else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
2947       Worklist.push_back(C->getOperand());
2948     else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
2949       Worklist.push_back(D->getLHS());
2950       Worklist.push_back(D->getRHS());
2951     } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
2952       if (!Inserted.insert(U)) continue;
2953       const Value *V = U->getValue();
2954       if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
2955         // Look for instructions defined outside the loop.
2956         if (L->contains(Inst)) continue;
2957       } else if (isa<UndefValue>(V))
2958         // Undef doesn't have a live range, so it doesn't matter.
2959         continue;
2960       for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
2961            UI != UE; ++UI) {
2962         const Instruction *UserInst = dyn_cast<Instruction>(*UI);
2963         // Ignore non-instructions.
2964         if (!UserInst)
2965           continue;
2966         // Ignore instructions in other functions (as can happen with
2967         // Constants).
2968         if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
2969           continue;
2970         // Ignore instructions not dominated by the loop.
2971         const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
2972           UserInst->getParent() :
2973           cast<PHINode>(UserInst)->getIncomingBlock(
2974             PHINode::getIncomingValueNumForOperand(UI.getOperandNo()));
2975         if (!DT.dominates(L->getHeader(), UseBB))
2976           continue;
2977         // Ignore uses which are part of other SCEV expressions, to avoid
2978         // analyzing them multiple times.
2979         if (SE.isSCEVable(UserInst->getType())) {
2980           const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
2981           // If the user is a no-op, look through to its uses.
2982           if (!isa<SCEVUnknown>(UserS))
2983             continue;
2984           if (UserS == U) {
2985             Worklist.push_back(
2986               SE.getUnknown(const_cast<Instruction *>(UserInst)));
2987             continue;
2988           }
2989         }
2990         // Ignore icmp instructions which are already being analyzed.
2991         if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
2992           unsigned OtherIdx = !UI.getOperandNo();
2993           Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
2994           if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
2995             continue;
2996         }
2997
2998         LSRFixup &LF = getNewFixup();
2999         LF.UserInst = const_cast<Instruction *>(UserInst);
3000         LF.OperandValToReplace = UI.getUse();
3001         std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0);
3002         LF.LUIdx = P.first;
3003         LF.Offset = P.second;
3004         LSRUse &LU = Uses[LF.LUIdx];
3005         LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3006         if (!LU.WidestFixupType ||
3007             SE.getTypeSizeInBits(LU.WidestFixupType) <
3008             SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3009           LU.WidestFixupType = LF.OperandValToReplace->getType();
3010         InsertSupplementalFormula(U, LU, LF.LUIdx);
3011         CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3012         break;
3013       }
3014     }
3015   }
3016 }
3017
3018 /// CollectSubexprs - Split S into subexpressions which can be pulled out into
3019 /// separate registers. If C is non-null, multiply each subexpression by C.
3020 ///
3021 /// Return remainder expression after factoring the subexpressions captured by
3022 /// Ops. If Ops is complete, return NULL.
3023 static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3024                                    SmallVectorImpl<const SCEV *> &Ops,
3025                                    const Loop *L,
3026                                    ScalarEvolution &SE,
3027                                    unsigned Depth = 0) {
3028   // Arbitrarily cap recursion to protect compile time.
3029   if (Depth >= 3)
3030     return S;
3031
3032   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3033     // Break out add operands.
3034     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
3035          I != E; ++I) {
3036       const SCEV *Remainder = CollectSubexprs(*I, C, Ops, L, SE, Depth+1);
3037       if (Remainder)
3038         Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3039     }
3040     return NULL;
3041   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
3042     // Split a non-zero base out of an addrec.
3043     if (AR->getStart()->isZero())
3044       return S;
3045
3046     const SCEV *Remainder = CollectSubexprs(AR->getStart(),
3047                                             C, Ops, L, SE, Depth+1);
3048     // Split the non-zero AddRec unless it is part of a nested recurrence that
3049     // does not pertain to this loop.
3050     if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
3051       Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3052       Remainder = NULL;
3053     }
3054     if (Remainder != AR->getStart()) {
3055       if (!Remainder)
3056         Remainder = SE.getConstant(AR->getType(), 0);
3057       return SE.getAddRecExpr(Remainder,
3058                               AR->getStepRecurrence(SE),
3059                               AR->getLoop(),
3060                               //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3061                               SCEV::FlagAnyWrap);
3062     }
3063   } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
3064     // Break (C * (a + b + c)) into C*a + C*b + C*c.
3065     if (Mul->getNumOperands() != 2)
3066       return S;
3067     if (const SCEVConstant *Op0 =
3068         dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
3069       C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3070       const SCEV *Remainder =
3071         CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
3072       if (Remainder)
3073         Ops.push_back(SE.getMulExpr(C, Remainder));
3074       return NULL;
3075     }
3076   }
3077   return S;
3078 }
3079
3080 /// GenerateReassociations - Split out subexpressions from adds and the bases of
3081 /// addrecs.
3082 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
3083                                          Formula Base,
3084                                          unsigned Depth) {
3085   // Arbitrarily cap recursion to protect compile time.
3086   if (Depth >= 3) return;
3087
3088   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
3089     const SCEV *BaseReg = Base.BaseRegs[i];
3090
3091     SmallVector<const SCEV *, 8> AddOps;
3092     const SCEV *Remainder = CollectSubexprs(BaseReg, 0, AddOps, L, SE);
3093     if (Remainder)
3094       AddOps.push_back(Remainder);
3095
3096     if (AddOps.size() == 1) continue;
3097
3098     for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
3099          JE = AddOps.end(); J != JE; ++J) {
3100
3101       // Loop-variant "unknown" values are uninteresting; we won't be able to
3102       // do anything meaningful with them.
3103       if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3104         continue;
3105
3106       // Don't pull a constant into a register if the constant could be folded
3107       // into an immediate field.
3108       if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset,
3109                            Base.getNumRegs() > 1,
3110                            LU.Kind, LU.AccessTy, TLI, SE))
3111         continue;
3112
3113       // Collect all operands except *J.
3114       SmallVector<const SCEV *, 8> InnerAddOps
3115         (((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
3116       InnerAddOps.append
3117         (llvm::next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end());
3118
3119       // Don't leave just a constant behind in a register if the constant could
3120       // be folded into an immediate field.
3121       if (InnerAddOps.size() == 1 &&
3122           isAlwaysFoldable(InnerAddOps[0], LU.MinOffset, LU.MaxOffset,
3123                            Base.getNumRegs() > 1,
3124                            LU.Kind, LU.AccessTy, TLI, SE))
3125         continue;
3126
3127       const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3128       if (InnerSum->isZero())
3129         continue;
3130       Formula F = Base;
3131
3132       // Add the remaining pieces of the add back into the new formula.
3133       const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3134       if (TLI && InnerSumSC &&
3135           SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3136           TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
3137                                    InnerSumSC->getValue()->getZExtValue())) {
3138         F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
3139                            InnerSumSC->getValue()->getZExtValue();
3140         F.BaseRegs.erase(F.BaseRegs.begin() + i);
3141       } else
3142         F.BaseRegs[i] = InnerSum;
3143
3144       // Add J as its own register, or an unfolded immediate.
3145       const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3146       if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3147           TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
3148                                    SC->getValue()->getZExtValue()))
3149         F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
3150                            SC->getValue()->getZExtValue();
3151       else
3152         F.BaseRegs.push_back(*J);
3153
3154       if (InsertFormula(LU, LUIdx, F))
3155         // If that formula hadn't been seen before, recurse to find more like
3156         // it.
3157         GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1);
3158     }
3159   }
3160 }
3161
3162 /// GenerateCombinations - Generate a formula consisting of all of the
3163 /// loop-dominating registers added into a single register.
3164 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
3165                                        Formula Base) {
3166   // This method is only interesting on a plurality of registers.
3167   if (Base.BaseRegs.size() <= 1) return;
3168
3169   Formula F = Base;
3170   F.BaseRegs.clear();
3171   SmallVector<const SCEV *, 4> Ops;
3172   for (SmallVectorImpl<const SCEV *>::const_iterator
3173        I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) {
3174     const SCEV *BaseReg = *I;
3175     if (SE.properlyDominates(BaseReg, L->getHeader()) &&
3176         !SE.hasComputableLoopEvolution(BaseReg, L))
3177       Ops.push_back(BaseReg);
3178     else
3179       F.BaseRegs.push_back(BaseReg);
3180   }
3181   if (Ops.size() > 1) {
3182     const SCEV *Sum = SE.getAddExpr(Ops);
3183     // TODO: If Sum is zero, it probably means ScalarEvolution missed an
3184     // opportunity to fold something. For now, just ignore such cases
3185     // rather than proceed with zero in a register.
3186     if (!Sum->isZero()) {
3187       F.BaseRegs.push_back(Sum);
3188       (void)InsertFormula(LU, LUIdx, F);
3189     }
3190   }
3191 }
3192
3193 /// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets.
3194 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
3195                                           Formula Base) {
3196   // We can't add a symbolic offset if the address already contains one.
3197   if (Base.AM.BaseGV) return;
3198
3199   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
3200     const SCEV *G = Base.BaseRegs[i];
3201     GlobalValue *GV = ExtractSymbol(G, SE);
3202     if (G->isZero() || !GV)
3203       continue;
3204     Formula F = Base;
3205     F.AM.BaseGV = GV;
3206     if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
3207                     LU.Kind, LU.AccessTy, TLI))
3208       continue;
3209     F.BaseRegs[i] = G;
3210     (void)InsertFormula(LU, LUIdx, F);
3211   }
3212 }
3213
3214 /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
3215 void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
3216                                           Formula Base) {
3217   // TODO: For now, just add the min and max offset, because it usually isn't
3218   // worthwhile looking at everything inbetween.
3219   SmallVector<int64_t, 2> Worklist;
3220   Worklist.push_back(LU.MinOffset);
3221   if (LU.MaxOffset != LU.MinOffset)
3222     Worklist.push_back(LU.MaxOffset);
3223
3224   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
3225     const SCEV *G = Base.BaseRegs[i];
3226
3227     for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
3228          E = Worklist.end(); I != E; ++I) {
3229       Formula F = Base;
3230       F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I;
3231       if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I,
3232                      LU.Kind, LU.AccessTy, TLI)) {
3233         // Add the offset to the base register.
3234         const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
3235         // If it cancelled out, drop the base register, otherwise update it.
3236         if (NewG->isZero()) {
3237           std::swap(F.BaseRegs[i], F.BaseRegs.back());
3238           F.BaseRegs.pop_back();
3239         } else
3240           F.BaseRegs[i] = NewG;
3241
3242         (void)InsertFormula(LU, LUIdx, F);
3243       }
3244     }
3245
3246     int64_t Imm = ExtractImmediate(G, SE);
3247     if (G->isZero() || Imm == 0)
3248       continue;
3249     Formula F = Base;
3250     F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm;
3251     if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
3252                     LU.Kind, LU.AccessTy, TLI))
3253       continue;
3254     F.BaseRegs[i] = G;
3255     (void)InsertFormula(LU, LUIdx, F);
3256   }
3257 }
3258
3259 /// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up
3260 /// the comparison. For example, x == y -> x*c == y*c.
3261 void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
3262                                          Formula Base) {
3263   if (LU.Kind != LSRUse::ICmpZero) return;
3264
3265   // Determine the integer type for the base formula.
3266   Type *IntTy = Base.getType();
3267   if (!IntTy) return;
3268   if (SE.getTypeSizeInBits(IntTy) > 64) return;
3269
3270   // Don't do this if there is more than one offset.
3271   if (LU.MinOffset != LU.MaxOffset) return;
3272
3273   assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");
3274
3275   // Check each interesting stride.
3276   for (SmallSetVector<int64_t, 8>::const_iterator
3277        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
3278     int64_t Factor = *I;
3279
3280     // Check that the multiplication doesn't overflow.
3281     if (Base.AM.BaseOffs == INT64_MIN && Factor == -1)
3282       continue;
3283     int64_t NewBaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
3284     if (NewBaseOffs / Factor != Base.AM.BaseOffs)
3285       continue;
3286
3287     // Check that multiplying with the use offset doesn't overflow.
3288     int64_t Offset = LU.MinOffset;
3289     if (Offset == INT64_MIN && Factor == -1)
3290       continue;
3291     Offset = (uint64_t)Offset * Factor;
3292     if (Offset / Factor != LU.MinOffset)
3293       continue;
3294
3295     Formula F = Base;
3296     F.AM.BaseOffs = NewBaseOffs;
3297
3298     // Check that this scale is legal.
3299     if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI))
3300       continue;
3301
3302     // Compensate for the use having MinOffset built into it.
3303     F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset;
3304
3305     const SCEV *FactorS = SE.getConstant(IntTy, Factor);
3306
3307     // Check that multiplying with each base register doesn't overflow.
3308     for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
3309       F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
3310       if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
3311         goto next;
3312     }
3313
3314     // Check that multiplying with the scaled register doesn't overflow.
3315     if (F.ScaledReg) {
3316       F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
3317       if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
3318         continue;
3319     }
3320
3321     // Check that multiplying with the unfolded offset doesn't overflow.
3322     if (F.UnfoldedOffset != 0) {
3323       if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
3324         continue;
3325       F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
3326       if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
3327         continue;
3328     }
3329
3330     // If we make it here and it's legal, add it.
3331     (void)InsertFormula(LU, LUIdx, F);
3332   next:;
3333   }
3334 }
3335
3336 /// GenerateScales - Generate stride factor reuse formulae by making use of
3337 /// scaled-offset address modes, for example.
3338 void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
3339   // Determine the integer type for the base formula.
3340   Type *IntTy = Base.getType();
3341   if (!IntTy) return;
3342
3343   // If this Formula already has a scaled register, we can't add another one.
3344   if (Base.AM.Scale != 0) return;
3345
3346   // Check each interesting stride.
3347   for (SmallSetVector<int64_t, 8>::const_iterator
3348        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
3349     int64_t Factor = *I;
3350
3351     Base.AM.Scale = Factor;
3352     Base.AM.HasBaseReg = Base.BaseRegs.size() > 1;
3353     // Check whether this scale is going to be legal.
3354     if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
3355                     LU.Kind, LU.AccessTy, TLI)) {
3356       // As a special-case, handle special out-of-loop Basic users specially.
3357       // TODO: Reconsider this special case.
3358       if (LU.Kind == LSRUse::Basic &&
3359           isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
3360                      LSRUse::Special, LU.AccessTy, TLI) &&
3361           LU.AllFixupsOutsideLoop)
3362         LU.Kind = LSRUse::Special;
3363       else
3364         continue;
3365     }
3366     // For an ICmpZero, negating a solitary base register won't lead to
3367     // new solutions.
3368     if (LU.Kind == LSRUse::ICmpZero &&
3369         !Base.AM.HasBaseReg && Base.AM.BaseOffs == 0 && !Base.AM.BaseGV)
3370       continue;
3371     // For each addrec base reg, apply the scale, if possible.
3372     for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3373       if (const SCEVAddRecExpr *AR =
3374             dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
3375         const SCEV *FactorS = SE.getConstant(IntTy, Factor);
3376         if (FactorS->isZero())
3377           continue;
3378         // Divide out the factor, ignoring high bits, since we'll be
3379         // scaling the value back up in the end.
3380         if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
3381           // TODO: This could be optimized to avoid all the copying.
3382           Formula F = Base;
3383           F.ScaledReg = Quotient;
3384           F.DeleteBaseReg(F.BaseRegs[i]);
3385           (void)InsertFormula(LU, LUIdx, F);
3386         }
3387       }
3388   }
3389 }
3390
3391 /// GenerateTruncates - Generate reuse formulae from different IV types.
3392 void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
3393   // This requires TargetLowering to tell us which truncates are free.
3394   if (!TLI) return;
3395
3396   // Don't bother truncating symbolic values.
3397   if (Base.AM.BaseGV) return;
3398
3399   // Determine the integer type for the base formula.
3400   Type *DstTy = Base.getType();
3401   if (!DstTy) return;
3402   DstTy = SE.getEffectiveSCEVType(DstTy);
3403
3404   for (SmallSetVector<Type *, 4>::const_iterator
3405        I = Types.begin(), E = Types.end(); I != E; ++I) {
3406     Type *SrcTy = *I;
3407     if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
3408       Formula F = Base;
3409
3410       if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
3411       for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
3412            JE = F.BaseRegs.end(); J != JE; ++J)
3413         *J = SE.getAnyExtendExpr(*J, SrcTy);
3414
3415       // TODO: This assumes we've done basic processing on all uses and
3416       // have an idea what the register usage is.
3417       if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
3418         continue;
3419
3420       (void)InsertFormula(LU, LUIdx, F);
3421     }
3422   }
3423 }
3424
3425 namespace {
3426
3427 /// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to
3428 /// defer modifications so that the search phase doesn't have to worry about
3429 /// the data structures moving underneath it.
3430 struct WorkItem {
3431   size_t LUIdx;
3432   int64_t Imm;
3433   const SCEV *OrigReg;
3434
3435   WorkItem(size_t LI, int64_t I, const SCEV *R)
3436     : LUIdx(LI), Imm(I), OrigReg(R) {}
3437
3438   void print(raw_ostream &OS) const;
3439   void dump() const;
3440 };
3441
3442 }
3443
3444 void WorkItem::print(raw_ostream &OS) const {
3445   OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
3446      << " , add offset " << Imm;
3447 }
3448
3449 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3450 void WorkItem::dump() const {
3451   print(errs()); errs() << '\n';
3452 }
3453 #endif
3454
3455 /// GenerateCrossUseConstantOffsets - Look for registers which are a constant
3456 /// distance apart and try to form reuse opportunities between them.
3457 void LSRInstance::GenerateCrossUseConstantOffsets() {
3458   // Group the registers by their value without any added constant offset.
3459   typedef std::map<int64_t, const SCEV *> ImmMapTy;
3460   typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
3461   RegMapTy Map;
3462   DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
3463   SmallVector<const SCEV *, 8> Sequence;
3464   for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
3465        I != E; ++I) {
3466     const SCEV *Reg = *I;
3467     int64_t Imm = ExtractImmediate(Reg, SE);
3468     std::pair<RegMapTy::iterator, bool> Pair =
3469       Map.insert(std::make_pair(Reg, ImmMapTy()));
3470     if (Pair.second)
3471       Sequence.push_back(Reg);
3472     Pair.first->second.insert(std::make_pair(Imm, *I));
3473     UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I);
3474   }
3475
3476   // Now examine each set of registers with the same base value. Build up
3477   // a list of work to do and do the work in a separate step so that we're
3478   // not adding formulae and register counts while we're searching.
3479   SmallVector<WorkItem, 32> WorkItems;
3480   SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
3481   for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
3482        E = Sequence.end(); I != E; ++I) {
3483     const SCEV *Reg = *I;
3484     const ImmMapTy &Imms = Map.find(Reg)->second;
3485
3486     // It's not worthwhile looking for reuse if there's only one offset.
3487     if (Imms.size() == 1)
3488       continue;
3489
3490     DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
3491           for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
3492                J != JE; ++J)
3493             dbgs() << ' ' << J->first;
3494           dbgs() << '\n');
3495
3496     // Examine each offset.
3497     for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
3498          J != JE; ++J) {
3499       const SCEV *OrigReg = J->second;
3500
3501       int64_t JImm = J->first;
3502       const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
3503
3504       if (!isa<SCEVConstant>(OrigReg) &&
3505           UsedByIndicesMap[Reg].count() == 1) {
3506         DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
3507         continue;
3508       }
3509
3510       // Conservatively examine offsets between this orig reg a few selected
3511       // other orig regs.
3512       ImmMapTy::const_iterator OtherImms[] = {
3513         Imms.begin(), prior(Imms.end()),
3514         Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
3515       };
3516       for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
3517         ImmMapTy::const_iterator M = OtherImms[i];
3518         if (M == J || M == JE) continue;
3519
3520         // Compute the difference between the two.
3521         int64_t Imm = (uint64_t)JImm - M->first;
3522         for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
3523              LUIdx = UsedByIndices.find_next(LUIdx))
3524           // Make a memo of this use, offset, and register tuple.
3525           if (UniqueItems.insert(std::make_pair(LUIdx, Imm)))
3526             WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
3527       }
3528     }
3529   }
3530
3531   Map.clear();
3532   Sequence.clear();
3533   UsedByIndicesMap.clear();
3534   UniqueItems.clear();
3535
3536   // Now iterate through the worklist and add new formulae.
3537   for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
3538        E = WorkItems.end(); I != E; ++I) {
3539     const WorkItem &WI = *I;
3540     size_t LUIdx = WI.LUIdx;
3541     LSRUse &LU = Uses[LUIdx];
3542     int64_t Imm = WI.Imm;
3543     const SCEV *OrigReg = WI.OrigReg;
3544
3545     Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
3546     const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
3547     unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
3548
3549     // TODO: Use a more targeted data structure.
3550     for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
3551       const Formula &F = LU.Formulae[L];
3552       // Use the immediate in the scaled register.
3553       if (F.ScaledReg == OrigReg) {
3554         int64_t Offs = (uint64_t)F.AM.BaseOffs +
3555                        Imm * (uint64_t)F.AM.Scale;
3556         // Don't create 50 + reg(-50).
3557         if (F.referencesReg(SE.getSCEV(
3558                    ConstantInt::get(IntTy, -(uint64_t)Offs))))
3559           continue;
3560         Formula NewF = F;
3561         NewF.AM.BaseOffs = Offs;
3562         if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
3563                         LU.Kind, LU.AccessTy, TLI))
3564           continue;
3565         NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
3566
3567         // If the new scale is a constant in a register, and adding the constant
3568         // value to the immediate would produce a value closer to zero than the
3569         // immediate itself, then the formula isn't worthwhile.
3570         if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
3571           if (C->getValue()->isNegative() !=
3572                 (NewF.AM.BaseOffs < 0) &&
3573               (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale))
3574                 .ule(abs64(NewF.AM.BaseOffs)))
3575             continue;
3576
3577         // OK, looks good.
3578         (void)InsertFormula(LU, LUIdx, NewF);
3579       } else {
3580         // Use the immediate in a base register.
3581         for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
3582           const SCEV *BaseReg = F.BaseRegs[N];
3583           if (BaseReg != OrigReg)
3584             continue;
3585           Formula NewF = F;
3586           NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
3587           if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
3588                           LU.Kind, LU.AccessTy, TLI)) {
3589             if (!TLI ||
3590                 !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
3591               continue;
3592             NewF = F;
3593             NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
3594           }
3595           NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
3596
3597           // If the new formula has a constant in a register, and adding the
3598           // constant value to the immediate would produce a value closer to
3599           // zero than the immediate itself, then the formula isn't worthwhile.
3600           for (SmallVectorImpl<const SCEV *>::const_iterator
3601                J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end();
3602                J != JE; ++J)
3603             if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J))
3604               if ((C->getValue()->getValue() + NewF.AM.BaseOffs).abs().slt(
3605                    abs64(NewF.AM.BaseOffs)) &&
3606                   (C->getValue()->getValue() +
3607                    NewF.AM.BaseOffs).countTrailingZeros() >=
3608                    CountTrailingZeros_64(NewF.AM.BaseOffs))
3609                 goto skip_formula;
3610
3611           // Ok, looks good.
3612           (void)InsertFormula(LU, LUIdx, NewF);
3613           break;
3614         skip_formula:;
3615         }
3616       }
3617     }
3618   }
3619 }
3620
3621 /// GenerateAllReuseFormulae - Generate formulae for each use.
3622 void
3623 LSRInstance::GenerateAllReuseFormulae() {
3624   // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
3625   // queries are more precise.
3626   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3627     LSRUse &LU = Uses[LUIdx];
3628     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3629       GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
3630     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3631       GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
3632   }
3633   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3634     LSRUse &LU = Uses[LUIdx];
3635     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3636       GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
3637     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3638       GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
3639     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3640       GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
3641     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3642       GenerateScales(LU, LUIdx, LU.Formulae[i]);
3643   }
3644   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3645     LSRUse &LU = Uses[LUIdx];
3646     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3647       GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
3648   }
3649
3650   GenerateCrossUseConstantOffsets();
3651
3652   DEBUG(dbgs() << "\n"
3653                   "After generating reuse formulae:\n";
3654         print_uses(dbgs()));
3655 }
3656
3657 /// If there are multiple formulae with the same set of registers used
3658 /// by other uses, pick the best one and delete the others.
3659 void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
3660   DenseSet<const SCEV *> VisitedRegs;
3661   SmallPtrSet<const SCEV *, 16> Regs;
3662   SmallPtrSet<const SCEV *, 16> LoserRegs;
3663 #ifndef NDEBUG
3664   bool ChangedFormulae = false;
3665 #endif
3666
3667   // Collect the best formula for each unique set of shared registers. This
3668   // is reset for each use.
3669   typedef DenseMap<SmallVector<const SCEV *, 2>, size_t, UniquifierDenseMapInfo>
3670     BestFormulaeTy;
3671   BestFormulaeTy BestFormulae;
3672
3673   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3674     LSRUse &LU = Uses[LUIdx];
3675     DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
3676
3677     bool Any = false;
3678     for (size_t FIdx = 0, NumForms = LU.Formulae.size();
3679          FIdx != NumForms; ++FIdx) {
3680       Formula &F = LU.Formulae[FIdx];
3681
3682       // Some formulas are instant losers. For example, they may depend on
3683       // nonexistent AddRecs from other loops. These need to be filtered
3684       // immediately, otherwise heuristics could choose them over others leading
3685       // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
3686       // avoids the need to recompute this information across formulae using the
3687       // same bad AddRec. Passing LoserRegs is also essential unless we remove
3688       // the corresponding bad register from the Regs set.
3689       Cost CostF;
3690       Regs.clear();
3691       CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT,
3692                         &LoserRegs);
3693       if (CostF.isLoser()) {
3694         // During initial formula generation, undesirable formulae are generated
3695         // by uses within other loops that have some non-trivial address mode or
3696         // use the postinc form of the IV. LSR needs to provide these formulae
3697         // as the basis of rediscovering the desired formula that uses an AddRec
3698         // corresponding to the existing phi. Once all formulae have been
3699         // generated, these initial losers may be pruned.
3700         DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs());
3701               dbgs() << "\n");
3702       }
3703       else {
3704         SmallVector<const SCEV *, 2> Key;
3705         for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
3706                JE = F.BaseRegs.end(); J != JE; ++J) {
3707           const SCEV *Reg = *J;
3708           if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
3709             Key.push_back(Reg);
3710         }
3711         if (F.ScaledReg &&
3712             RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
3713           Key.push_back(F.ScaledReg);
3714         // Unstable sort by host order ok, because this is only used for
3715         // uniquifying.
3716         std::sort(Key.begin(), Key.end());
3717
3718         std::pair<BestFormulaeTy::const_iterator, bool> P =
3719           BestFormulae.insert(std::make_pair(Key, FIdx));
3720         if (P.second)
3721           continue;
3722
3723         Formula &Best = LU.Formulae[P.first->second];
3724
3725         Cost CostBest;
3726         Regs.clear();
3727         CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
3728         if (CostF < CostBest)
3729           std::swap(F, Best);
3730         DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
3731               dbgs() << "\n"
3732                         "    in favor of formula "; Best.print(dbgs());
3733               dbgs() << '\n');
3734       }
3735 #ifndef NDEBUG
3736       ChangedFormulae = true;
3737 #endif
3738       LU.DeleteFormula(F);
3739       --FIdx;
3740       --NumForms;
3741       Any = true;
3742     }
3743
3744     // Now that we've filtered out some formulae, recompute the Regs set.
3745     if (Any)
3746       LU.RecomputeRegs(LUIdx, RegUses);
3747
3748     // Reset this to prepare for the next use.
3749     BestFormulae.clear();
3750   }
3751
3752   DEBUG(if (ChangedFormulae) {
3753           dbgs() << "\n"
3754                     "After filtering out undesirable candidates:\n";
3755           print_uses(dbgs());
3756         });
3757 }
3758
3759 // This is a rough guess that seems to work fairly well.
3760 static const size_t ComplexityLimit = UINT16_MAX;
3761
3762 /// EstimateSearchSpaceComplexity - Estimate the worst-case number of
3763 /// solutions the solver might have to consider. It almost never considers
3764 /// this many solutions because it prune the search space, but the pruning
3765 /// isn't always sufficient.
3766 size_t LSRInstance::EstimateSearchSpaceComplexity() const {
3767   size_t Power = 1;
3768   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
3769        E = Uses.end(); I != E; ++I) {
3770     size_t FSize = I->Formulae.size();
3771     if (FSize >= ComplexityLimit) {
3772       Power = ComplexityLimit;
3773       break;
3774     }
3775     Power *= FSize;
3776     if (Power >= ComplexityLimit)
3777       break;
3778   }
3779   return Power;
3780 }
3781
3782 /// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset
3783 /// of the registers of another formula, it won't help reduce register
3784 /// pressure (though it may not necessarily hurt register pressure); remove
3785 /// it to simplify the system.
3786 void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
3787   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
3788     DEBUG(dbgs() << "The search space is too complex.\n");
3789
3790     DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
3791                     "which use a superset of registers used by other "
3792                     "formulae.\n");
3793
3794     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3795       LSRUse &LU = Uses[LUIdx];
3796       bool Any = false;
3797       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
3798         Formula &F = LU.Formulae[i];
3799         // Look for a formula with a constant or GV in a register. If the use
3800         // also has a formula with that same value in an immediate field,
3801         // delete the one that uses a register.
3802         for (SmallVectorImpl<const SCEV *>::const_iterator
3803              I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
3804           if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
3805             Formula NewF = F;
3806             NewF.AM.BaseOffs += C->getValue()->getSExtValue();
3807             NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
3808                                 (I - F.BaseRegs.begin()));
3809             if (LU.HasFormulaWithSameRegs(NewF)) {
3810               DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
3811               LU.DeleteFormula(F);
3812               --i;
3813               --e;
3814               Any = true;
3815               break;
3816             }
3817           } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
3818             if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
3819               if (!F.AM.BaseGV) {
3820                 Formula NewF = F;
3821                 NewF.AM.BaseGV = GV;
3822                 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
3823                                     (I - F.BaseRegs.begin()));
3824                 if (LU.HasFormulaWithSameRegs(NewF)) {
3825                   DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
3826                         dbgs() << '\n');
3827                   LU.DeleteFormula(F);
3828                   --i;
3829                   --e;
3830                   Any = true;
3831                   break;
3832                 }
3833               }
3834           }
3835         }
3836       }
3837       if (Any)
3838         LU.RecomputeRegs(LUIdx, RegUses);
3839     }
3840
3841     DEBUG(dbgs() << "After pre-selection:\n";
3842           print_uses(dbgs()));
3843   }
3844 }
3845
3846 /// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers
3847 /// for expressions like A, A+1, A+2, etc., allocate a single register for
3848 /// them.
3849 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
3850   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
3851     DEBUG(dbgs() << "The search space is too complex.\n");
3852
3853     DEBUG(dbgs() << "Narrowing the search space by assuming that uses "
3854                     "separated by a constant offset will use the same "
3855                     "registers.\n");
3856
3857     // This is especially useful for unrolled loops.
3858
3859     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3860       LSRUse &LU = Uses[LUIdx];
3861       for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
3862            E = LU.Formulae.end(); I != E; ++I) {
3863         const Formula &F = *I;
3864         if (F.AM.BaseOffs != 0 && F.AM.Scale == 0) {
3865           if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) {
3866             if (reconcileNewOffset(*LUThatHas, F.AM.BaseOffs,
3867                                    /*HasBaseReg=*/false,
3868                                    LU.Kind, LU.AccessTy)) {
3869               DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs());
3870                     dbgs() << '\n');
3871
3872               LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
3873
3874               // Update the relocs to reference the new use.
3875               for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(),
3876                    E = Fixups.end(); I != E; ++I) {
3877                 LSRFixup &Fixup = *I;
3878                 if (Fixup.LUIdx == LUIdx) {
3879                   Fixup.LUIdx = LUThatHas - &Uses.front();
3880                   Fixup.Offset += F.AM.BaseOffs;
3881                   // Add the new offset to LUThatHas' offset list.
3882                   if (LUThatHas->Offsets.back() != Fixup.Offset) {
3883                     LUThatHas->Offsets.push_back(Fixup.Offset);
3884                     if (Fixup.Offset > LUThatHas->MaxOffset)
3885                       LUThatHas->MaxOffset = Fixup.Offset;
3886                     if (Fixup.Offset < LUThatHas->MinOffset)
3887                       LUThatHas->MinOffset = Fixup.Offset;
3888                   }
3889                   DEBUG(dbgs() << "New fixup has offset "
3890                                << Fixup.Offset << '\n');
3891                 }
3892                 if (Fixup.LUIdx == NumUses-1)
3893                   Fixup.LUIdx = LUIdx;
3894               }
3895
3896               // Delete formulae from the new use which are no longer legal.
3897               bool Any = false;
3898               for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
3899                 Formula &F = LUThatHas->Formulae[i];
3900                 if (!isLegalUse(F.AM,
3901                                 LUThatHas->MinOffset, LUThatHas->MaxOffset,
3902                                 LUThatHas->Kind, LUThatHas->AccessTy, TLI)) {
3903                   DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
3904                         dbgs() << '\n');
3905                   LUThatHas->DeleteFormula(F);
3906                   --i;
3907                   --e;
3908                   Any = true;
3909                 }
3910               }
3911               if (Any)
3912                 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
3913
3914               // Delete the old use.
3915               DeleteUse(LU, LUIdx);
3916               --LUIdx;
3917               --NumUses;
3918               break;
3919             }
3920           }
3921         }
3922       }
3923     }
3924
3925     DEBUG(dbgs() << "After pre-selection:\n";
3926           print_uses(dbgs()));
3927   }
3928 }
3929
3930 /// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call
3931 /// FilterOutUndesirableDedicatedRegisters again, if necessary, now that
3932 /// we've done more filtering, as it may be able to find more formulae to
3933 /// eliminate.
3934 void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
3935   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
3936     DEBUG(dbgs() << "The search space is too complex.\n");
3937
3938     DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
3939                     "undesirable dedicated registers.\n");
3940
3941     FilterOutUndesirableDedicatedRegisters();
3942
3943     DEBUG(dbgs() << "After pre-selection:\n";
3944           print_uses(dbgs()));
3945   }
3946 }
3947
3948 /// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely
3949 /// to be profitable, and then in any use which has any reference to that
3950 /// register, delete all formulae which do not reference that register.
3951 void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
3952   // With all other options exhausted, loop until the system is simple
3953   // enough to handle.
3954   SmallPtrSet<const SCEV *, 4> Taken;
3955   while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
3956     // Ok, we have too many of formulae on our hands to conveniently handle.
3957     // Use a rough heuristic to thin out the list.
3958     DEBUG(dbgs() << "The search space is too complex.\n");
3959
3960     // Pick the register which is used by the most LSRUses, which is likely
3961     // to be a good reuse register candidate.
3962     const SCEV *Best = 0;
3963     unsigned BestNum = 0;
3964     for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
3965          I != E; ++I) {
3966       const SCEV *Reg = *I;
3967       if (Taken.count(Reg))
3968         continue;
3969       if (!Best)
3970         Best = Reg;
3971       else {
3972         unsigned Count = RegUses.getUsedByIndices(Reg).count();
3973         if (Count > BestNum) {
3974           Best = Reg;
3975           BestNum = Count;
3976         }
3977       }
3978     }
3979
3980     DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
3981                  << " will yield profitable reuse.\n");
3982     Taken.insert(Best);
3983
3984     // In any use with formulae which references this register, delete formulae
3985     // which don't reference it.
3986     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3987       LSRUse &LU = Uses[LUIdx];
3988       if (!LU.Regs.count(Best)) continue;
3989
3990       bool Any = false;
3991       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
3992         Formula &F = LU.Formulae[i];
3993         if (!F.referencesReg(Best)) {
3994           DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
3995           LU.DeleteFormula(F);
3996           --e;
3997           --i;
3998           Any = true;
3999           assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
4000           continue;
4001         }
4002       }
4003
4004       if (Any)
4005         LU.RecomputeRegs(LUIdx, RegUses);
4006     }
4007
4008     DEBUG(dbgs() << "After pre-selection:\n";
4009           print_uses(dbgs()));
4010   }
4011 }
4012
4013 /// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of
4014 /// formulae to choose from, use some rough heuristics to prune down the number
4015 /// of formulae. This keeps the main solver from taking an extraordinary amount
4016 /// of time in some worst-case scenarios.
4017 void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
4018   NarrowSearchSpaceByDetectingSupersets();
4019   NarrowSearchSpaceByCollapsingUnrolledCode();
4020   NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
4021   NarrowSearchSpaceByPickingWinnerRegs();
4022 }
4023
4024 /// SolveRecurse - This is the recursive solver.
4025 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
4026                                Cost &SolutionCost,
4027                                SmallVectorImpl<const Formula *> &Workspace,
4028                                const Cost &CurCost,
4029                                const SmallPtrSet<const SCEV *, 16> &CurRegs,
4030                                DenseSet<const SCEV *> &VisitedRegs) const {
4031   // Some ideas:
4032   //  - prune more:
4033   //    - use more aggressive filtering
4034   //    - sort the formula so that the most profitable solutions are found first
4035   //    - sort the uses too
4036   //  - search faster:
4037   //    - don't compute a cost, and then compare. compare while computing a cost
4038   //      and bail early.
4039   //    - track register sets with SmallBitVector
4040
4041   const LSRUse &LU = Uses[Workspace.size()];
4042
4043   // If this use references any register that's already a part of the
4044   // in-progress solution, consider it a requirement that a formula must
4045   // reference that register in order to be considered. This prunes out
4046   // unprofitable searching.
4047   SmallSetVector<const SCEV *, 4> ReqRegs;
4048   for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(),
4049        E = CurRegs.end(); I != E; ++I)
4050     if (LU.Regs.count(*I))
4051       ReqRegs.insert(*I);
4052
4053   SmallPtrSet<const SCEV *, 16> NewRegs;
4054   Cost NewCost;
4055   for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
4056        E = LU.Formulae.end(); I != E; ++I) {
4057     const Formula &F = *I;
4058
4059     // Ignore formulae which do not use any of the required registers.
4060     bool SatisfiedReqReg = true;
4061     for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
4062          JE = ReqRegs.end(); J != JE; ++J) {
4063       const SCEV *Reg = *J;
4064       if ((!F.ScaledReg || F.ScaledReg != Reg) &&
4065           std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) ==
4066           F.BaseRegs.end()) {
4067         SatisfiedReqReg = false;
4068         break;
4069       }
4070     }
4071     if (!SatisfiedReqReg) {
4072       // If none of the formulae satisfied the required registers, then we could
4073       // clear ReqRegs and try again. Currently, we simply give up in this case.
4074       continue;
4075     }
4076
4077     // Evaluate the cost of the current formula. If it's already worse than
4078     // the current best, prune the search at that point.
4079     NewCost = CurCost;
4080     NewRegs = CurRegs;
4081     NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
4082     if (NewCost < SolutionCost) {
4083       Workspace.push_back(&F);
4084       if (Workspace.size() != Uses.size()) {
4085         SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
4086                      NewRegs, VisitedRegs);
4087         if (F.getNumRegs() == 1 && Workspace.size() == 1)
4088           VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
4089       } else {
4090         DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
4091               dbgs() << ".\n Regs:";
4092               for (SmallPtrSet<const SCEV *, 16>::const_iterator
4093                    I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I)
4094                 dbgs() << ' ' << **I;
4095               dbgs() << '\n');
4096
4097         SolutionCost = NewCost;
4098         Solution = Workspace;
4099       }
4100       Workspace.pop_back();
4101     }
4102   }
4103 }
4104
4105 /// Solve - Choose one formula from each use. Return the results in the given
4106 /// Solution vector.
4107 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
4108   SmallVector<const Formula *, 8> Workspace;
4109   Cost SolutionCost;
4110   SolutionCost.Loose();
4111   Cost CurCost;
4112   SmallPtrSet<const SCEV *, 16> CurRegs;
4113   DenseSet<const SCEV *> VisitedRegs;
4114   Workspace.reserve(Uses.size());
4115
4116   // SolveRecurse does all the work.
4117   SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
4118                CurRegs, VisitedRegs);
4119   if (Solution.empty()) {
4120     DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
4121     return;
4122   }
4123
4124   // Ok, we've now made all our decisions.
4125   DEBUG(dbgs() << "\n"
4126                   "The chosen solution requires "; SolutionCost.print(dbgs());
4127         dbgs() << ":\n";
4128         for (size_t i = 0, e = Uses.size(); i != e; ++i) {
4129           dbgs() << "  ";
4130           Uses[i].print(dbgs());
4131           dbgs() << "\n"
4132                     "    ";
4133           Solution[i]->print(dbgs());
4134           dbgs() << '\n';
4135         });
4136
4137   assert(Solution.size() == Uses.size() && "Malformed solution!");
4138 }
4139
4140 /// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up
4141 /// the dominator tree far as we can go while still being dominated by the
4142 /// input positions. This helps canonicalize the insert position, which
4143 /// encourages sharing.
4144 BasicBlock::iterator
4145 LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
4146                                  const SmallVectorImpl<Instruction *> &Inputs)
4147                                                                          const {
4148   for (;;) {
4149     const Loop *IPLoop = LI.getLoopFor(IP->getParent());
4150     unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
4151
4152     BasicBlock *IDom;
4153     for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
4154       if (!Rung) return IP;
4155       Rung = Rung->getIDom();
4156       if (!Rung) return IP;
4157       IDom = Rung->getBlock();
4158
4159       // Don't climb into a loop though.
4160       const Loop *IDomLoop = LI.getLoopFor(IDom);
4161       unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
4162       if (IDomDepth <= IPLoopDepth &&
4163           (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
4164         break;
4165     }
4166
4167     bool AllDominate = true;
4168     Instruction *BetterPos = 0;
4169     Instruction *Tentative = IDom->getTerminator();
4170     for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
4171          E = Inputs.end(); I != E; ++I) {
4172       Instruction *Inst = *I;
4173       if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
4174         AllDominate = false;
4175         break;
4176       }
4177       // Attempt to find an insert position in the middle of the block,
4178       // instead of at the end, so that it can be used for other expansions.
4179       if (IDom == Inst->getParent() &&
4180           (!BetterPos || !DT.dominates(Inst, BetterPos)))
4181         BetterPos = llvm::next(BasicBlock::iterator(Inst));
4182     }
4183     if (!AllDominate)
4184       break;
4185     if (BetterPos)
4186       IP = BetterPos;
4187     else
4188       IP = Tentative;
4189   }
4190
4191   return IP;
4192 }
4193
4194 /// AdjustInsertPositionForExpand - Determine an input position which will be
4195 /// dominated by the operands and which will dominate the result.
4196 BasicBlock::iterator
4197 LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
4198                                            const LSRFixup &LF,
4199                                            const LSRUse &LU,
4200                                            SCEVExpander &Rewriter) const {
4201   // Collect some instructions which must be dominated by the
4202   // expanding replacement. These must be dominated by any operands that
4203   // will be required in the expansion.
4204   SmallVector<Instruction *, 4> Inputs;
4205   if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
4206     Inputs.push_back(I);
4207   if (LU.Kind == LSRUse::ICmpZero)
4208     if (Instruction *I =
4209           dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
4210       Inputs.push_back(I);
4211   if (LF.PostIncLoops.count(L)) {
4212     if (LF.isUseFullyOutsideLoop(L))
4213       Inputs.push_back(L->getLoopLatch()->getTerminator());
4214     else
4215       Inputs.push_back(IVIncInsertPos);
4216   }
4217   // The expansion must also be dominated by the increment positions of any
4218   // loops it for which it is using post-inc mode.
4219   for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(),
4220        E = LF.PostIncLoops.end(); I != E; ++I) {
4221     const Loop *PIL = *I;
4222     if (PIL == L) continue;
4223
4224     // Be dominated by the loop exit.
4225     SmallVector<BasicBlock *, 4> ExitingBlocks;
4226     PIL->getExitingBlocks(ExitingBlocks);
4227     if (!ExitingBlocks.empty()) {
4228       BasicBlock *BB = ExitingBlocks[0];
4229       for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
4230         BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
4231       Inputs.push_back(BB->getTerminator());
4232     }
4233   }
4234
4235   assert(!isa<PHINode>(LowestIP) && !isa<LandingPadInst>(LowestIP)
4236          && !isa<DbgInfoIntrinsic>(LowestIP) &&
4237          "Insertion point must be a normal instruction");
4238
4239   // Then, climb up the immediate dominator tree as far as we can go while
4240   // still being dominated by the input positions.
4241   BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
4242
4243   // Don't insert instructions before PHI nodes.
4244   while (isa<PHINode>(IP)) ++IP;
4245
4246   // Ignore landingpad instructions.
4247   while (isa<LandingPadInst>(IP)) ++IP;
4248
4249   // Ignore debug intrinsics.
4250   while (isa<DbgInfoIntrinsic>(IP)) ++IP;
4251
4252   // Set IP below instructions recently inserted by SCEVExpander. This keeps the
4253   // IP consistent across expansions and allows the previously inserted
4254   // instructions to be reused by subsequent expansion.
4255   while (Rewriter.isInsertedInstruction(IP) && IP != LowestIP) ++IP;
4256
4257   return IP;
4258 }
4259
4260 /// Expand - Emit instructions for the leading candidate expression for this
4261 /// LSRUse (this is called "expanding").
4262 Value *LSRInstance::Expand(const LSRFixup &LF,
4263                            const Formula &F,
4264                            BasicBlock::iterator IP,
4265                            SCEVExpander &Rewriter,
4266                            SmallVectorImpl<WeakVH> &DeadInsts) const {
4267   const LSRUse &LU = Uses[LF.LUIdx];
4268
4269   // Determine an input position which will be dominated by the operands and
4270   // which will dominate the result.
4271   IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
4272
4273   // Inform the Rewriter if we have a post-increment use, so that it can
4274   // perform an advantageous expansion.
4275   Rewriter.setPostInc(LF.PostIncLoops);
4276
4277   // This is the type that the user actually needs.
4278   Type *OpTy = LF.OperandValToReplace->getType();
4279   // This will be the type that we'll initially expand to.
4280   Type *Ty = F.getType();
4281   if (!Ty)
4282     // No type known; just expand directly to the ultimate type.
4283     Ty = OpTy;
4284   else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
4285     // Expand directly to the ultimate type if it's the right size.
4286     Ty = OpTy;
4287   // This is the type to do integer arithmetic in.
4288   Type *IntTy = SE.getEffectiveSCEVType(Ty);
4289
4290   // Build up a list of operands to add together to form the full base.
4291   SmallVector<const SCEV *, 8> Ops;
4292
4293   // Expand the BaseRegs portion.
4294   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
4295        E = F.BaseRegs.end(); I != E; ++I) {
4296     const SCEV *Reg = *I;
4297     assert(!Reg->isZero() && "Zero allocated in a base register!");
4298
4299     // If we're expanding for a post-inc user, make the post-inc adjustment.
4300     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
4301     Reg = TransformForPostIncUse(Denormalize, Reg,
4302                                  LF.UserInst, LF.OperandValToReplace,
4303                                  Loops, SE, DT);
4304
4305     Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
4306   }
4307
4308   // Expand the ScaledReg portion.
4309   Value *ICmpScaledV = 0;
4310   if (F.AM.Scale != 0) {
4311     const SCEV *ScaledS = F.ScaledReg;
4312
4313     // If we're expanding for a post-inc user, make the post-inc adjustment.
4314     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
4315     ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
4316                                      LF.UserInst, LF.OperandValToReplace,
4317                                      Loops, SE, DT);
4318
4319     if (LU.Kind == LSRUse::ICmpZero) {
4320       // An interesting way of "folding" with an icmp is to use a negated
4321       // scale, which we'll implement by inserting it into the other operand
4322       // of the icmp.
4323       assert(F.AM.Scale == -1 &&
4324              "The only scale supported by ICmpZero uses is -1!");
4325       ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
4326     } else {
4327       // Otherwise just expand the scaled register and an explicit scale,
4328       // which is expected to be matched as part of the address.
4329
4330       // Flush the operand list to suppress SCEVExpander hoisting address modes.
4331       if (!Ops.empty() && LU.Kind == LSRUse::Address) {
4332         Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
4333         Ops.clear();
4334         Ops.push_back(SE.getUnknown(FullV));
4335       }
4336       ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
4337       ScaledS = SE.getMulExpr(ScaledS,
4338                               SE.getConstant(ScaledS->getType(), F.AM.Scale));
4339       Ops.push_back(ScaledS);
4340     }
4341   }
4342
4343   // Expand the GV portion.
4344   if (F.AM.BaseGV) {
4345     // Flush the operand list to suppress SCEVExpander hoisting.
4346     if (!Ops.empty()) {
4347       Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
4348       Ops.clear();
4349       Ops.push_back(SE.getUnknown(FullV));
4350     }
4351     Ops.push_back(SE.getUnknown(F.AM.BaseGV));
4352   }
4353
4354   // Flush the operand list to suppress SCEVExpander hoisting of both folded and
4355   // unfolded offsets. LSR assumes they both live next to their uses.
4356   if (!Ops.empty()) {
4357     Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
4358     Ops.clear();
4359     Ops.push_back(SE.getUnknown(FullV));
4360   }
4361
4362   // Expand the immediate portion.
4363   int64_t Offset = (uint64_t)F.AM.BaseOffs + LF.Offset;
4364   if (Offset != 0) {
4365     if (LU.Kind == LSRUse::ICmpZero) {
4366       // The other interesting way of "folding" with an ICmpZero is to use a
4367       // negated immediate.
4368       if (!ICmpScaledV)
4369         ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
4370       else {
4371         Ops.push_back(SE.getUnknown(ICmpScaledV));
4372         ICmpScaledV = ConstantInt::get(IntTy, Offset);
4373       }
4374     } else {
4375       // Just add the immediate values. These again are expected to be matched
4376       // as part of the address.
4377       Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
4378     }
4379   }
4380
4381   // Expand the unfolded offset portion.
4382   int64_t UnfoldedOffset = F.UnfoldedOffset;
4383   if (UnfoldedOffset != 0) {
4384     // Just add the immediate values.
4385     Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
4386                                                        UnfoldedOffset)));
4387   }
4388
4389   // Emit instructions summing all the operands.
4390   const SCEV *FullS = Ops.empty() ?
4391                       SE.getConstant(IntTy, 0) :
4392                       SE.getAddExpr(Ops);
4393   Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);
4394
4395   // We're done expanding now, so reset the rewriter.
4396   Rewriter.clearPostInc();
4397
4398   // An ICmpZero Formula represents an ICmp which we're handling as a
4399   // comparison against zero. Now that we've expanded an expression for that
4400   // form, update the ICmp's other operand.
4401   if (LU.Kind == LSRUse::ICmpZero) {
4402     ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
4403     DeadInsts.push_back(CI->getOperand(1));
4404     assert(!F.AM.BaseGV && "ICmp does not support folding a global value and "
4405                            "a scale at the same time!");
4406     if (F.AM.Scale == -1) {
4407       if (ICmpScaledV->getType() != OpTy) {
4408         Instruction *Cast =
4409           CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
4410                                                    OpTy, false),
4411                            ICmpScaledV, OpTy, "tmp", CI);
4412         ICmpScaledV = Cast;
4413       }
4414       CI->setOperand(1, ICmpScaledV);
4415     } else {
4416       assert(F.AM.Scale == 0 &&
4417              "ICmp does not support folding a global value and "
4418              "a scale at the same time!");
4419       Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
4420                                            -(uint64_t)Offset);
4421       if (C->getType() != OpTy)
4422         C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
4423                                                           OpTy, false),
4424                                   C, OpTy);
4425
4426       CI->setOperand(1, C);
4427     }
4428   }
4429
4430   return FullV;
4431 }
4432
4433 /// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use
4434 /// of their operands effectively happens in their predecessor blocks, so the
4435 /// expression may need to be expanded in multiple places.
4436 void LSRInstance::RewriteForPHI(PHINode *PN,
4437                                 const LSRFixup &LF,
4438                                 const Formula &F,
4439                                 SCEVExpander &Rewriter,
4440                                 SmallVectorImpl<WeakVH> &DeadInsts,
4441                                 Pass *P) const {
4442   DenseMap<BasicBlock *, Value *> Inserted;
4443   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
4444     if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
4445       BasicBlock *BB = PN->getIncomingBlock(i);
4446
4447       // If this is a critical edge, split the edge so that we do not insert
4448       // the code on all predecessor/successor paths.  We do this unless this
4449       // is the canonical backedge for this loop, which complicates post-inc
4450       // users.
4451       if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
4452           !isa<IndirectBrInst>(BB->getTerminator())) {
4453         BasicBlock *Parent = PN->getParent();
4454         Loop *PNLoop = LI.getLoopFor(Parent);
4455         if (!PNLoop || Parent != PNLoop->getHeader()) {
4456           // Split the critical edge.
4457           BasicBlock *NewBB = 0;
4458           if (!Parent->isLandingPad()) {
4459             NewBB = SplitCriticalEdge(BB, Parent, P,
4460                                       /*MergeIdenticalEdges=*/true,
4461                                       /*DontDeleteUselessPhis=*/true);
4462           } else {
4463             SmallVector<BasicBlock*, 2> NewBBs;
4464             SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs);
4465             NewBB = NewBBs[0];
4466           }
4467           // If NewBB==NULL, then SplitCriticalEdge refused to split because all
4468           // phi predecessors are identical. The simple thing to do is skip
4469           // splitting in this case rather than complicate the API.
4470           if (NewBB) {
4471             // If PN is outside of the loop and BB is in the loop, we want to
4472             // move the block to be immediately before the PHI block, not
4473             // immediately after BB.
4474             if (L->contains(BB) && !L->contains(PN))
4475               NewBB->moveBefore(PN->getParent());
4476
4477             // Splitting the edge can reduce the number of PHI entries we have.
4478             e = PN->getNumIncomingValues();
4479             BB = NewBB;
4480             i = PN->getBasicBlockIndex(BB);
4481           }
4482         }
4483       }
4484
4485       std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
4486         Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
4487       if (!Pair.second)
4488         PN->setIncomingValue(i, Pair.first->second);
4489       else {
4490         Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts);
4491
4492         // If this is reuse-by-noop-cast, insert the noop cast.
4493         Type *OpTy = LF.OperandValToReplace->getType();
4494         if (FullV->getType() != OpTy)
4495           FullV =
4496             CastInst::Create(CastInst::getCastOpcode(FullV, false,
4497                                                      OpTy, false),
4498                              FullV, LF.OperandValToReplace->getType(),
4499                              "tmp", BB->getTerminator());
4500
4501         PN->setIncomingValue(i, FullV);
4502         Pair.first->second = FullV;
4503       }
4504     }
4505 }
4506
4507 /// Rewrite - Emit instructions for the leading candidate expression for this
4508 /// LSRUse (this is called "expanding"), and update the UserInst to reference
4509 /// the newly expanded value.
4510 void LSRInstance::Rewrite(const LSRFixup &LF,
4511                           const Formula &F,
4512                           SCEVExpander &Rewriter,
4513                           SmallVectorImpl<WeakVH> &DeadInsts,
4514                           Pass *P) const {
4515   // First, find an insertion point that dominates UserInst. For PHI nodes,
4516   // find the nearest block which dominates all the relevant uses.
4517   if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
4518     RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P);
4519   } else {
4520     Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts);
4521
4522     // If this is reuse-by-noop-cast, insert the noop cast.
4523     Type *OpTy = LF.OperandValToReplace->getType();
4524     if (FullV->getType() != OpTy) {
4525       Instruction *Cast =
4526         CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
4527                          FullV, OpTy, "tmp", LF.UserInst);
4528       FullV = Cast;
4529     }
4530
4531     // Update the user. ICmpZero is handled specially here (for now) because
4532     // Expand may have updated one of the operands of the icmp already, and
4533     // its new value may happen to be equal to LF.OperandValToReplace, in
4534     // which case doing replaceUsesOfWith leads to replacing both operands
4535     // with the same value. TODO: Reorganize this.
4536     if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
4537       LF.UserInst->setOperand(0, FullV);
4538     else
4539       LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
4540   }
4541
4542   DeadInsts.push_back(LF.OperandValToReplace);
4543 }
4544
4545 /// ImplementSolution - Rewrite all the fixup locations with new values,
4546 /// following the chosen solution.
4547 void
4548 LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
4549                                Pass *P) {
4550   // Keep track of instructions we may have made dead, so that
4551   // we can remove them after we are done working.
4552   SmallVector<WeakVH, 16> DeadInsts;
4553
4554   SCEVExpander Rewriter(SE, "lsr");
4555 #ifndef NDEBUG
4556   Rewriter.setDebugType(DEBUG_TYPE);
4557 #endif
4558   Rewriter.disableCanonicalMode();
4559   Rewriter.enableLSRMode();
4560   Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
4561
4562   // Mark phi nodes that terminate chains so the expander tries to reuse them.
4563   for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(),
4564          ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) {
4565     if (PHINode *PN = dyn_cast<PHINode>(ChainI->tailUserInst()))
4566       Rewriter.setChainedPhi(PN);
4567   }
4568
4569   // Expand the new value definitions and update the users.
4570   for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
4571        E = Fixups.end(); I != E; ++I) {
4572     const LSRFixup &Fixup = *I;
4573
4574     Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P);
4575
4576     Changed = true;
4577   }
4578
4579   for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(),
4580          ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) {
4581     GenerateIVChain(*ChainI, Rewriter, DeadInsts);
4582     Changed = true;
4583   }
4584   // Clean up after ourselves. This must be done before deleting any
4585   // instructions.
4586   Rewriter.clear();
4587
4588   Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
4589 }
4590
4591 LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
4592   : IU(P->getAnalysis<IVUsers>()),
4593     SE(P->getAnalysis<ScalarEvolution>()),
4594     DT(P->getAnalysis<DominatorTree>()),
4595     LI(P->getAnalysis<LoopInfo>()),
4596     TLI(tli), L(l), Changed(false), IVIncInsertPos(0) {
4597
4598   // If LoopSimplify form is not available, stay out of trouble.
4599   if (!L->isLoopSimplifyForm())
4600     return;
4601
4602   // If there's no interesting work to be done, bail early.
4603   if (IU.empty()) return;
4604
4605   // If there's too much analysis to be done, bail early. We won't be able to
4606   // model the problem anyway.
4607   unsigned NumUsers = 0;
4608   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
4609     if (++NumUsers > MaxIVUsers) {
4610       DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << *L
4611             << "\n");
4612       return;
4613     }
4614   }
4615
4616 #ifndef NDEBUG
4617   // All dominating loops must have preheaders, or SCEVExpander may not be able
4618   // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
4619   //
4620   // IVUsers analysis should only create users that are dominated by simple loop
4621   // headers. Since this loop should dominate all of its users, its user list
4622   // should be empty if this loop itself is not within a simple loop nest.
4623   for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
4624        Rung; Rung = Rung->getIDom()) {
4625     BasicBlock *BB = Rung->getBlock();
4626     const Loop *DomLoop = LI.getLoopFor(BB);
4627     if (DomLoop && DomLoop->getHeader() == BB) {
4628       assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
4629     }
4630   }
4631 #endif // DEBUG
4632
4633   DEBUG(dbgs() << "\nLSR on loop ";
4634         WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
4635         dbgs() << ":\n");
4636
4637   // First, perform some low-level loop optimizations.
4638   OptimizeShadowIV();
4639   OptimizeLoopTermCond();
4640
4641   // If loop preparation eliminates all interesting IV users, bail.
4642   if (IU.empty()) return;
4643
4644   // Skip nested loops until we can model them better with formulae.
4645   if (!L->empty()) {
4646     DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
4647     return;
4648   }
4649
4650   // Start collecting data and preparing for the solver.
4651   CollectChains();
4652   CollectInterestingTypesAndFactors();
4653   CollectFixupsAndInitialFormulae();
4654   CollectLoopInvariantFixupsAndFormulae();
4655
4656   assert(!Uses.empty() && "IVUsers reported at least one use");
4657   DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
4658         print_uses(dbgs()));
4659
4660   // Now use the reuse data to generate a bunch of interesting ways
4661   // to formulate the values needed for the uses.
4662   GenerateAllReuseFormulae();
4663
4664   FilterOutUndesirableDedicatedRegisters();
4665   NarrowSearchSpaceUsingHeuristics();
4666
4667   SmallVector<const Formula *, 8> Solution;
4668   Solve(Solution);
4669
4670   // Release memory that is no longer needed.
4671   Factors.clear();
4672   Types.clear();
4673   RegUses.clear();
4674
4675   if (Solution.empty())
4676     return;
4677
4678 #ifndef NDEBUG
4679   // Formulae should be legal.
4680   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
4681        E = Uses.end(); I != E; ++I) {
4682      const LSRUse &LU = *I;
4683      for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
4684           JE = LU.Formulae.end(); J != JE; ++J)
4685         assert(isLegalUse(J->AM, LU.MinOffset, LU.MaxOffset,
4686                           LU.Kind, LU.AccessTy, TLI) &&
4687                "Illegal formula generated!");
4688   };
4689 #endif
4690
4691   // Now that we've decided what we want, make it so.
4692   ImplementSolution(Solution, P);
4693 }
4694
4695 void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
4696   if (Factors.empty() && Types.empty()) return;
4697
4698   OS << "LSR has identified the following interesting factors and types: ";
4699   bool First = true;
4700
4701   for (SmallSetVector<int64_t, 8>::const_iterator
4702        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
4703     if (!First) OS << ", ";
4704     First = false;
4705     OS << '*' << *I;
4706   }
4707
4708   for (SmallSetVector<Type *, 4>::const_iterator
4709        I = Types.begin(), E = Types.end(); I != E; ++I) {
4710     if (!First) OS << ", ";
4711     First = false;
4712     OS << '(' << **I << ')';
4713   }
4714   OS << '\n';
4715 }
4716
4717 void LSRInstance::print_fixups(raw_ostream &OS) const {
4718   OS << "LSR is examining the following fixup sites:\n";
4719   for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
4720        E = Fixups.end(); I != E; ++I) {
4721     dbgs() << "  ";
4722     I->print(OS);
4723     OS << '\n';
4724   }
4725 }
4726
4727 void LSRInstance::print_uses(raw_ostream &OS) const {
4728   OS << "LSR is examining the following uses:\n";
4729   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
4730        E = Uses.end(); I != E; ++I) {
4731     const LSRUse &LU = *I;
4732     dbgs() << "  ";
4733     LU.print(OS);
4734     OS << '\n';
4735     for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
4736          JE = LU.Formulae.end(); J != JE; ++J) {
4737       OS << "    ";
4738       J->print(OS);
4739       OS << '\n';
4740     }
4741   }
4742 }
4743
4744 void LSRInstance::print(raw_ostream &OS) const {
4745   print_factors_and_types(OS);
4746   print_fixups(OS);
4747   print_uses(OS);
4748 }
4749
4750 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4751 void LSRInstance::dump() const {
4752   print(errs()); errs() << '\n';
4753 }
4754 #endif
4755
4756 namespace {
4757
4758 class LoopStrengthReduce : public LoopPass {
4759   /// TLI - Keep a pointer of a TargetLowering to consult for determining
4760   /// transformation profitability.
4761   const TargetLowering *const TLI;
4762
4763 public:
4764   static char ID; // Pass ID, replacement for typeid
4765   explicit LoopStrengthReduce(const TargetLowering *tli = 0);
4766
4767 private:
4768   bool runOnLoop(Loop *L, LPPassManager &LPM);
4769   void getAnalysisUsage(AnalysisUsage &AU) const;
4770 };
4771
4772 }
4773
4774 char LoopStrengthReduce::ID = 0;
4775 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
4776                 "Loop Strength Reduction", false, false)
4777 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
4778 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
4779 INITIALIZE_PASS_DEPENDENCY(IVUsers)
4780 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
4781 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
4782 INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
4783                 "Loop Strength Reduction", false, false)
4784
4785
4786 Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
4787   return new LoopStrengthReduce(TLI);
4788 }
4789
4790 LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
4791   : LoopPass(ID), TLI(tli) {
4792     initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
4793   }
4794
4795 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
4796   // We split critical edges, so we change the CFG.  However, we do update
4797   // many analyses if they are around.
4798   AU.addPreservedID(LoopSimplifyID);
4799
4800   AU.addRequired<LoopInfo>();
4801   AU.addPreserved<LoopInfo>();
4802   AU.addRequiredID(LoopSimplifyID);
4803   AU.addRequired<DominatorTree>();
4804   AU.addPreserved<DominatorTree>();
4805   AU.addRequired<ScalarEvolution>();
4806   AU.addPreserved<ScalarEvolution>();
4807   // Requiring LoopSimplify a second time here prevents IVUsers from running
4808   // twice, since LoopSimplify was invalidated by running ScalarEvolution.
4809   AU.addRequiredID(LoopSimplifyID);
4810   AU.addRequired<IVUsers>();
4811   AU.addPreserved<IVUsers>();
4812 }
4813
4814 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
4815   bool Changed = false;
4816
4817   // Run the main LSR transformation.
4818   Changed |= LSRInstance(TLI, L, this).getChanged();
4819
4820   // Remove any extra phis created by processing inner loops.
4821   Changed |= DeleteDeadPHIs(L->getHeader());
4822   if (EnablePhiElim) {
4823     SmallVector<WeakVH, 16> DeadInsts;
4824     SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), "lsr");
4825 #ifndef NDEBUG
4826     Rewriter.setDebugType(DEBUG_TYPE);
4827 #endif
4828     unsigned numFolded = Rewriter.
4829       replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), DeadInsts, TLI);
4830     if (numFolded) {
4831       Changed = true;
4832       DeleteTriviallyDeadInstructions(DeadInsts);
4833       DeleteDeadPHIs(L->getHeader());
4834     }
4835   }
4836   return Changed;
4837 }