lib/Transforms/Scalar/LoopStrengthReduce.cpp

   1 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This transformation analyzes and transforms the induction variables (and
  11 // computations derived from them) into forms suitable for efficient execution
  12 // on the target.
  13 //
  14 // This pass performs a strength reduction on array references inside loops that
  15 // have as one or more of their components the loop induction variable, it
  16 // rewrites expressions to take advantage of scaled-index addressing modes
  17 // available on the target, and it performs a variety of other optimizations
  18 // related to loop induction variables.
  19 //
  20 // Terminology note: this code has a lot of handling for "post-increment" or
  21 // "post-inc" users. This is not talking about post-increment addressing modes;
  22 // it is instead talking about code like this:
  23 //
  24 //   %i = phi [ 0, %entry ], [ %i.next, %latch ]
  25 //   ...
  26 //   %i.next = add %i, 1
  27 //   %c = icmp eq %i.next, %n
  28 //
  29 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
  30 // it's useful to think about these as the same register, with some uses using
  31 // the value of the register before the add and some using // it after. In this
  32 // example, the icmp is a post-increment user, since it uses %i.next, which is
  33 // the value of the induction variable after the increment. The other common
  34 // case of post-increment users is users outside the loop.
  35 //
  36 // TODO: More sophistication in the way Formulae are generated and filtered.
  37 //
  38 // TODO: Handle multiple loops at a time.
  39 //
  40 // TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr
  41 //       instead of a GlobalValue?
  42 //
  43 // TODO: When truncation is free, truncate ICmp users' operands to make it a
  44 //       smaller encoding (on x86 at least).
  45 //
  46 // TODO: When a negated register is used by an add (such as in a list of
  47 //       multiple base registers, or as the increment expression in an addrec),
  48 //       we may not actually need both reg and (-1 * reg) in registers; the
  49 //       negation can be implemented by using a sub instead of an add. The
  50 //       lack of support for taking this into consideration when making
  51 //       register pressure decisions is partly worked around by the "Special"
  52 //       use kind.
  53 //
  54 //===----------------------------------------------------------------------===//
  55
  56 #define DEBUG_TYPE "loop-reduce"
  57 #include "llvm/Transforms/Scalar.h"
  58 #include "llvm/Constants.h"
  59 #include "llvm/Instructions.h"
  60 #include "llvm/IntrinsicInst.h"
  61 #include "llvm/DerivedTypes.h"
  62 #include "llvm/Analysis/IVUsers.h"
  63 #include "llvm/Analysis/Dominators.h"
  64 #include "llvm/Analysis/LoopPass.h"
  65 #include "llvm/Analysis/ScalarEvolutionExpander.h"
  66 #include "llvm/Assembly/Writer.h"
  67 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  68 #include "llvm/Transforms/Utils/Local.h"
  69 #include "llvm/ADT/SmallBitVector.h"
  70 #include "llvm/ADT/SetVector.h"
  71 #include "llvm/ADT/DenseSet.h"
  72 #include "llvm/Support/Debug.h"
  73 #include "llvm/Support/CommandLine.h"
  74 #include "llvm/Support/ValueHandle.h"
  75 #include "llvm/Support/raw_ostream.h"
  76 #include "llvm/Target/TargetLowering.h"
  77 #include <algorithm>
  78 using namespace llvm;
  79
  80 static cl::opt<bool> EnableNested(
  81   "enable-lsr-nested", cl::Hidden, cl::desc("Enable LSR on nested loops"));
  82
  83 static cl::opt<bool> EnableRetry(
  84   "enable-lsr-retry", cl::Hidden, cl::desc("Enable LSR retry"));
  85
  86 // Temporary flag to cleanup congruent phis after LSR phi expansion.
  87 // It's currently disabled until we can determine whether it's truly useful or
  88 // not. The flag should be removed after the v3.0 release.
  89 // This is now needed for ivchains.
  90 static cl::opt<bool> EnablePhiElim(
  91   "enable-lsr-phielim", cl::Hidden, cl::init(true),
  92   cl::desc("Enable LSR phi elimination"));
  93
  94 namespace {
  95
  96 /// RegSortData - This class holds data which is used to order reuse candidates.
  97 class RegSortData {
  98 public:
  99   /// UsedByIndices - This represents the set of LSRUse indices which reference
 100   /// a particular register.
 101   SmallBitVector UsedByIndices;
 102
 103   RegSortData() {}
 104
 105   void print(raw_ostream &OS) const;
 106   void dump() const;
 107 };
 108
 109 }
 110
 111 void RegSortData::print(raw_ostream &OS) const {
 112   OS << "[NumUses=" << UsedByIndices.count() << ']';
 113 }
 114
 115 void RegSortData::dump() const {
 116   print(errs()); errs() << '\n';
 117 }
 118
 119 namespace {
 120
 121 /// RegUseTracker - Map register candidates to information about how they are
 122 /// used.
 123 class RegUseTracker {
 124   typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
 125
 126   RegUsesTy RegUsesMap;
 127   SmallVector<const SCEV *, 16> RegSequence;
 128
 129 public:
 130   void CountRegister(const SCEV *Reg, size_t LUIdx);
 131   void DropRegister(const SCEV *Reg, size_t LUIdx);
 132   void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx);
 133
 134   bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
 135
 136   const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
 137
 138   void clear();
 139
 140   typedef SmallVectorImpl<const SCEV *>::iterator iterator;
 141   typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
 142   iterator begin() { return RegSequence.begin(); }
 143   iterator end()   { return RegSequence.end(); }
 144   const_iterator begin() const { return RegSequence.begin(); }
 145   const_iterator end() const   { return RegSequence.end(); }
 146 };
 147
 148 }
 149
 150 void
 151 RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) {
 152   std::pair<RegUsesTy::iterator, bool> Pair =
 153     RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
 154   RegSortData &RSD = Pair.first->second;
 155   if (Pair.second)
 156     RegSequence.push_back(Reg);
 157   RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
 158   RSD.UsedByIndices.set(LUIdx);
 159 }
 160
 161 void
 162 RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) {
 163   RegUsesTy::iterator It = RegUsesMap.find(Reg);
 164   assert(It != RegUsesMap.end());
 165   RegSortData &RSD = It->second;
 166   assert(RSD.UsedByIndices.size() > LUIdx);
 167   RSD.UsedByIndices.reset(LUIdx);
 168 }
 169
 170 void
 171 RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
 172   assert(LUIdx <= LastLUIdx);
 173
 174   // Update RegUses. The data structure is not optimized for this purpose;
 175   // we must iterate through it and update each of the bit vectors.
 176   for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end();
 177        I != E; ++I) {
 178     SmallBitVector &UsedByIndices = I->second.UsedByIndices;
 179     if (LUIdx < UsedByIndices.size())
 180       UsedByIndices[LUIdx] =
 181         LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0;
 182     UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
 183   }
 184 }
 185
 186 bool
 187 RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
 188   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
 189   if (I == RegUsesMap.end())
 190     return false;
 191   const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
 192   int i = UsedByIndices.find_first();
 193   if (i == -1) return false;
 194   if ((size_t)i != LUIdx) return true;
 195   return UsedByIndices.find_next(i) != -1;
 196 }
 197
 198 const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
 199   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
 200   assert(I != RegUsesMap.end() && "Unknown register!");
 201   return I->second.UsedByIndices;
 202 }
 203
 204 void RegUseTracker::clear() {
 205   RegUsesMap.clear();
 206   RegSequence.clear();
 207 }
 208
 209 namespace {
 210
 211 /// Formula - This class holds information that describes a formula for
 212 /// computing satisfying a use. It may include broken-out immediates and scaled
 213 /// registers.
 214 struct Formula {
 215   /// AM - This is used to represent complex addressing, as well as other kinds
 216   /// of interesting uses.
 217   TargetLowering::AddrMode AM;
 218
 219   /// BaseRegs - The list of "base" registers for this use. When this is
 220   /// non-empty, AM.HasBaseReg should be set to true.
 221   SmallVector<const SCEV *, 2> BaseRegs;
 222
 223   /// ScaledReg - The 'scaled' register for this use. This should be non-null
 224   /// when AM.Scale is not zero.
 225   const SCEV *ScaledReg;
 226
 227   /// UnfoldedOffset - An additional constant offset which added near the
 228   /// use. This requires a temporary register, but the offset itself can
 229   /// live in an add immediate field rather than a register.
 230   int64_t UnfoldedOffset;
 231
 232   Formula() : ScaledReg(0), UnfoldedOffset(0) {}
 233
 234   void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 235
 236   unsigned getNumRegs() const;
 237   Type *getType() const;
 238
 239   void DeleteBaseReg(const SCEV *&S);
 240
 241   bool referencesReg(const SCEV *S) const;
 242   bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
 243                                   const RegUseTracker &RegUses) const;
 244
 245   void print(raw_ostream &OS) const;
 246   void dump() const;
 247 };
 248
 249 }
 250
 251 /// DoInitialMatch - Recursion helper for InitialMatch.
 252 static void DoInitialMatch(const SCEV *S, Loop *L,
 253                            SmallVectorImpl<const SCEV *> &Good,
 254                            SmallVectorImpl<const SCEV *> &Bad,
 255                            ScalarEvolution &SE) {
 256   // Collect expressions which properly dominate the loop header.
 257   if (SE.properlyDominates(S, L->getHeader())) {
 258     Good.push_back(S);
 259     return;
 260   }
 261
 262   // Look at add operands.
 263   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 264     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
 265          I != E; ++I)
 266       DoInitialMatch(*I, L, Good, Bad, SE);
 267     return;
 268   }
 269
 270   // Look at addrec operands.
 271   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
 272     if (!AR->getStart()->isZero()) {
 273       DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
 274       DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
 275                                       AR->getStepRecurrence(SE),
 276                                       // FIXME: AR->getNoWrapFlags()
 277                                       AR->getLoop(), SCEV::FlagAnyWrap),
 278                      L, Good, Bad, SE);
 279       return;
 280     }
 281
 282   // Handle a multiplication by -1 (negation) if it didn't fold.
 283   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
 284     if (Mul->getOperand(0)->isAllOnesValue()) {
 285       SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
 286       const SCEV *NewMul = SE.getMulExpr(Ops);
 287
 288       SmallVector<const SCEV *, 4> MyGood;
 289       SmallVector<const SCEV *, 4> MyBad;
 290       DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
 291       const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
 292         SE.getEffectiveSCEVType(NewMul->getType())));
 293       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(),
 294            E = MyGood.end(); I != E; ++I)
 295         Good.push_back(SE.getMulExpr(NegOne, *I));
 296       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(),
 297            E = MyBad.end(); I != E; ++I)
 298         Bad.push_back(SE.getMulExpr(NegOne, *I));
 299       return;
 300     }
 301
 302   // Ok, we can't do anything interesting. Just stuff the whole thing into a
 303   // register and hope for the best.
 304   Bad.push_back(S);
 305 }
 306
 307 /// InitialMatch - Incorporate loop-variant parts of S into this Formula,
 308 /// attempting to keep all loop-invariant and loop-computable values in a
 309 /// single base register.
 310 void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
 311   SmallVector<const SCEV *, 4> Good;
 312   SmallVector<const SCEV *, 4> Bad;
 313   DoInitialMatch(S, L, Good, Bad, SE);
 314   if (!Good.empty()) {
 315     const SCEV *Sum = SE.getAddExpr(Good);
 316     if (!Sum->isZero())
 317       BaseRegs.push_back(Sum);
 318     AM.HasBaseReg = true;
 319   }
 320   if (!Bad.empty()) {
 321     const SCEV *Sum = SE.getAddExpr(Bad);
 322     if (!Sum->isZero())
 323       BaseRegs.push_back(Sum);
 324     AM.HasBaseReg = true;
 325   }
 326 }
 327
 328 /// getNumRegs - Return the total number of register operands used by this
 329 /// formula. This does not include register uses implied by non-constant
 330 /// addrec strides.
 331 unsigned Formula::getNumRegs() const {
 332   return !!ScaledReg + BaseRegs.size();
 333 }
 334
 335 /// getType - Return the type of this formula, if it has one, or null
 336 /// otherwise. This type is meaningless except for the bit size.
 337 Type *Formula::getType() const {
 338   return !BaseRegs.empty() ? BaseRegs.front()->getType() :
 339          ScaledReg ? ScaledReg->getType() :
 340          AM.BaseGV ? AM.BaseGV->getType() :
 341          0;
 342 }
 343
 344 /// DeleteBaseReg - Delete the given base reg from the BaseRegs list.
 345 void Formula::DeleteBaseReg(const SCEV *&S) {
 346   if (&S != &BaseRegs.back())
 347     std::swap(S, BaseRegs.back());
 348   BaseRegs.pop_back();
 349 }
 350
 351 /// referencesReg - Test if this formula references the given register.
 352 bool Formula::referencesReg(const SCEV *S) const {
 353   return S == ScaledReg ||
 354          std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
 355 }
 356
 357 /// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers
 358 /// which are used by uses other than the use with the given index.
 359 bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
 360                                          const RegUseTracker &RegUses) const {
 361   if (ScaledReg)
 362     if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
 363       return true;
 364   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
 365        E = BaseRegs.end(); I != E; ++I)
 366     if (RegUses.isRegUsedByUsesOtherThan(*I, LUIdx))
 367       return true;
 368   return false;
 369 }
 370
 371 void Formula::print(raw_ostream &OS) const {
 372   bool First = true;
 373   if (AM.BaseGV) {
 374     if (!First) OS << " + "; else First = false;
 375     WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false);
 376   }
 377   if (AM.BaseOffs != 0) {
 378     if (!First) OS << " + "; else First = false;
 379     OS << AM.BaseOffs;
 380   }
 381   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
 382        E = BaseRegs.end(); I != E; ++I) {
 383     if (!First) OS << " + "; else First = false;
 384     OS << "reg(" << **I << ')';
 385   }
 386   if (AM.HasBaseReg && BaseRegs.empty()) {
 387     if (!First) OS << " + "; else First = false;
 388     OS << "**error: HasBaseReg**";
 389   } else if (!AM.HasBaseReg && !BaseRegs.empty()) {
 390     if (!First) OS << " + "; else First = false;
 391     OS << "**error: !HasBaseReg**";
 392   }
 393   if (AM.Scale != 0) {
 394     if (!First) OS << " + "; else First = false;
 395     OS << AM.Scale << "*reg(";
 396     if (ScaledReg)
 397       OS << *ScaledReg;
 398     else
 399       OS << "<unknown>";
 400     OS << ')';
 401   }
 402   if (UnfoldedOffset != 0) {
 403     if (!First) OS << " + "; else First = false;
 404     OS << "imm(" << UnfoldedOffset << ')';
 405   }
 406 }
 407
 408 void Formula::dump() const {
 409   print(errs()); errs() << '\n';
 410 }
 411
 412 /// isAddRecSExtable - Return true if the given addrec can be sign-extended
 413 /// without changing its value.
 414 static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
 415   Type *WideTy =
 416     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
 417   return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
 418 }
 419
 420 /// isAddSExtable - Return true if the given add can be sign-extended
 421 /// without changing its value.
 422 static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
 423   Type *WideTy =
 424     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
 425   return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
 426 }
 427
 428 /// isMulSExtable - Return true if the given mul can be sign-extended
 429 /// without changing its value.
 430 static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
 431   Type *WideTy =
 432     IntegerType::get(SE.getContext(),
 433                      SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
 434   return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
 435 }
 436
 437 /// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined
 438 /// and if the remainder is known to be zero,  or null otherwise. If
 439 /// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified
 440 /// to Y, ignoring that the multiplication may overflow, which is useful when
 441 /// the result will be used in a context where the most significant bits are
 442 /// ignored.
 443 static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
 444                                 ScalarEvolution &SE,
 445                                 bool IgnoreSignificantBits = false) {
 446   // Handle the trivial case, which works for any SCEV type.
 447   if (LHS == RHS)
 448     return SE.getConstant(LHS->getType(), 1);
 449
 450   // Handle a few RHS special cases.
 451   const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
 452   if (RC) {
 453     const APInt &RA = RC->getValue()->getValue();
 454     // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
 455     // some folding.
 456     if (RA.isAllOnesValue())
 457       return SE.getMulExpr(LHS, RC);
 458     // Handle x /s 1 as x.
 459     if (RA == 1)
 460       return LHS;
 461   }
 462
 463   // Check for a division of a constant by a constant.
 464   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
 465     if (!RC)
 466       return 0;
 467     const APInt &LA = C->getValue()->getValue();
 468     const APInt &RA = RC->getValue()->getValue();
 469     if (LA.srem(RA) != 0)
 470       return 0;
 471     return SE.getConstant(LA.sdiv(RA));
 472   }
 473
 474   // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
 475   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
 476     if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
 477       const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
 478                                       IgnoreSignificantBits);
 479       if (!Step) return 0;
 480       const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
 481                                        IgnoreSignificantBits);
 482       if (!Start) return 0;
 483       // FlagNW is independent of the start value, step direction, and is
 484       // preserved with smaller magnitude steps.
 485       // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
 486       return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
 487     }
 488     return 0;
 489   }
 490
 491   // Distribute the sdiv over add operands, if the add doesn't overflow.
 492   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
 493     if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
 494       SmallVector<const SCEV *, 8> Ops;
 495       for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
 496            I != E; ++I) {
 497         const SCEV *Op = getExactSDiv(*I, RHS, SE,
 498                                       IgnoreSignificantBits);
 499         if (!Op) return 0;
 500         Ops.push_back(Op);
 501       }
 502       return SE.getAddExpr(Ops);
 503     }
 504     return 0;
 505   }
 506
 507   // Check for a multiply operand that we can pull RHS out of.
 508   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
 509     if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
 510       SmallVector<const SCEV *, 4> Ops;
 511       bool Found = false;
 512       for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end();
 513            I != E; ++I) {
 514         const SCEV *S = *I;
 515         if (!Found)
 516           if (const SCEV *Q = getExactSDiv(S, RHS, SE,
 517                                            IgnoreSignificantBits)) {
 518             S = Q;
 519             Found = true;
 520           }
 521         Ops.push_back(S);
 522       }
 523       return Found ? SE.getMulExpr(Ops) : 0;
 524     }
 525     return 0;
 526   }
 527
 528   // Otherwise we don't know.
 529   return 0;
 530 }
 531
 532 /// ExtractImmediate - If S involves the addition of a constant integer value,
 533 /// return that integer value, and mutate S to point to a new SCEV with that
 534 /// value excluded.
 535 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
 536   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
 537     if (C->getValue()->getValue().getMinSignedBits() <= 64) {
 538       S = SE.getConstant(C->getType(), 0);
 539       return C->getValue()->getSExtValue();
 540     }
 541   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 542     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
 543     int64_t Result = ExtractImmediate(NewOps.front(), SE);
 544     if (Result != 0)
 545       S = SE.getAddExpr(NewOps);
 546     return Result;
 547   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
 548     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
 549     int64_t Result = ExtractImmediate(NewOps.front(), SE);
 550     if (Result != 0)
 551       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
 552                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
 553                            SCEV::FlagAnyWrap);
 554     return Result;
 555   }
 556   return 0;
 557 }
 558
 559 /// ExtractSymbol - If S involves the addition of a GlobalValue address,
 560 /// return that symbol, and mutate S to point to a new SCEV with that
 561 /// value excluded.
 562 static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
 563   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
 564     if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
 565       S = SE.getConstant(GV->getType(), 0);
 566       return GV;
 567     }
 568   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 569     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
 570     GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
 571     if (Result)
 572       S = SE.getAddExpr(NewOps);
 573     return Result;
 574   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
 575     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
 576     GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
 577     if (Result)
 578       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
 579                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
 580                            SCEV::FlagAnyWrap);
 581     return Result;
 582   }
 583   return 0;
 584 }
 585
 586 /// isAddressUse - Returns true if the specified instruction is using the
 587 /// specified value as an address.
 588 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
 589   bool isAddress = isa<LoadInst>(Inst);
 590   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
 591     if (SI->getOperand(1) == OperandVal)
 592       isAddress = true;
 593   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
 594     // Addressing modes can also be folded into prefetches and a variety
 595     // of intrinsics.
 596     switch (II->getIntrinsicID()) {
 597       default: break;
 598       case Intrinsic::prefetch:
 599       case Intrinsic::x86_sse_storeu_ps:
 600       case Intrinsic::x86_sse2_storeu_pd:
 601       case Intrinsic::x86_sse2_storeu_dq:
 602       case Intrinsic::x86_sse2_storel_dq:
 603         if (II->getArgOperand(0) == OperandVal)
 604           isAddress = true;
 605         break;
 606     }
 607   }
 608   return isAddress;
 609 }
 610
 611 /// getAccessType - Return the type of the memory being accessed.
 612 static Type *getAccessType(const Instruction *Inst) {
 613   Type *AccessTy = Inst->getType();
 614   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
 615     AccessTy = SI->getOperand(0)->getType();
 616   else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
 617     // Addressing modes can also be folded into prefetches and a variety
 618     // of intrinsics.
 619     switch (II->getIntrinsicID()) {
 620     default: break;
 621     case Intrinsic::x86_sse_storeu_ps:
 622     case Intrinsic::x86_sse2_storeu_pd:
 623     case Intrinsic::x86_sse2_storeu_dq:
 624     case Intrinsic::x86_sse2_storel_dq:
 625       AccessTy = II->getArgOperand(0)->getType();
 626       break;
 627     }
 628   }
 629
 630   // All pointers have the same requirements, so canonicalize them to an
 631   // arbitrary pointer type to minimize variation.
 632   if (PointerType *PTy = dyn_cast<PointerType>(AccessTy))
 633     AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
 634                                 PTy->getAddressSpace());
 635
 636   return AccessTy;
 637 }
 638
 639 /// isExistingPhi - Return true if this AddRec is already a phi in its loop.
 640 static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
 641   for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
 642        PHINode *PN = dyn_cast<PHINode>(I); ++I) {
 643     if (SE.isSCEVable(PN->getType()) &&
 644         (SE.getEffectiveSCEVType(PN->getType()) ==
 645          SE.getEffectiveSCEVType(AR->getType())) &&
 646         SE.getSCEV(PN) == AR)
 647       return true;
 648   }
 649   return false;
 650 }
 651
 652 /// DeleteTriviallyDeadInstructions - If any of the instructions is the
 653 /// specified set are trivially dead, delete them and see if this makes any of
 654 /// their operands subsequently dead.
 655 static bool
 656 DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
 657   bool Changed = false;
 658
 659   while (!DeadInsts.empty()) {
 660     Instruction *I = dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val());
 661
 662     if (I == 0 || !isInstructionTriviallyDead(I))
 663       continue;
 664
 665     for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
 666       if (Instruction *U = dyn_cast<Instruction>(*OI)) {
 667         *OI = 0;
 668         if (U->use_empty())
 669           DeadInsts.push_back(U);
 670       }
 671
 672     I->eraseFromParent();
 673     Changed = true;
 674   }
 675
 676   return Changed;
 677 }
 678
 679 namespace {
 680
 681 /// Cost - This class is used to measure and compare candidate formulae.
 682 class Cost {
 683   /// TODO: Some of these could be merged. Also, a lexical ordering
 684   /// isn't always optimal.
 685   unsigned NumRegs;
 686   unsigned AddRecCost;
 687   unsigned NumIVMuls;
 688   unsigned NumBaseAdds;
 689   unsigned ImmCost;
 690   unsigned SetupCost;
 691
 692 public:
 693   Cost()
 694     : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
 695       SetupCost(0) {}
 696
 697   bool operator<(const Cost &Other) const;
 698
 699   void Loose();
 700
 701 #ifndef NDEBUG
 702   // Once any of the metrics loses, they must all remain losers.
 703   bool isValid() {
 704     return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
 705              | ImmCost | SetupCost) != ~0u)
 706       || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
 707            & ImmCost & SetupCost) == ~0u);
 708   }
 709 #endif
 710
 711   bool isLoser() {
 712     assert(isValid() && "invalid cost");
 713     return NumRegs == ~0u;
 714   }
 715
 716   void RateFormula(const Formula &F,
 717                    SmallPtrSet<const SCEV *, 16> &Regs,
 718                    const DenseSet<const SCEV *> &VisitedRegs,
 719                    const Loop *L,
 720                    const SmallVectorImpl<int64_t> &Offsets,
 721                    ScalarEvolution &SE, DominatorTree &DT,
 722                    SmallPtrSet<const SCEV *, 16> *LoserRegs = 0);
 723
 724   void print(raw_ostream &OS) const;
 725   void dump() const;
 726
 727 private:
 728   void RateRegister(const SCEV *Reg,
 729                     SmallPtrSet<const SCEV *, 16> &Regs,
 730                     const Loop *L,
 731                     ScalarEvolution &SE, DominatorTree &DT);
 732   void RatePrimaryRegister(const SCEV *Reg,
 733                            SmallPtrSet<const SCEV *, 16> &Regs,
 734                            const Loop *L,
 735                            ScalarEvolution &SE, DominatorTree &DT,
 736                            SmallPtrSet<const SCEV *, 16> *LoserRegs);
 737 };
 738
 739 }
 740
 741 /// RateRegister - Tally up interesting quantities from the given register.
 742 void Cost::RateRegister(const SCEV *Reg,
 743                         SmallPtrSet<const SCEV *, 16> &Regs,
 744                         const Loop *L,
 745                         ScalarEvolution &SE, DominatorTree &DT) {
 746   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
 747     if (AR->getLoop() == L)
 748       AddRecCost += 1; /// TODO: This should be a function of the stride.
 749
 750     // If this is an addrec for another loop, don't second-guess its addrec phi
 751     // nodes. LSR isn't currently smart enough to reason about more than one
 752     // loop at a time. LSR has either already run on inner loops, will not run
 753     // on other loops, and cannot be expected to change sibling loops. If the
 754     // AddRec exists, consider it's register free and leave it alone. Otherwise,
 755     // do not consider this formula at all.
 756     else if (!EnableNested || L->contains(AR->getLoop()) ||
 757              (!AR->getLoop()->contains(L) &&
 758               DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
 759       if (isExistingPhi(AR, SE))
 760         return;
 761
 762       // For !EnableNested, never rewrite IVs in other loops.
 763       if (!EnableNested) {
 764         Loose();
 765         return;
 766       }
 767       // If this isn't one of the addrecs that the loop already has, it
 768       // would require a costly new phi and add. TODO: This isn't
 769       // precisely modeled right now.
 770       ++NumBaseAdds;
 771       if (!Regs.count(AR->getStart())) {
 772         RateRegister(AR->getStart(), Regs, L, SE, DT);
 773         if (isLoser())
 774           return;
 775       }
 776     }
 777
 778     // Add the step value register, if it needs one.
 779     // TODO: The non-affine case isn't precisely modeled here.
 780     if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
 781       if (!Regs.count(AR->getOperand(1))) {
 782         RateRegister(AR->getOperand(1), Regs, L, SE, DT);
 783         if (isLoser())
 784           return;
 785       }
 786     }
 787   }
 788   ++NumRegs;
 789
 790   // Rough heuristic; favor registers which don't require extra setup
 791   // instructions in the preheader.
 792   if (!isa<SCEVUnknown>(Reg) &&
 793       !isa<SCEVConstant>(Reg) &&
 794       !(isa<SCEVAddRecExpr>(Reg) &&
 795         (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
 796          isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
 797     ++SetupCost;
 798
 799     NumIVMuls += isa<SCEVMulExpr>(Reg) &&
 800                  SE.hasComputableLoopEvolution(Reg, L);
 801 }
 802
 803 /// RatePrimaryRegister - Record this register in the set. If we haven't seen it
 804 /// before, rate it. Optional LoserRegs provides a way to declare any formula
 805 /// that refers to one of those regs an instant loser.
 806 void Cost::RatePrimaryRegister(const SCEV *Reg,
 807                                SmallPtrSet<const SCEV *, 16> &Regs,
 808                                const Loop *L,
 809                                ScalarEvolution &SE, DominatorTree &DT,
 810                                SmallPtrSet<const SCEV *, 16> *LoserRegs) {
 811   if (LoserRegs && LoserRegs->count(Reg)) {
 812     Loose();
 813     return;
 814   }
 815   if (Regs.insert(Reg)) {
 816     RateRegister(Reg, Regs, L, SE, DT);
 817     if (isLoser())
 818       LoserRegs->insert(Reg);
 819   }
 820 }
 821
 822 void Cost::RateFormula(const Formula &F,
 823                        SmallPtrSet<const SCEV *, 16> &Regs,
 824                        const DenseSet<const SCEV *> &VisitedRegs,
 825                        const Loop *L,
 826                        const SmallVectorImpl<int64_t> &Offsets,
 827                        ScalarEvolution &SE, DominatorTree &DT,
 828                        SmallPtrSet<const SCEV *, 16> *LoserRegs) {
 829   // Tally up the registers.
 830   if (const SCEV *ScaledReg = F.ScaledReg) {
 831     if (VisitedRegs.count(ScaledReg)) {
 832       Loose();
 833       return;
 834     }
 835     RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
 836     if (isLoser())
 837       return;
 838   }
 839   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
 840        E = F.BaseRegs.end(); I != E; ++I) {
 841     const SCEV *BaseReg = *I;
 842     if (VisitedRegs.count(BaseReg)) {
 843       Loose();
 844       return;
 845     }
 846     RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
 847     if (isLoser())
 848       return;
 849   }
 850
 851   // Determine how many (unfolded) adds we'll need inside the loop.
 852   size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
 853   if (NumBaseParts > 1)
 854     NumBaseAdds += NumBaseParts - 1;
 855
 856   // Tally up the non-zero immediates.
 857   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
 858        E = Offsets.end(); I != E; ++I) {
 859     int64_t Offset = (uint64_t)*I + F.AM.BaseOffs;
 860     if (F.AM.BaseGV)
 861       ImmCost += 64; // Handle symbolic values conservatively.
 862                      // TODO: This should probably be the pointer size.
 863     else if (Offset != 0)
 864       ImmCost += APInt(64, Offset, true).getMinSignedBits();
 865   }
 866   assert(isValid() && "invalid cost");
 867 }
 868
 869 /// Loose - Set this cost to a losing value.
 870 void Cost::Loose() {
 871   NumRegs = ~0u;
 872   AddRecCost = ~0u;
 873   NumIVMuls = ~0u;
 874   NumBaseAdds = ~0u;
 875   ImmCost = ~0u;
 876   SetupCost = ~0u;
 877 }
 878
 879 /// operator< - Choose the lower cost.
 880 bool Cost::operator<(const Cost &Other) const {
 881   if (NumRegs != Other.NumRegs)
 882     return NumRegs < Other.NumRegs;
 883   if (AddRecCost != Other.AddRecCost)
 884     return AddRecCost < Other.AddRecCost;
 885   if (NumIVMuls != Other.NumIVMuls)
 886     return NumIVMuls < Other.NumIVMuls;
 887   if (NumBaseAdds != Other.NumBaseAdds)
 888     return NumBaseAdds < Other.NumBaseAdds;
 889   if (ImmCost != Other.ImmCost)
 890     return ImmCost < Other.ImmCost;
 891   if (SetupCost != Other.SetupCost)
 892     return SetupCost < Other.SetupCost;
 893   return false;
 894 }
 895
 896 void Cost::print(raw_ostream &OS) const {
 897   OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
 898   if (AddRecCost != 0)
 899     OS << ", with addrec cost " << AddRecCost;
 900   if (NumIVMuls != 0)
 901     OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
 902   if (NumBaseAdds != 0)
 903     OS << ", plus " << NumBaseAdds << " base add"
 904        << (NumBaseAdds == 1 ? "" : "s");
 905   if (ImmCost != 0)
 906     OS << ", plus " << ImmCost << " imm cost";
 907   if (SetupCost != 0)
 908     OS << ", plus " << SetupCost << " setup cost";
 909 }
 910
 911 void Cost::dump() const {
 912   print(errs()); errs() << '\n';
 913 }
 914
 915 namespace {
 916
 917 /// LSRFixup - An operand value in an instruction which is to be replaced
 918 /// with some equivalent, possibly strength-reduced, replacement.
 919 struct LSRFixup {
 920   /// UserInst - The instruction which will be updated.
 921   Instruction *UserInst;
 922
 923   /// OperandValToReplace - The operand of the instruction which will
 924   /// be replaced. The operand may be used more than once; every instance
 925   /// will be replaced.
 926   Value *OperandValToReplace;
 927
 928   /// PostIncLoops - If this user is to use the post-incremented value of an
 929   /// induction variable, this variable is non-null and holds the loop
 930   /// associated with the induction variable.
 931   PostIncLoopSet PostIncLoops;
 932
 933   /// LUIdx - The index of the LSRUse describing the expression which
 934   /// this fixup needs, minus an offset (below).
 935   size_t LUIdx;
 936
 937   /// Offset - A constant offset to be added to the LSRUse expression.
 938   /// This allows multiple fixups to share the same LSRUse with different
 939   /// offsets, for example in an unrolled loop.
 940   int64_t Offset;
 941
 942   bool isUseFullyOutsideLoop(const Loop *L) const;
 943
 944   LSRFixup();
 945
 946   void print(raw_ostream &OS) const;
 947   void dump() const;
 948 };
 949
 950 }
 951
 952 LSRFixup::LSRFixup()
 953   : UserInst(0), OperandValToReplace(0), LUIdx(~size_t(0)), Offset(0) {}
 954
 955 /// isUseFullyOutsideLoop - Test whether this fixup always uses its
 956 /// value outside of the given loop.
 957 bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
 958   // PHI nodes use their value in their incoming blocks.
 959   if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
 960     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
 961       if (PN->getIncomingValue(i) == OperandValToReplace &&
 962           L->contains(PN->getIncomingBlock(i)))
 963         return false;
 964     return true;
 965   }
 966
 967   return !L->contains(UserInst);
 968 }
 969
 970 void LSRFixup::print(raw_ostream &OS) const {
 971   OS << "UserInst=";
 972   // Store is common and interesting enough to be worth special-casing.
 973   if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
 974     OS << "store ";
 975     WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false);
 976   } else if (UserInst->getType()->isVoidTy())
 977     OS << UserInst->getOpcodeName();
 978   else
 979     WriteAsOperand(OS, UserInst, /*PrintType=*/false);
 980
 981   OS << ", OperandValToReplace=";
 982   WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false);
 983
 984   for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(),
 985        E = PostIncLoops.end(); I != E; ++I) {
 986     OS << ", PostIncLoop=";
 987     WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false);
 988   }
 989
 990   if (LUIdx != ~size_t(0))
 991     OS << ", LUIdx=" << LUIdx;
 992
 993   if (Offset != 0)
 994     OS << ", Offset=" << Offset;
 995 }
 996
 997 void LSRFixup::dump() const {
 998   print(errs()); errs() << '\n';
 999 }
1000
1001 namespace {
1002
1003 /// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding
1004 /// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*.
1005 struct UniquifierDenseMapInfo {
1006   static SmallVector<const SCEV *, 2> getEmptyKey() {
1007     SmallVector<const SCEV *, 2> V;
1008     V.push_back(reinterpret_cast<const SCEV *>(-1));
1009     return V;
1010   }
1011
1012   static SmallVector<const SCEV *, 2> getTombstoneKey() {
1013     SmallVector<const SCEV *, 2> V;
1014     V.push_back(reinterpret_cast<const SCEV *>(-2));
1015     return V;
1016   }
1017
1018   static unsigned getHashValue(const SmallVector<const SCEV *, 2> &V) {
1019     unsigned Result = 0;
1020     for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(),
1021          E = V.end(); I != E; ++I)
1022       Result ^= DenseMapInfo<const SCEV *>::getHashValue(*I);
1023     return Result;
1024   }
1025
1026   static bool isEqual(const SmallVector<const SCEV *, 2> &LHS,
1027                       const SmallVector<const SCEV *, 2> &RHS) {
1028     return LHS == RHS;
1029   }
1030 };
1031
1032 /// LSRUse - This class holds the state that LSR keeps for each use in
1033 /// IVUsers, as well as uses invented by LSR itself. It includes information
1034 /// about what kinds of things can be folded into the user, information about
1035 /// the user itself, and information about how the use may be satisfied.
1036 /// TODO: Represent multiple users of the same expression in common?
1037 class LSRUse {
1038   DenseSet<SmallVector<const SCEV *, 2>, UniquifierDenseMapInfo> Uniquifier;
1039
1040 public:
1041   /// KindType - An enum for a kind of use, indicating what types of
1042   /// scaled and immediate operands it might support.
1043   enum KindType {
1044     Basic,   ///< A normal use, with no folding.
1045     Special, ///< A special case of basic, allowing -1 scales.
1046     Address, ///< An address use; folding according to TargetLowering
1047     ICmpZero ///< An equality icmp with both operands folded into one.
1048     // TODO: Add a generic icmp too?
1049   };
1050
1051   KindType Kind;
1052   Type *AccessTy;
1053
1054   SmallVector<int64_t, 8> Offsets;
1055   int64_t MinOffset;
1056   int64_t MaxOffset;
1057
1058   /// AllFixupsOutsideLoop - This records whether all of the fixups using this
1059   /// LSRUse are outside of the loop, in which case some special-case heuristics
1060   /// may be used.
1061   bool AllFixupsOutsideLoop;
1062
1063   /// WidestFixupType - This records the widest use type for any fixup using
1064   /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
1065   /// max fixup widths to be equivalent, because the narrower one may be relying
1066   /// on the implicit truncation to truncate away bogus bits.
1067   Type *WidestFixupType;
1068
1069   /// Formulae - A list of ways to build a value that can satisfy this user.
1070   /// After the list is populated, one of these is selected heuristically and
1071   /// used to formulate a replacement for OperandValToReplace in UserInst.
1072   SmallVector<Formula, 12> Formulae;
1073
1074   /// Regs - The set of register candidates used by all formulae in this LSRUse.
1075   SmallPtrSet<const SCEV *, 4> Regs;
1076
1077   LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T),
1078                                       MinOffset(INT64_MAX),
1079                                       MaxOffset(INT64_MIN),
1080                                       AllFixupsOutsideLoop(true),
1081                                       WidestFixupType(0) {}
1082
1083   bool HasFormulaWithSameRegs(const Formula &F) const;
1084   bool InsertFormula(const Formula &F);
1085   void DeleteFormula(Formula &F);
1086   void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1087
1088   void print(raw_ostream &OS) const;
1089   void dump() const;
1090 };
1091
1092 }
1093
1094 /// HasFormula - Test whether this use as a formula which has the same
1095 /// registers as the given formula.
1096 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1097   SmallVector<const SCEV *, 2> Key = F.BaseRegs;
1098   if (F.ScaledReg) Key.push_back(F.ScaledReg);
1099   // Unstable sort by host order ok, because this is only used for uniquifying.
1100   std::sort(Key.begin(), Key.end());
1101   return Uniquifier.count(Key);
1102 }
1103
1104 /// InsertFormula - If the given formula has not yet been inserted, add it to
1105 /// the list, and return true. Return false otherwise.
1106 bool LSRUse::InsertFormula(const Formula &F) {
1107   SmallVector<const SCEV *, 2> Key = F.BaseRegs;
1108   if (F.ScaledReg) Key.push_back(F.ScaledReg);
1109   // Unstable sort by host order ok, because this is only used for uniquifying.
1110   std::sort(Key.begin(), Key.end());
1111
1112   if (!Uniquifier.insert(Key).second)
1113     return false;
1114
1115   // Using a register to hold the value of 0 is not profitable.
1116   assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1117          "Zero allocated in a scaled register!");
1118 #ifndef NDEBUG
1119   for (SmallVectorImpl<const SCEV *>::const_iterator I =
1120        F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I)
1121     assert(!(*I)->isZero() && "Zero allocated in a base register!");
1122 #endif
1123
1124   // Add the formula to the list.
1125   Formulae.push_back(F);
1126
1127   // Record registers now being used by this use.
1128   Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1129
1130   return true;
1131 }
1132
1133 /// DeleteFormula - Remove the given formula from this use's list.
1134 void LSRUse::DeleteFormula(Formula &F) {
1135   if (&F != &Formulae.back())
1136     std::swap(F, Formulae.back());
1137   Formulae.pop_back();
1138 }
1139
1140 /// RecomputeRegs - Recompute the Regs field, and update RegUses.
1141 void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1142   // Now that we've filtered out some formulae, recompute the Regs set.
1143   SmallPtrSet<const SCEV *, 4> OldRegs = Regs;
1144   Regs.clear();
1145   for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(),
1146        E = Formulae.end(); I != E; ++I) {
1147     const Formula &F = *I;
1148     if (F.ScaledReg) Regs.insert(F.ScaledReg);
1149     Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1150   }
1151
1152   // Update the RegTracker.
1153   for (SmallPtrSet<const SCEV *, 4>::iterator I = OldRegs.begin(),
1154        E = OldRegs.end(); I != E; ++I)
1155     if (!Regs.count(*I))
1156       RegUses.DropRegister(*I, LUIdx);
1157 }
1158
1159 void LSRUse::print(raw_ostream &OS) const {
1160   OS << "LSR Use: Kind=";
1161   switch (Kind) {
1162   case Basic:    OS << "Basic"; break;
1163   case Special:  OS << "Special"; break;
1164   case ICmpZero: OS << "ICmpZero"; break;
1165   case Address:
1166     OS << "Address of ";
1167     if (AccessTy->isPointerTy())
1168       OS << "pointer"; // the full pointer type could be really verbose
1169     else
1170       OS << *AccessTy;
1171   }
1172
1173   OS << ", Offsets={";
1174   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
1175        E = Offsets.end(); I != E; ++I) {
1176     OS << *I;
1177     if (llvm::next(I) != E)
1178       OS << ',';
1179   }
1180   OS << '}';
1181
1182   if (AllFixupsOutsideLoop)
1183     OS << ", all-fixups-outside-loop";
1184
1185   if (WidestFixupType)
1186     OS << ", widest fixup type: " << *WidestFixupType;
1187 }
1188
1189 void LSRUse::dump() const {
1190   print(errs()); errs() << '\n';
1191 }
1192
1193 /// isLegalUse - Test whether the use described by AM is "legal", meaning it can
1194 /// be completely folded into the user instruction at isel time. This includes
1195 /// address-mode folding and special icmp tricks.
1196 static bool isLegalUse(const TargetLowering::AddrMode &AM,
1197                        LSRUse::KindType Kind, Type *AccessTy,
1198                        const TargetLowering *TLI) {
1199   switch (Kind) {
1200   case LSRUse::Address:
1201     // If we have low-level target information, ask the target if it can
1202     // completely fold this address.
1203     if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy);
1204
1205     // Otherwise, just guess that reg+reg addressing is legal.
1206     return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1;
1207
1208   case LSRUse::ICmpZero:
1209     // There's not even a target hook for querying whether it would be legal to
1210     // fold a GV into an ICmp.
1211     if (AM.BaseGV)
1212       return false;
1213
1214     // ICmp only has two operands; don't allow more than two non-trivial parts.
1215     if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0)
1216       return false;
1217
1218     // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1219     // putting the scaled register in the other operand of the icmp.
1220     if (AM.Scale != 0 && AM.Scale != -1)
1221       return false;
1222
1223     // If we have low-level target information, ask the target if it can fold an
1224     // integer immediate on an icmp.
1225     if (AM.BaseOffs != 0) {
1226       if (TLI) return TLI->isLegalICmpImmediate(-(uint64_t)AM.BaseOffs);
1227       return false;
1228     }
1229
1230     return true;
1231
1232   case LSRUse::Basic:
1233     // Only handle single-register values.
1234     return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0;
1235
1236   case LSRUse::Special:
1237     // Only handle -1 scales, or no scale.
1238     return AM.Scale == 0 || AM.Scale == -1;
1239   }
1240
1241   return false;
1242 }
1243
1244 static bool isLegalUse(TargetLowering::AddrMode AM,
1245                        int64_t MinOffset, int64_t MaxOffset,
1246                        LSRUse::KindType Kind, Type *AccessTy,
1247                        const TargetLowering *TLI) {
1248   // Check for overflow.
1249   if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) !=
1250       (MinOffset > 0))
1251     return false;
1252   AM.BaseOffs = (uint64_t)AM.BaseOffs + MinOffset;
1253   if (isLegalUse(AM, Kind, AccessTy, TLI)) {
1254     AM.BaseOffs = (uint64_t)AM.BaseOffs - MinOffset;
1255     // Check for overflow.
1256     if (((int64_t)((uint64_t)AM.BaseOffs + MaxOffset) > AM.BaseOffs) !=
1257         (MaxOffset > 0))
1258       return false;
1259     AM.BaseOffs = (uint64_t)AM.BaseOffs + MaxOffset;
1260     return isLegalUse(AM, Kind, AccessTy, TLI);
1261   }
1262   return false;
1263 }
1264
1265 static bool isAlwaysFoldable(int64_t BaseOffs,
1266                              GlobalValue *BaseGV,
1267                              bool HasBaseReg,
1268                              LSRUse::KindType Kind, Type *AccessTy,
1269                              const TargetLowering *TLI) {
1270   // Fast-path: zero is always foldable.
1271   if (BaseOffs == 0 && !BaseGV) return true;
1272
1273   // Conservatively, create an address with an immediate and a
1274   // base and a scale.
1275   TargetLowering::AddrMode AM;
1276   AM.BaseOffs = BaseOffs;
1277   AM.BaseGV = BaseGV;
1278   AM.HasBaseReg = HasBaseReg;
1279   AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1280
1281   // Canonicalize a scale of 1 to a base register if the formula doesn't
1282   // already have a base register.
1283   if (!AM.HasBaseReg && AM.Scale == 1) {
1284     AM.Scale = 0;
1285     AM.HasBaseReg = true;
1286   }
1287
1288   return isLegalUse(AM, Kind, AccessTy, TLI);
1289 }
1290
1291 static bool isAlwaysFoldable(const SCEV *S,
1292                              int64_t MinOffset, int64_t MaxOffset,
1293                              bool HasBaseReg,
1294                              LSRUse::KindType Kind, Type *AccessTy,
1295                              const TargetLowering *TLI,
1296                              ScalarEvolution &SE) {
1297   // Fast-path: zero is always foldable.
1298   if (S->isZero()) return true;
1299
1300   // Conservatively, create an address with an immediate and a
1301   // base and a scale.
1302   int64_t BaseOffs = ExtractImmediate(S, SE);
1303   GlobalValue *BaseGV = ExtractSymbol(S, SE);
1304
1305   // If there's anything else involved, it's not foldable.
1306   if (!S->isZero()) return false;
1307
1308   // Fast-path: zero is always foldable.
1309   if (BaseOffs == 0 && !BaseGV) return true;
1310
1311   // Conservatively, create an address with an immediate and a
1312   // base and a scale.
1313   TargetLowering::AddrMode AM;
1314   AM.BaseOffs = BaseOffs;
1315   AM.BaseGV = BaseGV;
1316   AM.HasBaseReg = HasBaseReg;
1317   AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1318
1319   return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI);
1320 }
1321
1322 namespace {
1323
1324 /// UseMapDenseMapInfo - A DenseMapInfo implementation for holding
1325 /// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind.
1326 struct UseMapDenseMapInfo {
1327   static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() {
1328     return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic);
1329   }
1330
1331   static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() {
1332     return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic);
1333   }
1334
1335   static unsigned
1336   getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) {
1337     unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first);
1338     Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second));
1339     return Result;
1340   }
1341
1342   static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS,
1343                       const std::pair<const SCEV *, LSRUse::KindType> &RHS) {
1344     return LHS == RHS;
1345   }
1346 };
1347
1348 /// LSRInstance - This class holds state for the main loop strength reduction
1349 /// logic.
1350 class LSRInstance {
1351   IVUsers &IU;
1352   ScalarEvolution &SE;
1353   DominatorTree &DT;
1354   LoopInfo &LI;
1355   const TargetLowering *const TLI;
1356   Loop *const L;
1357   bool Changed;
1358
1359   /// IVIncInsertPos - This is the insert position that the current loop's
1360   /// induction variable increment should be placed. In simple loops, this is
1361   /// the latch block's terminator. But in more complicated cases, this is a
1362   /// position which will dominate all the in-loop post-increment users.
1363   Instruction *IVIncInsertPos;
1364
1365   /// Factors - Interesting factors between use strides.
1366   SmallSetVector<int64_t, 8> Factors;
1367
1368   /// Types - Interesting use types, to facilitate truncation reuse.
1369   SmallSetVector<Type *, 4> Types;
1370
1371   /// Fixups - The list of operands which are to be replaced.
1372   SmallVector<LSRFixup, 16> Fixups;
1373
1374   /// Uses - The list of interesting uses.
1375   SmallVector<LSRUse, 16> Uses;
1376
1377   /// RegUses - Track which uses use which register candidates.
1378   RegUseTracker RegUses;
1379
1380   void OptimizeShadowIV();
1381   bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
1382   ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
1383   void OptimizeLoopTermCond();
1384
1385   void CollectInterestingTypesAndFactors();
1386   void CollectFixupsAndInitialFormulae();
1387
1388   LSRFixup &getNewFixup() {
1389     Fixups.push_back(LSRFixup());
1390     return Fixups.back();
1391   }
1392
1393   // Support for sharing of LSRUses between LSRFixups.
1394   typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>,
1395                    size_t,
1396                    UseMapDenseMapInfo> UseMapTy;
1397   UseMapTy UseMap;
1398
1399   bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
1400                           LSRUse::KindType Kind, Type *AccessTy);
1401
1402   std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
1403                                     LSRUse::KindType Kind,
1404                                     Type *AccessTy);
1405
1406   void DeleteUse(LSRUse &LU, size_t LUIdx);
1407
1408   LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
1409
1410   void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1411   void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1412   void CountRegisters(const Formula &F, size_t LUIdx);
1413   bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
1414
1415   void CollectLoopInvariantFixupsAndFormulae();
1416
1417   void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
1418                               unsigned Depth = 0);
1419   void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
1420   void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1421   void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1422   void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1423   void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1424   void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
1425   void GenerateCrossUseConstantOffsets();
1426   void GenerateAllReuseFormulae();
1427
1428   void FilterOutUndesirableDedicatedRegisters();
1429
1430   size_t EstimateSearchSpaceComplexity() const;
1431   void NarrowSearchSpaceByDetectingSupersets();
1432   void NarrowSearchSpaceByCollapsingUnrolledCode();
1433   void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
1434   void NarrowSearchSpaceByPickingWinnerRegs();
1435   void NarrowSearchSpaceUsingHeuristics();
1436
1437   void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
1438                     Cost &SolutionCost,
1439                     SmallVectorImpl<const Formula *> &Workspace,
1440                     const Cost &CurCost,
1441                     const SmallPtrSet<const SCEV *, 16> &CurRegs,
1442                     DenseSet<const SCEV *> &VisitedRegs) const;
1443   void Solve(SmallVectorImpl<const Formula *> &Solution) const;
1444
1445   BasicBlock::iterator
1446     HoistInsertPosition(BasicBlock::iterator IP,
1447                         const SmallVectorImpl<Instruction *> &Inputs) const;
1448   BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
1449                                                      const LSRFixup &LF,
1450                                                      const LSRUse &LU) const;
1451
1452   Value *Expand(const LSRFixup &LF,
1453                 const Formula &F,
1454                 BasicBlock::iterator IP,
1455                 SCEVExpander &Rewriter,
1456                 SmallVectorImpl<WeakVH> &DeadInsts) const;
1457   void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
1458                      const Formula &F,
1459                      SCEVExpander &Rewriter,
1460                      SmallVectorImpl<WeakVH> &DeadInsts,
1461                      Pass *P) const;
1462   void Rewrite(const LSRFixup &LF,
1463                const Formula &F,
1464                SCEVExpander &Rewriter,
1465                SmallVectorImpl<WeakVH> &DeadInsts,
1466                Pass *P) const;
1467   void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
1468                          Pass *P);
1469
1470 public:
1471   LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);
1472
1473   bool getChanged() const { return Changed; }
1474
1475   void print_factors_and_types(raw_ostream &OS) const;
1476   void print_fixups(raw_ostream &OS) const;
1477   void print_uses(raw_ostream &OS) const;
1478   void print(raw_ostream &OS) const;
1479   void dump() const;
1480 };
1481
1482 }
1483
1484 /// OptimizeShadowIV - If IV is used in a int-to-float cast
1485 /// inside the loop then try to eliminate the cast operation.
1486 void LSRInstance::OptimizeShadowIV() {
1487   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1488   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1489     return;
1490
1491   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
1492        UI != E; /* empty */) {
1493     IVUsers::const_iterator CandidateUI = UI;
1494     ++UI;
1495     Instruction *ShadowUse = CandidateUI->getUser();
1496     Type *DestTy = NULL;
1497     bool IsSigned = false;
1498
1499     /* If shadow use is a int->float cast then insert a second IV
1500        to eliminate this cast.
1501
1502          for (unsigned i = 0; i < n; ++i)
1503            foo((double)i);
1504
1505        is transformed into
1506
1507          double d = 0.0;
1508          for (unsigned i = 0; i < n; ++i, ++d)
1509            foo(d);
1510     */
1511     if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
1512       IsSigned = false;
1513       DestTy = UCast->getDestTy();
1514     }
1515     else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
1516       IsSigned = true;
1517       DestTy = SCast->getDestTy();
1518     }
1519     if (!DestTy) continue;
1520
1521     if (TLI) {
1522       // If target does not support DestTy natively then do not apply
1523       // this transformation.
1524       EVT DVT = TLI->getValueType(DestTy);
1525       if (!TLI->isTypeLegal(DVT)) continue;
1526     }
1527
1528     PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
1529     if (!PH) continue;
1530     if (PH->getNumIncomingValues() != 2) continue;
1531
1532     Type *SrcTy = PH->getType();
1533     int Mantissa = DestTy->getFPMantissaWidth();
1534     if (Mantissa == -1) continue;
1535     if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
1536       continue;
1537
1538     unsigned Entry, Latch;
1539     if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
1540       Entry = 0;
1541       Latch = 1;
1542     } else {
1543       Entry = 1;
1544       Latch = 0;
1545     }
1546
1547     ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
1548     if (!Init) continue;
1549     Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
1550                                         (double)Init->getSExtValue() :
1551                                         (double)Init->getZExtValue());
1552
1553     BinaryOperator *Incr =
1554       dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
1555     if (!Incr) continue;
1556     if (Incr->getOpcode() != Instruction::Add
1557         && Incr->getOpcode() != Instruction::Sub)
1558       continue;
1559
1560     /* Initialize new IV, double d = 0.0 in above example. */
1561     ConstantInt *C = NULL;
1562     if (Incr->getOperand(0) == PH)
1563       C = dyn_cast<ConstantInt>(Incr->getOperand(1));
1564     else if (Incr->getOperand(1) == PH)
1565       C = dyn_cast<ConstantInt>(Incr->getOperand(0));
1566     else
1567       continue;
1568
1569     if (!C) continue;
1570
1571     // Ignore negative constants, as the code below doesn't handle them
1572     // correctly. TODO: Remove this restriction.
1573     if (!C->getValue().isStrictlyPositive()) continue;
1574
1575     /* Add new PHINode. */
1576     PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
1577
1578     /* create new increment. '++d' in above example. */
1579     Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
1580     BinaryOperator *NewIncr =
1581       BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
1582                                Instruction::FAdd : Instruction::FSub,
1583                              NewPH, CFP, "IV.S.next.", Incr);
1584
1585     NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
1586     NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
1587
1588     /* Remove cast operation */
1589     ShadowUse->replaceAllUsesWith(NewPH);
1590     ShadowUse->eraseFromParent();
1591     Changed = true;
1592     break;
1593   }
1594 }
1595
1596 /// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
1597 /// set the IV user and stride information and return true, otherwise return
1598 /// false.
1599 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
1600   for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
1601     if (UI->getUser() == Cond) {
1602       // NOTE: we could handle setcc instructions with multiple uses here, but
1603       // InstCombine does it as well for simple uses, it's not clear that it
1604       // occurs enough in real life to handle.
1605       CondUse = UI;
1606       return true;
1607     }
1608   return false;
1609 }
1610
1611 /// OptimizeMax - Rewrite the loop's terminating condition if it uses
1612 /// a max computation.
1613 ///
1614 /// This is a narrow solution to a specific, but acute, problem. For loops
1615 /// like this:
1616 ///
1617 ///   i = 0;
1618 ///   do {
1619 ///     p[i] = 0.0;
1620 ///   } while (++i < n);
1621 ///
1622 /// the trip count isn't just 'n', because 'n' might not be positive. And
1623 /// unfortunately this can come up even for loops where the user didn't use
1624 /// a C do-while loop. For example, seemingly well-behaved top-test loops
1625 /// will commonly be lowered like this:
1626 //
1627 ///   if (n > 0) {
1628 ///     i = 0;
1629 ///     do {
1630 ///       p[i] = 0.0;
1631 ///     } while (++i < n);
1632 ///   }
1633 ///
1634 /// and then it's possible for subsequent optimization to obscure the if
1635 /// test in such a way that indvars can't find it.
1636 ///
1637 /// When indvars can't find the if test in loops like this, it creates a
1638 /// max expression, which allows it to give the loop a canonical
1639 /// induction variable:
1640 ///
1641 ///   i = 0;
1642 ///   max = n < 1 ? 1 : n;
1643 ///   do {
1644 ///     p[i] = 0.0;
1645 ///   } while (++i != max);
1646 ///
1647 /// Canonical induction variables are necessary because the loop passes
1648 /// are designed around them. The most obvious example of this is the
1649 /// LoopInfo analysis, which doesn't remember trip count values. It
1650 /// expects to be able to rediscover the trip count each time it is
1651 /// needed, and it does this using a simple analysis that only succeeds if
1652 /// the loop has a canonical induction variable.
1653 ///
1654 /// However, when it comes time to generate code, the maximum operation
1655 /// can be quite costly, especially if it's inside of an outer loop.
1656 ///
1657 /// This function solves this problem by detecting this type of loop and
1658 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
1659 /// the instructions for the maximum computation.
1660 ///
1661 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
1662   // Check that the loop matches the pattern we're looking for.
1663   if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
1664       Cond->getPredicate() != CmpInst::ICMP_NE)
1665     return Cond;
1666
1667   SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
1668   if (!Sel || !Sel->hasOneUse()) return Cond;
1669
1670   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1671   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1672     return Cond;
1673   const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
1674
1675   // Add one to the backedge-taken count to get the trip count.
1676   const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
1677   if (IterationCount != SE.getSCEV(Sel)) return Cond;
1678
1679   // Check for a max calculation that matches the pattern. There's no check
1680   // for ICMP_ULE here because the comparison would be with zero, which
1681   // isn't interesting.
1682   CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
1683   const SCEVNAryExpr *Max = 0;
1684   if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
1685     Pred = ICmpInst::ICMP_SLE;
1686     Max = S;
1687   } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
1688     Pred = ICmpInst::ICMP_SLT;
1689     Max = S;
1690   } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
1691     Pred = ICmpInst::ICMP_ULT;
1692     Max = U;
1693   } else {
1694     // No match; bail.
1695     return Cond;
1696   }
1697
1698   // To handle a max with more than two operands, this optimization would
1699   // require additional checking and setup.
1700   if (Max->getNumOperands() != 2)
1701     return Cond;
1702
1703   const SCEV *MaxLHS = Max->getOperand(0);
1704   const SCEV *MaxRHS = Max->getOperand(1);
1705
1706   // ScalarEvolution canonicalizes constants to the left. For < and >, look
1707   // for a comparison with 1. For <= and >=, a comparison with zero.
1708   if (!MaxLHS ||
1709       (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
1710     return Cond;
1711
1712   // Check the relevant induction variable for conformance to
1713   // the pattern.
1714   const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
1715   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
1716   if (!AR || !AR->isAffine() ||
1717       AR->getStart() != One ||
1718       AR->getStepRecurrence(SE) != One)
1719     return Cond;
1720
1721   assert(AR->getLoop() == L &&
1722          "Loop condition operand is an addrec in a different loop!");
1723
1724   // Check the right operand of the select, and remember it, as it will
1725   // be used in the new comparison instruction.
1726   Value *NewRHS = 0;
1727   if (ICmpInst::isTrueWhenEqual(Pred)) {
1728     // Look for n+1, and grab n.
1729     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
1730       if (isa<ConstantInt>(BO->getOperand(1)) &&
1731           cast<ConstantInt>(BO->getOperand(1))->isOne() &&
1732           SE.getSCEV(BO->getOperand(0)) == MaxRHS)
1733         NewRHS = BO->getOperand(0);
1734     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
1735       if (isa<ConstantInt>(BO->getOperand(1)) &&
1736           cast<ConstantInt>(BO->getOperand(1))->isOne() &&
1737           SE.getSCEV(BO->getOperand(0)) == MaxRHS)
1738         NewRHS = BO->getOperand(0);
1739     if (!NewRHS)
1740       return Cond;
1741   } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
1742     NewRHS = Sel->getOperand(1);
1743   else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
1744     NewRHS = Sel->getOperand(2);
1745   else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
1746     NewRHS = SU->getValue();
1747   else
1748     // Max doesn't match expected pattern.
1749     return Cond;
1750
1751   // Determine the new comparison opcode. It may be signed or unsigned,
1752   // and the original comparison may be either equality or inequality.
1753   if (Cond->getPredicate() == CmpInst::ICMP_EQ)
1754     Pred = CmpInst::getInversePredicate(Pred);
1755
1756   // Ok, everything looks ok to change the condition into an SLT or SGE and
1757   // delete the max calculation.
1758   ICmpInst *NewCond =
1759     new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
1760
1761   // Delete the max calculation instructions.
1762   Cond->replaceAllUsesWith(NewCond);
1763   CondUse->setUser(NewCond);
1764   Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
1765   Cond->eraseFromParent();
1766   Sel->eraseFromParent();
1767   if (Cmp->use_empty())
1768     Cmp->eraseFromParent();
1769   return NewCond;
1770 }
1771
1772 /// OptimizeLoopTermCond - Change loop terminating condition to use the
1773 /// postinc iv when possible.
1774 void
1775 LSRInstance::OptimizeLoopTermCond() {
1776   SmallPtrSet<Instruction *, 4> PostIncs;
1777
1778   BasicBlock *LatchBlock = L->getLoopLatch();
1779   SmallVector<BasicBlock*, 8> ExitingBlocks;
1780   L->getExitingBlocks(ExitingBlocks);
1781
1782   for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
1783     BasicBlock *ExitingBlock = ExitingBlocks[i];
1784
1785     // Get the terminating condition for the loop if possible.  If we
1786     // can, we want to change it to use a post-incremented version of its
1787     // induction variable, to allow coalescing the live ranges for the IV into
1788     // one register value.
1789
1790     BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
1791     if (!TermBr)
1792       continue;
1793     // FIXME: Overly conservative, termination condition could be an 'or' etc..
1794     if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
1795       continue;
1796
1797     // Search IVUsesByStride to find Cond's IVUse if there is one.
1798     IVStrideUse *CondUse = 0;
1799     ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
1800     if (!FindIVUserForCond(Cond, CondUse))
1801       continue;
1802
1803     // If the trip count is computed in terms of a max (due to ScalarEvolution
1804     // being unable to find a sufficient guard, for example), change the loop
1805     // comparison to use SLT or ULT instead of NE.
1806     // One consequence of doing this now is that it disrupts the count-down
1807     // optimization. That's not always a bad thing though, because in such
1808     // cases it may still be worthwhile to avoid a max.
1809     Cond = OptimizeMax(Cond, CondUse);
1810
1811     // If this exiting block dominates the latch block, it may also use
1812     // the post-inc value if it won't be shared with other uses.
1813     // Check for dominance.
1814     if (!DT.dominates(ExitingBlock, LatchBlock))
1815       continue;
1816
1817     // Conservatively avoid trying to use the post-inc value in non-latch
1818     // exits if there may be pre-inc users in intervening blocks.
1819     if (LatchBlock != ExitingBlock)
1820       for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
1821         // Test if the use is reachable from the exiting block. This dominator
1822         // query is a conservative approximation of reachability.
1823         if (&*UI != CondUse &&
1824             !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
1825           // Conservatively assume there may be reuse if the quotient of their
1826           // strides could be a legal scale.
1827           const SCEV *A = IU.getStride(*CondUse, L);
1828           const SCEV *B = IU.getStride(*UI, L);
1829           if (!A || !B) continue;
1830           if (SE.getTypeSizeInBits(A->getType()) !=
1831               SE.getTypeSizeInBits(B->getType())) {
1832             if (SE.getTypeSizeInBits(A->getType()) >
1833                 SE.getTypeSizeInBits(B->getType()))
1834               B = SE.getSignExtendExpr(B, A->getType());
1835             else
1836               A = SE.getSignExtendExpr(A, B->getType());
1837           }
1838           if (const SCEVConstant *D =
1839                 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
1840             const ConstantInt *C = D->getValue();
1841             // Stride of one or negative one can have reuse with non-addresses.
1842             if (C->isOne() || C->isAllOnesValue())
1843               goto decline_post_inc;
1844             // Avoid weird situations.
1845             if (C->getValue().getMinSignedBits() >= 64 ||
1846                 C->getValue().isMinSignedValue())
1847               goto decline_post_inc;
1848             // Without TLI, assume that any stride might be valid, and so any
1849             // use might be shared.
1850             if (!TLI)
1851               goto decline_post_inc;
1852             // Check for possible scaled-address reuse.
1853             Type *AccessTy = getAccessType(UI->getUser());
1854             TargetLowering::AddrMode AM;
1855             AM.Scale = C->getSExtValue();
1856             if (TLI->isLegalAddressingMode(AM, AccessTy))
1857               goto decline_post_inc;
1858             AM.Scale = -AM.Scale;
1859             if (TLI->isLegalAddressingMode(AM, AccessTy))
1860               goto decline_post_inc;
1861           }
1862         }
1863
1864     DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
1865                  << *Cond << '\n');
1866
1867     // It's possible for the setcc instruction to be anywhere in the loop, and
1868     // possible for it to have multiple users.  If it is not immediately before
1869     // the exiting block branch, move it.
1870     if (&*++BasicBlock::iterator(Cond) != TermBr) {
1871       if (Cond->hasOneUse()) {
1872         Cond->moveBefore(TermBr);
1873       } else {
1874         // Clone the terminating condition and insert into the loopend.
1875         ICmpInst *OldCond = Cond;
1876         Cond = cast<ICmpInst>(Cond->clone());
1877         Cond->setName(L->getHeader()->getName() + ".termcond");
1878         ExitingBlock->getInstList().insert(TermBr, Cond);
1879
1880         // Clone the IVUse, as the old use still exists!
1881         CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
1882         TermBr->replaceUsesOfWith(OldCond, Cond);
1883       }
1884     }
1885
1886     // If we get to here, we know that we can transform the setcc instruction to
1887     // use the post-incremented version of the IV, allowing us to coalesce the
1888     // live ranges for the IV correctly.
1889     CondUse->transformToPostInc(L);
1890     Changed = true;
1891
1892     PostIncs.insert(Cond);
1893   decline_post_inc:;
1894   }
1895
1896   // Determine an insertion point for the loop induction variable increment. It
1897   // must dominate all the post-inc comparisons we just set up, and it must
1898   // dominate the loop latch edge.
1899   IVIncInsertPos = L->getLoopLatch()->getTerminator();
1900   for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(),
1901        E = PostIncs.end(); I != E; ++I) {
1902     BasicBlock *BB =
1903       DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
1904                                     (*I)->getParent());
1905     if (BB == (*I)->getParent())
1906       IVIncInsertPos = *I;
1907     else if (BB != IVIncInsertPos->getParent())
1908       IVIncInsertPos = BB->getTerminator();
1909   }
1910 }
1911
1912 /// reconcileNewOffset - Determine if the given use can accommodate a fixup
1913 /// at the given offset and other details. If so, update the use and
1914 /// return true.
1915 bool
1916 LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
1917                                 LSRUse::KindType Kind, Type *AccessTy) {
1918   int64_t NewMinOffset = LU.MinOffset;
1919   int64_t NewMaxOffset = LU.MaxOffset;
1920   Type *NewAccessTy = AccessTy;
1921
1922   // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
1923   // something conservative, however this can pessimize in the case that one of
1924   // the uses will have all its uses outside the loop, for example.
1925   if (LU.Kind != Kind)
1926     return false;
1927   // Conservatively assume HasBaseReg is true for now.
1928   if (NewOffset < LU.MinOffset) {
1929     if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg,
1930                           Kind, AccessTy, TLI))
1931       return false;
1932     NewMinOffset = NewOffset;
1933   } else if (NewOffset > LU.MaxOffset) {
1934     if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg,
1935                           Kind, AccessTy, TLI))
1936       return false;
1937     NewMaxOffset = NewOffset;
1938   }
1939   // Check for a mismatched access type, and fall back conservatively as needed.
1940   // TODO: Be less conservative when the type is similar and can use the same
1941   // addressing modes.
1942   if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
1943     NewAccessTy = Type::getVoidTy(AccessTy->getContext());
1944
1945   // Update the use.
1946   LU.MinOffset = NewMinOffset;
1947   LU.MaxOffset = NewMaxOffset;
1948   LU.AccessTy = NewAccessTy;
1949   if (NewOffset != LU.Offsets.back())
1950     LU.Offsets.push_back(NewOffset);
1951   return true;
1952 }
1953
1954 /// getUse - Return an LSRUse index and an offset value for a fixup which
1955 /// needs the given expression, with the given kind and optional access type.
1956 /// Either reuse an existing use or create a new one, as needed.
1957 std::pair<size_t, int64_t>
1958 LSRInstance::getUse(const SCEV *&Expr,
1959                     LSRUse::KindType Kind, Type *AccessTy) {
1960   const SCEV *Copy = Expr;
1961   int64_t Offset = ExtractImmediate(Expr, SE);
1962
1963   // Basic uses can't accept any offset, for example.
1964   if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) {
1965     Expr = Copy;
1966     Offset = 0;
1967   }
1968
1969   std::pair<UseMapTy::iterator, bool> P =
1970     UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0));
1971   if (!P.second) {
1972     // A use already existed with this base.
1973     size_t LUIdx = P.first->second;
1974     LSRUse &LU = Uses[LUIdx];
1975     if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
1976       // Reuse this use.
1977       return std::make_pair(LUIdx, Offset);
1978   }
1979
1980   // Create a new use.
1981   size_t LUIdx = Uses.size();
1982   P.first->second = LUIdx;
1983   Uses.push_back(LSRUse(Kind, AccessTy));
1984   LSRUse &LU = Uses[LUIdx];
1985
1986   // We don't need to track redundant offsets, but we don't need to go out
1987   // of our way here to avoid them.
1988   if (LU.Offsets.empty() || Offset != LU.Offsets.back())
1989     LU.Offsets.push_back(Offset);
1990
1991   LU.MinOffset = Offset;
1992   LU.MaxOffset = Offset;
1993   return std::make_pair(LUIdx, Offset);
1994 }
1995
1996 /// DeleteUse - Delete the given use from the Uses list.
1997 void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
1998   if (&LU != &Uses.back())
1999     std::swap(LU, Uses.back());
2000   Uses.pop_back();
2001
2002   // Update RegUses.
2003   RegUses.SwapAndDropUse(LUIdx, Uses.size());
2004 }
2005
2006 /// FindUseWithFormula - Look for a use distinct from OrigLU which is has
2007 /// a formula that has the same registers as the given formula.
2008 LSRUse *
2009 LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2010                                        const LSRUse &OrigLU) {
2011   // Search all uses for the formula. This could be more clever.
2012   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2013     LSRUse &LU = Uses[LUIdx];
2014     // Check whether this use is close enough to OrigLU, to see whether it's
2015     // worthwhile looking through its formulae.
2016     // Ignore ICmpZero uses because they may contain formulae generated by
2017     // GenerateICmpZeroScales, in which case adding fixup offsets may
2018     // be invalid.
2019     if (&LU != &OrigLU &&
2020         LU.Kind != LSRUse::ICmpZero &&
2021         LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2022         LU.WidestFixupType == OrigLU.WidestFixupType &&
2023         LU.HasFormulaWithSameRegs(OrigF)) {
2024       // Scan through this use's formulae.
2025       for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
2026            E = LU.Formulae.end(); I != E; ++I) {
2027         const Formula &F = *I;
2028         // Check to see if this formula has the same registers and symbols
2029         // as OrigF.
2030         if (F.BaseRegs == OrigF.BaseRegs &&
2031             F.ScaledReg == OrigF.ScaledReg &&
2032             F.AM.BaseGV == OrigF.AM.BaseGV &&
2033             F.AM.Scale == OrigF.AM.Scale &&
2034             F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2035           if (F.AM.BaseOffs == 0)
2036             return &LU;
2037           // This is the formula where all the registers and symbols matched;
2038           // there aren't going to be any others. Since we declined it, we
2039           // can skip the rest of the formulae and procede to the next LSRUse.
2040           break;
2041         }
2042       }
2043     }
2044   }
2045
2046   // Nothing looked good.
2047   return 0;
2048 }
2049
2050 void LSRInstance::CollectInterestingTypesAndFactors() {
2051   SmallSetVector<const SCEV *, 4> Strides;
2052
2053   // Collect interesting types and strides.
2054   SmallVector<const SCEV *, 4> Worklist;
2055   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
2056     const SCEV *Expr = IU.getExpr(*UI);
2057
2058     // Collect interesting types.
2059     Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2060
2061     // Add strides for mentioned loops.
2062     Worklist.push_back(Expr);
2063     do {
2064       const SCEV *S = Worklist.pop_back_val();
2065       if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2066         if (EnableNested || AR->getLoop() == L)
2067           Strides.insert(AR->getStepRecurrence(SE));
2068         Worklist.push_back(AR->getStart());
2069       } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2070         Worklist.append(Add->op_begin(), Add->op_end());
2071       }
2072     } while (!Worklist.empty());
2073   }
2074
2075   // Compute interesting factors from the set of interesting strides.
2076   for (SmallSetVector<const SCEV *, 4>::const_iterator
2077        I = Strides.begin(), E = Strides.end(); I != E; ++I)
2078     for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2079          llvm::next(I); NewStrideIter != E; ++NewStrideIter) {
2080       const SCEV *OldStride = *I;
2081       const SCEV *NewStride = *NewStrideIter;
2082
2083       if (SE.getTypeSizeInBits(OldStride->getType()) !=
2084           SE.getTypeSizeInBits(NewStride->getType())) {
2085         if (SE.getTypeSizeInBits(OldStride->getType()) >
2086             SE.getTypeSizeInBits(NewStride->getType()))
2087           NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2088         else
2089           OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2090       }
2091       if (const SCEVConstant *Factor =
2092             dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2093                                                         SE, true))) {
2094         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
2095           Factors.insert(Factor->getValue()->getValue().getSExtValue());
2096       } else if (const SCEVConstant *Factor =
2097                    dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2098                                                                NewStride,
2099                                                                SE, true))) {
2100         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
2101           Factors.insert(Factor->getValue()->getValue().getSExtValue());
2102       }
2103     }
2104
2105   // If all uses use the same type, don't bother looking for truncation-based
2106   // reuse.
2107   if (Types.size() == 1)
2108     Types.clear();
2109
2110   DEBUG(print_factors_and_types(dbgs()));
2111 }
2112
2113 void LSRInstance::CollectFixupsAndInitialFormulae() {
2114   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
2115     // Record the uses.
2116     LSRFixup &LF = getNewFixup();
2117     LF.UserInst = UI->getUser();
2118     LF.OperandValToReplace = UI->getOperandValToReplace();
2119     LF.PostIncLoops = UI->getPostIncLoops();
2120
2121     LSRUse::KindType Kind = LSRUse::Basic;
2122     Type *AccessTy = 0;
2123     if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
2124       Kind = LSRUse::Address;
2125       AccessTy = getAccessType(LF.UserInst);
2126     }
2127
2128     const SCEV *S = IU.getExpr(*UI);
2129
2130     // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
2131     // (N - i == 0), and this allows (N - i) to be the expression that we work
2132     // with rather than just N or i, so we can consider the register
2133     // requirements for both N and i at the same time. Limiting this code to
2134     // equality icmps is not a problem because all interesting loops use
2135     // equality icmps, thanks to IndVarSimplify.
2136     if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst))
2137       if (CI->isEquality()) {
2138         // Swap the operands if needed to put the OperandValToReplace on the
2139         // left, for consistency.
2140         Value *NV = CI->getOperand(1);
2141         if (NV == LF.OperandValToReplace) {
2142           CI->setOperand(1, CI->getOperand(0));
2143           CI->setOperand(0, NV);
2144           NV = CI->getOperand(1);
2145           Changed = true;
2146         }
2147
2148         // x == y  -->  x - y == 0
2149         const SCEV *N = SE.getSCEV(NV);
2150         if (SE.isLoopInvariant(N, L)) {
2151           // S is normalized, so normalize N before folding it into S
2152           // to keep the result normalized.
2153           N = TransformForPostIncUse(Normalize, N, CI, 0,
2154                                      LF.PostIncLoops, SE, DT);
2155           Kind = LSRUse::ICmpZero;
2156           S = SE.getMinusSCEV(N, S);
2157         }
2158
2159         // -1 and the negations of all interesting strides (except the negation
2160         // of -1) are now also interesting.
2161         for (size_t i = 0, e = Factors.size(); i != e; ++i)
2162           if (Factors[i] != -1)
2163             Factors.insert(-(uint64_t)Factors[i]);
2164         Factors.insert(-1);
2165       }
2166
2167     // Set up the initial formula for this use.
2168     std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
2169     LF.LUIdx = P.first;
2170     LF.Offset = P.second;
2171     LSRUse &LU = Uses[LF.LUIdx];
2172     LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
2173     if (!LU.WidestFixupType ||
2174         SE.getTypeSizeInBits(LU.WidestFixupType) <
2175         SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
2176       LU.WidestFixupType = LF.OperandValToReplace->getType();
2177
2178     // If this is the first use of this LSRUse, give it a formula.
2179     if (LU.Formulae.empty()) {
2180       InsertInitialFormula(S, LU, LF.LUIdx);
2181       CountRegisters(LU.Formulae.back(), LF.LUIdx);
2182     }
2183   }
2184
2185   DEBUG(print_fixups(dbgs()));
2186 }
2187
2188 /// InsertInitialFormula - Insert a formula for the given expression into
2189 /// the given use, separating out loop-variant portions from loop-invariant
2190 /// and loop-computable portions.
2191 void
2192 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
2193   Formula F;
2194   F.InitialMatch(S, L, SE);
2195   bool Inserted = InsertFormula(LU, LUIdx, F);
2196   assert(Inserted && "Initial formula already exists!"); (void)Inserted;
2197 }
2198
2199 /// InsertSupplementalFormula - Insert a simple single-register formula for
2200 /// the given expression into the given use.
2201 void
2202 LSRInstance::InsertSupplementalFormula(const SCEV *S,
2203                                        LSRUse &LU, size_t LUIdx) {
2204   Formula F;
2205   F.BaseRegs.push_back(S);
2206   F.AM.HasBaseReg = true;
2207   bool Inserted = InsertFormula(LU, LUIdx, F);
2208   assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
2209 }
2210
2211 /// CountRegisters - Note which registers are used by the given formula,
2212 /// updating RegUses.
2213 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
2214   if (F.ScaledReg)
2215     RegUses.CountRegister(F.ScaledReg, LUIdx);
2216   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
2217        E = F.BaseRegs.end(); I != E; ++I)
2218     RegUses.CountRegister(*I, LUIdx);
2219 }
2220
2221 /// InsertFormula - If the given formula has not yet been inserted, add it to
2222 /// the list, and return true. Return false otherwise.
2223 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
2224   if (!LU.InsertFormula(F))
2225     return false;
2226
2227   CountRegisters(F, LUIdx);
2228   return true;
2229 }
2230
2231 /// CollectLoopInvariantFixupsAndFormulae - Check for other uses of
2232 /// loop-invariant values which we're tracking. These other uses will pin these
2233 /// values in registers, making them less profitable for elimination.
2234 /// TODO: This currently misses non-constant addrec step registers.
2235 /// TODO: Should this give more weight to users inside the loop?
2236 void
2237 LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
2238   SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
2239   SmallPtrSet<const SCEV *, 8> Inserted;
2240
2241   while (!Worklist.empty()) {
2242     const SCEV *S = Worklist.pop_back_val();
2243
2244     if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
2245       Worklist.append(N->op_begin(), N->op_end());
2246     else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
2247       Worklist.push_back(C->getOperand());
2248     else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
2249       Worklist.push_back(D->getLHS());
2250       Worklist.push_back(D->getRHS());
2251     } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
2252       if (!Inserted.insert(U)) continue;
2253       const Value *V = U->getValue();
2254       if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
2255         // Look for instructions defined outside the loop.
2256         if (L->contains(Inst)) continue;
2257       } else if (isa<UndefValue>(V))
2258         // Undef doesn't have a live range, so it doesn't matter.
2259         continue;
2260       for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
2261            UI != UE; ++UI) {
2262         const Instruction *UserInst = dyn_cast<Instruction>(*UI);
2263         // Ignore non-instructions.
2264         if (!UserInst)
2265           continue;
2266         // Ignore instructions in other functions (as can happen with
2267         // Constants).
2268         if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
2269           continue;
2270         // Ignore instructions not dominated by the loop.
2271         const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
2272           UserInst->getParent() :
2273           cast<PHINode>(UserInst)->getIncomingBlock(
2274             PHINode::getIncomingValueNumForOperand(UI.getOperandNo()));
2275         if (!DT.dominates(L->getHeader(), UseBB))
2276           continue;
2277         // Ignore uses which are part of other SCEV expressions, to avoid
2278         // analyzing them multiple times.
2279         if (SE.isSCEVable(UserInst->getType())) {
2280           const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
2281           // If the user is a no-op, look through to its uses.
2282           if (!isa<SCEVUnknown>(UserS))
2283             continue;
2284           if (UserS == U) {
2285             Worklist.push_back(
2286               SE.getUnknown(const_cast<Instruction *>(UserInst)));
2287             continue;
2288           }
2289         }
2290         // Ignore icmp instructions which are already being analyzed.
2291         if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
2292           unsigned OtherIdx = !UI.getOperandNo();
2293           Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
2294           if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
2295             continue;
2296         }
2297
2298         LSRFixup &LF = getNewFixup();
2299         LF.UserInst = const_cast<Instruction *>(UserInst);
2300         LF.OperandValToReplace = UI.getUse();
2301         std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0);
2302         LF.LUIdx = P.first;
2303         LF.Offset = P.second;
2304         LSRUse &LU = Uses[LF.LUIdx];
2305         LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
2306         if (!LU.WidestFixupType ||
2307             SE.getTypeSizeInBits(LU.WidestFixupType) <
2308             SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
2309           LU.WidestFixupType = LF.OperandValToReplace->getType();
2310         InsertSupplementalFormula(U, LU, LF.LUIdx);
2311         CountRegisters(LU.Formulae.back(), Uses.size() - 1);
2312         break;
2313       }
2314     }
2315   }
2316 }
2317
2318 /// CollectSubexprs - Split S into subexpressions which can be pulled out into
2319 /// separate registers. If C is non-null, multiply each subexpression by C.
2320 static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
2321                             SmallVectorImpl<const SCEV *> &Ops,
2322                             const Loop *L,
2323                             ScalarEvolution &SE) {
2324   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2325     // Break out add operands.
2326     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
2327          I != E; ++I)
2328       CollectSubexprs(*I, C, Ops, L, SE);
2329     return;
2330   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2331     // Split a non-zero base out of an addrec.
2332     if (!AR->getStart()->isZero()) {
2333       CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
2334                                        AR->getStepRecurrence(SE),
2335                                        AR->getLoop(),
2336                                        //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
2337                                        SCEV::FlagAnyWrap),
2338                       C, Ops, L, SE);
2339       CollectSubexprs(AR->getStart(), C, Ops, L, SE);
2340       return;
2341     }
2342   } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
2343     // Break (C * (a + b + c)) into C*a + C*b + C*c.
2344     if (Mul->getNumOperands() == 2)
2345       if (const SCEVConstant *Op0 =
2346             dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
2347         CollectSubexprs(Mul->getOperand(1),
2348                         C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0,
2349                         Ops, L, SE);
2350         return;
2351       }
2352   }
2353
2354   // Otherwise use the value itself, optionally with a scale applied.
2355   Ops.push_back(C ? SE.getMulExpr(C, S) : S);
2356 }
2357
2358 /// GenerateReassociations - Split out subexpressions from adds and the bases of
2359 /// addrecs.
2360 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
2361                                          Formula Base,
2362                                          unsigned Depth) {
2363   // Arbitrarily cap recursion to protect compile time.
2364   if (Depth >= 3) return;
2365
2366   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
2367     const SCEV *BaseReg = Base.BaseRegs[i];
2368
2369     SmallVector<const SCEV *, 8> AddOps;
2370     CollectSubexprs(BaseReg, 0, AddOps, L, SE);
2371
2372     if (AddOps.size() == 1) continue;
2373
2374     for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
2375          JE = AddOps.end(); J != JE; ++J) {
2376
2377       // Loop-variant "unknown" values are uninteresting; we won't be able to
2378       // do anything meaningful with them.
2379       if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
2380         continue;
2381
2382       // Don't pull a constant into a register if the constant could be folded
2383       // into an immediate field.
2384       if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset,
2385                            Base.getNumRegs() > 1,
2386                            LU.Kind, LU.AccessTy, TLI, SE))
2387         continue;
2388
2389       // Collect all operands except *J.
2390       SmallVector<const SCEV *, 8> InnerAddOps
2391         (((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
2392       InnerAddOps.append
2393         (llvm::next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end());
2394
2395       // Don't leave just a constant behind in a register if the constant could
2396       // be folded into an immediate field.
2397       if (InnerAddOps.size() == 1 &&
2398           isAlwaysFoldable(InnerAddOps[0], LU.MinOffset, LU.MaxOffset,
2399                            Base.getNumRegs() > 1,
2400                            LU.Kind, LU.AccessTy, TLI, SE))
2401         continue;
2402
2403       const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
2404       if (InnerSum->isZero())
2405         continue;
2406       Formula F = Base;
2407
2408       // Add the remaining pieces of the add back into the new formula.
2409       const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
2410       if (TLI && InnerSumSC &&
2411           SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
2412           TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
2413                                    InnerSumSC->getValue()->getZExtValue())) {
2414         F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
2415                            InnerSumSC->getValue()->getZExtValue();
2416         F.BaseRegs.erase(F.BaseRegs.begin() + i);
2417       } else
2418         F.BaseRegs[i] = InnerSum;
2419
2420       // Add J as its own register, or an unfolded immediate.
2421       const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
2422       if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
2423           TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
2424                                    SC->getValue()->getZExtValue()))
2425         F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
2426                            SC->getValue()->getZExtValue();
2427       else
2428         F.BaseRegs.push_back(*J);
2429
2430       if (InsertFormula(LU, LUIdx, F))
2431         // If that formula hadn't been seen before, recurse to find more like
2432         // it.
2433         GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1);
2434     }
2435   }
2436 }
2437
2438 /// GenerateCombinations - Generate a formula consisting of all of the
2439 /// loop-dominating registers added into a single register.
2440 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
2441                                        Formula Base) {
2442   // This method is only interesting on a plurality of registers.
2443   if (Base.BaseRegs.size() <= 1) return;
2444
2445   Formula F = Base;
2446   F.BaseRegs.clear();
2447   SmallVector<const SCEV *, 4> Ops;
2448   for (SmallVectorImpl<const SCEV *>::const_iterator
2449        I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) {
2450     const SCEV *BaseReg = *I;
2451     if (SE.properlyDominates(BaseReg, L->getHeader()) &&
2452         !SE.hasComputableLoopEvolution(BaseReg, L))
2453       Ops.push_back(BaseReg);
2454     else
2455       F.BaseRegs.push_back(BaseReg);
2456   }
2457   if (Ops.size() > 1) {
2458     const SCEV *Sum = SE.getAddExpr(Ops);
2459     // TODO: If Sum is zero, it probably means ScalarEvolution missed an
2460     // opportunity to fold something. For now, just ignore such cases
2461     // rather than proceed with zero in a register.
2462     if (!Sum->isZero()) {
2463       F.BaseRegs.push_back(Sum);
2464       (void)InsertFormula(LU, LUIdx, F);
2465     }
2466   }
2467 }
2468
2469 /// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets.
2470 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
2471                                           Formula Base) {
2472   // We can't add a symbolic offset if the address already contains one.
2473   if (Base.AM.BaseGV) return;
2474
2475   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
2476     const SCEV *G = Base.BaseRegs[i];
2477     GlobalValue *GV = ExtractSymbol(G, SE);
2478     if (G->isZero() || !GV)
2479       continue;
2480     Formula F = Base;
2481     F.AM.BaseGV = GV;
2482     if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
2483                     LU.Kind, LU.AccessTy, TLI))
2484       continue;
2485     F.BaseRegs[i] = G;
2486     (void)InsertFormula(LU, LUIdx, F);
2487   }
2488 }
2489
2490 /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
2491 void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
2492                                           Formula Base) {
2493   // TODO: For now, just add the min and max offset, because it usually isn't
2494   // worthwhile looking at everything inbetween.
2495   SmallVector<int64_t, 2> Worklist;
2496   Worklist.push_back(LU.MinOffset);
2497   if (LU.MaxOffset != LU.MinOffset)
2498     Worklist.push_back(LU.MaxOffset);
2499
2500   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
2501     const SCEV *G = Base.BaseRegs[i];
2502
2503     for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
2504          E = Worklist.end(); I != E; ++I) {
2505       Formula F = Base;
2506       F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I;
2507       if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I,
2508                      LU.Kind, LU.AccessTy, TLI)) {
2509         // Add the offset to the base register.
2510         const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
2511         // If it cancelled out, drop the base register, otherwise update it.
2512         if (NewG->isZero()) {
2513           std::swap(F.BaseRegs[i], F.BaseRegs.back());
2514           F.BaseRegs.pop_back();
2515         } else
2516           F.BaseRegs[i] = NewG;
2517
2518         (void)InsertFormula(LU, LUIdx, F);
2519       }
2520     }
2521
2522     int64_t Imm = ExtractImmediate(G, SE);
2523     if (G->isZero() || Imm == 0)
2524       continue;
2525     Formula F = Base;
2526     F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm;
2527     if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
2528                     LU.Kind, LU.AccessTy, TLI))
2529       continue;
2530     F.BaseRegs[i] = G;
2531     (void)InsertFormula(LU, LUIdx, F);
2532   }
2533 }
2534
2535 /// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up
2536 /// the comparison. For example, x == y -> x*c == y*c.
2537 void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
2538                                          Formula Base) {
2539   if (LU.Kind != LSRUse::ICmpZero) return;
2540
2541   // Determine the integer type for the base formula.
2542   Type *IntTy = Base.getType();
2543   if (!IntTy) return;
2544   if (SE.getTypeSizeInBits(IntTy) > 64) return;
2545
2546   // Don't do this if there is more than one offset.
2547   if (LU.MinOffset != LU.MaxOffset) return;
2548
2549   assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");
2550
2551   // Check each interesting stride.
2552   for (SmallSetVector<int64_t, 8>::const_iterator
2553        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
2554     int64_t Factor = *I;
2555
2556     // Check that the multiplication doesn't overflow.
2557     if (Base.AM.BaseOffs == INT64_MIN && Factor == -1)
2558       continue;
2559     int64_t NewBaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
2560     if (NewBaseOffs / Factor != Base.AM.BaseOffs)
2561       continue;
2562
2563     // Check that multiplying with the use offset doesn't overflow.
2564     int64_t Offset = LU.MinOffset;
2565     if (Offset == INT64_MIN && Factor == -1)
2566       continue;
2567     Offset = (uint64_t)Offset * Factor;
2568     if (Offset / Factor != LU.MinOffset)
2569       continue;
2570
2571     Formula F = Base;
2572     F.AM.BaseOffs = NewBaseOffs;
2573
2574     // Check that this scale is legal.
2575     if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI))
2576       continue;
2577
2578     // Compensate for the use having MinOffset built into it.
2579     F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset;
2580
2581     const SCEV *FactorS = SE.getConstant(IntTy, Factor);
2582
2583     // Check that multiplying with each base register doesn't overflow.
2584     for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
2585       F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
2586       if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
2587         goto next;
2588     }
2589
2590     // Check that multiplying with the scaled register doesn't overflow.
2591     if (F.ScaledReg) {
2592       F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
2593       if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
2594         continue;
2595     }
2596
2597     // Check that multiplying with the unfolded offset doesn't overflow.
2598     if (F.UnfoldedOffset != 0) {
2599       if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
2600         continue;
2601       F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
2602       if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
2603         continue;
2604     }
2605
2606     // If we make it here and it's legal, add it.
2607     (void)InsertFormula(LU, LUIdx, F);
2608   next:;
2609   }
2610 }
2611
2612 /// GenerateScales - Generate stride factor reuse formulae by making use of
2613 /// scaled-offset address modes, for example.
2614 void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
2615   // Determine the integer type for the base formula.
2616   Type *IntTy = Base.getType();
2617   if (!IntTy) return;
2618
2619   // If this Formula already has a scaled register, we can't add another one.
2620   if (Base.AM.Scale != 0) return;
2621
2622   // Check each interesting stride.
2623   for (SmallSetVector<int64_t, 8>::const_iterator
2624        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
2625     int64_t Factor = *I;
2626
2627     Base.AM.Scale = Factor;
2628     Base.AM.HasBaseReg = Base.BaseRegs.size() > 1;
2629     // Check whether this scale is going to be legal.
2630     if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
2631                     LU.Kind, LU.AccessTy, TLI)) {
2632       // As a special-case, handle special out-of-loop Basic users specially.
2633       // TODO: Reconsider this special case.
2634       if (LU.Kind == LSRUse::Basic &&
2635           isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
2636                      LSRUse::Special, LU.AccessTy, TLI) &&
2637           LU.AllFixupsOutsideLoop)
2638         LU.Kind = LSRUse::Special;
2639       else
2640         continue;
2641     }
2642     // For an ICmpZero, negating a solitary base register won't lead to
2643     // new solutions.
2644     if (LU.Kind == LSRUse::ICmpZero &&
2645         !Base.AM.HasBaseReg && Base.AM.BaseOffs == 0 && !Base.AM.BaseGV)
2646       continue;
2647     // For each addrec base reg, apply the scale, if possible.
2648     for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
2649       if (const SCEVAddRecExpr *AR =
2650             dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
2651         const SCEV *FactorS = SE.getConstant(IntTy, Factor);
2652         if (FactorS->isZero())
2653           continue;
2654         // Divide out the factor, ignoring high bits, since we'll be
2655         // scaling the value back up in the end.
2656         if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
2657           // TODO: This could be optimized to avoid all the copying.
2658           Formula F = Base;
2659           F.ScaledReg = Quotient;
2660           F.DeleteBaseReg(F.BaseRegs[i]);
2661           (void)InsertFormula(LU, LUIdx, F);
2662         }
2663       }
2664   }
2665 }
2666
2667 /// GenerateTruncates - Generate reuse formulae from different IV types.
2668 void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
2669   // This requires TargetLowering to tell us which truncates are free.
2670   if (!TLI) return;
2671
2672   // Don't bother truncating symbolic values.
2673   if (Base.AM.BaseGV) return;
2674
2675   // Determine the integer type for the base formula.
2676   Type *DstTy = Base.getType();
2677   if (!DstTy) return;
2678   DstTy = SE.getEffectiveSCEVType(DstTy);
2679
2680   for (SmallSetVector<Type *, 4>::const_iterator
2681        I = Types.begin(), E = Types.end(); I != E; ++I) {
2682     Type *SrcTy = *I;
2683     if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
2684       Formula F = Base;
2685
2686       if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
2687       for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
2688            JE = F.BaseRegs.end(); J != JE; ++J)
2689         *J = SE.getAnyExtendExpr(*J, SrcTy);
2690
2691       // TODO: This assumes we've done basic processing on all uses and
2692       // have an idea what the register usage is.
2693       if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
2694         continue;
2695
2696       (void)InsertFormula(LU, LUIdx, F);
2697     }
2698   }
2699 }
2700
2701 namespace {
2702
2703 /// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to
2704 /// defer modifications so that the search phase doesn't have to worry about
2705 /// the data structures moving underneath it.
2706 struct WorkItem {
2707   size_t LUIdx;
2708   int64_t Imm;
2709   const SCEV *OrigReg;
2710
2711   WorkItem(size_t LI, int64_t I, const SCEV *R)
2712     : LUIdx(LI), Imm(I), OrigReg(R) {}
2713
2714   void print(raw_ostream &OS) const;
2715   void dump() const;
2716 };
2717
2718 }
2719
2720 void WorkItem::print(raw_ostream &OS) const {
2721   OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
2722      << " , add offset " << Imm;
2723 }
2724
2725 void WorkItem::dump() const {
2726   print(errs()); errs() << '\n';
2727 }
2728
2729 /// GenerateCrossUseConstantOffsets - Look for registers which are a constant
2730 /// distance apart and try to form reuse opportunities between them.
2731 void LSRInstance::GenerateCrossUseConstantOffsets() {
2732   // Group the registers by their value without any added constant offset.
2733   typedef std::map<int64_t, const SCEV *> ImmMapTy;
2734   typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
2735   RegMapTy Map;
2736   DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
2737   SmallVector<const SCEV *, 8> Sequence;
2738   for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
2739        I != E; ++I) {
2740     const SCEV *Reg = *I;
2741     int64_t Imm = ExtractImmediate(Reg, SE);
2742     std::pair<RegMapTy::iterator, bool> Pair =
2743       Map.insert(std::make_pair(Reg, ImmMapTy()));
2744     if (Pair.second)
2745       Sequence.push_back(Reg);
2746     Pair.first->second.insert(std::make_pair(Imm, *I));
2747     UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I);
2748   }
2749
2750   // Now examine each set of registers with the same base value. Build up
2751   // a list of work to do and do the work in a separate step so that we're
2752   // not adding formulae and register counts while we're searching.
2753   SmallVector<WorkItem, 32> WorkItems;
2754   SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
2755   for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
2756        E = Sequence.end(); I != E; ++I) {
2757     const SCEV *Reg = *I;
2758     const ImmMapTy &Imms = Map.find(Reg)->second;
2759
2760     // It's not worthwhile looking for reuse if there's only one offset.
2761     if (Imms.size() == 1)
2762       continue;
2763
2764     DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
2765           for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
2766                J != JE; ++J)
2767             dbgs() << ' ' << J->first;
2768           dbgs() << '\n');
2769
2770     // Examine each offset.
2771     for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
2772          J != JE; ++J) {
2773       const SCEV *OrigReg = J->second;
2774
2775       int64_t JImm = J->first;
2776       const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
2777
2778       if (!isa<SCEVConstant>(OrigReg) &&
2779           UsedByIndicesMap[Reg].count() == 1) {
2780         DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
2781         continue;
2782       }
2783
2784       // Conservatively examine offsets between this orig reg a few selected
2785       // other orig regs.
2786       ImmMapTy::const_iterator OtherImms[] = {
2787         Imms.begin(), prior(Imms.end()),
2788         Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
2789       };
2790       for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
2791         ImmMapTy::const_iterator M = OtherImms[i];
2792         if (M == J || M == JE) continue;
2793
2794         // Compute the difference between the two.
2795         int64_t Imm = (uint64_t)JImm - M->first;
2796         for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
2797              LUIdx = UsedByIndices.find_next(LUIdx))
2798           // Make a memo of this use, offset, and register tuple.
2799           if (UniqueItems.insert(std::make_pair(LUIdx, Imm)))
2800             WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
2801       }
2802     }
2803   }
2804
2805   Map.clear();
2806   Sequence.clear();
2807   UsedByIndicesMap.clear();
2808   UniqueItems.clear();
2809
2810   // Now iterate through the worklist and add new formulae.
2811   for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
2812        E = WorkItems.end(); I != E; ++I) {
2813     const WorkItem &WI = *I;
2814     size_t LUIdx = WI.LUIdx;
2815     LSRUse &LU = Uses[LUIdx];
2816     int64_t Imm = WI.Imm;
2817     const SCEV *OrigReg = WI.OrigReg;
2818
2819     Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
2820     const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
2821     unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
2822
2823     // TODO: Use a more targeted data structure.
2824     for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
2825       const Formula &F = LU.Formulae[L];
2826       // Use the immediate in the scaled register.
2827       if (F.ScaledReg == OrigReg) {
2828         int64_t Offs = (uint64_t)F.AM.BaseOffs +
2829                        Imm * (uint64_t)F.AM.Scale;
2830         // Don't create 50 + reg(-50).
2831         if (F.referencesReg(SE.getSCEV(
2832                    ConstantInt::get(IntTy, -(uint64_t)Offs))))
2833           continue;
2834         Formula NewF = F;
2835         NewF.AM.BaseOffs = Offs;
2836         if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
2837                         LU.Kind, LU.AccessTy, TLI))
2838           continue;
2839         NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
2840
2841         // If the new scale is a constant in a register, and adding the constant
2842         // value to the immediate would produce a value closer to zero than the
2843         // immediate itself, then the formula isn't worthwhile.
2844         if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
2845           if (C->getValue()->isNegative() !=
2846                 (NewF.AM.BaseOffs < 0) &&
2847               (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale))
2848                 .ule(abs64(NewF.AM.BaseOffs)))
2849             continue;
2850
2851         // OK, looks good.
2852         (void)InsertFormula(LU, LUIdx, NewF);
2853       } else {
2854         // Use the immediate in a base register.
2855         for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
2856           const SCEV *BaseReg = F.BaseRegs[N];
2857           if (BaseReg != OrigReg)
2858             continue;
2859           Formula NewF = F;
2860           NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
2861           if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
2862                           LU.Kind, LU.AccessTy, TLI)) {
2863             if (!TLI ||
2864                 !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
2865               continue;
2866             NewF = F;
2867             NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
2868           }
2869           NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
2870
2871           // If the new formula has a constant in a register, and adding the
2872           // constant value to the immediate would produce a value closer to
2873           // zero than the immediate itself, then the formula isn't worthwhile.
2874           for (SmallVectorImpl<const SCEV *>::const_iterator
2875                J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end();
2876                J != JE; ++J)
2877             if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J))
2878               if ((C->getValue()->getValue() + NewF.AM.BaseOffs).abs().slt(
2879                    abs64(NewF.AM.BaseOffs)) &&
2880                   (C->getValue()->getValue() +
2881                    NewF.AM.BaseOffs).countTrailingZeros() >=
2882                    CountTrailingZeros_64(NewF.AM.BaseOffs))
2883                 goto skip_formula;
2884
2885           // Ok, looks good.
2886           (void)InsertFormula(LU, LUIdx, NewF);
2887           break;
2888         skip_formula:;
2889         }
2890       }
2891     }
2892   }
2893 }
2894
2895 /// GenerateAllReuseFormulae - Generate formulae for each use.
2896 void
2897 LSRInstance::GenerateAllReuseFormulae() {
2898   // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
2899   // queries are more precise.
2900   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2901     LSRUse &LU = Uses[LUIdx];
2902     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2903       GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
2904     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2905       GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
2906   }
2907   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2908     LSRUse &LU = Uses[LUIdx];
2909     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2910       GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
2911     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2912       GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
2913     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2914       GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
2915     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2916       GenerateScales(LU, LUIdx, LU.Formulae[i]);
2917   }
2918   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2919     LSRUse &LU = Uses[LUIdx];
2920     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2921       GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
2922   }
2923
2924   GenerateCrossUseConstantOffsets();
2925
2926   DEBUG(dbgs() << "\n"
2927                   "After generating reuse formulae:\n";
2928         print_uses(dbgs()));
2929 }
2930
2931 /// If there are multiple formulae with the same set of registers used
2932 /// by other uses, pick the best one and delete the others.
2933 void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
2934   DenseSet<const SCEV *> VisitedRegs;
2935   SmallPtrSet<const SCEV *, 16> Regs;
2936   SmallPtrSet<const SCEV *, 16> LoserRegs;
2937 #ifndef NDEBUG
2938   bool ChangedFormulae = false;
2939 #endif
2940
2941   // Collect the best formula for each unique set of shared registers. This
2942   // is reset for each use.
2943   typedef DenseMap<SmallVector<const SCEV *, 2>, size_t, UniquifierDenseMapInfo>
2944     BestFormulaeTy;
2945   BestFormulaeTy BestFormulae;
2946
2947   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2948     LSRUse &LU = Uses[LUIdx];
2949     DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
2950
2951     bool Any = false;
2952     for (size_t FIdx = 0, NumForms = LU.Formulae.size();
2953          FIdx != NumForms; ++FIdx) {
2954       Formula &F = LU.Formulae[FIdx];
2955
2956       // Some formulas are instant losers. For example, they may depend on
2957       // nonexistent AddRecs from other loops. These need to be filtered
2958       // immediately, otherwise heuristics could choose them over others leading
2959       // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
2960       // avoids the need to recompute this information across formulae using the
2961       // same bad AddRec. Passing LoserRegs is also essential unless we remove
2962       // the corresponding bad register from the Regs set.
2963       Cost CostF;
2964       Regs.clear();
2965       CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT,
2966                         &LoserRegs);
2967       if (CostF.isLoser()) {
2968         // During initial formula generation, undesirable formulae are generated
2969         // by uses within other loops that have some non-trivial address mode or
2970         // use the postinc form of the IV. LSR needs to provide these formulae
2971         // as the basis of rediscovering the desired formula that uses an AddRec
2972         // corresponding to the existing phi. Once all formulae have been
2973         // generated, these initial losers may be pruned.
2974         DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs());
2975               dbgs() << "\n");
2976       }
2977       else {
2978         SmallVector<const SCEV *, 2> Key;
2979         for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
2980                JE = F.BaseRegs.end(); J != JE; ++J) {
2981           const SCEV *Reg = *J;
2982           if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
2983             Key.push_back(Reg);
2984         }
2985         if (F.ScaledReg &&
2986             RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
2987           Key.push_back(F.ScaledReg);
2988         // Unstable sort by host order ok, because this is only used for
2989         // uniquifying.
2990         std::sort(Key.begin(), Key.end());
2991
2992         std::pair<BestFormulaeTy::const_iterator, bool> P =
2993           BestFormulae.insert(std::make_pair(Key, FIdx));
2994         if (P.second)
2995           continue;
2996
2997         Formula &Best = LU.Formulae[P.first->second];
2998
2999         Cost CostBest;
3000         Regs.clear();
3001         CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
3002         if (CostF < CostBest)
3003           std::swap(F, Best);
3004         DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
3005               dbgs() << "\n"
3006                         "    in favor of formula "; Best.print(dbgs());
3007               dbgs() << '\n');
3008       }
3009 #ifndef NDEBUG
3010       ChangedFormulae = true;
3011 #endif
3012       LU.DeleteFormula(F);
3013       --FIdx;
3014       --NumForms;
3015       Any = true;
3016     }
3017
3018     // Now that we've filtered out some formulae, recompute the Regs set.
3019     if (Any)
3020       LU.RecomputeRegs(LUIdx, RegUses);
3021
3022     // Reset this to prepare for the next use.
3023     BestFormulae.clear();
3024   }
3025
3026   DEBUG(if (ChangedFormulae) {
3027           dbgs() << "\n"
3028                     "After filtering out undesirable candidates:\n";
3029           print_uses(dbgs());
3030         });
3031 }
3032
3033 // This is a rough guess that seems to work fairly well.
3034 static const size_t ComplexityLimit = UINT16_MAX;
3035
3036 /// EstimateSearchSpaceComplexity - Estimate the worst-case number of
3037 /// solutions the solver might have to consider. It almost never considers
3038 /// this many solutions because it prune the search space, but the pruning
3039 /// isn't always sufficient.
3040 size_t LSRInstance::EstimateSearchSpaceComplexity() const {
3041   size_t Power = 1;
3042   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
3043        E = Uses.end(); I != E; ++I) {
3044     size_t FSize = I->Formulae.size();
3045     if (FSize >= ComplexityLimit) {
3046       Power = ComplexityLimit;
3047       break;
3048     }
3049     Power *= FSize;
3050     if (Power >= ComplexityLimit)
3051       break;
3052   }
3053   return Power;
3054 }
3055
3056 /// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset
3057 /// of the registers of another formula, it won't help reduce register
3058 /// pressure (though it may not necessarily hurt register pressure); remove
3059 /// it to simplify the system.
3060 void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
3061   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
3062     DEBUG(dbgs() << "The search space is too complex.\n");
3063
3064     DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
3065                     "which use a superset of registers used by other "
3066                     "formulae.\n");
3067
3068     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3069       LSRUse &LU = Uses[LUIdx];
3070       bool Any = false;
3071       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
3072         Formula &F = LU.Formulae[i];
3073         // Look for a formula with a constant or GV in a register. If the use
3074         // also has a formula with that same value in an immediate field,
3075         // delete the one that uses a register.
3076         for (SmallVectorImpl<const SCEV *>::const_iterator
3077              I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
3078           if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
3079             Formula NewF = F;
3080             NewF.AM.BaseOffs += C->getValue()->getSExtValue();
3081             NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
3082                                 (I - F.BaseRegs.begin()));
3083             if (LU.HasFormulaWithSameRegs(NewF)) {
3084               DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
3085               LU.DeleteFormula(F);
3086               --i;
3087               --e;
3088               Any = true;
3089               break;
3090             }
3091           } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
3092             if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
3093               if (!F.AM.BaseGV) {
3094                 Formula NewF = F;
3095                 NewF.AM.BaseGV = GV;
3096                 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
3097                                     (I - F.BaseRegs.begin()));
3098                 if (LU.HasFormulaWithSameRegs(NewF)) {
3099                   DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
3100                         dbgs() << '\n');
3101                   LU.DeleteFormula(F);
3102                   --i;
3103                   --e;
3104                   Any = true;
3105                   break;
3106                 }
3107               }
3108           }
3109         }
3110       }
3111       if (Any)
3112         LU.RecomputeRegs(LUIdx, RegUses);
3113     }
3114
3115     DEBUG(dbgs() << "After pre-selection:\n";
3116           print_uses(dbgs()));
3117   }
3118 }
3119
3120 /// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers
3121 /// for expressions like A, A+1, A+2, etc., allocate a single register for
3122 /// them.
3123 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
3124   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
3125     DEBUG(dbgs() << "The search space is too complex.\n");
3126
3127     DEBUG(dbgs() << "Narrowing the search space by assuming that uses "
3128                     "separated by a constant offset will use the same "
3129                     "registers.\n");
3130
3131     // This is especially useful for unrolled loops.
3132
3133     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3134       LSRUse &LU = Uses[LUIdx];
3135       for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
3136            E = LU.Formulae.end(); I != E; ++I) {
3137         const Formula &F = *I;
3138         if (F.AM.BaseOffs != 0 && F.AM.Scale == 0) {
3139           if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) {
3140             if (reconcileNewOffset(*LUThatHas, F.AM.BaseOffs,
3141                                    /*HasBaseReg=*/false,
3142                                    LU.Kind, LU.AccessTy)) {
3143               DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs());
3144                     dbgs() << '\n');
3145
3146               LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
3147
3148               // Update the relocs to reference the new use.
3149               for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(),
3150                    E = Fixups.end(); I != E; ++I) {
3151                 LSRFixup &Fixup = *I;
3152                 if (Fixup.LUIdx == LUIdx) {
3153                   Fixup.LUIdx = LUThatHas - &Uses.front();
3154                   Fixup.Offset += F.AM.BaseOffs;
3155                   // Add the new offset to LUThatHas' offset list.
3156                   if (LUThatHas->Offsets.back() != Fixup.Offset) {
3157                     LUThatHas->Offsets.push_back(Fixup.Offset);
3158                     if (Fixup.Offset > LUThatHas->MaxOffset)
3159                       LUThatHas->MaxOffset = Fixup.Offset;
3160                     if (Fixup.Offset < LUThatHas->MinOffset)
3161                       LUThatHas->MinOffset = Fixup.Offset;
3162                   }
3163                   DEBUG(dbgs() << "New fixup has offset "
3164                                << Fixup.Offset << '\n');
3165                 }
3166                 if (Fixup.LUIdx == NumUses-1)
3167                   Fixup.LUIdx = LUIdx;
3168               }
3169
3170               // Delete formulae from the new use which are no longer legal.
3171               bool Any = false;
3172               for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
3173                 Formula &F = LUThatHas->Formulae[i];
3174                 if (!isLegalUse(F.AM,
3175                                 LUThatHas->MinOffset, LUThatHas->MaxOffset,
3176                                 LUThatHas->Kind, LUThatHas->AccessTy, TLI)) {
3177                   DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
3178                         dbgs() << '\n');
3179                   LUThatHas->DeleteFormula(F);
3180                   --i;
3181                   --e;
3182                   Any = true;
3183                 }
3184               }
3185               if (Any)
3186                 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
3187
3188               // Delete the old use.
3189               DeleteUse(LU, LUIdx);
3190               --LUIdx;
3191               --NumUses;
3192               break;
3193             }
3194           }
3195         }
3196       }
3197     }
3198
3199     DEBUG(dbgs() << "After pre-selection:\n";
3200           print_uses(dbgs()));
3201   }
3202 }
3203
3204 /// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call
3205 /// FilterOutUndesirableDedicatedRegisters again, if necessary, now that
3206 /// we've done more filtering, as it may be able to find more formulae to
3207 /// eliminate.
3208 void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
3209   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
3210     DEBUG(dbgs() << "The search space is too complex.\n");
3211
3212     DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
3213                     "undesirable dedicated registers.\n");
3214
3215     FilterOutUndesirableDedicatedRegisters();
3216
3217     DEBUG(dbgs() << "After pre-selection:\n";
3218           print_uses(dbgs()));
3219   }
3220 }
3221
3222 /// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely
3223 /// to be profitable, and then in any use which has any reference to that
3224 /// register, delete all formulae which do not reference that register.
3225 void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
3226   // With all other options exhausted, loop until the system is simple
3227   // enough to handle.
3228   SmallPtrSet<const SCEV *, 4> Taken;
3229   while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
3230     // Ok, we have too many of formulae on our hands to conveniently handle.
3231     // Use a rough heuristic to thin out the list.
3232     DEBUG(dbgs() << "The search space is too complex.\n");
3233
3234     // Pick the register which is used by the most LSRUses, which is likely
3235     // to be a good reuse register candidate.
3236     const SCEV *Best = 0;
3237     unsigned BestNum = 0;
3238     for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
3239          I != E; ++I) {
3240       const SCEV *Reg = *I;
3241       if (Taken.count(Reg))
3242         continue;
3243       if (!Best)
3244         Best = Reg;
3245       else {
3246         unsigned Count = RegUses.getUsedByIndices(Reg).count();
3247         if (Count > BestNum) {
3248           Best = Reg;
3249           BestNum = Count;
3250         }
3251       }
3252     }
3253
3254     DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
3255                  << " will yield profitable reuse.\n");
3256     Taken.insert(Best);
3257
3258     // In any use with formulae which references this register, delete formulae
3259     // which don't reference it.
3260     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3261       LSRUse &LU = Uses[LUIdx];
3262       if (!LU.Regs.count(Best)) continue;
3263
3264       bool Any = false;
3265       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
3266         Formula &F = LU.Formulae[i];
3267         if (!F.referencesReg(Best)) {
3268           DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
3269           LU.DeleteFormula(F);
3270           --e;
3271           --i;
3272           Any = true;
3273           assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
3274           continue;
3275         }
3276       }
3277
3278       if (Any)
3279         LU.RecomputeRegs(LUIdx, RegUses);
3280     }
3281
3282     DEBUG(dbgs() << "After pre-selection:\n";
3283           print_uses(dbgs()));
3284   }
3285 }
3286
3287 /// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of
3288 /// formulae to choose from, use some rough heuristics to prune down the number
3289 /// of formulae. This keeps the main solver from taking an extraordinary amount
3290 /// of time in some worst-case scenarios.
3291 void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
3292   NarrowSearchSpaceByDetectingSupersets();
3293   NarrowSearchSpaceByCollapsingUnrolledCode();
3294   NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
3295   NarrowSearchSpaceByPickingWinnerRegs();
3296 }
3297
3298 /// SolveRecurse - This is the recursive solver.
3299 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
3300                                Cost &SolutionCost,
3301                                SmallVectorImpl<const Formula *> &Workspace,
3302                                const Cost &CurCost,
3303                                const SmallPtrSet<const SCEV *, 16> &CurRegs,
3304                                DenseSet<const SCEV *> &VisitedRegs) const {
3305   // Some ideas:
3306   //  - prune more:
3307   //    - use more aggressive filtering
3308   //    - sort the formula so that the most profitable solutions are found first
3309   //    - sort the uses too
3310   //  - search faster:
3311   //    - don't compute a cost, and then compare. compare while computing a cost
3312   //      and bail early.
3313   //    - track register sets with SmallBitVector
3314
3315   const LSRUse &LU = Uses[Workspace.size()];
3316
3317   // If this use references any register that's already a part of the
3318   // in-progress solution, consider it a requirement that a formula must
3319   // reference that register in order to be considered. This prunes out
3320   // unprofitable searching.
3321   SmallSetVector<const SCEV *, 4> ReqRegs;
3322   for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(),
3323        E = CurRegs.end(); I != E; ++I)
3324     if (LU.Regs.count(*I))
3325       ReqRegs.insert(*I);
3326
3327   bool AnySatisfiedReqRegs = false;
3328   SmallPtrSet<const SCEV *, 16> NewRegs;
3329   Cost NewCost;
3330 retry:
3331   for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
3332        E = LU.Formulae.end(); I != E; ++I) {
3333     const Formula &F = *I;
3334
3335     // Ignore formulae which do not use any of the required registers.
3336     for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
3337          JE = ReqRegs.end(); J != JE; ++J) {
3338       const SCEV *Reg = *J;
3339       if ((!F.ScaledReg || F.ScaledReg != Reg) &&
3340           std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) ==
3341           F.BaseRegs.end())
3342         goto skip;
3343     }
3344     AnySatisfiedReqRegs = true;
3345
3346     // Evaluate the cost of the current formula. If it's already worse than
3347     // the current best, prune the search at that point.
3348     NewCost = CurCost;
3349     NewRegs = CurRegs;
3350     NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
3351     if (NewCost < SolutionCost) {
3352       Workspace.push_back(&F);
3353       if (Workspace.size() != Uses.size()) {
3354         SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
3355                      NewRegs, VisitedRegs);
3356         if (F.getNumRegs() == 1 && Workspace.size() == 1)
3357           VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
3358       } else {
3359         DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
3360               dbgs() << ". Regs:";
3361               for (SmallPtrSet<const SCEV *, 16>::const_iterator
3362                    I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I)
3363                 dbgs() << ' ' << **I;
3364               dbgs() << '\n');
3365
3366         SolutionCost = NewCost;
3367         Solution = Workspace;
3368       }
3369       Workspace.pop_back();
3370     }
3371   skip:;
3372   }
3373
3374   if (!EnableRetry && !AnySatisfiedReqRegs)
3375     return;
3376
3377   // If none of the formulae had all of the required registers, relax the
3378   // constraint so that we don't exclude all formulae.
3379   if (!AnySatisfiedReqRegs) {
3380     assert(!ReqRegs.empty() && "Solver failed even without required registers");
3381     ReqRegs.clear();
3382     goto retry;
3383   }
3384 }
3385
3386 /// Solve - Choose one formula from each use. Return the results in the given
3387 /// Solution vector.
3388 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
3389   SmallVector<const Formula *, 8> Workspace;
3390   Cost SolutionCost;
3391   SolutionCost.Loose();
3392   Cost CurCost;
3393   SmallPtrSet<const SCEV *, 16> CurRegs;
3394   DenseSet<const SCEV *> VisitedRegs;
3395   Workspace.reserve(Uses.size());
3396
3397   // SolveRecurse does all the work.
3398   SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
3399                CurRegs, VisitedRegs);
3400   if (Solution.empty()) {
3401     DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
3402     return;
3403   }
3404
3405   // Ok, we've now made all our decisions.
3406   DEBUG(dbgs() << "\n"
3407                   "The chosen solution requires "; SolutionCost.print(dbgs());
3408         dbgs() << ":\n";
3409         for (size_t i = 0, e = Uses.size(); i != e; ++i) {
3410           dbgs() << "  ";
3411           Uses[i].print(dbgs());
3412           dbgs() << "\n"
3413                     "    ";
3414           Solution[i]->print(dbgs());
3415           dbgs() << '\n';
3416         });
3417
3418   assert(Solution.size() == Uses.size() && "Malformed solution!");
3419 }
3420
3421 /// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up
3422 /// the dominator tree far as we can go while still being dominated by the
3423 /// input positions. This helps canonicalize the insert position, which
3424 /// encourages sharing.
3425 BasicBlock::iterator
3426 LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
3427                                  const SmallVectorImpl<Instruction *> &Inputs)
3428                                                                          const {
3429   for (;;) {
3430     const Loop *IPLoop = LI.getLoopFor(IP->getParent());
3431     unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
3432
3433     BasicBlock *IDom;
3434     for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
3435       if (!Rung) return IP;
3436       Rung = Rung->getIDom();
3437       if (!Rung) return IP;
3438       IDom = Rung->getBlock();
3439
3440       // Don't climb into a loop though.
3441       const Loop *IDomLoop = LI.getLoopFor(IDom);
3442       unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
3443       if (IDomDepth <= IPLoopDepth &&
3444           (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
3445         break;
3446     }
3447
3448     bool AllDominate = true;
3449     Instruction *BetterPos = 0;
3450     Instruction *Tentative = IDom->getTerminator();
3451     for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
3452          E = Inputs.end(); I != E; ++I) {
3453       Instruction *Inst = *I;
3454       if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
3455         AllDominate = false;
3456         break;
3457       }
3458       // Attempt to find an insert position in the middle of the block,
3459       // instead of at the end, so that it can be used for other expansions.
3460       if (IDom == Inst->getParent() &&
3461           (!BetterPos || DT.dominates(BetterPos, Inst)))
3462         BetterPos = llvm::next(BasicBlock::iterator(Inst));
3463     }
3464     if (!AllDominate)
3465       break;
3466     if (BetterPos)
3467       IP = BetterPos;
3468     else
3469       IP = Tentative;
3470   }
3471
3472   return IP;
3473 }
3474
3475 /// AdjustInsertPositionForExpand - Determine an input position which will be
3476 /// dominated by the operands and which will dominate the result.
3477 BasicBlock::iterator
3478 LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP,
3479                                            const LSRFixup &LF,
3480                                            const LSRUse &LU) const {
3481   // Collect some instructions which must be dominated by the
3482   // expanding replacement. These must be dominated by any operands that
3483   // will be required in the expansion.
3484   SmallVector<Instruction *, 4> Inputs;
3485   if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
3486     Inputs.push_back(I);
3487   if (LU.Kind == LSRUse::ICmpZero)
3488     if (Instruction *I =
3489           dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
3490       Inputs.push_back(I);
3491   if (LF.PostIncLoops.count(L)) {
3492     if (LF.isUseFullyOutsideLoop(L))
3493       Inputs.push_back(L->getLoopLatch()->getTerminator());
3494     else
3495       Inputs.push_back(IVIncInsertPos);
3496   }
3497   // The expansion must also be dominated by the increment positions of any
3498   // loops it for which it is using post-inc mode.
3499   for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(),
3500        E = LF.PostIncLoops.end(); I != E; ++I) {
3501     const Loop *PIL = *I;
3502     if (PIL == L) continue;
3503
3504     // Be dominated by the loop exit.
3505     SmallVector<BasicBlock *, 4> ExitingBlocks;
3506     PIL->getExitingBlocks(ExitingBlocks);
3507     if (!ExitingBlocks.empty()) {
3508       BasicBlock *BB = ExitingBlocks[0];
3509       for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
3510         BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
3511       Inputs.push_back(BB->getTerminator());
3512     }
3513   }
3514
3515   // Then, climb up the immediate dominator tree as far as we can go while
3516   // still being dominated by the input positions.
3517   IP = HoistInsertPosition(IP, Inputs);
3518
3519   // Don't insert instructions before PHI nodes.
3520   while (isa<PHINode>(IP)) ++IP;
3521
3522   // Ignore landingpad instructions.
3523   while (isa<LandingPadInst>(IP)) ++IP;
3524
3525   // Ignore debug intrinsics.
3526   while (isa<DbgInfoIntrinsic>(IP)) ++IP;
3527
3528   return IP;
3529 }
3530
3531 /// Expand - Emit instructions for the leading candidate expression for this
3532 /// LSRUse (this is called "expanding").
3533 Value *LSRInstance::Expand(const LSRFixup &LF,
3534                            const Formula &F,
3535                            BasicBlock::iterator IP,
3536                            SCEVExpander &Rewriter,
3537                            SmallVectorImpl<WeakVH> &DeadInsts) const {
3538   const LSRUse &LU = Uses[LF.LUIdx];
3539
3540   // Determine an input position which will be dominated by the operands and
3541   // which will dominate the result.
3542   IP = AdjustInsertPositionForExpand(IP, LF, LU);
3543
3544   // Inform the Rewriter if we have a post-increment use, so that it can
3545   // perform an advantageous expansion.
3546   Rewriter.setPostInc(LF.PostIncLoops);
3547
3548   // This is the type that the user actually needs.
3549   Type *OpTy = LF.OperandValToReplace->getType();
3550   // This will be the type that we'll initially expand to.
3551   Type *Ty = F.getType();
3552   if (!Ty)
3553     // No type known; just expand directly to the ultimate type.
3554     Ty = OpTy;
3555   else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
3556     // Expand directly to the ultimate type if it's the right size.
3557     Ty = OpTy;
3558   // This is the type to do integer arithmetic in.
3559   Type *IntTy = SE.getEffectiveSCEVType(Ty);
3560
3561   // Build up a list of operands to add together to form the full base.
3562   SmallVector<const SCEV *, 8> Ops;
3563
3564   // Expand the BaseRegs portion.
3565   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
3566        E = F.BaseRegs.end(); I != E; ++I) {
3567     const SCEV *Reg = *I;
3568     assert(!Reg->isZero() && "Zero allocated in a base register!");
3569
3570     // If we're expanding for a post-inc user, make the post-inc adjustment.
3571     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
3572     Reg = TransformForPostIncUse(Denormalize, Reg,
3573                                  LF.UserInst, LF.OperandValToReplace,
3574                                  Loops, SE, DT);
3575
3576     Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
3577   }
3578
3579   // Flush the operand list to suppress SCEVExpander hoisting.
3580   if (!Ops.empty()) {
3581     Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
3582     Ops.clear();
3583     Ops.push_back(SE.getUnknown(FullV));
3584   }
3585
3586   // Expand the ScaledReg portion.
3587   Value *ICmpScaledV = 0;
3588   if (F.AM.Scale != 0) {
3589     const SCEV *ScaledS = F.ScaledReg;
3590
3591     // If we're expanding for a post-inc user, make the post-inc adjustment.
3592     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
3593     ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
3594                                      LF.UserInst, LF.OperandValToReplace,
3595                                      Loops, SE, DT);
3596
3597     if (LU.Kind == LSRUse::ICmpZero) {
3598       // An interesting way of "folding" with an icmp is to use a negated
3599       // scale, which we'll implement by inserting it into the other operand
3600       // of the icmp.
3601       assert(F.AM.Scale == -1 &&
3602              "The only scale supported by ICmpZero uses is -1!");
3603       ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
3604     } else {
3605       // Otherwise just expand the scaled register and an explicit scale,
3606       // which is expected to be matched as part of the address.
3607       ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
3608       ScaledS = SE.getMulExpr(ScaledS,
3609                               SE.getConstant(ScaledS->getType(), F.AM.Scale));
3610       Ops.push_back(ScaledS);
3611
3612       // Flush the operand list to suppress SCEVExpander hoisting.
3613       Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
3614       Ops.clear();
3615       Ops.push_back(SE.getUnknown(FullV));
3616     }
3617   }
3618
3619   // Expand the GV portion.
3620   if (F.AM.BaseGV) {
3621     Ops.push_back(SE.getUnknown(F.AM.BaseGV));
3622
3623     // Flush the operand list to suppress SCEVExpander hoisting.
3624     Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
3625     Ops.clear();
3626     Ops.push_back(SE.getUnknown(FullV));
3627   }
3628
3629   // Expand the immediate portion.
3630   int64_t Offset = (uint64_t)F.AM.BaseOffs + LF.Offset;
3631   if (Offset != 0) {
3632     if (LU.Kind == LSRUse::ICmpZero) {
3633       // The other interesting way of "folding" with an ICmpZero is to use a
3634       // negated immediate.
3635       if (!ICmpScaledV)
3636         ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
3637       else {
3638         Ops.push_back(SE.getUnknown(ICmpScaledV));
3639         ICmpScaledV = ConstantInt::get(IntTy, Offset);
3640       }
3641     } else {
3642       // Just add the immediate values. These again are expected to be matched
3643       // as part of the address.
3644       Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
3645     }
3646   }
3647
3648   // Expand the unfolded offset portion.
3649   int64_t UnfoldedOffset = F.UnfoldedOffset;
3650   if (UnfoldedOffset != 0) {
3651     // Just add the immediate values.
3652     Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
3653                                                        UnfoldedOffset)));
3654   }
3655
3656   // Emit instructions summing all the operands.
3657   const SCEV *FullS = Ops.empty() ?
3658                       SE.getConstant(IntTy, 0) :
3659                       SE.getAddExpr(Ops);
3660   Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);
3661
3662   // We're done expanding now, so reset the rewriter.
3663   Rewriter.clearPostInc();
3664
3665   // An ICmpZero Formula represents an ICmp which we're handling as a
3666   // comparison against zero. Now that we've expanded an expression for that
3667   // form, update the ICmp's other operand.
3668   if (LU.Kind == LSRUse::ICmpZero) {
3669     ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
3670     DeadInsts.push_back(CI->getOperand(1));
3671     assert(!F.AM.BaseGV && "ICmp does not support folding a global value and "
3672                            "a scale at the same time!");
3673     if (F.AM.Scale == -1) {
3674       if (ICmpScaledV->getType() != OpTy) {
3675         Instruction *Cast =
3676           CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
3677                                                    OpTy, false),
3678                            ICmpScaledV, OpTy, "tmp", CI);
3679         ICmpScaledV = Cast;
3680       }
3681       CI->setOperand(1, ICmpScaledV);
3682     } else {
3683       assert(F.AM.Scale == 0 &&
3684              "ICmp does not support folding a global value and "
3685              "a scale at the same time!");
3686       Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
3687                                            -(uint64_t)Offset);
3688       if (C->getType() != OpTy)
3689         C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
3690                                                           OpTy, false),
3691                                   C, OpTy);
3692
3693       CI->setOperand(1, C);
3694     }
3695   }
3696
3697   return FullV;
3698 }
3699
3700 /// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use
3701 /// of their operands effectively happens in their predecessor blocks, so the
3702 /// expression may need to be expanded in multiple places.
3703 void LSRInstance::RewriteForPHI(PHINode *PN,
3704                                 const LSRFixup &LF,
3705                                 const Formula &F,
3706                                 SCEVExpander &Rewriter,
3707                                 SmallVectorImpl<WeakVH> &DeadInsts,
3708                                 Pass *P) const {
3709   DenseMap<BasicBlock *, Value *> Inserted;
3710   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
3711     if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
3712       BasicBlock *BB = PN->getIncomingBlock(i);
3713
3714       // If this is a critical edge, split the edge so that we do not insert
3715       // the code on all predecessor/successor paths.  We do this unless this
3716       // is the canonical backedge for this loop, which complicates post-inc
3717       // users.
3718       if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
3719           !isa<IndirectBrInst>(BB->getTerminator())) {
3720         BasicBlock *Parent = PN->getParent();
3721         Loop *PNLoop = LI.getLoopFor(Parent);
3722         if (!PNLoop || Parent != PNLoop->getHeader()) {
3723           // Split the critical edge.
3724           BasicBlock *NewBB = 0;
3725           if (!Parent->isLandingPad()) {
3726             NewBB = SplitCriticalEdge(BB, Parent, P,
3727                                       /*MergeIdenticalEdges=*/true,
3728                                       /*DontDeleteUselessPhis=*/true);
3729           } else {
3730             SmallVector<BasicBlock*, 2> NewBBs;
3731             SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs);
3732             NewBB = NewBBs[0];
3733           }
3734
3735           // If PN is outside of the loop and BB is in the loop, we want to
3736           // move the block to be immediately before the PHI block, not
3737           // immediately after BB.
3738           if (L->contains(BB) && !L->contains(PN))
3739             NewBB->moveBefore(PN->getParent());
3740
3741           // Splitting the edge can reduce the number of PHI entries we have.
3742           e = PN->getNumIncomingValues();
3743           BB = NewBB;
3744           i = PN->getBasicBlockIndex(BB);
3745         }
3746       }
3747
3748       std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
3749         Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
3750       if (!Pair.second)
3751         PN->setIncomingValue(i, Pair.first->second);
3752       else {
3753         Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts);
3754
3755         // If this is reuse-by-noop-cast, insert the noop cast.
3756         Type *OpTy = LF.OperandValToReplace->getType();
3757         if (FullV->getType() != OpTy)
3758           FullV =
3759             CastInst::Create(CastInst::getCastOpcode(FullV, false,
3760                                                      OpTy, false),
3761                              FullV, LF.OperandValToReplace->getType(),
3762                              "tmp", BB->getTerminator());
3763
3764         PN->setIncomingValue(i, FullV);
3765         Pair.first->second = FullV;
3766       }
3767     }
3768 }
3769
3770 /// Rewrite - Emit instructions for the leading candidate expression for this
3771 /// LSRUse (this is called "expanding"), and update the UserInst to reference
3772 /// the newly expanded value.
3773 void LSRInstance::Rewrite(const LSRFixup &LF,
3774                           const Formula &F,
3775                           SCEVExpander &Rewriter,
3776                           SmallVectorImpl<WeakVH> &DeadInsts,
3777                           Pass *P) const {
3778   // First, find an insertion point that dominates UserInst. For PHI nodes,
3779   // find the nearest block which dominates all the relevant uses.
3780   if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
3781     RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P);
3782   } else {
3783     Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts);
3784
3785     // If this is reuse-by-noop-cast, insert the noop cast.
3786     Type *OpTy = LF.OperandValToReplace->getType();
3787     if (FullV->getType() != OpTy) {
3788       Instruction *Cast =
3789         CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
3790                          FullV, OpTy, "tmp", LF.UserInst);
3791       FullV = Cast;
3792     }
3793
3794     // Update the user. ICmpZero is handled specially here (for now) because
3795     // Expand may have updated one of the operands of the icmp already, and
3796     // its new value may happen to be equal to LF.OperandValToReplace, in
3797     // which case doing replaceUsesOfWith leads to replacing both operands
3798     // with the same value. TODO: Reorganize this.
3799     if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
3800       LF.UserInst->setOperand(0, FullV);
3801     else
3802       LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
3803   }
3804
3805   DeadInsts.push_back(LF.OperandValToReplace);
3806 }
3807
3808 /// ImplementSolution - Rewrite all the fixup locations with new values,
3809 /// following the chosen solution.
3810 void
3811 LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
3812                                Pass *P) {
3813   // Keep track of instructions we may have made dead, so that
3814   // we can remove them after we are done working.
3815   SmallVector<WeakVH, 16> DeadInsts;
3816
3817   SCEVExpander Rewriter(SE, "lsr");
3818   Rewriter.disableCanonicalMode();
3819   Rewriter.enableLSRMode();
3820   Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
3821
3822   // Expand the new value definitions and update the users.
3823   for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
3824        E = Fixups.end(); I != E; ++I) {
3825     const LSRFixup &Fixup = *I;
3826
3827     Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P);
3828
3829     Changed = true;
3830   }
3831
3832   // Clean up after ourselves. This must be done before deleting any
3833   // instructions.
3834   Rewriter.clear();
3835
3836   Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
3837 }
3838
3839 LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
3840   : IU(P->getAnalysis<IVUsers>()),
3841     SE(P->getAnalysis<ScalarEvolution>()),
3842     DT(P->getAnalysis<DominatorTree>()),
3843     LI(P->getAnalysis<LoopInfo>()),
3844     TLI(tli), L(l), Changed(false), IVIncInsertPos(0) {
3845
3846   // If LoopSimplify form is not available, stay out of trouble.
3847   if (!L->isLoopSimplifyForm())
3848     return;
3849
3850   // All outer loops must have preheaders, or SCEVExpander may not be able to
3851   // materialize an AddRecExpr whose Start is an outer AddRecExpr.
3852   for (const Loop *OuterLoop = L; (OuterLoop = OuterLoop->getParentLoop());) {
3853     if (!OuterLoop->getLoopPreheader())
3854       return;
3855   }
3856   // If there's no interesting work to be done, bail early.
3857   if (IU.empty()) return;
3858
3859   DEBUG(dbgs() << "\nLSR on loop ";
3860         WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
3861         dbgs() << ":\n");
3862
3863   // First, perform some low-level loop optimizations.
3864   OptimizeShadowIV();
3865   OptimizeLoopTermCond();
3866
3867   // If loop preparation eliminates all interesting IV users, bail.
3868   if (IU.empty()) return;
3869
3870   // Skip nested loops until we can model them better with formulae.
3871   if (!EnableNested && !L->empty()) {
3872     DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
3873     return;
3874   }
3875
3876   // Start collecting data and preparing for the solver.
3877   CollectInterestingTypesAndFactors();
3878   CollectFixupsAndInitialFormulae();
3879   CollectLoopInvariantFixupsAndFormulae();
3880
3881   DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
3882         print_uses(dbgs()));
3883
3884   // Now use the reuse data to generate a bunch of interesting ways
3885   // to formulate the values needed for the uses.
3886   GenerateAllReuseFormulae();
3887
3888   FilterOutUndesirableDedicatedRegisters();
3889   NarrowSearchSpaceUsingHeuristics();
3890
3891   SmallVector<const Formula *, 8> Solution;
3892   Solve(Solution);
3893
3894   // Release memory that is no longer needed.
3895   Factors.clear();
3896   Types.clear();
3897   RegUses.clear();
3898
3899   if (Solution.empty())
3900     return;
3901
3902 #ifndef NDEBUG
3903   // Formulae should be legal.
3904   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
3905        E = Uses.end(); I != E; ++I) {
3906      const LSRUse &LU = *I;
3907      for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
3908           JE = LU.Formulae.end(); J != JE; ++J)
3909         assert(isLegalUse(J->AM, LU.MinOffset, LU.MaxOffset,
3910                           LU.Kind, LU.AccessTy, TLI) &&
3911                "Illegal formula generated!");
3912   };
3913 #endif
3914
3915   // Now that we've decided what we want, make it so.
3916   ImplementSolution(Solution, P);
3917 }
3918
3919 void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
3920   if (Factors.empty() && Types.empty()) return;
3921
3922   OS << "LSR has identified the following interesting factors and types: ";
3923   bool First = true;
3924
3925   for (SmallSetVector<int64_t, 8>::const_iterator
3926        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
3927     if (!First) OS << ", ";
3928     First = false;
3929     OS << '*' << *I;
3930   }
3931
3932   for (SmallSetVector<Type *, 4>::const_iterator
3933        I = Types.begin(), E = Types.end(); I != E; ++I) {
3934     if (!First) OS << ", ";
3935     First = false;
3936     OS << '(' << **I << ')';
3937   }
3938   OS << '\n';
3939 }
3940
3941 void LSRInstance::print_fixups(raw_ostream &OS) const {
3942   OS << "LSR is examining the following fixup sites:\n";
3943   for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
3944        E = Fixups.end(); I != E; ++I) {
3945     dbgs() << "  ";
3946     I->print(OS);
3947     OS << '\n';
3948   }
3949 }
3950
3951 void LSRInstance::print_uses(raw_ostream &OS) const {
3952   OS << "LSR is examining the following uses:\n";
3953   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
3954        E = Uses.end(); I != E; ++I) {
3955     const LSRUse &LU = *I;
3956     dbgs() << "  ";
3957     LU.print(OS);
3958     OS << '\n';
3959     for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
3960          JE = LU.Formulae.end(); J != JE; ++J) {
3961       OS << "    ";
3962       J->print(OS);
3963       OS << '\n';
3964     }
3965   }
3966 }
3967
3968 void LSRInstance::print(raw_ostream &OS) const {
3969   print_factors_and_types(OS);
3970   print_fixups(OS);
3971   print_uses(OS);
3972 }
3973
3974 void LSRInstance::dump() const {
3975   print(errs()); errs() << '\n';
3976 }
3977
3978 namespace {
3979
3980 class LoopStrengthReduce : public LoopPass {
3981   /// TLI - Keep a pointer of a TargetLowering to consult for determining
3982   /// transformation profitability.
3983   const TargetLowering *const TLI;
3984
3985 public:
3986   static char ID; // Pass ID, replacement for typeid
3987   explicit LoopStrengthReduce(const TargetLowering *tli = 0);
3988
3989 private:
3990   bool runOnLoop(Loop *L, LPPassManager &LPM);
3991   void getAnalysisUsage(AnalysisUsage &AU) const;
3992 };
3993
3994 }
3995
3996 char LoopStrengthReduce::ID = 0;
3997 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
3998                 "Loop Strength Reduction", false, false)
3999 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
4000 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
4001 INITIALIZE_PASS_DEPENDENCY(IVUsers)
4002 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
4003 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
4004 INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
4005                 "Loop Strength Reduction", false, false)
4006
4007
4008 Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
4009   return new LoopStrengthReduce(TLI);
4010 }
4011
4012 LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
4013   : LoopPass(ID), TLI(tli) {
4014     initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
4015   }
4016
4017 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
4018   // We split critical edges, so we change the CFG.  However, we do update
4019   // many analyses if they are around.
4020   AU.addPreservedID(LoopSimplifyID);
4021
4022   AU.addRequired<LoopInfo>();
4023   AU.addPreserved<LoopInfo>();
4024   AU.addRequiredID(LoopSimplifyID);
4025   AU.addRequired<DominatorTree>();
4026   AU.addPreserved<DominatorTree>();
4027   AU.addRequired<ScalarEvolution>();
4028   AU.addPreserved<ScalarEvolution>();
4029   // Requiring LoopSimplify a second time here prevents IVUsers from running
4030   // twice, since LoopSimplify was invalidated by running ScalarEvolution.
4031   AU.addRequiredID(LoopSimplifyID);
4032   AU.addRequired<IVUsers>();
4033   AU.addPreserved<IVUsers>();
4034 }
4035
4036 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
4037   bool Changed = false;
4038
4039   // Run the main LSR transformation.
4040   Changed |= LSRInstance(TLI, L, this).getChanged();
4041
4042   // Remove any extra phis created by processing inner loops.
4043   Changed |= DeleteDeadPHIs(L->getHeader());
4044   if (EnablePhiElim) {
4045     SmallVector<WeakVH, 16> DeadInsts;
4046     SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), "lsr");
4047 #ifndef NDEBUG
4048     Rewriter.setDebugType(DEBUG_TYPE);
4049 #endif
4050     unsigned numFolded = Rewriter.
4051       replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), DeadInsts, TLI);
4052     if (numFolded) {
4053       Changed = true;
4054       DeleteTriviallyDeadInstructions(DeadInsts);
4055       DeleteDeadPHIs(L->getHeader());
4056     }
4057   }
4058   return Changed;
4059 }