Fix the other half of the alignment changing issue by making sure that the

[oota-llvm.git] / lib / Transforms / Scalar / LoopStrengthReduce.cpp
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp

index f1dbbf95292abd68f6461a7d68152ceb66009113..a3cef7a82807ad677af121c8f92e9eb3bcb0898b 100644 (file)
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -161,9 +161,10 @@ RegUseTracker::DropUse(size_t LUIdx) {
  
  bool
  RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
-  if (!RegUsesMap.count(Reg)) return false;
-  const SmallBitVector &UsedByIndices =
-    RegUsesMap.find(Reg)->second.UsedByIndices;
+  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
+  if (I == RegUsesMap.end())
+    return false;
+  const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
    int i = UsedByIndices.find_first();
    if (i == -1) return false;
    if ((size_t)i != LUIdx) return true;
@@ -392,12 +393,13 @@ static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
    return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
  }
  
-/// isMulSExtable - Return true if the given add can be sign-extended
+/// isMulSExtable - Return true if the given mul can be sign-extended
  /// without changing its value.
-static bool isMulSExtable(const SCEVMulExpr *A, ScalarEvolution &SE) {
+static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
    const Type *WideTy =
-    IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
-  return isa<SCEVMulExpr>(SE.getSignExtendExpr(A, WideTy));
+    IntegerType::get(SE.getContext(),
+                     SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
+  return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
  }
  
  /// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined
@@ -413,33 +415,42 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
    if (LHS == RHS)
      return SE.getConstant(LHS->getType(), 1);
  
-  // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do some
-  // folding.
-  if (RHS->isAllOnesValue())
-    return SE.getMulExpr(LHS, RHS);
+  // Handle a few RHS special cases.
+  const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
+  if (RC) {
+    const APInt &RA = RC->getValue()->getValue();
+    // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
+    // some folding.
+    if (RA.isAllOnesValue())
+      return SE.getMulExpr(LHS, RC);
+    // Handle x /s 1 as x.
+    if (RA == 1)
+      return LHS;
+  }
  
    // Check for a division of a constant by a constant.
    if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
-    const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
      if (!RC)
        return 0;
-    if (C->getValue()->getValue().srem(RC->getValue()->getValue()) != 0)
+    const APInt &LA = C->getValue()->getValue();
+    const APInt &RA = RC->getValue()->getValue();
+    if (LA.srem(RA) != 0)
        return 0;
-    return SE.getConstant(C->getValue()->getValue()
-               .sdiv(RC->getValue()->getValue()));
+    return SE.getConstant(LA.sdiv(RA));
    }
  
    // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
      if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
-      const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
-                                       IgnoreSignificantBits);
-      if (!Start) return 0;
        const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
                                        IgnoreSignificantBits);
        if (!Step) return 0;
+      const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
+                                       IgnoreSignificantBits);
+      if (!Start) return 0;
        return SE.getAddRecExpr(Start, Step, AR->getLoop());
      }
+    return 0;
    }
  
    // Distribute the sdiv over add operands, if the add doesn't overflow.
@@ -455,10 +466,11 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
        }
        return SE.getAddExpr(Ops);
      }
+    return 0;
    }
  
    // Check for a multiply operand that we can pull RHS out of.
-  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS))
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
      if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
        SmallVector<const SCEV *, 4> Ops;
        bool Found = false;
@@ -475,6 +487,8 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
        }
        return Found ? SE.getMulExpr(Ops) : 0;
      }
+    return 0;
+  }
  
    // Otherwise we don't know.
    return 0;
@@ -492,12 +506,14 @@ static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
    } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
      SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
      int64_t Result = ExtractImmediate(NewOps.front(), SE);
-    S = SE.getAddExpr(NewOps);
+    if (Result != 0)
+      S = SE.getAddExpr(NewOps);
      return Result;
    } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
      SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
      int64_t Result = ExtractImmediate(NewOps.front(), SE);
-    S = SE.getAddRecExpr(NewOps, AR->getLoop());
+    if (Result != 0)
+      S = SE.getAddRecExpr(NewOps, AR->getLoop());
      return Result;
    }
    return 0;
@@ -515,12 +531,14 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
    } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
      SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
      GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
-    S = SE.getAddExpr(NewOps);
+    if (Result)
+      S = SE.getAddExpr(NewOps);
      return Result;
    } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
      SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
      GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
-    S = SE.getAddRecExpr(NewOps, AR->getLoop());
+    if (Result)
+      S = SE.getAddRecExpr(NewOps, AR->getLoop());
      return Result;
    }
    return 0;
@@ -546,7 +564,7 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
        case Intrinsic::x86_sse2_storeu_pd:
        case Intrinsic::x86_sse2_storeu_dq:
        case Intrinsic::x86_sse2_storel_dq:
-        if (II->getOperand(1) == OperandVal)
+        if (II->getArgOperand(0) == OperandVal)
            isAddress = true;
          break;
      }
@@ -568,7 +586,7 @@ static const Type *getAccessType(const Instruction *Inst) {
      case Intrinsic::x86_sse2_storeu_pd:
      case Intrinsic::x86_sse2_storeu_dq:
      case Intrinsic::x86_sse2_storel_dq:
-      AccessTy = II->getOperand(1)->getType();
+      AccessTy = II->getArgOperand(0)->getType();
        break;
      }
    }
@@ -590,7 +608,7 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
    bool Changed = false;
  
    while (!DeadInsts.empty()) {
-    Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
+    Instruction *I = dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val());
  
      if (I == 0 || !isInstructionTriviallyDead(I))
        continue;
@@ -627,8 +645,6 @@ public:
      : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
        SetupCost(0) {}
  
-  unsigned getNumRegs() const { return NumRegs; }
-
    bool operator<(const Cost &Other) const;
  
    void Loose();
@@ -952,6 +968,12 @@ public:
    /// may be used.
    bool AllFixupsOutsideLoop;
  
+  /// WidestFixupType - This records the widest use type for any fixup using
+  /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
+  /// max fixup widths to be equivalent, because the narrower one may be relying
+  /// on the implicit truncation to truncate away bogus bits.
+  const Type *WidestFixupType;
+
    /// Formulae - A list of ways to build a value that can satisfy this user.
    /// After the list is populated, one of these is selected heuristically and
    /// used to formulate a replacement for OperandValToReplace in UserInst.
@@ -963,19 +985,20 @@ public:
    LSRUse(KindType K, const Type *T) : Kind(K), AccessTy(T),
                                        MinOffset(INT64_MAX),
                                        MaxOffset(INT64_MIN),
-                                      AllFixupsOutsideLoop(true) {}
+                                      AllFixupsOutsideLoop(true),
+                                      WidestFixupType(0) {}
  
    bool HasFormulaWithSameRegs(const Formula &F) const;
    bool InsertFormula(const Formula &F);
    void DeleteFormula(Formula &F);
    void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
  
-  void check() const;
-
    void print(raw_ostream &OS) const;
    void dump() const;
  };
  
+}
+
  /// HasFormula - Test whether this use as a formula which has the same
  /// registers as the given formula.
  bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
@@ -1061,13 +1084,16 @@ void LSRUse::print(raw_ostream &OS) const {
    for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
         E = Offsets.end(); I != E; ++I) {
      OS << *I;
-    if (next(I) != E)
+    if (llvm::next(I) != E)
        OS << ',';
    }
    OS << '}';
  
    if (AllFixupsOutsideLoop)
      OS << ", all-fixups-outside-loop";
+
+  if (WidestFixupType)
+    OS << ", widest fixup type: " << *WidestFixupType;
  }
  
  void LSRUse::dump() const {
@@ -1203,6 +1229,32 @@ static bool isAlwaysFoldable(const SCEV *S,
    return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI);
  }
  
+namespace {
+
+/// UseMapDenseMapInfo - A DenseMapInfo implementation for holding
+/// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind.
+struct UseMapDenseMapInfo {
+  static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() {
+    return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic);
+  }
+
+  static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() {
+    return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic);
+  }
+
+  static unsigned
+  getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) {
+    unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first);
+    Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second));
+    return Result;
+  }
+
+  static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS,
+                      const std::pair<const SCEV *, LSRUse::KindType> &RHS) {
+    return LHS == RHS;
+  }
+};
+
  /// FormulaSorter - This class implements an ordering for formulae which sorts
  /// the by their standalone cost.
  class FormulaSorter {
@@ -1275,7 +1327,9 @@ class LSRInstance {
    }
  
    // Support for sharing of LSRUses between LSRFixups.
-  typedef DenseMap<const SCEV *, size_t> UseMapTy;
+  typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>,
+                   size_t,
+                   UseMapDenseMapInfo> UseMapTy;
    UseMapTy UseMap;
  
    bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
@@ -1311,6 +1365,10 @@ public:
    void FilterOutUndesirableDedicatedRegisters();
  
    size_t EstimateSearchSpaceComplexity() const;
+  void NarrowSearchSpaceByDetectingSupersets();
+  void NarrowSearchSpaceByCollapsingUnrolledCode();
+  void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  void NarrowSearchSpaceByPickingWinnerRegs();
    void NarrowSearchSpaceUsingHeuristics();
  
    void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
@@ -1544,7 +1602,7 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
    const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
  
    // Add one to the backedge-taken count to get the trip count.
-  const SCEV *IterationCount = SE.getAddExpr(BackedgeTakenCount, One);
+  const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
    if (IterationCount != SE.getSCEV(Sel)) return Cond;
  
    // Check for a max calculation that matches the pattern. There's no check
@@ -1613,8 +1671,11 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
      NewRHS = Sel->getOperand(1);
    else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
      NewRHS = Sel->getOperand(2);
+  else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
+    NewRHS = SU->getValue();
    else
-    llvm_unreachable("Max doesn't match expected pattern!");
+    // Max doesn't match expected pattern.
+    return Cond;
  
    // Determine the new comparison opcode. It may be signed or unsigned,
    // and the original comparison may be either equality or inequality.
@@ -1705,13 +1766,13 @@ LSRInstance::OptimizeLoopTermCond() {
            }
            if (const SCEVConstant *D =
                  dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
+            const ConstantInt *C = D->getValue();
              // Stride of one or negative one can have reuse with non-addresses.
-            if (D->getValue()->isOne() ||
-                D->getValue()->isAllOnesValue())
+            if (C->isOne() || C->isAllOnesValue())
                goto decline_post_inc;
              // Avoid weird situations.
-            if (D->getValue()->getValue().getMinSignedBits() >= 64 ||
-                D->getValue()->getValue().isMinSignedValue())
+            if (C->getValue().getMinSignedBits() >= 64 ||
+                C->getValue().isMinSignedValue())
                goto decline_post_inc;
              // Without TLI, assume that any stride might be valid, and so any
              // use might be shared.
@@ -1720,7 +1781,7 @@ LSRInstance::OptimizeLoopTermCond() {
              // Check for possible scaled-address reuse.
              const Type *AccessTy = getAccessType(UI->getUser());
              TargetLowering::AddrMode AM;
-            AM.Scale = D->getValue()->getSExtValue();
+            AM.Scale = C->getSExtValue();
              if (TLI->isLegalAddressingMode(AM, AccessTy))
                goto decline_post_inc;
              AM.Scale = -AM.Scale;
@@ -1805,6 +1866,8 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
      NewMaxOffset = NewOffset;
    }
    // Check for a mismatched access type, and fall back conservatively as needed.
+  // TODO: Be less conservative when the type is similar and can use the same
+  // addressing modes.
    if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
      NewAccessTy = Type::getVoidTy(AccessTy->getContext());
  
@@ -1833,7 +1896,7 @@ LSRInstance::getUse(const SCEV *&Expr,
    }
  
    std::pair<UseMapTy::iterator, bool> P =
-    UseMap.insert(std::make_pair(Expr, 0));
+    UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0));
    if (!P.second) {
      // A use already existed with this base.
      size_t LUIdx = P.first->second;
@@ -1871,32 +1934,41 @@ void LSRInstance::DeleteUse(LSRUse &LU) {
  LSRUse *
  LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
                                         const LSRUse &OrigLU) {
-  // Search all uses for the formula. This could be more clever. Ignore
-  // ICmpZero uses because they may contain formulae generated by
-  // GenerateICmpZeroScales, in which case adding fixup offsets may
-  // be invalid.
+  // Search all uses for the formula. This could be more clever.
    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
      LSRUse &LU = Uses[LUIdx];
+    // Check whether this use is close enough to OrigLU, to see whether it's
+    // worthwhile looking through its formulae.
+    // Ignore ICmpZero uses because they may contain formulae generated by
+    // GenerateICmpZeroScales, in which case adding fixup offsets may
+    // be invalid.
      if (&LU != &OrigLU &&
          LU.Kind != LSRUse::ICmpZero &&
          LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
+        LU.WidestFixupType == OrigLU.WidestFixupType &&
          LU.HasFormulaWithSameRegs(OrigF)) {
+      // Scan through this use's formulae.
        for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
             E = LU.Formulae.end(); I != E; ++I) {
          const Formula &F = *I;
+        // Check to see if this formula has the same registers and symbols
+        // as OrigF.
          if (F.BaseRegs == OrigF.BaseRegs &&
              F.ScaledReg == OrigF.ScaledReg &&
              F.AM.BaseGV == OrigF.AM.BaseGV &&
-            F.AM.Scale == OrigF.AM.Scale &&
-            LU.Kind) {
+            F.AM.Scale == OrigF.AM.Scale) {
            if (F.AM.BaseOffs == 0)
              return &LU;
+          // This is the formula where all the registers and symbols matched;
+          // there aren't going to be any others. Since we declined it, we
+          // can skip the rest of the formulae and procede to the next LSRUse.
            break;
          }
        }
      }
    }
  
+  // Nothing looked good.
    return 0;
  }
  
@@ -1919,7 +1991,7 @@ void LSRInstance::CollectInterestingTypesAndFactors() {
          Strides.insert(AR->getStepRecurrence(SE));
          Worklist.push_back(AR->getStart());
        } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
-        Worklist.insert(Worklist.end(), Add->op_begin(), Add->op_end());
+        Worklist.append(Add->op_begin(), Add->op_end());
        }
      } while (!Worklist.empty());
    }
@@ -1928,7 +2000,7 @@ void LSRInstance::CollectInterestingTypesAndFactors() {
    for (SmallSetVector<const SCEV *, 4>::const_iterator
         I = Strides.begin(), E = Strides.end(); I != E; ++I)
      for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
-         next(I); NewStrideIter != E; ++NewStrideIter) {
+         llvm::next(I); NewStrideIter != E; ++NewStrideIter) {
        const SCEV *OldStride = *I;
        const SCEV *NewStride = *NewStrideIter;
  
@@ -2018,6 +2090,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
      LF.Offset = P.second;
      LSRUse &LU = Uses[LF.LUIdx];
      LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+    if (!LU.WidestFixupType ||
+        SE.getTypeSizeInBits(LU.WidestFixupType) <
+        SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+      LU.WidestFixupType = LF.OperandValToReplace->getType();
  
      // If this is the first use of this LSRUse, give it a formula.
      if (LU.Formulae.empty()) {
@@ -2086,7 +2162,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
      const SCEV *S = Worklist.pop_back_val();
  
      if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
-      Worklist.insert(Worklist.end(), N->op_begin(), N->op_end());
+      Worklist.append(N->op_begin(), N->op_end());
      else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
        Worklist.push_back(C->getOperand());
      else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
@@ -2095,8 +2171,12 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
      } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
        if (!Inserted.insert(U)) continue;
        const Value *V = U->getValue();
-      if (const Instruction *Inst = dyn_cast<Instruction>(V))
+      if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
+        // Look for instructions defined outside the loop.
          if (L->contains(Inst)) continue;
+      } else if (isa<UndefValue>(V))
+        // Undef doesn't have a live range, so it doesn't matter.
+        continue;
        for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
             UI != UE; ++UI) {
          const Instruction *UserInst = dyn_cast<Instruction>(*UI);
@@ -2143,6 +2223,10 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
          LF.Offset = P.second;
          LSRUse &LU = Uses[LF.LUIdx];
          LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+        if (!LU.WidestFixupType ||
+            SE.getTypeSizeInBits(LU.WidestFixupType) <
+            SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+          LU.WidestFixupType = LF.OperandValToReplace->getType();
          InsertSupplementalFormula(U, LU, LF.LUIdx);
          CountRegisters(LU.Formulae.back(), Uses.size() - 1);
          break;
@@ -2155,20 +2239,22 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
  /// separate registers. If C is non-null, multiply each subexpression by C.
  static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
                              SmallVectorImpl<const SCEV *> &Ops,
+                            const Loop *L,
                              ScalarEvolution &SE) {
    if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
      // Break out add operands.
      for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
           I != E; ++I)
-      CollectSubexprs(*I, C, Ops, SE);
+      CollectSubexprs(*I, C, Ops, L, SE);
      return;
    } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
      // Split a non-zero base out of an addrec.
      if (!AR->getStart()->isZero()) {
        CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
                                         AR->getStepRecurrence(SE),
-                                       AR->getLoop()), C, Ops, SE);
-      CollectSubexprs(AR->getStart(), C, Ops, SE);
+                                       AR->getLoop()),
+                      C, Ops, L, SE);
+      CollectSubexprs(AR->getStart(), C, Ops, L, SE);
        return;
      }
    } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
@@ -2178,12 +2264,12 @@ static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
              dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
          CollectSubexprs(Mul->getOperand(1),
                          C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0,
-                        Ops, SE);
+                        Ops, L, SE);
          return;
        }
    }
  
-  // Otherwise use the value itself.
+  // Otherwise use the value itself, optionally with a scale applied.
    Ops.push_back(C ? SE.getMulExpr(C, S) : S);
  }
  
@@ -2199,11 +2285,18 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
      const SCEV *BaseReg = Base.BaseRegs[i];
  
      SmallVector<const SCEV *, 8> AddOps;
-    CollectSubexprs(BaseReg, 0, AddOps, SE);
+    CollectSubexprs(BaseReg, 0, AddOps, L, SE);
+
      if (AddOps.size() == 1) continue;
  
      for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
           JE = AddOps.end(); J != JE; ++J) {
+
+      // Loop-variant "unknown" values are uninteresting; we won't be able to
+      // do anything meaningful with them.
+      if (isa<SCEVUnknown>(*J) && !(*J)->isLoopInvariant(L))
+        continue;
+
        // Don't pull a constant into a register if the constant could be folded
        // into an immediate field.
        if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset,
@@ -2212,11 +2305,10 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
          continue;
  
        // Collect all operands except *J.
-      SmallVector<const SCEV *, 8> InnerAddOps;
-      for (SmallVectorImpl<const SCEV *>::const_iterator K = AddOps.begin(),
-           KE = AddOps.end(); K != KE; ++K)
-        if (K != J)
-          InnerAddOps.push_back(*K);
+      SmallVector<const SCEV *, 8> InnerAddOps
+        (((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
+      InnerAddOps.append
+        (llvm::next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end());
  
        // Don't leave just a constant behind in a register if the constant could
        // be folded into an immediate field.
@@ -2297,7 +2389,7 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
                                            Formula Base) {
    // TODO: For now, just add the min and max offset, because it usually isn't
    // worthwhile looking at everything inbetween.
-  SmallVector<int64_t, 4> Worklist;
+  SmallVector<int64_t, 2> Worklist;
    Worklist.push_back(LU.MinOffset);
    if (LU.MaxOffset != LU.MinOffset)
      Worklist.push_back(LU.MaxOffset);
@@ -2311,7 +2403,14 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
        F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I;
        if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I,
                       LU.Kind, LU.AccessTy, TLI)) {
-        F.BaseRegs[i] = SE.getAddExpr(G, SE.getConstant(G->getType(), *I));
+        // Add the offset to the base register.
+        const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
+        // If it cancelled out, drop the base register, otherwise update it.
+        if (NewG->isZero()) {
+          std::swap(F.BaseRegs[i], F.BaseRegs.back());
+          F.BaseRegs.pop_back();
+        } else
+          F.BaseRegs[i] = NewG;
  
          (void)InsertFormula(LU, LUIdx, F);
        }
@@ -2350,13 +2449,12 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
    for (SmallSetVector<int64_t, 8>::const_iterator
         I = Factors.begin(), E = Factors.end(); I != E; ++I) {
      int64_t Factor = *I;
-    Formula F = Base;
  
      // Check that the multiplication doesn't overflow.
-    if (F.AM.BaseOffs == INT64_MIN && Factor == -1)
+    if (Base.AM.BaseOffs == INT64_MIN && Factor == -1)
        continue;
-    F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
-    if (F.AM.BaseOffs / Factor != Base.AM.BaseOffs)
+    int64_t NewBaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
+    if (NewBaseOffs / Factor != Base.AM.BaseOffs)
        continue;
  
      // Check that multiplying with the use offset doesn't overflow.
@@ -2367,6 +2465,9 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
      if (Offset / Factor != LU.MinOffset)
        continue;
  
+    Formula F = Base;
+    F.AM.BaseOffs = NewBaseOffs;
+
      // Check that this scale is legal.
      if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI))
        continue;
@@ -2609,7 +2710,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
  
      // TODO: Use a more targeted data structure.
      for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
-      Formula F = LU.Formulae[L];
+      const Formula &F = LU.Formulae[L];
        // Use the immediate in the scaled register.
        if (F.ScaledReg == OrigReg) {
          int64_t Offs = (uint64_t)F.AM.BaseOffs +
@@ -2704,6 +2805,10 @@ LSRInstance::GenerateAllReuseFormulae() {
    }
  
    GenerateCrossUseConstantOffsets();
+
+  DEBUG(dbgs() << "\n"
+                  "After generating reuse formulae:\n";
+        print_uses(dbgs()));
  }
  
  /// If their are multiple formulae with the same set of registers used
@@ -2802,11 +2907,11 @@ size_t LSRInstance::EstimateSearchSpaceComplexity() const {
    return Power;
  }
  
-/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of
-/// formulae to choose from, use some rough heuristics to prune down the number
-/// of formulae. This keeps the main solver from taking an extraordinary amount
-/// of time in some worst-case scenarios.
-void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
+/// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset
+/// of the registers of another formula, it won't help reduce register
+/// pressure (though it may not necessarily hurt register pressure); remove
+/// it to simplify the system.
+void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
    if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
      DEBUG(dbgs() << "The search space is too complex.\n");
  
@@ -2864,7 +2969,12 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
      DEBUG(dbgs() << "After pre-selection:\n";
            print_uses(dbgs()));
    }
+}
  
+/// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers
+/// for expressions like A, A+1, A+2, etc., allocate a single register for
+/// them.
+void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
    if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
      DEBUG(dbgs() << "The search space is too complex.\n");
  
@@ -2914,7 +3024,7 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
                  if (Fixup.LUIdx == LUIdx) {
                    Fixup.LUIdx = LUThatHas - &Uses.front();
                    Fixup.Offset += F.AM.BaseOffs;
-                  DEBUG(errs() << "New fixup has offset "
+                  DEBUG(dbgs() << "New fixup has offset "
                                 << Fixup.Offset << '\n');
                  }
                  if (Fixup.LUIdx == NumUses-1)
@@ -2935,7 +3045,30 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
      DEBUG(dbgs() << "After pre-selection:\n";
            print_uses(dbgs()));
    }
+}
+
+/// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call 
+/// FilterOutUndesirableDedicatedRegisters again, if necessary, now that
+/// we've done more filtering, as it may be able to find more formulae to
+/// eliminate.
+void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
+  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+    DEBUG(dbgs() << "The search space is too complex.\n");
  
+    DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
+                    "undesirable dedicated registers.\n");
+
+    FilterOutUndesirableDedicatedRegisters();
+
+    DEBUG(dbgs() << "After pre-selection:\n";
+          print_uses(dbgs()));
+  }
+}
+
+/// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely
+/// to be profitable, and then in any use which has any reference to that
+/// register, delete all formulae which do not reference that register.
+void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
    // With all other options exhausted, loop until the system is simple
    // enough to handle.
    SmallPtrSet<const SCEV *, 4> Taken;
@@ -2997,6 +3130,17 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
    }
  }
  
+/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of
+/// formulae to choose from, use some rough heuristics to prune down the number
+/// of formulae. This keeps the main solver from taking an extraordinary amount
+/// of time in some worst-case scenarios.
+void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
+  NarrowSearchSpaceByDetectingSupersets();
+  NarrowSearchSpaceByCollapsingUnrolledCode();
+  NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  NarrowSearchSpaceByPickingWinnerRegs();
+}
+
  /// SolveRecurse - This is the recursive solver.
  void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
                                 Cost &SolutionCost,
@@ -3109,6 +3253,8 @@ void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
            Solution[i]->print(dbgs());
            dbgs() << '\n';
          });
+
+  assert(Solution.size() == Uses.size() && "Malformed solution!");
  }
  
  /// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up
@@ -3125,7 +3271,7 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
  
      BasicBlock *IDom;
      for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
-      assert(Rung && "Block has no DomTreeNode!");
+      if (!Rung) return IP;
        Rung = Rung->getIDom();
        if (!Rung) return IP;
        IDom = Rung->getBlock();
@@ -3538,16 +3684,11 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
    // to formulate the values needed for the uses.
    GenerateAllReuseFormulae();
  
-  DEBUG(dbgs() << "\n"
-                  "After generating reuse formulae:\n";
-        print_uses(dbgs()));
-
    FilterOutUndesirableDedicatedRegisters();
    NarrowSearchSpaceUsingHeuristics();
  
    SmallVector<const Formula *, 8> Solution;
    Solve(Solution);
-  assert(Solution.size() == Uses.size() && "Malformed solution!");
  
    // Release memory that is no longer needed.
    Factors.clear();
@@ -3597,9 +3738,8 @@ void LSRInstance::print_fixups(raw_ostream &OS) const {
    OS << "LSR is examining the following fixup sites:\n";
    for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
         E = Fixups.end(); I != E; ++I) {
-    const LSRFixup &LF = *I;
      dbgs() << "  ";
-    LF.print(OS);
+    I->print(OS);
      OS << '\n';
    }
  }
@@ -3650,15 +3790,15 @@ private:
  }
  
  char LoopStrengthReduce::ID = 0;
-static RegisterPass<LoopStrengthReduce>
-X("loop-reduce", "Loop Strength Reduction");
+INITIALIZE_PASS(LoopStrengthReduce, "loop-reduce",
+                "Loop Strength Reduction", false, false);
  
  Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
    return new LoopStrengthReduce(TLI);
  }
  
  LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
-  : LoopPass(&ID), TLI(tli) {}
+  : LoopPass(ID), TLI(tli) {}
  
  void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
    // We split critical edges, so we change the CFG.  However, we do update