[SROA] Split the alignment computation complete for the memcpy rewriting

[oota-llvm.git] / lib / Transforms / Scalar / SROA.cpp
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp

index 5c55143a9b19db4e8941f6fb01e459bbdf01a7f1..859e3a79b7d5bd85b9deb59a2e10eb913201ea8c 100644 (file)
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -29,7 +29,6 @@
  #include "llvm/ADT/SetVector.h"
  #include "llvm/ADT/SmallVector.h"
  #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Dominators.h"
  #include "llvm/Analysis/Loads.h"
  #include "llvm/Analysis/PtrUseVisitor.h"
  #include "llvm/Analysis/ValueTracking.h"
@@ -38,6 +37,7 @@
  #include "llvm/IR/Constants.h"
  #include "llvm/IR/DataLayout.h"
  #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
  #include "llvm/IR/Function.h"
  #include "llvm/IR/IRBuilder.h"
  #include "llvm/IR/Instructions.h"
@@ -51,10 +51,17 @@
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/ErrorHandling.h"
  #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TimeValue.h"
  #include "llvm/Support/raw_ostream.h"
  #include "llvm/Transforms/Utils/Local.h"
  #include "llvm/Transforms/Utils/PromoteMemToReg.h"
  #include "llvm/Transforms/Utils/SSAUpdater.h"
+
+#if __cplusplus >= 201103L && !defined(NDEBUG)
+// We only use this for a debug check in C++11
+#include <random>
+#endif
+
  using namespace llvm;
  
  STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
@@ -73,6 +80,16 @@ STATISTIC(NumVectorized, "Number of vectorized aggregates");
  static cl::opt<bool>
  ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden);
  
+/// Hidden option to enable randomly shuffling the slices to help uncover
+/// instability in their order.
+static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
+                                             cl::init(false), cl::Hidden);
+
+/// Hidden option to experiment with completely strict handling of inbounds
+/// GEPs.
+static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds",
+                                        cl::init(false), cl::Hidden);
+
  namespace {
  /// \brief A custom IRBuilder inserter which prefixes all names if they are
  /// preserved.
@@ -244,8 +261,8 @@ public:
    void printUse(raw_ostream &OS, const_iterator I,
                  StringRef Indent = "  ") const;
    void print(raw_ostream &OS) const;
-  void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const;
-  void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const;
+  void dump(const_iterator I) const;
+  void dump() const;
  #endif
  
  private:
@@ -339,7 +356,7 @@ private:
                   bool IsSplittable = false) {
      // Completely skip uses which have a zero size or start either before or
      // past the end of the allocation.
-    if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) {
+    if (Size == 0 || Offset.uge(AllocSize)) {
        DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
                     << " which has zero size or starts outside of the "
                     << AllocSize << " byte alloca:\n"
@@ -380,6 +397,43 @@ private:
      if (GEPI.use_empty())
        return markAsDead(GEPI);
  
+    if (SROAStrictInbounds && GEPI.isInBounds()) {
+      // FIXME: This is a manually un-factored variant of the basic code inside
+      // of GEPs with checking of the inbounds invariant specified in the
+      // langref in a very strict sense. If we ever want to enable
+      // SROAStrictInbounds, this code should be factored cleanly into
+      // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
+      // by writing out the code here where we have tho underlying allocation
+      // size readily available.
+      APInt GEPOffset = Offset;
+      for (gep_type_iterator GTI = gep_type_begin(GEPI),
+                             GTE = gep_type_end(GEPI);
+           GTI != GTE; ++GTI) {
+        ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+        if (!OpC)
+          break;
+
+        // Handle a struct index, which adds its field offset to the pointer.
+        if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+          unsigned ElementIdx = OpC->getZExtValue();
+          const StructLayout *SL = DL.getStructLayout(STy);
+          GEPOffset +=
+              APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
+        } else {
+          // For array or vector indices, scale the index by the size of the type.
+          APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
+          GEPOffset += Index * APInt(Offset.getBitWidth(),
+                                     DL.getTypeAllocSize(GTI.getIndexedType()));
+        }
+
+        // If this index has computed an intermediate pointer which is not
+        // inbounds, then the result of the GEP is a poison value and we can
+        // delete it and all uses.
+        if (GEPOffset.ugt(AllocSize))
+          return markAsDead(GEPI);
+      }
+    }
+
      return Base::visitGetElementPtrInst(GEPI);
    }
  
@@ -426,8 +480,7 @@ private:
      // risk of overflow.
      // FIXME: We should instead consider the pointer to have escaped if this
      // function is being instrumented for addressing bugs or race conditions.
-    if (Offset.isNegative() || Size > AllocSize ||
-        Offset.ugt(AllocSize - Size)) {
+    if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
        DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
                     << " which extends past the end of the " << AllocSize
                     << " byte alloca:\n"
@@ -446,7 +499,7 @@ private:
      assert(II.getRawDest() == *U && "Pointer use is not the destination?");
      ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
      if ((Length && Length->getValue() == 0) ||
-        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
+        (IsOffsetKnown && Offset.uge(AllocSize)))
        // Zero-length mem transfer intrinsics can be ignored entirely.
        return markAsDead(II);
  
@@ -461,14 +514,30 @@ private:
  
    void visitMemTransferInst(MemTransferInst &II) {
      ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
-    if ((Length && Length->getValue() == 0) ||
-        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
+    if (Length && Length->getValue() == 0)
        // Zero-length mem transfer intrinsics can be ignored entirely.
        return markAsDead(II);
  
+    // Because we can visit these intrinsics twice, also check to see if the
+    // first time marked this instruction as dead. If so, skip it.
+    if (VisitedDeadInsts.count(&II))
+      return;
+
      if (!IsOffsetKnown)
        return PI.setAborted(&II);
  
+    // This side of the transfer is completely out-of-bounds, and so we can
+    // nuke the entire transfer. However, we also need to nuke the other side
+    // if already added to our partitions.
+    // FIXME: Yet another place we really should bypass this when
+    // instrumenting for ASan.
+    if (Offset.uge(AllocSize)) {
+      SmallDenseMap<Instruction *, unsigned>::iterator MTPI = MemTransferSliceMap.find(&II);
+      if (MTPI != MemTransferSliceMap.end())
+        S.Slices[MTPI->second].kill();
+      return markAsDead(II);
+    }
+
      uint64_t RawOffset = Offset.getLimitedValue();
      uint64_t Size = Length ? Length->getLimitedValue()
                             : AllocSize - RawOffset;
@@ -597,8 +666,7 @@ private:
      // themselves which should be replaced with undef.
      // FIXME: This should instead be escaped in the event we're instrumenting
      // for address sanitization.
-    if ((Offset.isNegative() && (-Offset).uge(PHISize)) ||
-        (!Offset.isNegative() && Offset.uge(AllocSize))) {
+    if (Offset.uge(AllocSize)) {
        S.DeadOperands.push_back(U);
        return;
      }
@@ -638,8 +706,7 @@ private:
      // themselves which should be replaced with undef.
      // FIXME: This should instead be escaped in the event we're instrumenting
      // for address sanitization.
-    if ((Offset.isNegative() && Offset.uge(SelectSize)) ||
-        (!Offset.isNegative() && Offset.uge(AllocSize))) {
+    if (Offset.uge(AllocSize)) {
        S.DeadOperands.push_back(U);
        return;
      }
@@ -674,6 +741,13 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
                                std::mem_fun_ref(&Slice::isDead)),
                 Slices.end());
  
+#if __cplusplus >= 201103L && !defined(NDEBUG)
+  if (SROARandomShuffleSlices) {
+    std::mt19937 MT(static_cast<unsigned>(sys::TimeValue::now().msec()));
+    std::shuffle(Slices.begin(), Slices.end(), MT);
+  }
+#endif
+
    // Sort the uses. This arranges for the offsets to be in ascending order,
    // and the sizes to be in descending order.
    std::sort(Slices.begin(), Slices.end());
@@ -712,8 +786,10 @@ void AllocaSlices::print(raw_ostream &OS) const {
      print(OS, I);
  }
  
-void AllocaSlices::dump(const_iterator I) const { print(dbgs(), I); }
-void AllocaSlices::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
+  print(dbgs(), I);
+}
+LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
  
  #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
  
@@ -733,9 +809,9 @@ class AllocaPromoter : public LoadAndStorePromoter {
    SmallVector<DbgValueInst *, 4> DVIs;
  
  public:
-  AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
+  AllocaPromoter(const SmallVectorImpl<Instruction *> &Insts, SSAUpdater &S,
                   AllocaInst &AI, DIBuilder &DIB)
-    : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
+      : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
  
    void run(const SmallVectorImpl<Instruction*> &Insts) {
      // Retain the debug information attached to the alloca for use when
@@ -762,9 +838,30 @@ public:
  
    virtual bool isInstInList(Instruction *I,
                              const SmallVectorImpl<Instruction*> &Insts) const {
+    Value *Ptr;
      if (LoadInst *LI = dyn_cast<LoadInst>(I))
-      return LI->getOperand(0) == &AI;
-    return cast<StoreInst>(I)->getPointerOperand() == &AI;
+      Ptr = LI->getOperand(0);
+    else
+      Ptr = cast<StoreInst>(I)->getPointerOperand();
+
+    // Only used to detect cycles, which will be rare and quickly found as
+    // we're walking up a chain of defs rather than down through uses.
+    SmallPtrSet<Value *, 4> Visited;
+
+    do {
+      if (Ptr == &AI)
+        return true;
+
+      if (BitCastInst *BCI = dyn_cast<BitCastInst>(Ptr))
+        Ptr = BCI->getOperand(0);
+      else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr))
+        Ptr = GEPI->getPointerOperand();
+      else
+        return false;
+
+    } while (Visited.insert(Ptr));
+
+    return false;
    }
  
    virtual void updateDebugInfo(Instruction *Inst) const {
@@ -894,6 +991,7 @@ private:
                          ArrayRef<AllocaSlices::iterator> SplitUses);
    bool splitAlloca(AllocaInst &AI, AllocaSlices &S);
    bool runOnAlloca(AllocaInst &AI);
+  void clobberUse(Use &U);
    void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas);
    bool promoteAllocas(Function &F);
  };
@@ -907,7 +1005,7 @@ FunctionPass *llvm::createSROAPass(bool RequiresDomTree) {
  
  INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates",
                        false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates",
                      false, false)
  
@@ -917,6 +1015,11 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
                              AllocaSlices::const_iterator E,
                              uint64_t EndOffset) {
    Type *Ty = 0;
+  bool TyIsCommon = true;
+  IntegerType *ITy = 0;
+
+  // Note that we need to look at *every* alloca slice's Use to ensure we
+  // always get consistent results regardless of the order of slices.
    for (AllocaSlices::const_iterator I = B; I != E; ++I) {
      Use *U = I->getUse();
      if (isa<IntrinsicInst>(*U->getUser()))
@@ -925,33 +1028,34 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
        continue;
  
      Type *UserTy = 0;
-    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser()))
+    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
        UserTy = LI->getType();
-    else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser()))
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
        UserTy = SI->getValueOperand()->getType();
+    }
+
+    if (!UserTy || (Ty && Ty != UserTy))
+      TyIsCommon = false; // Give up on anything but an iN type.
      else
-      return 0; // Bail if we have weird uses.
+      Ty = UserTy;
  
-    if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) {
+    if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
        // If the type is larger than the partition, skip it. We only encounter
        // this for split integer operations where we want to use the type of the
-      // entity causing the split.
-      if (ITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
+      // entity causing the split. Also skip if the type is not a byte width
+      // multiple.
+      if (UserITy->getBitWidth() % 8 != 0 ||
+          UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
          continue;
  
-      // If we have found an integer type use covering the alloca, use that
-      // regardless of the other types, as integers are often used for a
-      // "bucket
-      // of bits" type.
-      return ITy;
+      // Track the largest bitwidth integer type used in this way in case there
+      // is no common type.
+      if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
+        ITy = UserITy;
      }
-
-    if (Ty && Ty != UserTy)
-      return 0;
-
-    Ty = UserTy;
    }
-  return Ty;
+
+  return TyIsCommon ? Ty : ITy;
  }
  
  /// PHI instructions that use an alloca and are subsequently loaded can be
@@ -1158,7 +1262,7 @@ static void speculateSelectInstLoads(SelectInst &SI) {
  /// This will return the BasePtr if that is valid, or build a new GEP
  /// instruction using the IRBuilder if GEP-ing is needed.
  static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
-                       SmallVectorImpl<Value *> &Indices) {
+                       SmallVectorImpl<Value *> &Indices, Twine NamePrefix) {
    if (Indices.empty())
      return BasePtr;
  
@@ -1167,7 +1271,7 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
    if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
      return BasePtr;
  
-  return IRB.CreateInBoundsGEP(BasePtr, Indices, "idx");
+  return IRB.CreateInBoundsGEP(BasePtr, Indices, NamePrefix + "sroa_idx");
  }
  
  /// \brief Get a natural GEP off of the BasePtr walking through Ty toward
@@ -1181,9 +1285,10 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
  /// indicated by Indices to have the correct offset.
  static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
                                      Value *BasePtr, Type *Ty, Type *TargetTy,
-                                    SmallVectorImpl<Value *> &Indices) {
+                                    SmallVectorImpl<Value *> &Indices,
+                                    Twine NamePrefix) {
    if (Ty == TargetTy)
-    return buildGEP(IRB, BasePtr, Indices);
+    return buildGEP(IRB, BasePtr, Indices, NamePrefix);
  
    // See if we can descend into a struct and locate a field with the correct
    // type.
@@ -1210,7 +1315,7 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
    if (ElementTy != TargetTy)
      Indices.erase(Indices.end() - NumLayers, Indices.end());
  
-  return buildGEP(IRB, BasePtr, Indices);
+  return buildGEP(IRB, BasePtr, Indices, NamePrefix);
  }
  
  /// \brief Recursively compute indices for a natural GEP.
@@ -1220,9 +1325,10 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
  static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
                                         Value *Ptr, Type *Ty, APInt &Offset,
                                         Type *TargetTy,
-                                       SmallVectorImpl<Value *> &Indices) {
+                                       SmallVectorImpl<Value *> &Indices,
+                                       Twine NamePrefix) {
    if (Offset == 0)
-    return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices);
+    return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, NamePrefix);
  
    // We can't recurse through pointer types.
    if (Ty->isPointerTy())
@@ -1242,7 +1348,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
      Offset -= NumSkippedElements * ElementSize;
      Indices.push_back(IRB.getInt(NumSkippedElements));
      return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(),
-                                    Offset, TargetTy, Indices);
+                                    Offset, TargetTy, Indices, NamePrefix);
    }
  
    if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
@@ -1255,7 +1361,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
      Offset -= NumSkippedElements * ElementSize;
      Indices.push_back(IRB.getInt(NumSkippedElements));
      return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
-                                    Indices);
+                                    Indices, NamePrefix);
    }
  
    StructType *STy = dyn_cast<StructType>(Ty);
@@ -1274,7 +1380,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
  
    Indices.push_back(IRB.getInt32(Index));
    return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
-                                  Indices);
+                                  Indices, NamePrefix);
  }
  
  /// \brief Get a natural GEP from a base pointer to a particular offset and
@@ -1289,7 +1395,8 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
  /// If no natural GEP can be constructed, this function returns null.
  static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
                                        Value *Ptr, APInt Offset, Type *TargetTy,
-                                      SmallVectorImpl<Value *> &Indices) {
+                                      SmallVectorImpl<Value *> &Indices,
+                                      Twine NamePrefix) {
    PointerType *Ty = cast<PointerType>(Ptr->getType());
  
    // Don't consider any GEPs through an i8* as natural unless the TargetTy is
@@ -1308,7 +1415,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
    Offset -= NumSkippedElements * ElementSize;
    Indices.push_back(IRB.getInt(NumSkippedElements));
    return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
-                                  Indices);
+                                  Indices, NamePrefix);
  }
  
  /// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
@@ -1326,8 +1433,9 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
  /// properties. The algorithm tries to fold as many constant indices into
  /// a single GEP as possible, thus making each GEP more independent of the
  /// surrounding code.
-static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL,
-                             Value *Ptr, APInt Offset, Type *PointerTy) {
+static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
+                             APInt Offset, Type *PointerTy,
+                             Twine NamePrefix) {
    // Even though we don't look through PHI nodes, we could be called on an
    // instruction in an unreachable block, which may be on a cycle.
    SmallPtrSet<Value *, 4> Visited;
@@ -1361,7 +1469,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL,
      // See if we can perform a natural GEP here.
      Indices.clear();
      if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
-                                           Indices)) {
+                                           Indices, NamePrefix)) {
        if (P->getType() == PointerTy) {
          // Zap any offset pointer that we ended up computing in previous rounds.
          if (OffsetPtr && OffsetPtr->use_empty())
@@ -1396,19 +1504,19 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL,
    if (!OffsetPtr) {
      if (!Int8Ptr) {
        Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(),
-                                  "raw_cast");
+                                  NamePrefix + "sroa_raw_cast");
        Int8PtrOffset = Offset;
      }
  
      OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr :
        IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
-                            "raw_idx");
+                            NamePrefix + "sroa_raw_idx");
    }
    Ptr = OffsetPtr;
  
    // On the off chance we were targeting i8*, guard the bitcast here.
    if (Ptr->getType() != PointerTy)
-    Ptr = IRB.CreateBitCast(Ptr, PointerTy, "cast");
+    Ptr = IRB.CreateBitCast(Ptr, PointerTy, NamePrefix + "sroa_cast");
  
    return Ptr;
  }
@@ -1431,6 +1539,10 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
    if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
      return false;
  
+  // We can convert pointers to integers and vice-versa. Same for vectors
+  // of pointers and integers.
+  OldTy = OldTy->getScalarType();
+  NewTy = NewTy->getScalarType();
    if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
      if (NewTy->isPointerTy() && OldTy->isPointerTy())
        return true;
@@ -1449,21 +1561,53 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
  /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
  /// two types for viability with this routine.
  static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
-                           Type *Ty) {
-  assert(canConvertValue(DL, V->getType(), Ty) &&
-         "Value not convertable to type");
-  if (V->getType() == Ty)
+                           Type *NewTy) {
+  Type *OldTy = V->getType();
+  assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
+
+  if (OldTy == NewTy)
      return V;
-  if (IntegerType *OldITy = dyn_cast<IntegerType>(V->getType()))
-    if (IntegerType *NewITy = dyn_cast<IntegerType>(Ty))
+
+  if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy))
+    if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy))
        if (NewITy->getBitWidth() > OldITy->getBitWidth())
          return IRB.CreateZExt(V, NewITy);
-  if (V->getType()->isIntegerTy() && Ty->isPointerTy())
-    return IRB.CreateIntToPtr(V, Ty);
-  if (V->getType()->isPointerTy() && Ty->isIntegerTy())
-    return IRB.CreatePtrToInt(V, Ty);
  
-  return IRB.CreateBitCast(V, Ty);
+  // See if we need inttoptr for this type pair. A cast involving both scalars
+  // and vectors requires and additional bitcast.
+  if (OldTy->getScalarType()->isIntegerTy() &&
+      NewTy->getScalarType()->isPointerTy()) {
+    // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
+    if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+      return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+                                NewTy);
+
+    // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
+    if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+      return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+                                NewTy);
+
+    return IRB.CreateIntToPtr(V, NewTy);
+  }
+
+  // See if we need ptrtoint for this type pair. A cast involving both scalars
+  // and vectors requires and additional bitcast.
+  if (OldTy->getScalarType()->isPointerTy() &&
+      NewTy->getScalarType()->isIntegerTy()) {
+    // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
+    if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+      return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                               NewTy);
+
+    // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
+    if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+      return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                               NewTy);
+
+    return IRB.CreatePtrToInt(V, NewTy);
+  }
+
+  return IRB.CreateBitCast(V, NewTy);
  }
  
  /// \brief Test whether the given slice use can be promoted to a vector.
@@ -1865,16 +2009,22 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
    // integer type will be stored here for easy access during rewriting.
    IntegerType *IntTy;
  
-  // The offset of the slice currently being rewritten.
+  // The original offset of the slice currently being rewritten relative to
+  // the original alloca.
    uint64_t BeginOffset, EndOffset;
+  // The new offsets of the slice currently being rewritten relative to the
+  // original alloca.
+  uint64_t NewBeginOffset, NewEndOffset;
+
+  uint64_t SliceSize;
    bool IsSplittable;
    bool IsSplit;
    Use *OldUse;
    Instruction *OldPtr;
  
-  // Output members carrying state about the result of visiting and rewriting
-  // the slice of the alloca.
-  bool IsUsedByRewrittenSpeculatableInstructions;
+  // Track post-rewrite users which are PHI nodes and Selects.
+  SmallPtrSetImpl<PHINode *> &PHIUsers;
+  SmallPtrSetImpl<SelectInst *> &SelectUsers;
  
    // Utility IR builder, whose name prefix is setup for each visited use, and
    // the insertion point is set to point to the user.
@@ -1883,11 +2033,14 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
  public:
    AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &S, SROA &Pass,
                        AllocaInst &OldAI, AllocaInst &NewAI,
-                      uint64_t NewBeginOffset, uint64_t NewEndOffset,
-                      bool IsVectorPromotable = false,
-                      bool IsIntegerPromotable = false)
+                      uint64_t NewAllocaBeginOffset,
+                      uint64_t NewAllocaEndOffset, bool IsVectorPromotable,
+                      bool IsIntegerPromotable,
+                      SmallPtrSetImpl<PHINode *> &PHIUsers,
+                      SmallPtrSetImpl<SelectInst *> &SelectUsers)
        : DL(DL), S(S), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
-        NewAllocaBeginOffset(NewBeginOffset), NewAllocaEndOffset(NewEndOffset),
+        NewAllocaBeginOffset(NewAllocaBeginOffset),
+        NewAllocaEndOffset(NewAllocaEndOffset),
          NewAllocaTy(NewAI.getAllocatedType()),
          VecTy(IsVectorPromotable ? cast<VectorType>(NewAllocaTy) : 0),
          ElementTy(VecTy ? VecTy->getElementType() : 0),
@@ -1898,7 +2051,7 @@ public:
                          DL.getTypeSizeInBits(NewAI.getAllocatedType()))
                    : 0),
          BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
-        OldPtr(), IsUsedByRewrittenSpeculatableInstructions(false),
+        OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
          IRB(NewAI.getContext(), ConstantFolder()) {
      if (VecTy) {
        assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
@@ -1917,6 +2070,14 @@ public:
      IsSplit =
          BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
  
+    // Compute the intersecting offset range.
+    assert(BeginOffset < NewAllocaEndOffset);
+    assert(EndOffset > NewAllocaBeginOffset);
+    NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+    NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+
+    SliceSize = NewEndOffset - NewBeginOffset;
+
      OldUse = I->getUse();
      OldPtr = cast<Instruction>(OldUse->get());
  
@@ -1931,20 +2092,6 @@ public:
      return CanSROA;
    }
  
-  /// \brief Query whether this slice is used by speculatable instructions after
-  /// rewriting.
-  ///
-  /// These instructions (PHIs and Selects currently) require the alloca slice
-  /// to run back through the rewriter. Thus, they are promotable, but not on
-  /// this iteration. This is distinct from a slice which is unpromotable for
-  /// some other reason, in which case we don't even want to perform the
-  /// speculation. This can be querried at any time and reflects whether (at
-  /// that point) a visit call has rewritten a speculatable instruction on the
-  /// current slice.
-  bool isUsedByRewrittenSpeculatableInstructions() const {
-    return IsUsedByRewrittenSpeculatableInstructions;
-  }
-
  private:
    // Make sure the other visit overloads are visible.
    using Base::visit;
@@ -1955,30 +2102,53 @@ private:
      llvm_unreachable("No rewrite rule for this instruction!");
    }
  
-  Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, uint64_t Offset,
-                              Type *PointerTy) {
-    assert(Offset >= NewAllocaBeginOffset);
-    return getAdjustedPtr(IRB, DL, &NewAI, APInt(DL.getPointerSizeInBits(),
-                                                 Offset - NewAllocaBeginOffset),
-                          PointerTy);
+  Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
+    // Note that the offset computation can use BeginOffset or NewBeginOffset
+    // interchangeably for unsplit slices.
+    assert(IsSplit || BeginOffset == NewBeginOffset);
+    uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+
+#ifndef NDEBUG
+    StringRef OldName = OldPtr->getName();
+    // Skip through the last '.sroa.' component of the name.
+    size_t LastSROAPrefix = OldName.rfind(".sroa.");
+    if (LastSROAPrefix != StringRef::npos) {
+      OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
+      // Look for an SROA slice index.
+      size_t IndexEnd = OldName.find_first_not_of("0123456789");
+      if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
+        // Strip the index and look for the offset.
+        OldName = OldName.substr(IndexEnd + 1);
+        size_t OffsetEnd = OldName.find_first_not_of("0123456789");
+        if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
+          // Strip the offset.
+          OldName = OldName.substr(OffsetEnd + 1);
+      }
+    }
+    // Strip any SROA suffixes as well.
+    OldName = OldName.substr(0, OldName.find(".sroa_"));
+#endif
+
+    return getAdjustedPtr(IRB, DL, &NewAI,
+                          APInt(DL.getPointerSizeInBits(), Offset), PointerTy,
+#ifndef NDEBUG
+                          Twine(OldName) + "."
+#else
+                          Twine()
+#endif
+                          );
    }
  
-  /// \brief Compute suitable alignment to access an offset into the new alloca.
-  unsigned getOffsetAlign(uint64_t Offset) {
+  /// \brief Compute suitable alignment to access this slice of the *new* alloca.
+  ///
+  /// You can optionally pass a type to this routine and if that type's ABI
+  /// alignment is itself suitable, this will return zero.
+  unsigned getSliceAlign(Type *Ty = 0) {
      unsigned NewAIAlign = NewAI.getAlignment();
      if (!NewAIAlign)
        NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType());
-    return MinAlign(NewAIAlign, Offset);
-  }
-
-  /// \brief Compute suitable alignment to access a type at an offset of the
-  /// new alloca.
-  ///
-  /// \returns zero if the type's ABI alignment is a suitable alignment,
-  /// otherwise returns the maximal suitable alignment.
-  unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) {
-    unsigned Align = getOffsetAlign(Offset);
-    return Align == DL.getABITypeAlignment(Ty) ? 0 : Align;
+    unsigned Align = MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset);
+    return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align;
    }
  
    unsigned getIndex(uint64_t Offset) {
@@ -1996,8 +2166,7 @@ private:
        Pass.DeadInsts.insert(I);
    }
  
-  Value *rewriteVectorizedLoadInst(uint64_t NewBeginOffset,
-                                   uint64_t NewEndOffset) {
+  Value *rewriteVectorizedLoadInst() {
      unsigned BeginIndex = getIndex(NewBeginOffset);
      unsigned EndIndex = getIndex(NewEndOffset);
      assert(EndIndex > BeginIndex && "Empty vector!");
@@ -2007,8 +2176,7 @@ private:
      return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
    }
  
-  Value *rewriteIntegerLoad(LoadInst &LI, uint64_t NewBeginOffset,
-                            uint64_t NewEndOffset) {
+  Value *rewriteIntegerLoad(LoadInst &LI) {
      assert(IntTy && "We cannot insert an integer to the alloca");
      assert(!LI.isVolatile());
      Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
@@ -2027,32 +2195,23 @@ private:
      Value *OldOp = LI.getOperand(0);
      assert(OldOp == OldPtr);
  
-    // Compute the intersecting offset range.
-    assert(BeginOffset < NewAllocaEndOffset);
-    assert(EndOffset > NewAllocaBeginOffset);
-    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
-    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
-
-    uint64_t Size = NewEndOffset - NewBeginOffset;
-
-    Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), Size * 8)
+    Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
                               : LI.getType();
      bool IsPtrAdjusted = false;
      Value *V;
      if (VecTy) {
-      V = rewriteVectorizedLoadInst(NewBeginOffset, NewEndOffset);
+      V = rewriteVectorizedLoadInst();
      } else if (IntTy && LI.getType()->isIntegerTy()) {
-      V = rewriteIntegerLoad(LI, NewBeginOffset, NewEndOffset);
+      V = rewriteIntegerLoad(LI);
      } else if (NewBeginOffset == NewAllocaBeginOffset &&
                 canConvertValue(DL, NewAllocaTy, LI.getType())) {
        V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                LI.isVolatile(), "load");
+                                LI.isVolatile(), LI.getName());
      } else {
        Type *LTy = TargetTy->getPointerTo();
-      V = IRB.CreateAlignedLoad(
-          getAdjustedAllocaPtr(IRB, NewBeginOffset, LTy),
-          getOffsetTypeAlign(TargetTy, NewBeginOffset - NewAllocaBeginOffset),
-          LI.isVolatile(), "load");
+      V = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
+                                getSliceAlign(TargetTy), LI.isVolatile(),
+                                LI.getName());
        IsPtrAdjusted = true;
      }
      V = convertValue(DL, IRB, V, TargetTy);
@@ -2061,7 +2220,7 @@ private:
        assert(!LI.isVolatile());
        assert(LI.getType()->isIntegerTy() &&
               "Only integer type loads and stores are split");
-      assert(Size < DL.getTypeStoreSize(LI.getType()) &&
+      assert(SliceSize < DL.getTypeStoreSize(LI.getType()) &&
               "Split load isn't smaller than original load");
        assert(LI.getType()->getIntegerBitWidth() ==
               DL.getTypeStoreSizeInBits(LI.getType()) &&
@@ -2089,9 +2248,7 @@ private:
      return !LI.isVolatile() && !IsPtrAdjusted;
    }
  
-  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
-                                  uint64_t NewBeginOffset,
-                                  uint64_t NewEndOffset) {
+  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) {
      if (V->getType() != VecTy) {
        unsigned BeginIndex = getIndex(NewBeginOffset);
        unsigned EndIndex = getIndex(NewEndOffset);
@@ -2117,8 +2274,7 @@ private:
      return true;
    }
  
-  bool rewriteIntegerStore(Value *V, StoreInst &SI,
-                           uint64_t NewBeginOffset, uint64_t NewEndOffset) {
+  bool rewriteIntegerStore(Value *V, StoreInst &SI) {
      assert(IntTy && "We cannot extract an integer from the alloca");
      assert(!SI.isVolatile());
      if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
@@ -2151,30 +2307,22 @@ private:
        if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
          Pass.PostPromotionWorklist.insert(AI);
  
-    // Compute the intersecting offset range.
-    assert(BeginOffset < NewAllocaEndOffset);
-    assert(EndOffset > NewAllocaBeginOffset);
-    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
-    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
-
-    uint64_t Size = NewEndOffset - NewBeginOffset;
-    if (Size < DL.getTypeStoreSize(V->getType())) {
+    if (SliceSize < DL.getTypeStoreSize(V->getType())) {
        assert(!SI.isVolatile());
        assert(V->getType()->isIntegerTy() &&
               "Only integer type loads and stores are split");
        assert(V->getType()->getIntegerBitWidth() ==
               DL.getTypeStoreSizeInBits(V->getType()) &&
               "Non-byte-multiple bit width");
-      IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8);
+      IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
        V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset,
                           "extract");
      }
  
      if (VecTy)
-      return rewriteVectorizedStoreInst(V, SI, OldOp, NewBeginOffset,
-                                        NewEndOffset);
+      return rewriteVectorizedStoreInst(V, SI, OldOp);
      if (IntTy && V->getType()->isIntegerTy())
-      return rewriteIntegerStore(V, SI, NewBeginOffset, NewEndOffset);
+      return rewriteIntegerStore(V, SI);
  
      StoreInst *NewSI;
      if (NewBeginOffset == NewAllocaBeginOffset &&
@@ -2184,12 +2332,9 @@ private:
        NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
                                       SI.isVolatile());
      } else {
-      Value *NewPtr = getAdjustedAllocaPtr(IRB, NewBeginOffset,
-                                           V->getType()->getPointerTo());
-      NewSI = IRB.CreateAlignedStore(
-          V, NewPtr, getOffsetTypeAlign(
-                         V->getType(), NewBeginOffset - NewAllocaBeginOffset),
-          SI.isVolatile());
+      Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo());
+      NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
+                                     SI.isVolatile());
      }
      (void)NewSI;
      Pass.DeadInsts.insert(&SI);
@@ -2241,11 +2386,10 @@ private:
      // pointer to the new alloca.
      if (!isa<Constant>(II.getLength())) {
        assert(!IsSplit);
-      assert(BeginOffset >= NewAllocaBeginOffset);
-      II.setDest(
-          getAdjustedAllocaPtr(IRB, BeginOffset, II.getRawDest()->getType()));
+      assert(NewBeginOffset == BeginOffset);
+      II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
        Type *CstTy = II.getAlignmentCst()->getType();
-      II.setAlignment(ConstantInt::get(CstTy, getOffsetAlign(BeginOffset)));
+      II.setAlignment(ConstantInt::get(CstTy, getSliceAlign()));
  
        deleteIfTriviallyDead(OldPtr);
        return false;
@@ -2257,13 +2401,6 @@ private:
      Type *AllocaTy = NewAI.getAllocatedType();
      Type *ScalarTy = AllocaTy->getScalarType();
  
-    // Compute the intersecting offset range.
-    assert(BeginOffset < NewAllocaEndOffset);
-    assert(EndOffset > NewAllocaBeginOffset);
-    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
-    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
-    uint64_t SliceOffset = NewBeginOffset - NewAllocaBeginOffset;
-
      // If this doesn't map cleanly onto the alloca type, and that type isn't
      // a single value type, just emit a memset.
      if (!VecTy && !IntTy &&
@@ -2275,8 +2412,8 @@ private:
        Type *SizeTy = II.getLength()->getType();
        Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
        CallInst *New = IRB.CreateMemSet(
-          getAdjustedAllocaPtr(IRB, NewBeginOffset, II.getRawDest()->getType()),
-          II.getValue(), Size, getOffsetAlign(SliceOffset), II.isVolatile());
+          getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
+          getSliceAlign(), II.isVolatile());
        (void)New;
        DEBUG(dbgs() << "          to: " << *New << "\n");
        return false;
@@ -2353,25 +2490,11 @@ private:
  
      DEBUG(dbgs() << "    original: " << II << "\n");
  
-    // Compute the intersecting offset range.
-    assert(BeginOffset < NewAllocaEndOffset);
-    assert(EndOffset > NewAllocaBeginOffset);
-    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
-    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+    bool IsDest = &II.getRawDestUse() == OldUse;
+    assert((IsDest && II.getRawDest() == OldPtr) ||
+           (!IsDest && II.getRawSource() == OldPtr));
  
-    assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr);
-    bool IsDest = II.getRawDest() == OldPtr;
-
-    // Compute the relative offset within the transfer.
-    unsigned IntPtrWidth = DL.getPointerSizeInBits();
-    APInt RelOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
-
-    unsigned Align = II.getAlignment();
-    uint64_t SliceOffset = NewBeginOffset - NewAllocaBeginOffset;
-    if (Align > 1)
-      Align =
-          MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(),
-                   MinAlign(II.getAlignment(), getOffsetAlign(SliceOffset)));
+    unsigned SliceAlign = getSliceAlign();
  
      // For unsplit intrinsics, we simply modify the source and destination
      // pointers in place. This isn't just an optimization, it is a matter of
@@ -2381,19 +2504,20 @@ private:
      // memcpy, and so simply updating the pointers is the necessary for us to
      // update both source and dest of a single call.
      if (!IsSplittable) {
-      Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource();
+      Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
        if (IsDest)
-        II.setDest(
-            getAdjustedAllocaPtr(IRB, BeginOffset, II.getRawDest()->getType()));
+        II.setDest(AdjustedPtr);
        else
-        II.setSource(getAdjustedAllocaPtr(IRB, BeginOffset,
-                                          II.getRawSource()->getType()));
+        II.setSource(AdjustedPtr);
  
-      Type *CstTy = II.getAlignmentCst()->getType();
-      II.setAlignment(ConstantInt::get(CstTy, Align));
+      if (II.getAlignment() > SliceAlign) {
+        Type *CstTy = II.getAlignmentCst()->getType();
+        II.setAlignment(
+            ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign)));
+      }
  
        DEBUG(dbgs() << "          to: " << II << "\n");
-      deleteIfTriviallyDead(OldOp);
+      deleteIfTriviallyDead(OldPtr);
        return false;
      }
      // For split transfer intrinsics we have an incredibly useful assurance:
@@ -2429,37 +2553,38 @@ private:
      // alloca that should be re-examined after rewriting this instruction.
      Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
      if (AllocaInst *AI
-          = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets()))
+          = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
+      assert(AI != &OldAI && AI != &NewAI &&
+             "Splittable transfers cannot reach the same alloca on both ends.");
        Pass.Worklist.insert(AI);
+    }
+
+    // Compute the relative offset for the other pointer within the transfer.
+    unsigned IntPtrWidth = DL.getPointerSizeInBits();
+    APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
+    unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1,
+                                   OtherOffset.zextOrTrunc(64).getZExtValue());
  
      if (EmitMemCpy) {
-      Type *OtherPtrTy = IsDest ? II.getRawSource()->getType()
-                                : II.getRawDest()->getType();
+      Type *OtherPtrTy = OtherPtr->getType();
  
        // Compute the other pointer, folding as much as possible to produce
        // a single, simple GEP in most cases.
-      OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, RelOffset, OtherPtrTy);
+      OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
+                                OtherPtr->getName() + ".");
  
-      Value *OurPtr = getAdjustedAllocaPtr(
-          IRB, NewBeginOffset,
-          IsDest ? II.getRawDest()->getType() : II.getRawSource()->getType());
+      Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
        Type *SizeTy = II.getLength()->getType();
        Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
  
-      CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr,
-                                       IsDest ? OtherPtr : OurPtr,
-                                       Size, Align, II.isVolatile());
+      CallInst *New = IRB.CreateMemCpy(
+          IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size,
+          MinAlign(SliceAlign, OtherAlign), II.isVolatile());
        (void)New;
        DEBUG(dbgs() << "          to: " << *New << "\n");
        return false;
      }
  
-    // Note that we clamp the alignment to 1 here as a 0 alignment for a memcpy
-    // is equivalent to 1, but that isn't true if we end up rewriting this as
-    // a load or store.
-    if (!Align)
-      Align = 1;
-
      bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
                           NewEndOffset == NewAllocaEndOffset;
      uint64_t Size = NewEndOffset - NewBeginOffset;
@@ -2481,10 +2606,15 @@ private:
        OtherPtrTy = SubIntTy->getPointerTo();
      }
  
-    Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, RelOffset, OtherPtrTy);
+    Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
+                                   OtherPtr->getName() + ".");
+    unsigned SrcAlign = OtherAlign;
      Value *DstPtr = &NewAI;
-    if (!IsDest)
+    unsigned DstAlign = SliceAlign;
+    if (!IsDest) {
        std::swap(SrcPtr, DstPtr);
+      std::swap(SrcAlign, DstAlign);
+    }
  
      Value *Src;
      if (VecTy && !IsWholeAlloca && !IsDest) {
@@ -2498,7 +2628,7 @@ private:
        uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
        Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
      } else {
-      Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(),
+      Src = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(),
                                    "copyload");
      }
  
@@ -2516,7 +2646,7 @@ private:
      }
  
      StoreInst *Store = cast<StoreInst>(
-      IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile()));
+        IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
      (void)Store;
      DEBUG(dbgs() << "          to: " << *Store << "\n");
      return !II.isVolatile();
@@ -2528,20 +2658,13 @@ private:
      DEBUG(dbgs() << "    original: " << II << "\n");
      assert(II.getArgOperand(1) == OldPtr);
  
-    // Compute the intersecting offset range.
-    assert(BeginOffset < NewAllocaEndOffset);
-    assert(EndOffset > NewAllocaBeginOffset);
-    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
-    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
-
      // Record this instruction for deletion.
      Pass.DeadInsts.insert(&II);
  
      ConstantInt *Size
        = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
                           NewEndOffset - NewBeginOffset);
-    Value *Ptr =
-        getAdjustedAllocaPtr(IRB, NewBeginOffset, II.getArgOperand(1)->getType());
+    Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
      Value *New;
      if (II.getIntrinsicID() == Intrinsic::lifetime_start)
        New = IRB.CreateLifetimeStart(Ptr, Size);
@@ -2562,28 +2685,22 @@ private:
      // as local as possible to the PHI. To do that, we re-use the location of
      // the old pointer, which necessarily must be in the right position to
      // dominate the PHI.
-    IRBuilderTy PtrBuilder(OldPtr);
-    PtrBuilder.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) +
-                             ".");
+    IRBuilderTy PtrBuilder(IRB);
+    PtrBuilder.SetInsertPoint(OldPtr);
+    PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc());
  
-    Value *NewPtr =
-        getAdjustedAllocaPtr(PtrBuilder, BeginOffset, OldPtr->getType());
+    Value *NewPtr = getNewAllocaSlicePtr(PtrBuilder, OldPtr->getType());
      // Replace the operands which were using the old pointer.
      std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
  
      DEBUG(dbgs() << "          to: " << PN << "\n");
      deleteIfTriviallyDead(OldPtr);
  
-    // Check whether we can speculate this PHI node, and if so remember that
-    // fact and queue it up for another iteration after the speculation
-    // occurs.
-    if (isSafePHIToSpeculate(PN, &DL)) {
-      Pass.SpeculatablePHIs.insert(&PN);
-      IsUsedByRewrittenSpeculatableInstructions = true;
-      return true;
-    }
-
-    return false; // PHIs can't be promoted on their own.
+    // PHIs can't be promoted on their own, but often can be speculated. We
+    // check the speculation outside of the rewriter so that we see the
+    // fully-rewritten alloca.
+    PHIUsers.insert(&PN);
+    return true;
    }
  
    bool visitSelectInst(SelectInst &SI) {
@@ -2593,7 +2710,7 @@ private:
      assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
      assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
  
-    Value *NewPtr = getAdjustedAllocaPtr(IRB, BeginOffset, OldPtr->getType());
+    Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
      // Replace the operands which were using the old pointer.
      if (SI.getOperand(1) == OldPtr)
        SI.setOperand(1, NewPtr);
@@ -2603,16 +2720,11 @@ private:
      DEBUG(dbgs() << "          to: " << SI << "\n");
      deleteIfTriviallyDead(OldPtr);
  
-    // Check whether we can speculate this select instruction, and if so
-    // remember that fact and queue it up for another iteration after the
-    // speculation occurs.
-    if (isSafeSelectToSpeculate(SI, &DL)) {
-      Pass.SpeculatableSelects.insert(&SI);
-      IsUsedByRewrittenSpeculatableInstructions = true;
-      return true;
-    }
-
-    return false; // Selects can't be promoted on their own.
+    // Selects can't be promoted on their own, but often can be speculated. We
+    // check the speculation outside of the rewriter so that we see the
+    // fully-rewritten alloca.
+    SelectUsers.insert(&SI);
+    return true;
    }
  
  };
@@ -3048,17 +3160,17 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S,
                 << "[" << BeginOffset << "," << EndOffset << ") to: " << *NewAI
                 << "\n");
  
-  // Track the high watermark on several worklists that are only relevant for
+  // Track the high watermark on the worklist as it is only relevant for
    // promoted allocas. We will reset it to this point if the alloca is not in
    // fact scheduled for promotion.
    unsigned PPWOldSize = PostPromotionWorklist.size();
-  unsigned SPOldSize = SpeculatablePHIs.size();
-  unsigned SSOldSize = SpeculatableSelects.size();
    unsigned NumUses = 0;
+  SmallPtrSet<PHINode *, 8> PHIUsers;
+  SmallPtrSet<SelectInst *, 8> SelectUsers;
  
    AllocaSliceRewriter Rewriter(*DL, S, *this, AI, *NewAI, BeginOffset,
                                 EndOffset, IsVectorPromotable,
-                               IsIntegerPromotable);
+                               IsIntegerPromotable, PHIUsers, SelectUsers);
    bool Promotable = true;
    for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(),
                                                          SUE = SplitUses.end();
@@ -3079,33 +3191,55 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S,
    MaxUsesPerAllocaPartition =
        std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition);
  
-  if (Promotable && !Rewriter.isUsedByRewrittenSpeculatableInstructions()) {
-    DEBUG(dbgs() << "  and queuing for promotion\n");
-    PromotableAllocas.push_back(NewAI);
-  } else if (NewAI != &AI ||
-             (Promotable &&
-              Rewriter.isUsedByRewrittenSpeculatableInstructions())) {
+  // Now that we've processed all the slices in the new partition, check if any
+  // PHIs or Selects would block promotion.
+  for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(),
+                                            E = PHIUsers.end();
+       I != E; ++I)
+    if (!isSafePHIToSpeculate(**I, DL)) {
+      Promotable = false;
+      PHIUsers.clear();
+      SelectUsers.clear();
+      break;
+    }
+  for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(),
+                                               E = SelectUsers.end();
+       I != E; ++I)
+    if (!isSafeSelectToSpeculate(**I, DL)) {
+      Promotable = false;
+      PHIUsers.clear();
+      SelectUsers.clear();
+      break;
+    }
+
+  if (Promotable) {
+    if (PHIUsers.empty() && SelectUsers.empty()) {
+      // Promote the alloca.
+      PromotableAllocas.push_back(NewAI);
+    } else {
+      // If we have either PHIs or Selects to speculate, add them to those
+      // worklists and re-queue the new alloca so that we promote in on the
+      // next iteration.
+      for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(),
+                                                E = PHIUsers.end();
+           I != E; ++I)
+        SpeculatablePHIs.insert(*I);
+      for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(),
+                                                   E = SelectUsers.end();
+           I != E; ++I)
+        SpeculatableSelects.insert(*I);
+      Worklist.insert(NewAI);
+    }
+  } else {
      // If we can't promote the alloca, iterate on it to check for new
      // refinements exposed by splitting the current alloca. Don't iterate on an
      // alloca which didn't actually change and didn't get promoted.
-    //
-    // Alternatively, if we could promote the alloca but have speculatable
-    // instructions then we will speculate them after finishing our processing
-    // of the original alloca. Mark the new one for re-visiting in the next
-    // iteration so the speculated operations can be rewritten.
-    //
-    // FIXME: We should actually track whether the rewriter changed anything.
-    Worklist.insert(NewAI);
-  }
-
-  // Drop any post-promotion work items if promotion didn't happen.
-  if (!Promotable) {
+    if (NewAI != &AI)
+      Worklist.insert(NewAI);
+
+    // Drop any post-promotion work items if promotion didn't happen.
      while (PostPromotionWorklist.size() > PPWOldSize)
        PostPromotionWorklist.pop_back();
-    while (SpeculatablePHIs.size() > SPOldSize)
-      SpeculatablePHIs.pop_back();
-    while (SpeculatableSelects.size() > SSOldSize)
-      SpeculatableSelects.pop_back();
    }
  
    return true;
@@ -3260,6 +3394,21 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &S) {
    return Changed;
  }
  
+/// \brief Clobber a use with undef, deleting the used value if it becomes dead.
+void SROA::clobberUse(Use &U) {
+  Value *OldV = U;
+  // Replace the use with an undef value.
+  U = UndefValue::get(OldV->getType());
+
+  // Check for this making an instruction dead. We have to garbage collect
+  // all the dead instructions to ensure the uses of any alloca end up being
+  // minimal.
+  if (Instruction *OldI = dyn_cast<Instruction>(OldV))
+    if (isInstructionTriviallyDead(OldI)) {
+      DeadInsts.insert(OldI);
+    }
+}
+
  /// \brief Analyze an alloca for SROA.
  ///
  /// This analyzes the alloca to ensure we can reason about it, builds
@@ -3297,21 +3446,23 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
    for (AllocaSlices::dead_user_iterator DI = S.dead_user_begin(),
                                          DE = S.dead_user_end();
         DI != DE; ++DI) {
-    Changed = true;
+    // Free up everything used by this instruction.
+    for (User::op_iterator DOI = (*DI)->op_begin(), DOE = (*DI)->op_end();
+         DOI != DOE; ++DOI)
+      clobberUse(*DOI);
+
+    // Now replace the uses of this instruction.
      (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType()));
+
+    // And mark it for deletion.
      DeadInsts.insert(*DI);
+    Changed = true;
    }
    for (AllocaSlices::dead_op_iterator DO = S.dead_op_begin(),
                                        DE = S.dead_op_end();
         DO != DE; ++DO) {
-    Value *OldV = **DO;
-    // Clobber the use with an undef value.
-    **DO = UndefValue::get(OldV->getType());
-    if (Instruction *OldI = dyn_cast<Instruction>(OldV))
-      if (isInstructionTriviallyDead(OldI)) {
-        Changed = true;
-        DeadInsts.insert(OldI);
-      }
+    clobberUse(**DO);
+    Changed = true;
    }
  
    // No slices to split. Leave the dead alloca for a later pass to clean up.
@@ -3364,12 +3515,12 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
  }
  
  static void enqueueUsersInWorklist(Instruction &I,
-                                   SmallVectorImpl<Use *> &UseWorklist,
-                                   SmallPtrSet<Use *, 8> &VisitedUses) {
+                                   SmallVectorImpl<Instruction *> &Worklist,
+                                   SmallPtrSet<Instruction *, 8> &Visited) {
    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;
         ++UI)
-    if (VisitedUses.insert(&UI.getUse()))
-      UseWorklist.push_back(&UI.getUse());
+    if (Visited.insert(cast<Instruction>(*UI)))
+      Worklist.push_back(cast<Instruction>(*UI));
  }
  
  /// \brief Promote the allocas, using the best available technique.
@@ -3388,7 +3539,7 @@ bool SROA::promoteAllocas(Function &F) {
  
    if (DT && !ForceSSAUpdater) {
      DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
-    PromoteMemToReg(PromotableAllocas, *DT, DL);
+    PromoteMemToReg(PromotableAllocas, *DT);
      PromotableAllocas.clear();
      return true;
    }
@@ -3396,29 +3547,29 @@ bool SROA::promoteAllocas(Function &F) {
    DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n");
    SSAUpdater SSA;
    DIBuilder DIB(*F.getParent());
-  SmallVector<Instruction*, 64> Insts;
+  SmallVector<Instruction *, 64> Insts;
  
    // We need a worklist to walk the uses of each alloca.
-  SmallVector<Use *, 8> UseWorklist;
-  SmallPtrSet<Use *, 8> VisitedUses;
+  SmallVector<Instruction *, 8> Worklist;
+  SmallPtrSet<Instruction *, 8> Visited;
    SmallVector<Instruction *, 32> DeadInsts;
  
    for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) {
      AllocaInst *AI = PromotableAllocas[Idx];
-    UseWorklist.clear();
-    VisitedUses.clear();
+    Insts.clear();
+    Worklist.clear();
+    Visited.clear();
  
-    enqueueUsersInWorklist(*AI, UseWorklist, VisitedUses);
+    enqueueUsersInWorklist(*AI, Worklist, Visited);
  
-    while (!UseWorklist.empty()) {
-      Use *U = UseWorklist.pop_back_val();
-      Instruction &I = *cast<Instruction>(U->getUser());
+    while (!Worklist.empty()) {
+      Instruction *I = Worklist.pop_back_val();
  
        // FIXME: Currently the SSAUpdater infrastructure doesn't reason about
        // lifetime intrinsics and so we strip them (and the bitcasts+GEPs
        // leading to them) here. Eventually it should use them to optimize the
        // scalar values produced.
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
          assert(II->getIntrinsicID() == Intrinsic::lifetime_start ||
                 II->getIntrinsicID() == Intrinsic::lifetime_end);
          II->eraseFromParent();
@@ -3428,12 +3579,12 @@ bool SROA::promoteAllocas(Function &F) {
        // Push the loads and stores we find onto the list. SROA will already
        // have validated that all loads and stores are viable candidates for
        // promotion.
-      if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
          assert(LI->getType() == AI->getAllocatedType());
          Insts.push_back(LI);
          continue;
        }
-      if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
          assert(SI->getValueOperand()->getType() == AI->getAllocatedType());
          Insts.push_back(SI);
          continue;
@@ -3442,11 +3593,10 @@ bool SROA::promoteAllocas(Function &F) {
        // For everything else, we know that only no-op bitcasts and GEPs will
        // make it this far, just recurse through them and recall them for later
        // removal.
-      DeadInsts.push_back(&I);
-      enqueueUsersInWorklist(I, UseWorklist, VisitedUses);
+      DeadInsts.push_back(I);
+      enqueueUsersInWorklist(*I, Worklist, Visited);
      }
      AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts);
-    Insts.clear();
      while (!DeadInsts.empty())
        DeadInsts.pop_back_val()->eraseFromParent();
      AI->eraseFromParent();
@@ -3471,14 +3621,20 @@ namespace {
  }
  
  bool SROA::runOnFunction(Function &F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
    DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
    C = &F.getContext();
-  DL = getAnalysisIfAvailable<DataLayout>();
-  if (!DL) {
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  if (!DLP) {
      DEBUG(dbgs() << "  Skipping SROA -- no target data!\n");
      return false;
    }
-  DT = getAnalysisIfAvailable<DominatorTree>();
+  DL = &DLP->getDataLayout();
+  DominatorTreeWrapperPass *DTWP =
+      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DT = DTWP ? &DTWP->getDomTree() : 0;
  
    BasicBlock &EntryBB = F.getEntryBlock();
    for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end());
@@ -3520,6 +3676,6 @@ bool SROA::runOnFunction(Function &F) {
  
  void SROA::getAnalysisUsage(AnalysisUsage &AU) const {
    if (RequiresDomTree)
-    AU.addRequired<DominatorTree>();
+    AU.addRequired<DominatorTreeWrapperPass>();
    AU.setPreservesCFG();
  }