Use the new script to sort the includes of every file under lib.

[oota-llvm.git] / lib / Transforms / Scalar / SROA.cpp
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp

index 7e206f0bcec77033a619e5d55f064015f0b3c291..cb9838ef67443ed1dc79e482ebaf04db0cacc840 100644 (file)
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -25,33 +25,33 @@
  
  #define DEBUG_TYPE "sroa"
  #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ValueTracking.h"
  #include "llvm/Constants.h"
  #include "llvm/DIBuilder.h"
+#include "llvm/DataLayout.h"
  #include "llvm/DebugInfo.h"
  #include "llvm/DerivedTypes.h"
  #include "llvm/Function.h"
  #include "llvm/IRBuilder.h"
+#include "llvm/InstVisitor.h"
  #include "llvm/Instructions.h"
  #include "llvm/IntrinsicInst.h"
  #include "llvm/LLVMContext.h"
  #include "llvm/Module.h"
  #include "llvm/Operator.h"
  #include "llvm/Pass.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/ValueTracking.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/ErrorHandling.h"
  #include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/InstVisitor.h"
  #include "llvm/Support/MathExtras.h"
  #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
  #include "llvm/Transforms/Utils/Local.h"
  #include "llvm/Transforms/Utils/PromoteMemToReg.h"
  #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -132,9 +132,6 @@ public:
      ///
      /// We flag partitions as splittable when they are formed entirely due to
      /// accesses by trivially splittable operations such as memset and memcpy.
-    ///
-    /// FIXME: At some point we should consider loads and stores of FCAs to be
-    /// splittable and eagerly split them into scalar values.
      bool IsSplittable;
  
      /// \brief Test whether a partition has been marked as dead.
@@ -337,7 +334,7 @@ private:
    class UseBuilder;
    friend class AllocaPartitioning::UseBuilder;
  
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
    /// \brief Handle to alloca instruction to simplify method interfaces.
    AllocaInst &AI;
  #endif
@@ -544,33 +541,25 @@ private:
  
    void insertUse(Instruction &I, int64_t Offset, uint64_t Size,
                   bool IsSplittable = false) {
-    // Completely skip uses which have a zero size or don't overlap the
-    // allocation.
-    if (Size == 0 ||
-        (Offset >= 0 && (uint64_t)Offset >= AllocSize) ||
-        (Offset < 0 && (uint64_t)-Offset >= Size)) {
+    // Completely skip uses which have a zero size or start either before or
+    // past the end of the allocation.
+    if (Size == 0 || Offset < 0 || (uint64_t)Offset >= AllocSize) {
        DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
-                   << " which starts past the end of the " << AllocSize
-                   << " byte alloca:\n"
+                   << " which has zero size or starts outside of the "
+                   << AllocSize << " byte alloca:\n"
                     << "    alloca: " << P.AI << "\n"
                     << "       use: " << I << "\n");
        return;
      }
  
-    // Clamp the start to the beginning of the allocation.
-    if (Offset < 0) {
-      DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
-                   << " to start at the beginning of the alloca:\n"
-                   << "    alloca: " << P.AI << "\n"
-                   << "       use: " << I << "\n");
-      Size -= (uint64_t)-Offset;
-      Offset = 0;
-    }
-
      uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size;
  
      // Clamp the end offset to the end of the allocation. Note that this is
      // formulated to handle even the case where "BeginOffset + Size" overflows.
+    // NOTE! This may appear superficially to be something we could ignore
+    // entirely, but that is not so! There may be PHI-node uses where some
+    // instructions are dead but not others. We can't completely ignore the
+    // PHI node, and so have to record at least the information here.
      assert(AllocSize >= BeginOffset); // Established above.
      if (Size > AllocSize - BeginOffset) {
        DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
@@ -584,7 +573,8 @@ private:
      P.Partitions.push_back(New);
    }
  
-  bool handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset) {
+  bool handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset,
+                         bool IsVolatile) {
      uint64_t Size = TD.getTypeStoreSize(Ty);
  
      // If this memory access can be shown to *statically* extend outside the
@@ -605,7 +595,14 @@ private:
        return true;
      }
  
-    insertUse(I, Offset, Size);
+    // We allow splitting of loads and stores where the type is an integer type
+    // and which cover the entire alloca. Such integer loads and stores
+    // often require decomposition into fine grained loads and stores.
+    bool IsSplittable = false;
+    if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
+      IsSplittable = !IsVolatile && ITy->getBitWidth() == AllocSize*8;
+
+    insertUse(I, Offset, Size, IsSplittable);
      return true;
    }
  
@@ -626,7 +623,7 @@ private:
    bool visitLoadInst(LoadInst &LI) {
      assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
             "All simple FCA loads should have been pre-split");
-    return handleLoadOrStore(LI.getType(), LI, Offset);
+    return handleLoadOrStore(LI.getType(), LI, Offset, LI.isVolatile());
    }
  
    bool visitStoreInst(StoreInst &SI) {
@@ -636,7 +633,7 @@ private:
  
      assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
             "All simple FCA stores should have been pre-split");
-    return handleLoadOrStore(ValOp->getType(), SI, Offset);
+    return handleLoadOrStore(ValOp->getType(), SI, Offset, SI.isVolatile());
    }
  
  
@@ -872,16 +869,9 @@ private:
    void insertUse(Instruction &User, int64_t Offset, uint64_t Size) {
      // If the use has a zero size or extends outside of the allocation, record
      // it as a dead use for elimination later.
-    if (Size == 0 || (uint64_t)Offset >= AllocSize ||
-        (Offset < 0 && (uint64_t)-Offset >= Size))
+    if (Size == 0 || Offset < 0 || (uint64_t)Offset >= AllocSize)
        return markAsDead(User);
  
-    // Clamp the start to the beginning of the allocation.
-    if (Offset < 0) {
-      Size -= (uint64_t)-Offset;
-      Offset = 0;
-    }
-
      uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size;
  
      // Clamp the end offset to the end of the allocation. Note that this is
@@ -1117,7 +1107,7 @@ void AllocaPartitioning::splitAndMergePartitions() {
  
  AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI)
      :
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
        AI(AI),
  #endif
        PointerEscapingInstr(0) {
@@ -1175,6 +1165,21 @@ Type *AllocaPartitioning::getCommonType(iterator I) const {
        UserTy = LI->getType();
      } else if (StoreInst *SI = dyn_cast<StoreInst>(UI->U->getUser())) {
        UserTy = SI->getValueOperand()->getType();
+    } else {
+      return 0; // Bail if we have weird uses.
+    }
+
+    if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) {
+      // If the type is larger than the partition, skip it. We only encounter
+      // this for split integer operations where we want to use the type of the
+      // entity causing the split.
+      if (ITy->getBitWidth() > (I->EndOffset - I->BeginOffset)*8)
+        continue;
+
+      // If we have found an integer type use covering the alloca, use that
+      // regardless of the other types, as integers are often used for a "bucket
+      // of bits" type.
+      return ITy;
      }
  
      if (Ty && Ty != UserTy)
@@ -1362,11 +1367,7 @@ class SROA : public FunctionPass {
    /// \brief A collection of instructions to delete.
    /// We try to batch deletions to simplify code and make things a bit more
    /// efficient.
-  SmallVector<Instruction *, 8> DeadInsts;
-
-  /// \brief A set to prevent repeatedly marking an instruction split into many
-  /// uses as dead. Only used to guard insertion into DeadInsts.
-  SmallPtrSet<Instruction *, 4> DeadSplitInsts;
+  SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts;
  
    /// \brief Post-promotion worklist.
    ///
@@ -1553,7 +1554,7 @@ private:
      do {
        LoadInst *LI = Loads.pop_back_val();
        LI->replaceAllUsesWith(NewPN);
-      Pass.DeadInsts.push_back(LI);
+      Pass.DeadInsts.insert(LI);
      } while (!Loads.empty());
  
      // Inject loads into all of the pred blocks.
@@ -1697,7 +1698,7 @@ private:
  
        DEBUG(dbgs() << "          speculated to: " << *V << "\n");
        LI->replaceAllUsesWith(V);
-      Pass.DeadInsts.push_back(LI);
+      Pass.DeadInsts.insert(LI);
      }
    }
  };
@@ -1784,7 +1785,9 @@ static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD,
        break;
      if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) {
        ElementTy = SeqTy->getElementType();
-      Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(), 0)));
+      // Note that we use the default address space as this index is over an
+      // array or a vector, not a pointer.
+      Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0)));
      } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
        if (STy->element_begin() == STy->element_end())
          break; // Nothing left to descend into.
@@ -1825,7 +1828,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
      if (ElementSizeInBits % 8)
        return 0; // GEPs over non-multiple of 8 size vector elements are invalid.
      APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
-    APInt NumSkippedElements = Offset.udiv(ElementSize);
+    APInt NumSkippedElements = Offset.sdiv(ElementSize);
      if (NumSkippedElements.ugt(VecTy->getNumElements()))
        return 0;
      Offset -= NumSkippedElements * ElementSize;
@@ -1837,7 +1840,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
    if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
      Type *ElementTy = ArrTy->getElementType();
      APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
-    APInt NumSkippedElements = Offset.udiv(ElementSize);
+    APInt NumSkippedElements = Offset.sdiv(ElementSize);
      if (NumSkippedElements.ugt(ArrTy->getNumElements()))
        return 0;
  
@@ -1893,7 +1896,7 @@ static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD,
    APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
    if (ElementSize == 0)
      return 0; // Zero-length arrays can't help us build a natural GEP.
-  APInt NumSkippedElements = Offset.udiv(ElementSize);
+  APInt NumSkippedElements = Offset.sdiv(ElementSize);
  
    Offset -= NumSkippedElements * ElementSize;
    Indices.push_back(IRB.getInt(NumSkippedElements));
@@ -2004,6 +2007,51 @@ static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD,
    return Ptr;
  }
  
+/// \brief Test whether we can convert a value from the old to the new type.
+///
+/// This predicate should be used to guard calls to convertValue in order to
+/// ensure that we only try to convert viable values. The strategy is that we
+/// will peel off single element struct and array wrappings to get to an
+/// underlying value, and convert that value.
+static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
+  if (OldTy == NewTy)
+    return true;
+  if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy))
+    return false;
+  if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
+    return false;
+
+  if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
+    if (NewTy->isPointerTy() && OldTy->isPointerTy())
+      return true;
+    if (NewTy->isIntegerTy() || OldTy->isIntegerTy())
+      return true;
+    return false;
+  }
+
+  return true;
+}
+
+/// \brief Generic routine to convert an SSA value to a value of a different
+/// type.
+///
+/// This will try various different casting techniques, such as bitcasts,
+/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
+/// two types for viability with this routine.
+static Value *convertValue(const DataLayout &DL, IRBuilder<> &IRB, Value *V,
+                           Type *Ty) {
+  assert(canConvertValue(DL, V->getType(), Ty) &&
+         "Value not convertable to type");
+  if (V->getType() == Ty)
+    return V;
+  if (V->getType()->isIntegerTy() && Ty->isPointerTy())
+    return IRB.CreateIntToPtr(V, Ty);
+  if (V->getType()->isPointerTy() && Ty->isIntegerTy())
+    return IRB.CreatePtrToInt(V, Ty);
+
+  return IRB.CreateBitCast(V, Ty);
+}
+
  /// \brief Test whether the given alloca partition can be promoted to a vector.
  ///
  /// This is a quick test to check whether we can rewrite a particular alloca
@@ -2049,11 +2097,11 @@ static bool isVectorPromotionViable(const DataLayout &TD,
          EndIndex > Ty->getNumElements())
        return false;
  
-    // FIXME: We should build shuffle vector instructions to handle
-    // non-element-sized accesses.
-    if ((EndOffset - BeginOffset) != ElementSize &&
-        (EndOffset - BeginOffset) != VecSize)
-      return false;
+    assert(EndIndex > BeginIndex && "Empty vector!");
+    uint64_t NumElements = EndIndex - BeginIndex;
+    Type *PartitionTy
+      = (NumElements == 1) ? Ty->getElementType()
+                           : VectorType::get(Ty->getElementType(), NumElements);
  
      if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) {
        if (MI->isVolatile())
@@ -2067,55 +2115,102 @@ static bool isVectorPromotionViable(const DataLayout &TD,
      } else if (I->U->get()->getType()->getPointerElementType()->isStructTy()) {
        // Disable vector promotion when there are loads or stores of an FCA.
        return false;
-    } else if (!isa<LoadInst>(I->U->getUser()) &&
-               !isa<StoreInst>(I->U->getUser())) {
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) {
+      if (LI->isVolatile())
+        return false;
+      if (!canConvertValue(TD, PartitionTy, LI->getType()))
+        return false;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) {
+      if (SI->isVolatile())
+        return false;
+      if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy))
+        return false;
+    } else {
        return false;
      }
    }
    return true;
  }
  
-/// \brief Test whether the given alloca partition can be promoted to an int.
+/// \brief Test whether the given alloca partition's integer operations can be
+/// widened to promotable ones.
  ///
-/// This is a quick test to check whether we can rewrite a particular alloca
-/// partition (and its newly formed alloca) into an integer alloca suitable for
-/// promotion to an SSA value. We only can ensure this for a limited set of
-/// operations, and we don't want to do the rewrites unless we are confident
-/// that the result will be promotable, so we have an early test here.
-static bool isIntegerPromotionViable(const DataLayout &TD,
-                                     Type *AllocaTy,
-                                     uint64_t AllocBeginOffset,
-                                     AllocaPartitioning &P,
-                                     AllocaPartitioning::const_use_iterator I,
-                                     AllocaPartitioning::const_use_iterator E) {
-  IntegerType *Ty = dyn_cast<IntegerType>(AllocaTy);
-  if (!Ty || 8*TD.getTypeStoreSize(Ty) != Ty->getBitWidth())
+/// This is a quick test to check whether we can rewrite the integer loads and
+/// stores to a particular alloca into wider loads and stores and be able to
+/// promote the resulting alloca.
+static bool isIntegerWideningViable(const DataLayout &TD,
+                                    Type *AllocaTy,
+                                    uint64_t AllocBeginOffset,
+                                    AllocaPartitioning &P,
+                                    AllocaPartitioning::const_use_iterator I,
+                                    AllocaPartitioning::const_use_iterator E) {
+  uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy);
+  // Don't create integer types larger than the maximum bitwidth.
+  if (SizeInBits > IntegerType::MAX_INT_BITS)
+    return false;
+
+  // Don't try to handle allocas with bit-padding.
+  if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy))
      return false;
  
+  // We need to ensure that an integer type with the appropriate bitwidth can
+  // be converted to the alloca type, whatever that is. We don't want to force
+  // the alloca itself to have an integer type if there is a more suitable one.
+  Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
+  if (!canConvertValue(TD, AllocaTy, IntTy) ||
+      !canConvertValue(TD, IntTy, AllocaTy))
+    return false;
+
+  uint64_t Size = TD.getTypeStoreSize(AllocaTy);
+
    // Check the uses to ensure the uses are (likely) promoteable integer uses.
    // Also ensure that the alloca has a covering load or store. We don't want
-  // promote because of some other unsplittable entry (which we may make
-  // splittable later) and lose the ability to promote each element access.
+  // to widen the integer operotains only to fail to promote due to some other
+  // unsplittable entry (which we may make splittable later).
    bool WholeAllocaOp = false;
    for (; I != E; ++I) {
      if (!I->U)
        continue; // Skip dead use.
  
+    uint64_t RelBegin = I->BeginOffset - AllocBeginOffset;
+    uint64_t RelEnd = I->EndOffset - AllocBeginOffset;
+
      // We can't reasonably handle cases where the load or store extends past
      // the end of the aloca's type and into its padding.
-    if ((I->EndOffset - AllocBeginOffset) > TD.getTypeStoreSize(Ty))
+    if (RelEnd > Size)
        return false;
  
      if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) {
-      if (LI->isVolatile() || !LI->getType()->isIntegerTy())
+      if (LI->isVolatile())
          return false;
-      if (LI->getType() == Ty)
+      if (RelBegin == 0 && RelEnd == Size)
          WholeAllocaOp = true;
+      if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
+        if (ITy->getBitWidth() < TD.getTypeStoreSize(ITy))
+          return false;
+        continue;
+      }
+      // Non-integer loads need to be convertible from the alloca type so that
+      // they are promotable.
+      if (RelBegin != 0 || RelEnd != Size ||
+          !canConvertValue(TD, AllocaTy, LI->getType()))
+        return false;
      } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) {
-      if (SI->isVolatile() || !SI->getValueOperand()->getType()->isIntegerTy())
+      Type *ValueTy = SI->getValueOperand()->getType();
+      if (SI->isVolatile())
          return false;
-      if (SI->getValueOperand()->getType() == Ty)
+      if (RelBegin == 0 && RelEnd == Size)
          WholeAllocaOp = true;
+      if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
+        if (ITy->getBitWidth() < TD.getTypeStoreSize(ITy))
+          return false;
+        continue;
+      }
+      // Non-integer stores need to be convertible to the alloca type so that
+      // they are promotable.
+      if (RelBegin != 0 || RelEnd != Size ||
+          !canConvertValue(TD, ValueTy, AllocaTy))
+        return false;
      } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) {
        if (MI->isVolatile())
          return false;
@@ -2125,6 +2220,10 @@ static bool isIntegerPromotionViable(const DataLayout &TD,
          if (!MTO.IsSplittable)
            return false;
        }
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->U->getUser())) {
+      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+          II->getIntrinsicID() != Intrinsic::lifetime_end)
+        return false;
      } else {
        return false;
      }
@@ -2132,6 +2231,60 @@ static bool isIntegerPromotionViable(const DataLayout &TD,
    return WholeAllocaOp;
  }
  
+static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V,
+                             IntegerType *Ty, uint64_t Offset,
+                             const Twine &Name) {
+  DEBUG(dbgs() << "       start: " << *V << "\n");
+  IntegerType *IntTy = cast<IntegerType>(V->getType());
+  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+         "Element extends past full value");
+  uint64_t ShAmt = 8*Offset;
+  if (DL.isBigEndian())
+    ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+  if (ShAmt) {
+    V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
+    DEBUG(dbgs() << "     shifted: " << *V << "\n");
+  }
+  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+         "Cannot extract to a larger integer!");
+  if (Ty != IntTy) {
+    V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
+    DEBUG(dbgs() << "     trunced: " << *V << "\n");
+  }
+  return V;
+}
+
+static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old,
+                            Value *V, uint64_t Offset, const Twine &Name) {
+  IntegerType *IntTy = cast<IntegerType>(Old->getType());
+  IntegerType *Ty = cast<IntegerType>(V->getType());
+  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+         "Cannot insert a larger integer!");
+  DEBUG(dbgs() << "       start: " << *V << "\n");
+  if (Ty != IntTy) {
+    V = IRB.CreateZExt(V, IntTy, Name + ".ext");
+    DEBUG(dbgs() << "    extended: " << *V << "\n");
+  }
+  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+         "Element store outside of alloca store");
+  uint64_t ShAmt = 8*Offset;
+  if (DL.isBigEndian())
+    ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+  if (ShAmt) {
+    V = IRB.CreateShl(V, ShAmt, Name + ".shift");
+    DEBUG(dbgs() << "     shifted: " << *V << "\n");
+  }
+
+  if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
+    APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
+    Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
+    DEBUG(dbgs() << "      masked: " << *Old << "\n");
+    V = IRB.CreateOr(Old, V, Name + ".insert");
+    DEBUG(dbgs() << "    inserted: " << *V << "\n");
+  }
+  return V;
+}
+
  namespace {
  /// \brief Visitor to rewrite instructions using a partition of an alloca to
  /// use a new alloca.
@@ -2165,10 +2318,10 @@ class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter,
    uint64_t ElementSize;
  
    // This is a convenience and flag variable that will be null unless the new
-  // alloca has a promotion-targeted integer type due to passing
-  // isIntegerPromotionViable above. If it is non-null does, the desired
+  // alloca's integer operations should be widened to this integer type due to
+  // passing isIntegerWideningViable above. If it is non-null, the desired
    // integer type will be stored here for easy access during rewriting.
-  IntegerType *IntPromotionTy;
+  IntegerType *IntTy;
  
    // The offset of the partition user currently being rewritten.
    uint64_t BeginOffset, EndOffset;
@@ -2188,7 +2341,7 @@ public:
        NewAllocaBeginOffset(NewBeginOffset),
        NewAllocaEndOffset(NewEndOffset),
        NewAllocaTy(NewAI.getAllocatedType()),
-      VecTy(), ElementTy(), ElementSize(), IntPromotionTy(),
+      VecTy(), ElementTy(), ElementSize(), IntTy(),
        BeginOffset(), EndOffset() {
    }
  
@@ -2204,9 +2357,10 @@ public:
        assert((VecTy->getScalarSizeInBits() % 8) == 0 &&
               "Only multiple-of-8 sized vector elements are viable");
        ElementSize = VecTy->getScalarSizeInBits() / 8;
-    } else if (isIntegerPromotionViable(TD, NewAI.getAllocatedType(),
-                                        NewAllocaBeginOffset, P, I, E)) {
-      IntPromotionTy = cast<IntegerType>(NewAI.getAllocatedType());
+    } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(),
+                                       NewAllocaBeginOffset, P, I, E)) {
+      IntTy = Type::getIntNTy(NewAI.getContext(),
+                              TD.getTypeSizeInBits(NewAI.getAllocatedType()));
      }
      bool CanSROA = true;
      for (; I != E; ++I) {
@@ -2225,6 +2379,10 @@ public:
        ElementTy = 0;
        ElementSize = 0;
      }
+    if (IntTy) {
+      assert(CanSROA);
+      IntTy = 0;
+    }
      return CanSROA;
    }
  
@@ -2277,128 +2435,58 @@ private:
      return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset);
    }
  
-  ConstantInt *getIndex(IRBuilder<> &IRB, uint64_t Offset) {
+  unsigned getIndex(uint64_t Offset) {
      assert(VecTy && "Can only call getIndex when rewriting a vector");
      uint64_t RelOffset = Offset - NewAllocaBeginOffset;
      assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
      uint32_t Index = RelOffset / ElementSize;
      assert(Index * ElementSize == RelOffset);
-    return IRB.getInt32(Index);
-  }
-
-  Value *extractInteger(IRBuilder<> &IRB, IntegerType *TargetTy,
-                        uint64_t Offset) {
-    assert(IntPromotionTy && "Alloca is not an integer we can extract from");
-    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                     getName(".load"));
-    assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset");
-    uint64_t RelOffset = Offset - NewAllocaBeginOffset;
-    assert(TD.getTypeStoreSize(TargetTy) + RelOffset <=
-           TD.getTypeStoreSize(IntPromotionTy) &&
-           "Element load outside of alloca store");
-    uint64_t ShAmt = 8*RelOffset;
-    if (TD.isBigEndian())
-      ShAmt = 8*(TD.getTypeStoreSize(IntPromotionTy) -
-                 TD.getTypeStoreSize(TargetTy) - RelOffset);
-    if (ShAmt)
-      V = IRB.CreateLShr(V, ShAmt, getName(".shift"));
-    if (TargetTy != IntPromotionTy) {
-      assert(TargetTy->getBitWidth() < IntPromotionTy->getBitWidth() &&
-             "Cannot extract to a larger integer!");
-      V = IRB.CreateTrunc(V, TargetTy, getName(".trunc"));
-    }
-    return V;
-  }
-
-  StoreInst *insertInteger(IRBuilder<> &IRB, Value *V, uint64_t Offset) {
-    IntegerType *Ty = cast<IntegerType>(V->getType());
-    if (Ty == IntPromotionTy)
-      return IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
-
-    assert(Ty->getBitWidth() < IntPromotionTy->getBitWidth() &&
-           "Cannot insert a larger integer!");
-    V = IRB.CreateZExt(V, IntPromotionTy, getName(".ext"));
-    assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset");
-    uint64_t RelOffset = Offset - NewAllocaBeginOffset;
-    assert(TD.getTypeStoreSize(Ty) + RelOffset <=
-           TD.getTypeStoreSize(IntPromotionTy) &&
-           "Element store outside of alloca store");
-    uint64_t ShAmt = 8*RelOffset;
-    if (TD.isBigEndian())
-      ShAmt = 8*(TD.getTypeStoreSize(IntPromotionTy) - TD.getTypeStoreSize(Ty)
-                 - RelOffset);
-    if (ShAmt)
-      V = IRB.CreateShl(V, ShAmt, getName(".shift"));
-
-    APInt Mask = ~Ty->getMask().zext(IntPromotionTy->getBitWidth()).shl(ShAmt);
-    Value *Old = IRB.CreateAnd(IRB.CreateAlignedLoad(&NewAI,
-                                                     NewAI.getAlignment(),
-                                                     getName(".oldload")),
-                               Mask, getName(".mask"));
-    return IRB.CreateAlignedStore(IRB.CreateOr(Old, V, getName(".insert")),
-                                  &NewAI, NewAI.getAlignment());
+    return Index;
    }
  
    void deleteIfTriviallyDead(Value *V) {
      Instruction *I = cast<Instruction>(V);
      if (isInstructionTriviallyDead(I))
-      Pass.DeadInsts.push_back(I);
-  }
-
-  /// \brief Test whether we can convert a value from the old to the new type.
-  ///
-  /// This predicate should be used to guard calls to convertValue in order to
-  /// ensure that we only try to convert viable values. The strategy is that we
-  /// will peel off single element struct and array wrappings to get to an
-  /// underlying value, and convert that value.
-  bool canConvertValue(Type *OldTy, Type *NewTy) {
-    if (OldTy == NewTy)
-      return true;
-    if (TD.getTypeSizeInBits(NewTy) != TD.getTypeSizeInBits(OldTy))
-      return false;
-    if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
-      return false;
-    return true;
-  }
-
-  Value *convertValue(IRBuilder<> &IRB, Value *V, Type *Ty) {
-    assert(canConvertValue(V->getType(), Ty) && "Value not convertable to type");
-    if (V->getType()->isIntegerTy() && Ty->isPointerTy())
-      return IRB.CreateIntToPtr(V, Ty);
-    if (V->getType()->isPointerTy() && Ty->isIntegerTy())
-      return IRB.CreatePtrToInt(V, Ty);
-
-    return IRB.CreateBitCast(V, Ty);
+      Pass.DeadInsts.insert(I);
    }
  
-  bool rewriteVectorizedLoadInst(IRBuilder<> &IRB, LoadInst &LI, Value *OldOp) {
-    Value *Result;
-    if (LI.getType() == VecTy->getElementType() ||
-        BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) {
-      Result = IRB.CreateExtractElement(
-        IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")),
-        getIndex(IRB, BeginOffset), getName(".extract"));
-    } else {
-      Result = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+  Value *rewriteVectorizedLoadInst(IRBuilder<> &IRB, LoadInst &LI, Value *OldOp) {
+    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                       getName(".load"));
+    unsigned BeginIndex = getIndex(BeginOffset);
+    unsigned EndIndex = getIndex(EndOffset);
+    assert(EndIndex > BeginIndex && "Empty vector!");
+    unsigned NumElements = EndIndex - BeginIndex;
+    assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+    if (NumElements == 1) {
+      V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
+                                   getName(".extract"));
+      DEBUG(dbgs() << "     extract: " << *V << "\n");
+    } else if (NumElements < VecTy->getNumElements()) {
+      SmallVector<Constant*, 8> Mask;
+      Mask.reserve(NumElements);
+      for (unsigned i = BeginIndex; i != EndIndex; ++i)
+        Mask.push_back(IRB.getInt32(i));
+      V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                                  ConstantVector::get(Mask),
+                                  getName(".extract"));
+      DEBUG(dbgs() << "     shuffle: " << *V << "\n");
      }
-    if (Result->getType() != LI.getType())
-      Result = convertValue(IRB, Result, LI.getType());
-    LI.replaceAllUsesWith(Result);
-    Pass.DeadInsts.push_back(&LI);
-
-    DEBUG(dbgs() << "          to: " << *Result << "\n");
-    return true;
+    return V;
    }
  
-  bool rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) {
+  Value *rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) {
+    assert(IntTy && "We cannot insert an integer to the alloca");
      assert(!LI.isVolatile());
-    Value *Result = extractInteger(IRB, cast<IntegerType>(LI.getType()),
-                                   BeginOffset);
-    LI.replaceAllUsesWith(Result);
-    Pass.DeadInsts.push_back(&LI);
-    DEBUG(dbgs() << "          to: " << *Result << "\n");
-    return true;
+    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                     getName(".load"));
+    V = convertValue(TD, IRB, V, IntTy);
+    assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+    uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+    if (Offset > 0 || EndOffset < NewAllocaEndOffset)
+      V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset,
+                         getName(".extract"));
+    return V;
    }
  
    bool visitLoadInst(LoadInst &LI) {
@@ -2407,59 +2495,152 @@ private:
      assert(OldOp == OldPtr);
      IRBuilder<> IRB(&LI);
  
-    if (VecTy)
-      return rewriteVectorizedLoadInst(IRB, LI, OldOp);
-    if (IntPromotionTy)
-      return rewriteIntegerLoad(IRB, LI);
+    uint64_t Size = EndOffset - BeginOffset;
+    bool IsSplitIntLoad = Size < TD.getTypeStoreSize(LI.getType());
  
-    if (BeginOffset == NewAllocaBeginOffset &&
-        canConvertValue(NewAllocaTy, LI.getType())) {
-      Value *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                           LI.isVolatile(), getName(".load"));
-      Value *NewV = convertValue(IRB, NewLI, LI.getType());
-      LI.replaceAllUsesWith(NewV);
-      Pass.DeadInsts.push_back(&LI);
-
-      DEBUG(dbgs() << "          to: " << *NewLI << "\n");
-      return !LI.isVolatile();
+    // If this memory access can be shown to *statically* extend outside the
+    // bounds of the original allocation it's behavior is undefined. Rather
+    // than trying to transform it, just replace it with undef.
+    // FIXME: We should do something more clever for functions being
+    // instrumented by asan.
+    // FIXME: Eventually, once ASan and friends can flush out bugs here, this
+    // should be transformed to a load of null making it unreachable.
+    uint64_t OldAllocSize = TD.getTypeAllocSize(OldAI.getAllocatedType());
+    if (TD.getTypeStoreSize(LI.getType()) > OldAllocSize) {
+      LI.replaceAllUsesWith(UndefValue::get(LI.getType()));
+      Pass.DeadInsts.insert(&LI);
+      deleteIfTriviallyDead(OldOp);
+      DEBUG(dbgs() << "          to: undef!!\n");
+      return true;
      }
  
-    Value *NewPtr = getAdjustedAllocaPtr(IRB,
-                                         LI.getPointerOperand()->getType());
-    LI.setOperand(0, NewPtr);
-    LI.setAlignment(getPartitionTypeAlign(LI.getType()));
-    DEBUG(dbgs() << "          to: " << LI << "\n");
+    Type *TargetTy = IsSplitIntLoad ? Type::getIntNTy(LI.getContext(), Size * 8)
+                                    : LI.getType();
+    bool IsPtrAdjusted = false;
+    Value *V;
+    if (VecTy) {
+      V = rewriteVectorizedLoadInst(IRB, LI, OldOp);
+    } else if (IntTy && LI.getType()->isIntegerTy()) {
+      V = rewriteIntegerLoad(IRB, LI);
+    } else if (BeginOffset == NewAllocaBeginOffset &&
+               canConvertValue(TD, NewAllocaTy, LI.getType())) {
+      V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                LI.isVolatile(), getName(".load"));
+    } else {
+      Type *LTy = TargetTy->getPointerTo();
+      V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy),
+                                getPartitionTypeAlign(TargetTy),
+                                LI.isVolatile(), getName(".load"));
+      IsPtrAdjusted = true;
+    }
+    V = convertValue(TD, IRB, V, TargetTy);
+
+    if (IsSplitIntLoad) {
+      assert(!LI.isVolatile());
+      assert(LI.getType()->isIntegerTy() &&
+             "Only integer type loads and stores are split");
+      assert(LI.getType()->getIntegerBitWidth() ==
+             TD.getTypeStoreSizeInBits(LI.getType()) &&
+             "Non-byte-multiple bit width");
+      assert(LI.getType()->getIntegerBitWidth() ==
+             TD.getTypeAllocSizeInBits(OldAI.getAllocatedType()) &&
+             "Only alloca-wide loads can be split and recomposed");
+      // Move the insertion point just past the load so that we can refer to it.
+      IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI)));
+      // Create a placeholder value with the same type as LI to use as the
+      // basis for the new value. This allows us to replace the uses of LI with
+      // the computed value, and then replace the placeholder with LI, leaving
+      // LI only used for this computation.
+      Value *Placeholder
+        = new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
+      V = insertInteger(TD, IRB, Placeholder, V, BeginOffset,
+                        getName(".insert"));
+      LI.replaceAllUsesWith(V);
+      Placeholder->replaceAllUsesWith(&LI);
+      delete Placeholder;
+    } else {
+      LI.replaceAllUsesWith(V);
+    }
  
+    Pass.DeadInsts.insert(&LI);
      deleteIfTriviallyDead(OldOp);
-    return NewPtr == &NewAI && !LI.isVolatile();
-  }
-
-  bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, StoreInst &SI,
-                                  Value *OldOp) {
-    Value *V = SI.getValueOperand();
-    if (V->getType() == ElementTy ||
-        BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) {
-      if (V->getType() != ElementTy)
-        V = convertValue(IRB, V, ElementTy);
+    DEBUG(dbgs() << "          to: " << *V << "\n");
+    return !LI.isVolatile() && !IsPtrAdjusted;
+  }
+
+  bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, Value *V,
+                                  StoreInst &SI, Value *OldOp) {
+    unsigned BeginIndex = getIndex(BeginOffset);
+    unsigned EndIndex = getIndex(EndOffset);
+    assert(EndIndex > BeginIndex && "Empty vector!");
+    unsigned NumElements = EndIndex - BeginIndex;
+    assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+    Type *PartitionTy
+      = (NumElements == 1) ? ElementTy
+                           : VectorType::get(ElementTy, NumElements);
+    if (V->getType() != PartitionTy)
+      V = convertValue(TD, IRB, V, PartitionTy);
+    if (NumElements < VecTy->getNumElements()) {
+      // We need to mix in the existing elements.
        LoadInst *LI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                             getName(".load"));
-      V = IRB.CreateInsertElement(LI, V, getIndex(IRB, BeginOffset),
-                                  getName(".insert"));
-    } else if (V->getType() != VecTy) {
-      V = convertValue(IRB, V, VecTy);
+      if (NumElements == 1) {
+        V = IRB.CreateInsertElement(LI, V, IRB.getInt32(BeginIndex),
+                                    getName(".insert"));
+        DEBUG(dbgs() <<  "     insert: " << *V << "\n");
+      } else {
+        // When inserting a smaller vector into the larger to store, we first
+        // use a shuffle vector to widen it with undef elements, and then
+        // a second shuffle vector to select between the loaded vector and the
+        // incoming vector.
+        SmallVector<Constant*, 8> Mask;
+        Mask.reserve(VecTy->getNumElements());
+        for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+          if (i >= BeginIndex && i < EndIndex)
+            Mask.push_back(IRB.getInt32(i - BeginIndex));
+          else
+            Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
+        V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                                    ConstantVector::get(Mask),
+                                    getName(".expand"));
+        DEBUG(dbgs() << "    shuffle1: " << *V << "\n");
+
+        Mask.clear();
+        for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+          if (i >= BeginIndex && i < EndIndex)
+            Mask.push_back(IRB.getInt32(i));
+          else
+            Mask.push_back(IRB.getInt32(i + VecTy->getNumElements()));
+        V = IRB.CreateShuffleVector(V, LI, ConstantVector::get(Mask),
+                                    getName("insert"));
+        DEBUG(dbgs() << "    shuffle2: " << *V << "\n");
+      }
+    } else {
+      V = convertValue(TD, IRB, V, VecTy);
      }
      StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
-    Pass.DeadInsts.push_back(&SI);
+    Pass.DeadInsts.insert(&SI);
  
      (void)Store;
      DEBUG(dbgs() << "          to: " << *Store << "\n");
      return true;
    }
  
-  bool rewriteIntegerStore(IRBuilder<> &IRB, StoreInst &SI) {
+  bool rewriteIntegerStore(IRBuilder<> &IRB, Value *V, StoreInst &SI) {
+    assert(IntTy && "We cannot extract an integer from the alloca");
      assert(!SI.isVolatile());
-    StoreInst *Store = insertInteger(IRB, SI.getValueOperand(), BeginOffset);
-    Pass.DeadInsts.push_back(&SI);
+    if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                         getName(".oldload"));
+      Old = convertValue(TD, IRB, Old, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset,
+                        getName(".insert"));
+    }
+    V = convertValue(TD, IRB, V, NewAllocaTy);
+    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+    Pass.DeadInsts.insert(&SI);
      (void)Store;
      DEBUG(dbgs() << "          to: " << *Store << "\n");
      return true;
@@ -2471,40 +2652,53 @@ private:
      assert(OldOp == OldPtr);
      IRBuilder<> IRB(&SI);
  
-    if (VecTy)
-      return rewriteVectorizedStoreInst(IRB, SI, OldOp);
-    if (IntPromotionTy)
-      return rewriteIntegerStore(IRB, SI);
-
-    Type *ValueTy = SI.getValueOperand()->getType();
+    Value *V = SI.getValueOperand();
  
      // Strip all inbounds GEPs and pointer casts to try to dig out any root
      // alloca that should be re-examined after promoting this alloca.
-    if (ValueTy->isPointerTy())
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(SI.getValueOperand()
-                                                  ->stripInBoundsOffsets()))
+    if (V->getType()->isPointerTy())
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
          Pass.PostPromotionWorklist.insert(AI);
  
-    if (BeginOffset == NewAllocaBeginOffset &&
-        canConvertValue(ValueTy, NewAllocaTy)) {
-      Value *NewV = convertValue(IRB, SI.getValueOperand(), NewAllocaTy);
-      StoreInst *NewSI = IRB.CreateAlignedStore(NewV, &NewAI, NewAI.getAlignment(),
-                                                SI.isVolatile());
-      (void)NewSI;
-      Pass.DeadInsts.push_back(&SI);
-
-      DEBUG(dbgs() << "          to: " << *NewSI << "\n");
-      return !SI.isVolatile();
+    uint64_t Size = EndOffset - BeginOffset;
+    if (Size < TD.getTypeStoreSize(V->getType())) {
+      assert(!SI.isVolatile());
+      assert(V->getType()->isIntegerTy() &&
+             "Only integer type loads and stores are split");
+      assert(V->getType()->getIntegerBitWidth() ==
+             TD.getTypeStoreSizeInBits(V->getType()) &&
+             "Non-byte-multiple bit width");
+      assert(V->getType()->getIntegerBitWidth() ==
+             TD.getTypeSizeInBits(OldAI.getAllocatedType()) &&
+             "Only alloca-wide stores can be split and recomposed");
+      IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8);
+      V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset,
+                         getName(".extract"));
      }
  
-    Value *NewPtr = getAdjustedAllocaPtr(IRB,
-                                         SI.getPointerOperand()->getType());
-    SI.setOperand(1, NewPtr);
-    SI.setAlignment(getPartitionTypeAlign(SI.getValueOperand()->getType()));
-    DEBUG(dbgs() << "          to: " << SI << "\n");
+    if (VecTy)
+      return rewriteVectorizedStoreInst(IRB, V, SI, OldOp);
+    if (IntTy && V->getType()->isIntegerTy())
+      return rewriteIntegerStore(IRB, V, SI);
  
+    StoreInst *NewSI;
+    if (BeginOffset == NewAllocaBeginOffset &&
+        canConvertValue(TD, V->getType(), NewAllocaTy)) {
+      V = convertValue(TD, IRB, V, NewAllocaTy);
+      NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+                                     SI.isVolatile());
+    } else {
+      Value *NewPtr = getAdjustedAllocaPtr(IRB, V->getType()->getPointerTo());
+      NewSI = IRB.CreateAlignedStore(V, NewPtr,
+                                     getPartitionTypeAlign(V->getType()),
+                                     SI.isVolatile());
+    }
+    (void)NewSI;
+    Pass.DeadInsts.insert(&SI);
      deleteIfTriviallyDead(OldOp);
-    return NewPtr == &NewAI && !SI.isVolatile();
+
+    DEBUG(dbgs() << "          to: " << *NewSI << "\n");
+    return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
    }
  
    bool visitMemSetInst(MemSetInst &II) {
@@ -2524,18 +2718,18 @@ private:
      }
  
      // Record this instruction for deletion.
-    if (Pass.DeadSplitInsts.insert(&II))
-      Pass.DeadInsts.push_back(&II);
+    Pass.DeadInsts.insert(&II);
  
      Type *AllocaTy = NewAI.getAllocatedType();
      Type *ScalarTy = AllocaTy->getScalarType();
  
      // If this doesn't map cleanly onto the alloca type, and that type isn't
      // a single value type, just emit a memset.
-    if (!VecTy && (BeginOffset != NewAllocaBeginOffset ||
-                   EndOffset != NewAllocaEndOffset ||
-                   !AllocaTy->isSingleValueType() ||
-                   !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)))) {
+    if (!VecTy && !IntTy &&
+        (BeginOffset != NewAllocaBeginOffset ||
+         EndOffset != NewAllocaEndOffset ||
+         !AllocaTy->isSingleValueType() ||
+         !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)))) {
        Type *SizeTy = II.getLength()->getType();
        Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
        CallInst *New
@@ -2553,37 +2747,29 @@ private:
      // a sensible representation for the alloca type. This is essentially
      // splatting the byte to a sufficiently wide integer, bitcasting to the
      // desired scalar type, and splatting it across any desired vector type.
+    uint64_t Size = EndOffset - BeginOffset;
      Value *V = II.getValue();
      IntegerType *VTy = cast<IntegerType>(V->getType());
-    Type *IntTy = Type::getIntNTy(VTy->getContext(),
-                                  TD.getTypeSizeInBits(ScalarTy));
-    if (TD.getTypeSizeInBits(ScalarTy) > VTy->getBitWidth())
-      V = IRB.CreateMul(IRB.CreateZExt(V, IntTy, getName(".zext")),
+    Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8);
+    if (Size*8 > VTy->getBitWidth())
+      V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, getName(".zext")),
                          ConstantExpr::getUDiv(
-                          Constant::getAllOnesValue(IntTy),
+                          Constant::getAllOnesValue(SplatIntTy),
                            ConstantExpr::getZExt(
                              Constant::getAllOnesValue(V->getType()),
-                            IntTy)),
+                            SplatIntTy)),
                          getName(".isplat"));
-    if (V->getType() != ScalarTy) {
-      if (ScalarTy->isPointerTy())
-        V = IRB.CreateIntToPtr(V, ScalarTy);
-      else if (ScalarTy->isPrimitiveType() || ScalarTy->isVectorTy())
-        V = IRB.CreateBitCast(V, ScalarTy);
-      else if (ScalarTy->isIntegerTy())
-        llvm_unreachable("Computed different integer types with equal widths");
-      else
-        llvm_unreachable("Invalid scalar type");
-    }
  
      // If this is an element-wide memset of a vectorizable alloca, insert it.
      if (VecTy && (BeginOffset > NewAllocaBeginOffset ||
                    EndOffset < NewAllocaEndOffset)) {
+      if (V->getType() != ScalarTy)
+        V = convertValue(TD, IRB, V, ScalarTy);
        StoreInst *Store = IRB.CreateAlignedStore(
          IRB.CreateInsertElement(IRB.CreateAlignedLoad(&NewAI,
                                                        NewAI.getAlignment(),
                                                        getName(".load")),
-                                V, getIndex(IRB, BeginOffset),
+                                V, IRB.getInt32(getIndex(BeginOffset)),
                                  getName(".insert")),
          &NewAI, NewAI.getAlignment());
        (void)Store;
@@ -2591,18 +2777,22 @@ private:
        return true;
      }
  
-    // Splat to a vector if needed.
-    if (VectorType *VecTy = dyn_cast<VectorType>(AllocaTy)) {
-      VectorType *SplatSourceTy = VectorType::get(V->getType(), 1);
-      V = IRB.CreateShuffleVector(
-        IRB.CreateInsertElement(UndefValue::get(SplatSourceTy), V,
-                                IRB.getInt32(0), getName(".vsplat.insert")),
-        UndefValue::get(SplatSourceTy),
-        ConstantVector::getSplat(VecTy->getNumElements(), IRB.getInt32(0)),
-        getName(".vsplat.shuffle"));
-      assert(V->getType() == VecTy);
+    // If this is a memset on an alloca where we can widen stores, insert the
+    // set integer.
+    if (IntTy && (BeginOffset > NewAllocaBeginOffset ||
+                  EndOffset < NewAllocaEndOffset)) {
+      assert(!II.isVolatile());
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                         getName(".oldload"));
+      Old = convertValue(TD, IRB, Old, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert"));
      }
  
+    if (V->getType() != AllocaTy)
+      V = convertValue(TD, IRB, V, AllocaTy);
+
      Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
                                          II.isVolatile());
      (void)New;
@@ -2663,9 +2853,9 @@ private:
      // If this doesn't map cleanly onto the alloca type, and that type isn't
      // a single value type, just emit a memcpy.
      bool EmitMemCpy
-      = !VecTy && (BeginOffset != NewAllocaBeginOffset ||
-                   EndOffset != NewAllocaEndOffset ||
-                   !NewAI.getAllocatedType()->isSingleValueType());
+      = !VecTy && !IntTy && (BeginOffset != NewAllocaBeginOffset ||
+                             EndOffset != NewAllocaEndOffset ||
+                             !NewAI.getAllocatedType()->isSingleValueType());
  
      // If we're just going to emit a memcpy, the alloca hasn't changed, and the
      // size hasn't been shrunk based on analysis of the viable range, this is
@@ -2684,17 +2874,25 @@ private:
        return false;
      }
      // Record this instruction for deletion.
-    if (Pass.DeadSplitInsts.insert(&II))
-      Pass.DeadInsts.push_back(&II);
+    Pass.DeadInsts.insert(&II);
  
-    bool IsVectorElement = VecTy && (BeginOffset > NewAllocaBeginOffset ||
-                                     EndOffset < NewAllocaEndOffset);
+    bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset &&
+                         EndOffset == NewAllocaEndOffset;
+    bool IsVectorElement = VecTy && !IsWholeAlloca;
+    uint64_t Size = EndOffset - BeginOffset;
+    IntegerType *SubIntTy
+      = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0;
  
      Type *OtherPtrTy = IsDest ? II.getRawSource()->getType()
                                : II.getRawDest()->getType();
-    if (!EmitMemCpy)
-      OtherPtrTy = IsVectorElement ? VecTy->getElementType()->getPointerTo()
-                                   : NewAI.getType();
+    if (!EmitMemCpy) {
+      if (IsVectorElement)
+        OtherPtrTy = VecTy->getElementType()->getPointerTo();
+      else if (IntTy && !IsWholeAlloca)
+        OtherPtrTy = SubIntTy->getPointerTo();
+      else
+        OtherPtrTy = NewAI.getType();
+    }
  
      // Compute the other pointer, folding as much as possible to produce
      // a single, simple GEP in most cases.
@@ -2739,18 +2937,35 @@ private:
        // We have to extract rather than load.
        Src = IRB.CreateExtractElement(
          IRB.CreateAlignedLoad(SrcPtr, Align, getName(".copyload")),
-        getIndex(IRB, BeginOffset),
+        IRB.getInt32(getIndex(BeginOffset)),
          getName(".copyextract"));
+    } else if (IntTy && !IsWholeAlloca && !IsDest) {
+      Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                  getName(".load"));
+      Src = convertValue(TD, IRB, Src, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, getName(".extract"));
      } else {
        Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(),
                                    getName(".copyload"));
      }
  
+    if (IntTy && !IsWholeAlloca && IsDest) {
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                         getName(".oldload"));
+      Old = convertValue(TD, IRB, Old, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      Src = insertInteger(TD, IRB, Old, Src, Offset, getName(".insert"));
+      Src = convertValue(TD, IRB, Src, NewAllocaTy);
+    }
+
      if (IsVectorElement && IsDest) {
        // We have to insert into a loaded copy before storing.
        Src = IRB.CreateInsertElement(
          IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")),
-        Src, getIndex(IRB, BeginOffset),
+        Src, IRB.getInt32(getIndex(BeginOffset)),
          getName(".insert"));
      }
  
@@ -2769,8 +2984,7 @@ private:
      assert(II.getArgOperand(1) == OldPtr);
  
      // Record this instruction for deletion.
-    if (Pass.DeadSplitInsts.insert(&II))
-      Pass.DeadInsts.push_back(&II);
+    Pass.DeadInsts.insert(&II);
  
      ConstantInt *Size
        = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
@@ -2797,10 +3011,7 @@ private:
  
      Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType());
      // Replace the operands which were using the old pointer.
-    User::op_iterator OI = PN.op_begin(), OE = PN.op_end();
-    for (; OI != OE; ++OI)
-      if (*OI == OldPtr)
-        *OI = NewPtr;
+    std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
  
      DEBUG(dbgs() << "          to: " << PN << "\n");
      deleteIfTriviallyDead(OldPtr);
@@ -3038,6 +3249,36 @@ private:
  };
  }
  
+/// \brief Strip aggregate type wrapping.
+///
+/// This removes no-op aggregate types wrapping an underlying type. It will
+/// strip as many layers of types as it can without changing either the type
+/// size or the allocated size.
+static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
+  if (Ty->isSingleValueType())
+    return Ty;
+
+  uint64_t AllocSize = DL.getTypeAllocSize(Ty);
+  uint64_t TypeSize = DL.getTypeSizeInBits(Ty);
+
+  Type *InnerTy;
+  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+    InnerTy = ArrTy->getElementType();
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    const StructLayout *SL = DL.getStructLayout(STy);
+    unsigned Index = SL->getElementContainingOffset(0);
+    InnerTy = STy->getElementType(Index);
+  } else {
+    return Ty;
+  }
+
+  if (AllocSize > DL.getTypeAllocSize(InnerTy) ||
+      TypeSize > DL.getTypeSizeInBits(InnerTy))
+    return Ty;
+
+  return stripAggregateTypeWrapping(DL, InnerTy);
+}
+
  /// \brief Try to find a partition of the aggregate type passed in for a given
  /// offset and size.
  ///
@@ -3054,7 +3295,10 @@ private:
  static Type *getTypePartition(const DataLayout &TD, Type *Ty,
                                uint64_t Offset, uint64_t Size) {
    if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size)
-    return Ty;
+    return stripAggregateTypeWrapping(TD, Ty);
+  if (Offset > TD.getTypeAllocSize(Ty) ||
+      (TD.getTypeAllocSize(Ty) - Offset) < Size)
+    return 0;
  
    if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
      // We can't partition pointers...
@@ -3083,7 +3327,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
      assert(Offset == 0);
  
      if (Size == ElementSize)
-      return ElementTy;
+      return stripAggregateTypeWrapping(TD, ElementTy);
      assert(Size > ElementSize);
      uint64_t NumElements = Size / ElementSize;
      if (NumElements * ElementSize != Size)
@@ -3119,7 +3363,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
    assert(Offset == 0);
  
    if (Size == ElementSize)
-    return ElementTy;
+    return stripAggregateTypeWrapping(TD, ElementTy);
  
    StructType::element_iterator EI = STy->element_begin() + Index,
                                 EE = STy->element_end();
@@ -3140,11 +3384,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
    }
  
    // Try to build up a sub-structure.
-  SmallVector<Type *, 4> ElementTys;
-  do {
-    ElementTys.push_back(*EI++);
-  } while (EI != EE);
-  StructType *SubTy = StructType::get(STy->getContext(), ElementTys,
+  StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE),
                                        STy->isPacked());
    const StructLayout *SubSL = TD.getStructLayout(SubTy);
    if (Size != SubSL->getSizeInBytes())
@@ -3313,7 +3553,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
         DI != DE; ++DI) {
      Changed = true;
      (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType()));
-    DeadInsts.push_back(*DI);
+    DeadInsts.insert(*DI);
    }
    for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(),
                                              DE = P.dead_op_end();
@@ -3324,7 +3564,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
      if (Instruction *OldI = dyn_cast<Instruction>(OldV))
        if (isInstructionTriviallyDead(OldI)) {
          Changed = true;
-        DeadInsts.push_back(OldI);
+        DeadInsts.insert(OldI);
        }
    }
  
@@ -3345,17 +3585,18 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
  /// We also record the alloca instructions deleted here so that they aren't
  /// subsequently handed to mem2reg to promote.
  void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
-  DeadSplitInsts.clear();
    while (!DeadInsts.empty()) {
      Instruction *I = DeadInsts.pop_back_val();
      DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
  
+    I->replaceAllUsesWith(UndefValue::get(I->getType()));
+
      for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
        if (Instruction *U = dyn_cast<Instruction>(*OI)) {
          // Zero out the operand and see if it becomes trivially dead.
          *OI = 0;
          if (isInstructionTriviallyDead(U))
-          DeadInsts.push_back(U);
+          DeadInsts.insert(U);
        }
  
      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))