Make sure scalarrepl picks the correct alloca when it rewrites a bitcast. Fixes...

[oota-llvm.git] / lib / Transforms / Scalar / ScalarReplAggregates.cpp
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp

index c3ca85280ee76ba0b74bdbe2bef8884ec92eddfd..c12f403d57e177b96c35f7facdd27347a479eec9 100644 (file)
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -30,6 +30,8 @@
  #include "llvm/LLVMContext.h"
  #include "llvm/Module.h"
  #include "llvm/Pass.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Analysis/DIBuilder.h"
  #include "llvm/Analysis/Dominators.h"
  #include "llvm/Analysis/Loads.h"
  #include "llvm/Analysis/ValueTracking.h"
@@ -127,11 +129,11 @@ namespace {
                                           AllocaInfo &Info);
      void isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info);
      void isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
-                         const Type *MemOpType, bool isStore, AllocaInfo &Info,
+                         Type *MemOpType, bool isStore, AllocaInfo &Info,
                           Instruction *TheAccess, bool AllowWholeAccess);
-    bool TypeHasComponent(const Type *T, uint64_t Offset, uint64_t Size);
-    uint64_t FindElementAndOffset(const Type *&T, uint64_t &Offset,
-                                  const Type *&IdxTy);
+    bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size);
+    uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset,
+                                  Type *&IdxTy);
  
      void DoScalarReplacement(AllocaInst *AI,
                               std::vector<AllocaInst*> &WorkList);
@@ -143,6 +145,9 @@ namespace {
                          SmallVector<AllocaInst*, 32> &NewElts);
      void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
                      SmallVector<AllocaInst*, 32> &NewElts);
+    void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
+                                  uint64_t Offset,
+                                  SmallVector<AllocaInst*, 32> &NewElts);
      void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
                                        AllocaInst *AI,
                                        SmallVector<AllocaInst*, 32> &NewElts);
@@ -151,7 +156,8 @@ namespace {
      void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
                                        SmallVector<AllocaInst*, 32> &NewElts);
  
-    static MemTransferInst *isOnlyCopiedFromConstantGlobal(AllocaInst *AI);
+    static MemTransferInst *isOnlyCopiedFromConstantGlobal(
+        AllocaInst *AI, SmallVector<Instruction*, 4> &ToDelete);
    };
    
    // SROA_DT - SROA that uses DominatorTree.
@@ -219,7 +225,7 @@ namespace {
  /// optimization, which scans the uses of an alloca and determines if it can
  /// rewrite it in terms of a single new alloca that can be mem2reg'd.
  class ConvertToScalarInfo {
-  /// AllocaSize - The size of the alloca being considered.
+  /// AllocaSize - The size of the alloca being considered in bytes.
    unsigned AllocaSize;
    const TargetData &TD;
  
@@ -227,33 +233,50 @@ class ConvertToScalarInfo {
    /// which means that mem2reg can't promote it.
    bool IsNotTrivial;
  
+  /// ScalarKind - Tracks the kind of alloca being considered for promotion,
+  /// computed based on the uses of the alloca rather than the LLVM type system.
+  enum {
+    Unknown,
+
+    // Accesses via GEPs that are consistent with element access of a vector
+    // type. This will not be converted into a vector unless there is a later
+    // access using an actual vector type.
+    ImplicitVector,
+
+    // Accesses via vector operations and GEPs that are consistent with the
+    // layout of a vector type.
+    Vector,
+
+    // An integer bag-of-bits with bitwise operations for insertion and
+    // extraction. Any combination of types can be converted into this kind
+    // of scalar.
+    Integer
+  } ScalarKind;
+
    /// VectorTy - This tracks the type that we should promote the vector to if
    /// it is possible to turn it into a vector.  This starts out null, and if it
    /// isn't possible to turn into a vector type, it gets set to VoidTy.
-  const Type *VectorTy;
+  VectorType *VectorTy;
  
-  /// HadAVector - True if there is at least one vector access to the alloca.
-  /// We don't want to turn random arrays into vectors and use vector element
-  /// insert/extract, but if there are element accesses to something that is
-  /// also declared as a vector, we do want to promote to a vector.
-  bool HadAVector;
+  /// HadNonMemTransferAccess - True if there is at least one access to the 
+  /// alloca that is not a MemTransferInst.  We don't want to turn structs into
+  /// large integers unless there is some potential for optimization.
+  bool HadNonMemTransferAccess;
  
  public:
    explicit ConvertToScalarInfo(unsigned Size, const TargetData &td)
-    : AllocaSize(Size), TD(td) {
-    IsNotTrivial = false;
-    VectorTy = 0;
-    HadAVector = false;
-  }
+    : AllocaSize(Size), TD(td), IsNotTrivial(false), ScalarKind(Unknown),
+      VectorTy(0), HadNonMemTransferAccess(false) { }
  
    AllocaInst *TryConvert(AllocaInst *AI);
  
  private:
    bool CanConvertToScalar(Value *V, uint64_t Offset);
-  void MergeInType(const Type *In, uint64_t Offset);
+  void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset);
+  bool MergeInVectorType(VectorType *VInTy, uint64_t Offset);
    void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset);
  
-  Value *ConvertScalar_ExtractValue(Value *NV, const Type *ToType,
+  Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType,
                                      uint64_t Offset, IRBuilder<> &Builder);
    Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal,
                                     uint64_t Offset, IRBuilder<> &Builder);
@@ -270,29 +293,44 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
    if (!CanConvertToScalar(AI, 0) || !IsNotTrivial)
      return 0;
  
+  // If an alloca has only memset / memcpy uses, it may still have an Unknown
+  // ScalarKind. Treat it as an Integer below.
+  if (ScalarKind == Unknown)
+    ScalarKind = Integer;
+
+  if (ScalarKind == Vector && VectorTy->getBitWidth() != AllocaSize * 8)
+    ScalarKind = Integer;
+
    // If we were able to find a vector type that can handle this with
    // insert/extract elements, and if there was at least one use that had
    // a vector type, promote this to a vector.  We don't want to promote
    // random stuff that doesn't use vectors (e.g. <9 x double>) because then
    // we just get a lot of insert/extracts.  If at least one vector is
    // involved, then we probably really do have a union of vector/array.
-  const Type *NewTy;
-  if (VectorTy && VectorTy->isVectorTy() && HadAVector) {
+  Type *NewTy;
+  if (ScalarKind == Vector) {
+    assert(VectorTy && "Missing type for vector scalar.");
      DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n  TYPE = "
            << *VectorTy << '\n');
      NewTy = VectorTy;  // Use the vector type.
    } else {
+    unsigned BitWidth = AllocaSize * 8;
+    if ((ScalarKind == ImplicitVector || ScalarKind == Integer) &&
+        !HadNonMemTransferAccess && !TD.fitsInLegalInteger(BitWidth))
+      return 0;
+
      DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n");
      // Create and insert the integer alloca.
-    NewTy = IntegerType::get(AI->getContext(), AllocaSize*8);
+    NewTy = IntegerType::get(AI->getContext(), BitWidth);
    }
    AllocaInst *NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin());
    ConvertUsesToScalar(AI, NewAI, 0);
    return NewAI;
  }
  
-/// MergeInType - Add the 'In' type to the accumulated vector type (VectorTy)
-/// so far at the offset specified by Offset (which is specified in bytes).
+/// MergeInTypeForLoadOrStore - Add the 'In' type to the accumulated vector type
+/// (VectorTy) so far at the offset specified by Offset (which is specified in
+/// bytes).
  ///
  /// There are two cases we handle here:
  ///   1) A union of vector types of the same size and potentially its elements.
@@ -303,50 +341,65 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
  ///      large) integer type with extract and insert operations where the loads
  ///      and stores would mutate the memory.  We mark this by setting VectorTy
  ///      to VoidTy.
-void ConvertToScalarInfo::MergeInType(const Type *In, uint64_t Offset) {
+void ConvertToScalarInfo::MergeInTypeForLoadOrStore(Type *In,
+                                                    uint64_t Offset) {
    // If we already decided to turn this into a blob of integer memory, there is
    // nothing to be done.
-  if (VectorTy && VectorTy->isVoidTy())
+  if (ScalarKind == Integer)
      return;
  
    // If this could be contributing to a vector, analyze it.
  
    // If the In type is a vector that is the same size as the alloca, see if it
    // matches the existing VecTy.
-  if (const VectorType *VInTy = dyn_cast<VectorType>(In)) {
-    // Remember if we saw a vector type.
-    HadAVector = true;
-
-    if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) {
-      // If we're storing/loading a vector of the right size, allow it as a
-      // vector.  If this the first vector we see, remember the type so that
-      // we know the element size.  If this is a subsequent access, ignore it
-      // even if it is a differing type but the same size.  Worst case we can
-      // bitcast the resultant vectors.
-      if (VectorTy == 0)
-        VectorTy = VInTy;
+  if (VectorType *VInTy = dyn_cast<VectorType>(In)) {
+    if (MergeInVectorType(VInTy, Offset))
        return;
-    }
    } else if (In->isFloatTy() || In->isDoubleTy() ||
               (In->isIntegerTy() && In->getPrimitiveSizeInBits() >= 8 &&
                isPowerOf2_32(In->getPrimitiveSizeInBits()))) {
+    // Full width accesses can be ignored, because they can always be turned
+    // into bitcasts.
+    unsigned EltSize = In->getPrimitiveSizeInBits()/8;
+    if (EltSize == AllocaSize)
+      return;
+
      // If we're accessing something that could be an element of a vector, see
      // if the implied vector agrees with what we already have and if Offset is
      // compatible with it.
-    unsigned EltSize = In->getPrimitiveSizeInBits()/8;
      if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 &&
-        (VectorTy == 0 ||
-         cast<VectorType>(VectorTy)->getElementType()
-               ->getPrimitiveSizeInBits()/8 == EltSize)) {
-      if (VectorTy == 0)
+        (!VectorTy || EltSize == VectorTy->getElementType()
+                                         ->getPrimitiveSizeInBits()/8)) {
+      if (!VectorTy) {
+        ScalarKind = ImplicitVector;
          VectorTy = VectorType::get(In, AllocaSize/EltSize);
+      }
        return;
      }
    }
  
    // Otherwise, we have a case that we can't handle with an optimized vector
    // form.  We can still turn this into a large integer.
-  VectorTy = Type::getVoidTy(In->getContext());
+  ScalarKind = Integer;
+}
+
+/// MergeInVectorType - Handles the vector case of MergeInTypeForLoadOrStore,
+/// returning true if the type was successfully merged and false otherwise.
+bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy,
+                                            uint64_t Offset) {
+  if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) {
+    // If we're storing/loading a vector of the right size, allow it as a
+    // vector.  If this the first vector we see, remember the type so that
+    // we know the element size. If this is a subsequent access, ignore it
+    // even if it is a differing type but the same size. Worst case we can
+    // bitcast the resultant vectors.
+    if (!VectorTy)
+      VectorTy = VInTy;
+    ScalarKind = Vector;
+    return true;
+  }
+
+  return false;
  }
  
  /// CanConvertToScalar - V is a pointer.  If we can convert the pointee and all
@@ -364,27 +417,30 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
  
      if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
        // Don't break volatile loads.
-      if (LI->isVolatile())
+      if (!LI->isSimple())
          return false;
        // Don't touch MMX operations.
        if (LI->getType()->isX86_MMXTy())
          return false;
-      MergeInType(LI->getType(), Offset);
+      HadNonMemTransferAccess = true;
+      MergeInTypeForLoadOrStore(LI->getType(), Offset);
        continue;
      }
  
      if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
        // Storing the pointer, not into the value?
-      if (SI->getOperand(0) == V || SI->isVolatile()) return false;
+      if (SI->getOperand(0) == V || !SI->isSimple()) return false;
        // Don't touch MMX operations.
        if (SI->getOperand(0)->getType()->isX86_MMXTy())
          return false;
-      MergeInType(SI->getOperand(0)->getType(), Offset);
+      HadNonMemTransferAccess = true;
+      MergeInTypeForLoadOrStore(SI->getOperand(0)->getType(), Offset);
        continue;
      }
  
      if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
-      IsNotTrivial = true;  // Can't be mem2reg'd.
+      if (!onlyUsedByLifetimeMarkers(BCI))
+        IsNotTrivial = true;  // Can't be mem2reg'd.
        if (!CanConvertToScalar(BCI, Offset))
          return false;
        continue;
@@ -398,22 +454,36 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
        // Compute the offset that this GEP adds to the pointer.
        SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
        uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
-                                               &Indices[0], Indices.size());
+                                               Indices);
        // See if all uses can be converted.
        if (!CanConvertToScalar(GEP, Offset+GEPOffset))
          return false;
        IsNotTrivial = true;  // Can't be mem2reg'd.
+      HadNonMemTransferAccess = true;
        continue;
      }
  
      // If this is a constant sized memset of a constant value (e.g. 0) we can
      // handle it.
      if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
-      // Store of constant value and constant size.
-      if (!isa<ConstantInt>(MSI->getValue()) ||
-          !isa<ConstantInt>(MSI->getLength()))
+      // Store of constant value.
+      if (!isa<ConstantInt>(MSI->getValue()))
+        return false;
+
+      // Store of constant size.
+      ConstantInt *Len = dyn_cast<ConstantInt>(MSI->getLength());
+      if (!Len)
          return false;
+
+      // If the size differs from the alloca, we can only convert the alloca to
+      // an integer bag-of-bits.
+      // FIXME: This should handle all of the cases that are currently accepted
+      // as vector element insertions.
+      if (Len->getZExtValue() != AllocaSize || Offset != 0)
+        ScalarKind = Integer;
+
        IsNotTrivial = true;  // Can't be mem2reg'd.
+      HadNonMemTransferAccess = true;
        continue;
      }
  
@@ -428,6 +498,14 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
        continue;
      }
  
+    // If this is a lifetime intrinsic, we can handle it.
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        continue;
+      }
+    }
+
      // Otherwise, we cannot handle this!
      return false;
    }
@@ -457,7 +535,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
        // Compute the offset that this GEP adds to the pointer.
        SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
        uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
-                                               &Indices[0], Indices.size());
+                                               Indices);
        ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8);
        GEP->eraseFromParent();
        continue;
@@ -467,7 +545,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
  
      if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
        // The load is a bit extract from NewAI shifted right by Offset bits.
-      Value *LoadedVal = Builder.CreateLoad(NewAI, "tmp");
+      Value *LoadedVal = Builder.CreateLoad(NewAI);
        Value *NewLoadVal
          = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder);
        LI->replaceAllUsesWith(NewLoadVal);
@@ -536,8 +614,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
          // pointer (bitcasted), then a store to our new alloca.
          assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?");
          Value *SrcPtr = MTI->getSource();
-        const PointerType* SPTy = cast<PointerType>(SrcPtr->getType());
-        const PointerType* AIPTy = cast<PointerType>(NewAI->getType());
+        PointerType* SPTy = cast<PointerType>(SrcPtr->getType());
+        PointerType* AIPTy = cast<PointerType>(NewAI->getType());
          if (SPTy->getAddressSpace() != AIPTy->getAddressSpace()) {
            AIPTy = PointerType::get(AIPTy->getElementType(),
                                     SPTy->getAddressSpace());
@@ -553,8 +631,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
          assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?");
          LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval");
  
-        const PointerType* DPTy = cast<PointerType>(MTI->getDest()->getType());
-        const PointerType* AIPTy = cast<PointerType>(NewAI->getType());
+        PointerType* DPTy = cast<PointerType>(MTI->getDest()->getType());
+        PointerType* AIPTy = cast<PointerType>(NewAI->getType());
          if (DPTy->getAddressSpace() != AIPTy->getAddressSpace()) {
            AIPTy = PointerType::get(AIPTy->getElementType(),
                                     DPTy->getAddressSpace());
@@ -571,6 +649,16 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
        continue;
      }
  
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        // There's no need to preserve these, as the resulting alloca will be
+        // converted to a register anyways.
+        II->eraseFromParent();
+        continue;
+      }
+    }
+
      llvm_unreachable("Unsupported operation!");
    }
  }
@@ -586,17 +674,20 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
  /// Offset is an offset from the original alloca, in bits that need to be
  /// shifted to the right.
  Value *ConvertToScalarInfo::
-ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
+ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
                             uint64_t Offset, IRBuilder<> &Builder) {
    // If the load is of the whole new alloca, no conversion is needed.
-  if (FromVal->getType() == ToType && Offset == 0)
+  Type *FromType = FromVal->getType();
+  if (FromType == ToType && Offset == 0)
      return FromVal;
  
    // If the result alloca is a vector type, this is either an element
    // access or a bitcast to another vector type of the same size.
-  if (const VectorType *VTy = dyn_cast<VectorType>(FromVal->getType())) {
-    if (ToType->isVectorTy())
-      return Builder.CreateBitCast(FromVal, ToType, "tmp");
+  if (VectorType *VTy = dyn_cast<VectorType>(FromType)) {
+    unsigned FromTypeSize = TD.getTypeAllocSize(FromType);
+    unsigned ToTypeSize = TD.getTypeAllocSize(ToType);
+    if (FromTypeSize == ToTypeSize)
+        return Builder.CreateBitCast(FromVal, ToType);
  
      // Otherwise it must be an element access.
      unsigned Elt = 0;
@@ -606,40 +697,39 @@ ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
        assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
      }
      // Return the element extracted out of it.
-    Value *V = Builder.CreateExtractElement(FromVal, ConstantInt::get(
-                    Type::getInt32Ty(FromVal->getContext()), Elt), "tmp");
+    Value *V = Builder.CreateExtractElement(FromVal, Builder.getInt32(Elt));
      if (V->getType() != ToType)
-      V = Builder.CreateBitCast(V, ToType, "tmp");
+      V = Builder.CreateBitCast(V, ToType);
      return V;
    }
  
    // If ToType is a first class aggregate, extract out each of the pieces and
    // use insertvalue's to form the FCA.
-  if (const StructType *ST = dyn_cast<StructType>(ToType)) {
+  if (StructType *ST = dyn_cast<StructType>(ToType)) {
      const StructLayout &Layout = *TD.getStructLayout(ST);
      Value *Res = UndefValue::get(ST);
      for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
        Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i),
                                          Offset+Layout.getElementOffsetInBits(i),
                                                Builder);
-      Res = Builder.CreateInsertValue(Res, Elt, i, "tmp");
+      Res = Builder.CreateInsertValue(Res, Elt, i);
      }
      return Res;
    }
  
-  if (const ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
      uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
      Value *Res = UndefValue::get(AT);
      for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
        Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(),
                                                Offset+i*EltSize, Builder);
-      Res = Builder.CreateInsertValue(Res, Elt, i, "tmp");
+      Res = Builder.CreateInsertValue(Res, Elt, i);
      }
      return Res;
    }
  
    // Otherwise, this must be a union that was converted to an integer value.
-  const IntegerType *NTy = cast<IntegerType>(FromVal->getType());
+  IntegerType *NTy = cast<IntegerType>(FromVal->getType());
  
    // If this is a big-endian system and the load is narrower than the
    // full alloca type, we need to do a shift to get the right bits.
@@ -659,33 +749,31 @@ ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
    // only some bits are used.
    if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth())
      FromVal = Builder.CreateLShr(FromVal,
-                                 ConstantInt::get(FromVal->getType(),
-                                                           ShAmt), "tmp");
+                                 ConstantInt::get(FromVal->getType(), ShAmt));
    else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth())
      FromVal = Builder.CreateShl(FromVal,
-                                ConstantInt::get(FromVal->getType(),
-                                                          -ShAmt), "tmp");
+                                ConstantInt::get(FromVal->getType(), -ShAmt));
  
    // Finally, unconditionally truncate the integer to the right width.
    unsigned LIBitWidth = TD.getTypeSizeInBits(ToType);
    if (LIBitWidth < NTy->getBitWidth())
      FromVal =
        Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(),
-                                                    LIBitWidth), "tmp");
+                                                    LIBitWidth));
    else if (LIBitWidth > NTy->getBitWidth())
      FromVal =
         Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(),
-                                                    LIBitWidth), "tmp");
+                                                    LIBitWidth));
  
    // If the result is an integer, this is a trunc or bitcast.
    if (ToType->isIntegerTy()) {
      // Should be done.
    } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) {
      // Just do a bitcast, we know the sizes match up.
-    FromVal = Builder.CreateBitCast(FromVal, ToType, "tmp");
+    FromVal = Builder.CreateBitCast(FromVal, ToType);
    } else {
      // Otherwise must be a pointer.
-    FromVal = Builder.CreateIntToPtr(FromVal, ToType, "tmp");
+    FromVal = Builder.CreateIntToPtr(FromVal, ToType);
    }
    assert(FromVal->getType() == ToType && "Didn't convert right?");
    return FromVal;
@@ -705,37 +793,32 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
                            uint64_t Offset, IRBuilder<> &Builder) {
    // Convert the stored type to the actual type, shift it left to insert
    // then 'or' into place.
-  const Type *AllocaType = Old->getType();
+  Type *AllocaType = Old->getType();
    LLVMContext &Context = Old->getContext();
  
-  if (const VectorType *VTy = dyn_cast<VectorType>(AllocaType)) {
+  if (VectorType *VTy = dyn_cast<VectorType>(AllocaType)) {
      uint64_t VecSize = TD.getTypeAllocSizeInBits(VTy);
      uint64_t ValSize = TD.getTypeAllocSizeInBits(SV->getType());
  
      // Changing the whole vector with memset or with an access of a different
      // vector type?
      if (ValSize == VecSize)
-      return Builder.CreateBitCast(SV, AllocaType, "tmp");
-
-    uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType());
+        return Builder.CreateBitCast(SV, AllocaType);
  
      // Must be an element insertion.
+    Type *EltTy = VTy->getElementType();
+    if (SV->getType() != EltTy)
+      SV = Builder.CreateBitCast(SV, EltTy);
+    uint64_t EltSize = TD.getTypeAllocSizeInBits(EltTy);
      unsigned Elt = Offset/EltSize;
-
-    if (SV->getType() != VTy->getElementType())
-      SV = Builder.CreateBitCast(SV, VTy->getElementType(), "tmp");
-
-    SV = Builder.CreateInsertElement(Old, SV,
-                     ConstantInt::get(Type::getInt32Ty(SV->getContext()), Elt),
-                                     "tmp");
-    return SV;
+    return Builder.CreateInsertElement(Old, SV, Builder.getInt32(Elt));
    }
  
    // If SV is a first-class aggregate value, insert each value recursively.
-  if (const StructType *ST = dyn_cast<StructType>(SV->getType())) {
+  if (StructType *ST = dyn_cast<StructType>(SV->getType())) {
      const StructLayout &Layout = *TD.getStructLayout(ST);
      for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
-      Value *Elt = Builder.CreateExtractValue(SV, i, "tmp");
+      Value *Elt = Builder.CreateExtractValue(SV, i);
        Old = ConvertScalar_InsertValue(Elt, Old,
                                        Offset+Layout.getElementOffsetInBits(i),
                                        Builder);
@@ -743,10 +826,10 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
      return Old;
    }
  
-  if (const ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
      uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
      for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
-      Value *Elt = Builder.CreateExtractValue(SV, i, "tmp");
+      Value *Elt = Builder.CreateExtractValue(SV, i);
        Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder);
      }
      return Old;
@@ -759,20 +842,19 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
    unsigned SrcStoreWidth = TD.getTypeStoreSizeInBits(SV->getType());
    unsigned DestStoreWidth = TD.getTypeStoreSizeInBits(AllocaType);
    if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy())
-    SV = Builder.CreateBitCast(SV,
-                            IntegerType::get(SV->getContext(),SrcWidth), "tmp");
+    SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth));
    else if (SV->getType()->isPointerTy())
-    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()), "tmp");
+    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()));
  
    // Zero extend or truncate the value if needed.
    if (SV->getType() != AllocaType) {
      if (SV->getType()->getPrimitiveSizeInBits() <
               AllocaType->getPrimitiveSizeInBits())
-      SV = Builder.CreateZExt(SV, AllocaType, "tmp");
+      SV = Builder.CreateZExt(SV, AllocaType);
      else {
        // Truncation may be needed if storing more than the alloca can hold
        // (undefined behavior).
-      SV = Builder.CreateTrunc(SV, AllocaType, "tmp");
+      SV = Builder.CreateTrunc(SV, AllocaType);
        SrcWidth = DestWidth;
        SrcStoreWidth = DestStoreWidth;
      }
@@ -795,12 +877,10 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
    // only some bits in the structure are set.
    APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth));
    if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) {
-    SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(),
-                           ShAmt), "tmp");
+    SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), ShAmt));
      Mask <<= ShAmt;
    } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) {
-    SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(),
-                            -ShAmt), "tmp");
+    SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), -ShAmt));
      Mask = Mask.lshr(-ShAmt);
    }
  
@@ -845,15 +925,37 @@ bool SROA::runOnFunction(Function &F) {
  namespace {
  class AllocaPromoter : public LoadAndStorePromoter {
    AllocaInst *AI;
+  DIBuilder *DIB;
+  SmallVector<DbgDeclareInst *, 4> DDIs;
+  SmallVector<DbgValueInst *, 4> DVIs;
  public:
-  AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S)
-    : LoadAndStorePromoter(Insts, S), AI(0) {}
+  AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
+                 DIBuilder *DB)
+    : LoadAndStorePromoter(Insts, S), AI(0), DIB(DB) {}
    
    void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) {
      // Remember which alloca we're promoting (for isInstInList).
      this->AI = AI;
+    if (MDNode *DebugNode = MDNode::getIfExists(AI->getContext(), AI))
+      for (Value::use_iterator UI = DebugNode->use_begin(),
+             E = DebugNode->use_end(); UI != E; ++UI)
+        if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI))
+          DDIs.push_back(DDI);
+        else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI))
+          DVIs.push_back(DVI);
+
      LoadAndStorePromoter::run(Insts);
      AI->eraseFromParent();
+    for (SmallVector<DbgDeclareInst *, 4>::iterator I = DDIs.begin(), 
+           E = DDIs.end(); I != E; ++I) {
+      DbgDeclareInst *DDI = *I;
+      DDI->eraseFromParent();
+    }
+    for (SmallVector<DbgValueInst *, 4>::iterator I = DVIs.begin(), 
+           E = DVIs.end(); I != E; ++I) {
+      DbgValueInst *DVI = *I;
+      DVI->eraseFromParent();
+    }
    }
    
    virtual bool isInstInList(Instruction *I,
@@ -862,6 +964,45 @@ public:
        return LI->getOperand(0) == AI;
      return cast<StoreInst>(I)->getPointerOperand() == AI;
    }
+
+  virtual void updateDebugInfo(Instruction *Inst) const {
+    for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), 
+           E = DDIs.end(); I != E; ++I) {
+      DbgDeclareInst *DDI = *I;
+      if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+        ConvertDebugDeclareToDebugValue(DDI, SI, *DIB);
+      else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+        ConvertDebugDeclareToDebugValue(DDI, LI, *DIB);
+    }
+    for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), 
+           E = DVIs.end(); I != E; ++I) {
+      DbgValueInst *DVI = *I;
+      if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        Instruction *DbgVal = NULL;
+        // If an argument is zero extended then use argument directly. The ZExt
+        // may be zapped by an optimization pass in future.
+        Argument *ExtendedArg = NULL;
+        if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
+          ExtendedArg = dyn_cast<Argument>(ZExt->getOperand(0));
+        if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
+          ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));
+        if (ExtendedArg)
+          DbgVal = DIB->insertDbgValueIntrinsic(ExtendedArg, 0, 
+                                                DIVariable(DVI->getVariable()),
+                                                SI);
+        else
+          DbgVal = DIB->insertDbgValueIntrinsic(SI->getOperand(0), 0, 
+                                                DIVariable(DVI->getVariable()),
+                                                SI);
+        DbgVal->setDebugLoc(DVI->getDebugLoc());
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        Instruction *DbgVal = 
+          DIB->insertDbgValueIntrinsic(LI->getOperand(0), 0, 
+                                       DIVariable(DVI->getVariable()), LI);
+        DbgVal->setDebugLoc(DVI->getDebugLoc());
+      }
+    }
+  }
  };
  } // end anon namespace
  
@@ -885,7 +1026,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) {
    for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end();
         UI != UE; ++UI) {
      LoadInst *LI = dyn_cast<LoadInst>(*UI);
-    if (LI == 0 || LI->isVolatile()) return false;
+    if (LI == 0 || !LI->isSimple()) return false;
      
      // Both operands to the select need to be dereferencable, either absolutely
      // (e.g. allocas) or at this point because we can see other accesses to it.
@@ -926,7 +1067,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) {
    for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end();
         UI != UE; ++UI) {
      LoadInst *LI = dyn_cast<LoadInst>(*UI);
-    if (LI == 0 || LI->isVolatile()) return false;
+    if (LI == 0 || !LI->isSimple()) return false;
      
      // For now we only allow loads in the same block as the PHI.  This is a
      // common case that happens when instcombine merges two loads through a PHI.
@@ -947,17 +1088,21 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) {
    // trapping load in the predecessor if it is a critical edge.
    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
      BasicBlock *Pred = PN->getIncomingBlock(i);
+    Value *InVal = PN->getIncomingValue(i);
+
+    // If the terminator of the predecessor has side-effects (an invoke),
+    // there is no safe place to put a load in the predecessor.
+    if (Pred->getTerminator()->mayHaveSideEffects())
+      return false;
+
+    // If the value is produced by the terminator of the predecessor
+    // (an invoke), there is no valid place to put a load in the predecessor.
+    if (Pred->getTerminator() == InVal)
+      return false;
  
      // If the predecessor has a single successor, then the edge isn't critical.
      if (Pred->getTerminator()->getNumSuccessors() == 1)
        continue;
-    
-    Value *InVal = PN->getIncomingValue(i);
-    
-    // If the InVal is an invoke in the pred, we can't put a load on the edge.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(InVal))
-      if (II->getParent() == Pred)
-        return false;
  
      // If this pointer is always safe to load, or if we can prove that there is
      // already a load in the block, then we can move the load to the pred block.
@@ -984,13 +1129,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
         UI != UE; ++UI) {
      User *U = *UI;
      if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-      if (LI->isVolatile())
+      if (!LI->isSimple())
          return false;
        continue;
      }
      
      if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
-      if (SI->getOperand(0) == AI || SI->isVolatile())
+      if (SI->getOperand(0) == AI || !SI->isSimple())
          return false;   // Don't allow a store OF the AI, only INTO the AI.
        continue;
      }
@@ -1032,6 +1177,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
        continue;
      }
      
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+      if (onlyUsedByLifetimeMarkers(BCI)) {
+        InstsToRewrite.insert(BCI);
+        continue;
+      }
+    }
+    
      return false;
    }
  
@@ -1043,6 +1195,18 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
    // If we have instructions that need to be rewritten for this to be promotable
    // take care of it now.
    for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) {
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(InstsToRewrite[i])) {
+      // This could only be a bitcast used by nothing but lifetime intrinsics.
+      for (BitCastInst::use_iterator I = BCI->use_begin(), E = BCI->use_end();
+           I != E;) {
+        Use &U = I.getUse();
+        ++I;
+        cast<Instruction>(U.getUser())->eraseFromParent();
+      }
+      BCI->eraseFromParent();
+      continue;
+    }
+
      if (SelectInst *SI = dyn_cast<SelectInst>(InstsToRewrite[i])) {
        // Selects in InstsToRewrite only have load uses.  Rewrite each as two
        // loads with a new select.
@@ -1053,7 +1217,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
          LoadInst *TrueLoad = 
            Builder.CreateLoad(SI->getTrueValue(), LI->getName()+".t");
          LoadInst *FalseLoad = 
-          Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".t");
+          Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".f");
          
          // Transfer alignment and TBAA info if present.
          TrueLoad->setAlignment(LI->getAlignment());
@@ -1082,8 +1246,9 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
        continue;
      }
      
-    const Type *LoadTy = cast<PointerType>(PN->getType())->getElementType();
-    PHINode *NewPN = PHINode::Create(LoadTy, PN->getName()+".ld", PN);
+    Type *LoadTy = cast<PointerType>(PN->getType())->getElementType();
+    PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(),
+                                     PN->getName()+".ld", PN);
  
      // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
      // matter which one we get and if any differ, it doesn't matter.
@@ -1123,7 +1288,6 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
    return true;
  }
  
-
  bool SROA::performPromotion(Function &F) {
    std::vector<AllocaInst*> Allocas;
    DominatorTree *DT = 0;
@@ -1131,7 +1295,7 @@ bool SROA::performPromotion(Function &F) {
      DT = &getAnalysis<DominatorTree>();
  
    BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
-
+  DIBuilder DIB(*F.getParent());
    bool Changed = false;
    SmallVector<Instruction*, 64> Insts;
    while (1) {
@@ -1157,8 +1321,7 @@ bool SROA::performPromotion(Function &F) {
          for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
               UI != E; ++UI)
            Insts.push_back(cast<Instruction>(*UI));
-        
-        AllocaPromoter(Insts, SSA).run(AI, Insts);
+        AllocaPromoter(Insts, SSA, &DIB).run(AI, Insts);
          Insts.clear();
        }
      }
@@ -1173,21 +1336,21 @@ bool SROA::performPromotion(Function &F) {
  /// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for
  /// SROA.  It must be a struct or array type with a small number of elements.
  static bool ShouldAttemptScalarRepl(AllocaInst *AI) {
-  const Type *T = AI->getAllocatedType();
+  Type *T = AI->getAllocatedType();
    // Do not promote any struct into more than 32 separate vars.
-  if (const StructType *ST = dyn_cast<StructType>(T))
+  if (StructType *ST = dyn_cast<StructType>(T))
      return ST->getNumElements() <= 32;
    // Arrays are much less likely to be safe for SROA; only consider
    // them if they are very small.
-  if (const ArrayType *AT = dyn_cast<ArrayType>(T))
+  if (ArrayType *AT = dyn_cast<ArrayType>(T))
      return AT->getNumElements() <= 8;
    return false;
  }
  
  
  // performScalarRepl - This algorithm is a simple worklist driven algorithm,
-// which runs on all of the malloc/alloca instructions in the function, removing
-// them if they are only used by getelementptr instructions.
+// which runs on all of the alloca instructions in the function, removing them
+// if they are only used by getelementptr instructions.
  //
  bool SROA::performScalarRepl(Function &F) {
    std::vector<AllocaInst*> WorkList;
@@ -1221,12 +1384,15 @@ bool SROA::performScalarRepl(Function &F) {
      // the constant global instead.  This is commonly produced by the CFE by
      // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
      // is only subsequently read.
-    if (MemTransferInst *TheCopy = isOnlyCopiedFromConstantGlobal(AI)) {
+    SmallVector<Instruction *, 4> ToDelete;
+    if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(AI, ToDelete)) {
        DEBUG(dbgs() << "Found alloca equal to global: " << *AI << '\n');
-      DEBUG(dbgs() << "  memcpy = " << *TheCopy << '\n');
-      Constant *TheSrc = cast<Constant>(TheCopy->getSource());
+      DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
+      for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
+        ToDelete[i]->eraseFromParent();
+      Constant *TheSrc = cast<Constant>(Copy->getSource());
        AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType()));
-      TheCopy->eraseFromParent();  // Don't mutate the global.
+      Copy->eraseFromParent();  // Don't mutate the global.
        AI->eraseFromParent();
        ++NumGlobals;
        Changed = true;
@@ -1281,7 +1447,7 @@ void SROA::DoScalarReplacement(AllocaInst *AI,
                                 std::vector<AllocaInst*> &WorkList) {
    DEBUG(dbgs() << "Found inst to SROA: " << *AI << '\n');
    SmallVector<AllocaInst*, 32> ElementAllocas;
-  if (const StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
+  if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
      ElementAllocas.reserve(ST->getNumContainedTypes());
      for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) {
        AllocaInst *NA = new AllocaInst(ST->getContainedType(i), 0,
@@ -1291,9 +1457,9 @@ void SROA::DoScalarReplacement(AllocaInst *AI,
        WorkList.push_back(NA);  // Add to worklist for recursive processing
      }
    } else {
-    const ArrayType *AT = cast<ArrayType>(AI->getAllocatedType());
+    ArrayType *AT = cast<ArrayType>(AI->getAllocatedType());
      ElementAllocas.reserve(AT->getNumElements());
-    const Type *ElTy = AT->getElementType();
+    Type *ElTy = AT->getElementType();
      for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
        AllocaInst *NA = new AllocaInst(ElTy, 0, AI->getAlignment(),
                                        AI->getName() + "." + Twine(i), AI);
@@ -1357,22 +1523,26 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
                        UI.getOperandNo() == 0, Info, MI,
                        true /*AllowWholeAccess*/);
      } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      if (LI->isVolatile())
+      if (!LI->isSimple())
          return MarkUnsafe(Info, User);
-      const Type *LIType = LI->getType();
+      Type *LIType = LI->getType();
        isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
                        LIType, false, Info, LI, true /*AllowWholeAccess*/);
        Info.hasALoadOrStore = true;
          
      } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
        // Store is ok if storing INTO the pointer, not storing the pointer
-      if (SI->isVolatile() || SI->getOperand(0) == I)
+      if (!SI->isSimple() || SI->getOperand(0) == I)
          return MarkUnsafe(Info, User);
          
-      const Type *SIType = SI->getOperand(0)->getType();
+      Type *SIType = SI->getOperand(0)->getType();
        isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType),
                        SIType, true, Info, SI, true /*AllowWholeAccess*/);
        Info.hasALoadOrStore = true;
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+          II->getIntrinsicID() != Intrinsic::lifetime_end)
+        return MarkUnsafe(Info, User);
      } else if (isa<PHINode>(User) || isa<SelectInst>(User)) {
        isSafePHISelectUseForScalarRepl(User, Offset, Info);
      } else {
@@ -1412,19 +1582,19 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
          return MarkUnsafe(Info, User);
        isSafePHISelectUseForScalarRepl(GEPI, Offset, Info);
      } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      if (LI->isVolatile())
+      if (!LI->isSimple())
          return MarkUnsafe(Info, User);
-      const Type *LIType = LI->getType();
+      Type *LIType = LI->getType();
        isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
                        LIType, false, Info, LI, false /*AllowWholeAccess*/);
        Info.hasALoadOrStore = true;
        
      } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
        // Store is ok if storing INTO the pointer, not storing the pointer
-      if (SI->isVolatile() || SI->getOperand(0) == I)
+      if (!SI->isSimple() || SI->getOperand(0) == I)
          return MarkUnsafe(Info, User);
        
-      const Type *SIType = SI->getOperand(0)->getType();
+      Type *SIType = SI->getOperand(0)->getType();
        isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType),
                        SIType, true, Info, SI, false /*AllowWholeAccess*/);
        Info.hasALoadOrStore = true;
@@ -1463,8 +1633,7 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI,
    // Compute the offset due to this GEP and check if the alloca has a
    // component element at that offset.
    SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
-  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(),
-                                 &Indices[0], Indices.size());
+  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
    if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, 0))
      MarkUnsafe(Info, GEPI);
  }
@@ -1473,14 +1642,14 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI,
  /// elements of the same type (which is always true for arrays).  If so,
  /// return true with NumElts and EltTy set to the number of elements and the
  /// element type, respectively.
-static bool isHomogeneousAggregate(const Type *T, unsigned &NumElts,
-                                   const Type *&EltTy) {
-  if (const ArrayType *AT = dyn_cast<ArrayType>(T)) {
+static bool isHomogeneousAggregate(Type *T, unsigned &NumElts,
+                                   Type *&EltTy) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
      NumElts = AT->getNumElements();
      EltTy = (NumElts == 0 ? 0 : AT->getElementType());
      return true;
    }
-  if (const StructType *ST = dyn_cast<StructType>(T)) {
+  if (StructType *ST = dyn_cast<StructType>(T)) {
      NumElts = ST->getNumContainedTypes();
      EltTy = (NumElts == 0 ? 0 : ST->getContainedType(0));
      for (unsigned n = 1; n < NumElts; ++n) {
@@ -1494,12 +1663,12 @@ static bool isHomogeneousAggregate(const Type *T, unsigned &NumElts,
  
  /// isCompatibleAggregate - Check if T1 and T2 are either the same type or are
  /// "homogeneous" aggregates with the same element type and number of elements.
-static bool isCompatibleAggregate(const Type *T1, const Type *T2) {
+static bool isCompatibleAggregate(Type *T1, Type *T2) {
    if (T1 == T2)
      return true;
  
    unsigned NumElts1, NumElts2;
-  const Type *EltTy1, *EltTy2;
+  Type *EltTy1, *EltTy2;
    if (isHomogeneousAggregate(T1, NumElts1, EltTy1) &&
        isHomogeneousAggregate(T2, NumElts2, EltTy2) &&
        NumElts1 == NumElts2 &&
@@ -1517,7 +1686,7 @@ static bool isCompatibleAggregate(const Type *T1, const Type *T2) {
  /// If AllowWholeAccess is true, then this allows uses of the entire alloca as a
  /// unit.  If false, it only allows accesses known to be in a single element.
  void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
-                           const Type *MemOpType, bool isStore,
+                           Type *MemOpType, bool isStore,
                             AllocaInfo &Info, Instruction *TheAccess,
                             bool AllowWholeAccess) {
    // Check if this is a load/store of the entire alloca.
@@ -1544,7 +1713,7 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
      }
    }
    // Check if the offset/size correspond to a component within the alloca type.
-  const Type *T = Info.AI->getAllocatedType();
+  Type *T = Info.AI->getAllocatedType();
    if (TypeHasComponent(T, Offset, MemSize)) {
      Info.hasSubelementAccess = true;
      return;
@@ -1555,16 +1724,16 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
  
  /// TypeHasComponent - Return true if T has a component type with the
  /// specified offset and size.  If Size is zero, do not check the size.
-bool SROA::TypeHasComponent(const Type *T, uint64_t Offset, uint64_t Size) {
-  const Type *EltTy;
+bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) {
+  Type *EltTy;
    uint64_t EltSize;
-  if (const StructType *ST = dyn_cast<StructType>(T)) {
+  if (StructType *ST = dyn_cast<StructType>(T)) {
      const StructLayout *Layout = TD->getStructLayout(ST);
      unsigned EltIdx = Layout->getElementContainingOffset(Offset);
      EltTy = ST->getContainedType(EltIdx);
      EltSize = TD->getTypeAllocSize(EltTy);
      Offset -= Layout->getElementOffset(EltIdx);
-  } else if (const ArrayType *AT = dyn_cast<ArrayType>(T)) {
+  } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
      EltTy = AT->getElementType();
      EltSize = TD->getTypeAllocSize(EltTy);
      if (Offset >= AT->getNumElements() * EltSize)
@@ -1611,9 +1780,17 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
        // address operand will be updated, so nothing else needs to be done.
        continue;
      }
+
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        RewriteLifetimeIntrinsic(II, AI, Offset, NewElts);
+      }
+      continue;
+    }
      
      if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      const Type *LIType = LI->getType();
+      Type *LIType = LI->getType();
        
        if (isCompatibleAggregate(LIType, AI->getAllocatedType())) {
          // Replace:
@@ -1625,9 +1802,10 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
          //   %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1
          // (Also works for arrays instead of structs)
          Value *Insert = UndefValue::get(LIType);
+        IRBuilder<> Builder(LI);
          for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
-          Value *Load = new LoadInst(NewElts[i], "load", LI);
-          Insert = InsertValueInst::Create(Insert, Load, i, "insert", LI);
+          Value *Load = Builder.CreateLoad(NewElts[i], "load");
+          Insert = Builder.CreateInsertValue(Insert, Load, i, "insert");
          }
          LI->replaceAllUsesWith(Insert);
          DeadInsts.push_back(LI);
@@ -1642,7 +1820,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
      
      if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
        Value *Val = SI->getOperand(0);
-      const Type *SIType = Val->getType();
+      Type *SIType = Val->getType();
        if (isCompatibleAggregate(SIType, AI->getAllocatedType())) {
          // Replace:
          //   store { i32, i32 } %val, { i32, i32 }* %alloc
@@ -1652,9 +1830,10 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
          //   %val.1 = extractvalue { i32, i32 } %val, 1
          //   store i32 %val.1, i32* %alloc.1
          // (Also works for arrays instead of structs)
+        IRBuilder<> Builder(SI);
          for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
-          Value *Extract = ExtractValueInst::Create(Val, i, Val->getName(), SI);
-          new StoreInst(Extract, NewElts[i], SI);
+          Value *Extract = Builder.CreateExtractValue(Val, i, Val->getName());
+          Builder.CreateStore(Extract, NewElts[i]);
          }
          DeadInsts.push_back(SI);
        } else if (SIType->isIntegerTy() &&
@@ -1696,8 +1875,14 @@ void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
      return;
  
    // The bitcast references the original alloca.  Replace its uses with
-  // references to the first new element alloca.
-  Instruction *Val = NewElts[0];
+  // references to the alloca containing offset zero (which is normally at
+  // index zero, but might not be in cases involving structs with elements
+  // of size zero).
+  Type *T = AI->getAllocatedType();
+  uint64_t EltOffset = 0;
+  Type *IdxTy;
+  uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy);
+  Instruction *Val = NewElts[Idx];
    if (Val->getType() != BC->getDestTy()) {
      Val = new BitCastInst(Val, BC->getDestTy(), "", BC);
      Val->takeName(BC);
@@ -1711,10 +1896,10 @@ void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
  /// Sets T to the type of the element and Offset to the offset within that
  /// element.  IdxTy is set to the type of the index result to be used in a
  /// GEP instruction.
-uint64_t SROA::FindElementAndOffset(const Type *&T, uint64_t &Offset,
-                                    const Type *&IdxTy) {
+uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset,
+                                    Type *&IdxTy) {
    uint64_t Idx = 0;
-  if (const StructType *ST = dyn_cast<StructType>(T)) {
+  if (StructType *ST = dyn_cast<StructType>(T)) {
      const StructLayout *Layout = TD->getStructLayout(ST);
      Idx = Layout->getElementContainingOffset(Offset);
      T = ST->getContainedType(Idx);
@@ -1722,7 +1907,7 @@ uint64_t SROA::FindElementAndOffset(const Type *&T, uint64_t &Offset,
      IdxTy = Type::getInt32Ty(T->getContext());
      return Idx;
    }
-  const ArrayType *AT = cast<ArrayType>(T);
+  ArrayType *AT = cast<ArrayType>(T);
    T = AT->getElementType();
    uint64_t EltSize = TD->getTypeAllocSize(T);
    Idx = Offset / EltSize;
@@ -1738,13 +1923,12 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
                        SmallVector<AllocaInst*, 32> &NewElts) {
    uint64_t OldOffset = Offset;
    SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
-  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(),
-                                 &Indices[0], Indices.size());
+  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
  
    RewriteForScalarRepl(GEPI, AI, Offset, NewElts);
  
-  const Type *T = AI->getAllocatedType();
-  const Type *IdxTy;
+  Type *T = AI->getAllocatedType();
+  Type *IdxTy;
    uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy);
    if (GEPI->getOperand(0) == AI)
      OldIdx = ~0ULL; // Force the GEP to be rewritten.
@@ -1758,7 +1942,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
    if (Idx == OldIdx)
      return;
  
-  const Type *i32Ty = Type::getInt32Ty(AI->getContext());
+  Type *i32Ty = Type::getInt32Ty(AI->getContext());
    SmallVector<Value*, 8> NewArgs;
    NewArgs.push_back(Constant::getNullValue(i32Ty));
    while (EltOffset != 0) {
@@ -1767,8 +1951,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
    }
    Instruction *Val = NewElts[Idx];
    if (NewArgs.size() > 1) {
-    Val = GetElementPtrInst::CreateInBounds(Val, NewArgs.begin(),
-                                            NewArgs.end(), "", GEPI);
+    Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI);
      Val->takeName(GEPI);
    }
    if (Val->getType() != GEPI->getType())
@@ -1777,6 +1960,62 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
    DeadInsts.push_back(GEPI);
  }
  
+/// RewriteLifetimeIntrinsic - II is a lifetime.start/lifetime.end. Rewrite it
+/// to mark the lifetime of the scalarized memory.
+void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
+                                    uint64_t Offset,
+                                    SmallVector<AllocaInst*, 32> &NewElts) {
+  ConstantInt *OldSize = cast<ConstantInt>(II->getArgOperand(0));
+  // Put matching lifetime markers on everything from Offset up to
+  // Offset+OldSize.
+  Type *AIType = AI->getAllocatedType();
+  uint64_t NewOffset = Offset;
+  Type *IdxTy;
+  uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy);
+
+  IRBuilder<> Builder(II);
+  uint64_t Size = OldSize->getLimitedValue();
+
+  if (NewOffset) {
+    // Splice the first element and index 'NewOffset' bytes in.  SROA will
+    // split the alloca again later.
+    Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy());
+    V = Builder.CreateGEP(V, Builder.getInt64(NewOffset));
+
+    IdxTy = NewElts[Idx]->getAllocatedType();
+    uint64_t EltSize = TD->getTypeAllocSize(IdxTy) - NewOffset;
+    if (EltSize > Size) {
+      EltSize = Size;
+      Size = 0;
+    } else {
+      Size -= EltSize;
+    }
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      Builder.CreateLifetimeStart(V, Builder.getInt64(EltSize));
+    else
+      Builder.CreateLifetimeEnd(V, Builder.getInt64(EltSize));
+    ++Idx;
+  }
+
+  for (; Idx != NewElts.size() && Size; ++Idx) {
+    IdxTy = NewElts[Idx]->getAllocatedType();
+    uint64_t EltSize = TD->getTypeAllocSize(IdxTy);
+    if (EltSize > Size) {
+      EltSize = Size;
+      Size = 0;
+    } else {
+      Size -= EltSize;
+    }
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      Builder.CreateLifetimeStart(NewElts[Idx],
+                                  Builder.getInt64(EltSize));
+    else
+      Builder.CreateLifetimeEnd(NewElts[Idx],
+                                Builder.getInt64(EltSize));
+  }
+  DeadInsts.push_back(II);
+}
+
  /// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI.
  /// Rewrite it to copy or set the elements of the scalarized memory.
  void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
@@ -1824,7 +2063,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
  
      // If the pointer is not the right type, insert a bitcast to the right
      // type.
-    const Type *NewTy =
+    Type *NewTy =
        PointerType::get(AI->getType()->getElementType(), AddrSpace);
  
      if (OtherPtr->getType() != NewTy)
@@ -1844,16 +2083,16 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
      if (OtherPtr) {
        Value *Idx[2] = { Zero,
                        ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) };
-      OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx, Idx + 2,
+      OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx,
                                                OtherPtr->getName()+"."+Twine(i),
                                                     MI);
        uint64_t EltOffset;
-      const PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType());
-      const Type *OtherTy = OtherPtrTy->getElementType();
-      if (const StructType *ST = dyn_cast<StructType>(OtherTy)) {
+      PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType());
+      Type *OtherTy = OtherPtrTy->getElementType();
+      if (StructType *ST = dyn_cast<StructType>(OtherTy)) {
          EltOffset = TD->getStructLayout(ST)->getElementOffset(i);
        } else {
-        const Type *EltTy = cast<SequentialType>(OtherTy)->getElementType();
+        Type *EltTy = cast<SequentialType>(OtherTy)->getElementType();
          EltOffset = TD->getTypeAllocSize(EltTy)*i;
        }
  
@@ -1866,7 +2105,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
      }
  
      Value *EltPtr = NewElts[i];
-    const Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType();
+    Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType();
  
      // If we got down to a scalar, insert a load or store as appropriate.
      if (EltTy->isSingleValueType()) {
@@ -1892,7 +2131,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
            StoreVal = Constant::getNullValue(EltTy);  // 0.0, null, 0, <0,0>
          } else {
            // If EltTy is a vector type, get the element type.
-          const Type *ValTy = EltTy->getScalarType();
+          Type *ValTy = EltTy->getScalarType();
  
            // Construct an integer with the right value.
            unsigned EltSize = TD->getTypeSizeInBits(ValTy);
@@ -1913,8 +2152,8 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
            assert(StoreVal->getType() == ValTy && "Type mismatch!");
  
            // If the requested value was a vector constant, create it.
-          if (EltTy != ValTy) {
-            unsigned NumElts = cast<VectorType>(ValTy)->getNumElements();
+          if (EltTy->isVectorTy()) {
+            unsigned NumElts = cast<VectorType>(EltTy)->getNumElements();
              SmallVector<Constant*, 16> Elts(NumElts, StoreVal);
              StoreVal = ConstantVector::get(Elts);
            }
@@ -1927,6 +2166,8 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
      }
  
      unsigned EltSize = TD->getTypeAllocSize(EltTy);
+    if (!EltSize)
+      continue;
  
      IRBuilder<> Builder(MI);
  
@@ -1956,7 +2197,7 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
    // Extract each element out of the integer according to its structure offset
    // and store the element value to the individual alloca.
    Value *SrcVal = SI->getOperand(0);
-  const Type *AllocaEltTy = AI->getAllocatedType();
+  Type *AllocaEltTy = AI->getAllocatedType();
    uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
  
    IRBuilder<> Builder(SI);
@@ -1971,12 +2212,12 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
  
    // There are two forms here: AI could be an array or struct.  Both cases
    // have different ways to compute the element offset.
-  if (const StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
+  if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
      const StructLayout *Layout = TD->getStructLayout(EltSTy);
  
      for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
        // Get the number of bits to shift SrcVal to get the value.
-      const Type *FieldTy = EltSTy->getElementType(i);
+      Type *FieldTy = EltSTy->getElementType(i);
        uint64_t Shift = Layout->getElementOffsetInBits(i);
  
        if (TD->isBigEndian())
@@ -2012,8 +2253,8 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
      }
  
    } else {
-    const ArrayType *ATy = cast<ArrayType>(AllocaEltTy);
-    const Type *ArrayEltTy = ATy->getElementType();
+    ArrayType *ATy = cast<ArrayType>(AllocaEltTy);
+    Type *ArrayEltTy = ATy->getElementType();
      uint64_t ElementOffset = TD->getTypeAllocSizeInBits(ArrayEltTy);
      uint64_t ElementSizeBits = TD->getTypeSizeInBits(ArrayEltTy);
  
@@ -2069,7 +2310,7 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
                                          SmallVector<AllocaInst*, 32> &NewElts) {
    // Extract each element out of the NewElts according to its structure offset
    // and form the result value.
-  const Type *AllocaEltTy = AI->getAllocatedType();
+  Type *AllocaEltTy = AI->getAllocatedType();
    uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
  
    DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI
@@ -2079,10 +2320,10 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
    // have different ways to compute the element offset.
    const StructLayout *Layout = 0;
    uint64_t ArrayEltBitOffset = 0;
-  if (const StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
+  if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
      Layout = TD->getStructLayout(EltSTy);
    } else {
-    const Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType();
+    Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType();
      ArrayEltBitOffset = TD->getTypeAllocSizeInBits(ArrayEltTy);
    }
  
@@ -2093,14 +2334,14 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
      // Load the value from the alloca.  If the NewElt is an aggregate, cast
      // the pointer to an integer of the same size before doing the load.
      Value *SrcField = NewElts[i];
-    const Type *FieldTy =
+    Type *FieldTy =
        cast<PointerType>(SrcField->getType())->getElementType();
      uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy);
  
      // Ignore zero sized fields like {}, they obviously contain no data.
      if (FieldSizeBits == 0) continue;
  
-    const IntegerType *FieldIntTy = IntegerType::get(LI->getContext(),
+    IntegerType *FieldIntTy = IntegerType::get(LI->getContext(),
                                                       FieldSizeBits);
      if (!FieldTy->isIntegerTy() && !FieldTy->isFloatingPointTy() &&
          !FieldTy->isVectorTy())
@@ -2153,14 +2394,14 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
  /// HasPadding - Return true if the specified type has any structure or
  /// alignment padding in between the elements that would be split apart
  /// by SROA; return false otherwise.
-static bool HasPadding(const Type *Ty, const TargetData &TD) {
-  if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+static bool HasPadding(Type *Ty, const TargetData &TD) {
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
      Ty = ATy->getElementType();
      return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty);
    }
  
    // SROA currently handles only Arrays and Structs.
-  const StructType *STy = cast<StructType>(Ty);
+  StructType *STy = cast<StructType>(Ty);
    const StructLayout *SL = TD.getStructLayout(STy);
    unsigned PrevFieldBitOffset = 0;
    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
@@ -2215,7 +2456,7 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) {
    // and fusion code.
    if (!Info.hasSubelementAccess && Info.hasALoadOrStore) {
      // If the struct/array just has one element, use basic SRoA.
-    if (const StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
+    if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
        if (ST->getNumElements() > 1) return false;
      } else {
        if (cast<ArrayType>(AI->getAllocatedType())->getNumElements() > 1)
@@ -2248,20 +2489,27 @@ static bool PointsToConstantGlobal(Value *V) {
  /// the uses.  If we see a memcpy/memmove that targets an unoffseted pointer to
  /// the alloca, and if the source pointer is a pointer to a constant global, we
  /// can optimize this.
-static bool isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
-                                           bool isOffset) {
+static bool
+isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
+                               bool isOffset,
+                               SmallVector<Instruction *, 4> &LifetimeMarkers) {
+  // We track lifetime intrinsics as we encounter them.  If we decide to go
+  // ahead and replace the value with the global, this lets the caller quickly
+  // eliminate the markers.
+
    for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
      User *U = cast<Instruction>(*UI);
  
      if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
        // Ignore non-volatile loads, they are always ok.
-      if (LI->isVolatile()) return false;
+      if (!LI->isSimple()) return false;
        continue;
      }
  
      if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
        // If uses of the bitcast are ok, we are ok.
-      if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, isOffset))
+      if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, isOffset,
+                                          LifetimeMarkers))
          return false;
        continue;
      }
@@ -2269,29 +2517,43 @@ static bool isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
        // If the GEP has all zero indices, it doesn't offset the pointer.  If it
        // doesn't, it does.
        if (!isOnlyCopiedFromConstantGlobal(GEP, TheCopy,
-                                         isOffset || !GEP->hasAllZeroIndices()))
+                                          isOffset || !GEP->hasAllZeroIndices(),
+                                          LifetimeMarkers))
          return false;
        continue;
      }
  
      if (CallSite CS = U) {
-      // If this is a readonly/readnone call site, then we know it is just a
-      // load and we can ignore it.
-      if (CS.onlyReadsMemory())
-        continue;
-
        // If this is the function being called then we treat it like a load and
        // ignore it.
        if (CS.isCallee(UI))
          continue;
  
+      // If this is a readonly/readnone call site, then we know it is just a
+      // load (but one that potentially returns the value itself), so we can
+      // ignore it if we know that the value isn't captured.
+      unsigned ArgNo = CS.getArgumentNo(UI);
+      if (CS.onlyReadsMemory() &&
+          (CS.getInstruction()->use_empty() ||
+           CS.paramHasAttr(ArgNo+1, Attribute::NoCapture)))
+        continue;
+
        // If this is being passed as a byval argument, the caller is making a
        // copy, so it is only a read of the alloca.
-      unsigned ArgNo = CS.getArgumentNo(UI);
        if (CS.paramHasAttr(ArgNo+1, Attribute::ByVal))
          continue;
      }
  
+    // Lifetime intrinsics can be handled by the caller.
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        assert(II->use_empty() && "Lifetime markers have no result to use!");
+        LifetimeMarkers.push_back(II);
+        continue;
+      }
+    }
+
      // If this is isn't our memcpy/memmove, reject it as something we can't
      // handle.
      MemTransferInst *MI = dyn_cast<MemTransferInst>(U);
@@ -2328,9 +2590,11 @@ static bool isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
  /// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
  /// modified by a copy from a constant global.  If we can prove this, we can
  /// replace any uses of the alloca with uses of the global directly.
-MemTransferInst *SROA::isOnlyCopiedFromConstantGlobal(AllocaInst *AI) {
+MemTransferInst *
+SROA::isOnlyCopiedFromConstantGlobal(AllocaInst *AI,
+                                     SmallVector<Instruction*, 4> &ToDelete) {
    MemTransferInst *TheCopy = 0;
-  if (::isOnlyCopiedFromConstantGlobal(AI, TheCopy, false))
+  if (::isOnlyCopiedFromConstantGlobal(AI, TheCopy, false, ToDelete))
      return TheCopy;
    return 0;
  }