[NaryReassociate] speeds up candidate searching

[oota-llvm.git] / lib / Transforms / Scalar / SROA.cpp
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp

index ef91effe3ff35ffa040b44fcddcb49fa453d7bb0..59dc52811c96547d6ee13946e3663992c8e53642 100644 (file)
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -28,7 +28,7 @@
  #include "llvm/ADT/SetVector.h"
  #include "llvm/ADT/SmallVector.h"
  #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
  #include "llvm/Analysis/Loads.h"
  #include "llvm/Analysis/PtrUseVisitor.h"
  #include "llvm/Analysis/ValueTracking.h"
@@ -247,7 +247,7 @@ public:
    /// hold.
    void insert(ArrayRef<Slice> NewSlices) {
      int OldSize = Slices.size();
-    std::move(NewSlices.begin(), NewSlices.end(), std::back_inserter(Slices));
+    Slices.append(NewSlices.begin(), NewSlices.end());
      auto SliceI = Slices.begin() + OldSize;
      std::sort(SliceI, Slices.end());
      std::inplace_merge(Slices.begin(), SliceI, Slices.end());
@@ -701,6 +701,7 @@ private:
        // by writing out the code here where we have tho underlying allocation
        // size readily available.
        APInt GEPOffset = Offset;
+      const DataLayout &DL = GEPI.getModule()->getDataLayout();
        for (gep_type_iterator GTI = gep_type_begin(GEPI),
                               GTE = gep_type_end(GEPI);
             GTI != GTE; ++GTI) {
@@ -735,16 +736,10 @@ private:
  
    void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
                           uint64_t Size, bool IsVolatile) {
-    // We allow splitting of loads and stores where the type is an integer type
-    // and cover the entire alloca. This prevents us from splitting over
-    // eagerly.
-    // FIXME: In the great blue eventually, we should eagerly split all integer
-    // loads and stores, and then have a separate step that merges adjacent
-    // alloca partitions into a single partition suitable for integer widening.
-    // Or we should skip the merge step and rely on GVN and other passes to
-    // merge adjacent loads and stores that survive mem2reg.
-    bool IsSplittable =
-        Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize;
+    // We allow splitting of non-volatile loads and stores where the type is an
+    // integer type. These may be used to implement 'memcpy' or other "transfer
+    // of bits" patterns.
+    bool IsSplittable = Ty->isIntegerTy() && !IsVolatile;
  
      insertUse(I, Offset, Size, IsSplittable);
    }
@@ -756,6 +751,7 @@ private:
      if (!IsOffsetKnown)
        return PI.setAborted(&LI);
  
+    const DataLayout &DL = LI.getModule()->getDataLayout();
      uint64_t Size = DL.getTypeStoreSize(LI.getType());
      return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
    }
@@ -767,6 +763,7 @@ private:
      if (!IsOffsetKnown)
        return PI.setAborted(&SI);
  
+    const DataLayout &DL = SI.getModule()->getDataLayout();
      uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
  
      // If this memory access can be shown to *statically* extend outside the
@@ -904,6 +901,7 @@ private:
      SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
      Visited.insert(Root);
      Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
+    const DataLayout &DL = Root->getModule()->getDataLayout();
      // If there are no loads or stores, the access is dead. We mark that as
      // a size zero access.
      Size = 0;
@@ -1168,10 +1166,9 @@ public:
        } else {
          continue;
        }
-      Instruction *DbgVal =
-          DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()),
-                                      DIExpression(DVI->getExpression()), Inst);
-      DbgVal->setDebugLoc(DVI->getDebugLoc());
+      DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()),
+                                  DIExpression(DVI->getExpression()),
+                                  DVI->getDebugLoc(), Inst);
      }
    }
  };
@@ -1200,9 +1197,8 @@ class SROA : public FunctionPass {
    const bool RequiresDomTree;
  
    LLVMContext *C;
-  const DataLayout *DL;
    DominatorTree *DT;
-  AssumptionTracker *AT;
+  AssumptionCache *AC;
  
    /// \brief Worklist of alloca instructions to simplify.
    ///
@@ -1249,7 +1245,7 @@ class SROA : public FunctionPass {
  public:
    SROA(bool RequiresDomTree = true)
        : FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr),
-        DL(nullptr), DT(nullptr) {
+        DT(nullptr) {
      initializeSROAPass(*PassRegistry::getPassRegistry());
    }
    bool runOnFunction(Function &F) override;
@@ -1263,8 +1259,8 @@ private:
    friend class AllocaSliceRewriter;
  
    bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
-  bool rewritePartition(AllocaInst &AI, AllocaSlices &AS,
-                        AllocaSlices::Partition &P);
+  AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS,
+                               AllocaSlices::Partition &P);
    bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
    bool runOnAlloca(AllocaInst &AI);
    void clobberUse(Use &U);
@@ -1281,7 +1277,7 @@ FunctionPass *llvm::createSROAPass(bool RequiresDomTree) {
  
  INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", false,
                        false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionTracker)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false,
                      false)
@@ -1355,7 +1351,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
  ///
  /// FIXME: This should be hoisted into a generic utility, likely in
  /// Transforms/Util/Local.h
-static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) {
+static bool isSafePHIToSpeculate(PHINode &PN) {
    // For now, we can only do this promotion if the load is in the same block
    // as the PHI, and if there are no stores between the phi and load.
    // TODO: Allow recursive phi users.
@@ -1387,6 +1383,8 @@ static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) {
    if (!HaveLoad)
      return false;
  
+  const DataLayout &DL = PN.getModule()->getDataLayout();
+
    // We can only transform this if it is safe to push the loads into the
    // predecessor blocks. The only thing to watch out for is that we can't put
    // a possibly trapping load in the predecessor if it is a critical edge.
@@ -1409,7 +1407,7 @@ static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) {
      // is already a load in the block, then we can move the load to the pred
      // block.
      if (InVal->isDereferenceablePointer(DL) ||
-        isSafeToLoadUnconditionally(InVal, TI, MaxAlign, DL))
+        isSafeToLoadUnconditionally(InVal, TI, MaxAlign))
        continue;
  
      return false;
@@ -1474,10 +1472,10 @@ static void speculatePHINodeLoads(PHINode &PN) {
  ///
  /// We can do this to a select if its only uses are loads and if the operand
  /// to the select can be loaded unconditionally.
-static bool isSafeSelectToSpeculate(SelectInst &SI,
-                                    const DataLayout *DL = nullptr) {
+static bool isSafeSelectToSpeculate(SelectInst &SI) {
    Value *TValue = SI.getTrueValue();
    Value *FValue = SI.getFalseValue();
+  const DataLayout &DL = SI.getModule()->getDataLayout();
    bool TDerefable = TValue->isDereferenceablePointer(DL);
    bool FDerefable = FValue->isDereferenceablePointer(DL);
  
@@ -1490,10 +1488,10 @@ static bool isSafeSelectToSpeculate(SelectInst &SI,
      // absolutely (e.g. allocas) or at this point because we can see other
      // accesses to it.
      if (!TDerefable &&
-        !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment(), DL))
+        !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment()))
        return false;
      if (!FDerefable &&
-        !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment(), DL))
+        !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment()))
        return false;
    }
  
@@ -1553,7 +1551,8 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
    if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
      return BasePtr;
  
-  return IRB.CreateInBoundsGEP(BasePtr, Indices, NamePrefix + "sroa_idx");
+  return IRB.CreateInBoundsGEP(nullptr, BasePtr, Indices,
+                               NamePrefix + "sroa_idx");
  }
  
  /// \brief Get a natural GEP off of the BasePtr walking through Ty toward
@@ -1804,7 +1803,8 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
  
      OffsetPtr = Int8PtrOffset == 0
                      ? Int8Ptr
-                    : IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
+                    : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr,
+                                            IRB.getInt(Int8PtrOffset),
                                              NamePrefix + "sroa_raw_idx");
    }
    Ptr = OffsetPtr;
@@ -2620,7 +2620,8 @@ private:
        // LI only used for this computation.
        Value *Placeholder =
            new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
-      V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset, "insert");
+      V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
+                        "insert");
        LI.replaceAllUsesWith(V);
        Placeholder->replaceAllUsesWith(&LI);
        delete Placeholder;
@@ -2699,7 +2700,8 @@ private:
                   DL.getTypeStoreSizeInBits(V->getType()) &&
               "Non-byte-multiple bit width");
        IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
-      V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset, "extract");
+      V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
+                         "extract");
      }
  
      if (VecTy)
@@ -3249,7 +3251,8 @@ private:
      void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
        assert(Ty->isSingleValueType());
        // Load the single value and insert it using the indices.
-      Value *GEP = IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep");
+      Value *GEP =
+          IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep");
        Value *Load = IRB.CreateLoad(GEP, Name + ".load");
        Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
        DEBUG(dbgs() << "          to: " << *Load << "\n");
@@ -3282,7 +3285,7 @@ private:
        // Extract the single value and store it using the indices.
        Value *Store = IRB.CreateStore(
            IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
-          IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"));
+          IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep"));
        (void)Store;
        DEBUG(dbgs() << "          to: " << *Store << "\n");
      }
@@ -3520,18 +3523,34 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
    };
    SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
  
+  // Track loads out of this alloca which cannot, for any reason, be pre-split.
+  // This is important as we also cannot pre-split stores of those loads!
+  // FIXME: This is all pretty gross. It means that we can be more aggressive
+  // in pre-splitting when the load feeding the store happens to come from
+  // a separate alloca. Put another way, the effectiveness of SROA would be
+  // decreased by a frontend which just concatenated all of its local allocas
+  // into one big flat alloca. But defeating such patterns is exactly the job
+  // SROA is tasked with! Sadly, to not have this discrepancy we would have
+  // change store pre-splitting to actually force pre-splitting of the load
+  // that feeds it *and all stores*. That makes pre-splitting much harder, but
+  // maybe it would make it more principled?
+  SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
+
    DEBUG(dbgs() << "  Searching for candidate loads and stores\n");
    for (auto &P : AS.partitions()) {
      for (Slice &S : P) {
-      if (!S.isSplittable())
-        continue;
-      if (S.endOffset() <= P.endOffset())
+      Instruction *I = cast<Instruction>(S.getUse()->getUser());
+      if (!S.isSplittable() ||S.endOffset() <= P.endOffset()) {
+        // If this was a load we have to track that it can't participate in any
+        // pre-splitting!
+        if (auto *LI = dyn_cast<LoadInst>(I))
+          UnsplittableLoads.insert(LI);
          continue;
+      }
        assert(P.endOffset() > S.beginOffset() &&
               "Empty or backwards partition!");
  
        // Determine if this is a pre-splittable slice.
-      Instruction *I = cast<Instruction>(S.getUse()->getUser());
        if (auto *LI = dyn_cast<LoadInst>(I)) {
          assert(!LI->isVolatile() && "Cannot split volatile loads!");
  
@@ -3546,8 +3565,10 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
            }
            return true;
          };
-        if (!IsLoadSimplyStored(LI))
+        if (!IsLoadSimplyStored(LI)) {
+          UnsplittableLoads.insert(LI);
            continue;
+        }
  
          Loads.push_back(LI);
        } else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) {
@@ -3571,7 +3592,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
        assert(Offsets.Splits.empty() &&
               "Should not have splits the first time we see an instruction!");
        Offsets.S = &S;
-      Offsets.Splits.push_back(P.endOffset());
+      Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
      }
  
      // Now scan the already split slices, and add a split for any of them which
@@ -3586,13 +3607,14 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
        assert(Offsets.S == S && "Found a mismatched slice!");
        assert(!Offsets.Splits.empty() &&
               "Cannot have an empty set of splits on the second partition!");
-      assert(Offsets.Splits.back() == P.beginOffset() &&
+      assert(Offsets.Splits.back() ==
+                 P.beginOffset() - Offsets.S->beginOffset() &&
               "Previous split does not end where this one begins!");
  
        // Record each split. The last partition's end isn't needed as the size
        // of the slice dictates that.
        if (S->endOffset() > P.endOffset())
-        Offsets.Splits.push_back(P.endOffset());
+        Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
      }
    }
  
@@ -3600,16 +3622,20 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
    // such loads and stores, we can only pre-split them if their splits exactly
    // match relative to their starting offset. We have to verify this prior to
    // any rewriting.
-  SmallPtrSet<LoadInst *, 4> BadSplitLoads;
    Stores.erase(
        std::remove_if(Stores.begin(), Stores.end(),
-                     [&BadSplitLoads, &SplitOffsetsMap](StoreInst *SI) {
+                     [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
                         // Lookup the load we are storing in our map of split
                         // offsets.
                         auto *LI = cast<LoadInst>(SI->getValueOperand());
+                       // If it was completely unsplittable, then we're done,
+                       // and this store can't be pre-split.
+                       if (UnsplittableLoads.count(LI))
+                         return true;
+
                         auto LoadOffsetsI = SplitOffsetsMap.find(LI);
                         if (LoadOffsetsI == SplitOffsetsMap.end())
-                         return false; // Unrelated loads are always safe.
+                         return false; // Unrelated loads are definitely safe.
                         auto &LoadOffsets = LoadOffsetsI->second;
  
                         // Now lookup the store's offsets.
@@ -3630,16 +3656,30 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
                         // with mismatched relative splits. Just give up on them
                         // and remove both instructions from our list of
                         // candidates.
-                       BadSplitLoads.insert(LI);
+                       UnsplittableLoads.insert(LI);
                         return true;
                       }),
        Stores.end());
+  // Now we have to go *back* through all te stores, because a later store may
+  // have caused an earlier store's load to become unsplittable and if it is
+  // unsplittable for the later store, then we can't rely on it being split in
+  // the earlier store either.
+  Stores.erase(std::remove_if(Stores.begin(), Stores.end(),
+                              [&UnsplittableLoads](StoreInst *SI) {
+                                auto *LI =
+                                    cast<LoadInst>(SI->getValueOperand());
+                                return UnsplittableLoads.count(LI);
+                              }),
+               Stores.end());
+  // Once we've established all the loads that can't be split for some reason,
+  // filter any that made it into our list out.
    Loads.erase(std::remove_if(Loads.begin(), Loads.end(),
-                             [&BadSplitLoads](LoadInst *LI) {
-                               return BadSplitLoads.count(LI);
+                             [&UnsplittableLoads](LoadInst *LI) {
+                               return UnsplittableLoads.count(LI);
                               }),
                Loads.end());
  
+
    // If no loads or stores are left, there is no pre-splitting to be done for
    // this alloca.
    if (Loads.empty() && Stores.empty())
@@ -3666,6 +3706,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
    // them to the alloca slices.
    SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
    std::vector<LoadInst *> SplitLoads;
+  const DataLayout &DL = AI.getModule()->getDataLayout();
    for (LoadInst *LI : Loads) {
      SplitLoads.clear();
  
@@ -3691,10 +3732,10 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
        auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
        auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
        LoadInst *PLoad = IRB.CreateAlignedLoad(
-          getAdjustedPtr(IRB, *DL, BasePtr,
-                         APInt(DL->getPointerSizeInBits(), PartOffset),
+          getAdjustedPtr(IRB, DL, BasePtr,
+                         APInt(DL.getPointerSizeInBits(), PartOffset),
                           PartPtrTy, BasePtr->getName() + "."),
-          getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false,
+          getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
            LI->getName());
  
        // Append this load onto the list of split loads so we can find it later
@@ -3705,7 +3746,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
        NewSlices.push_back(
            Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
                  &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
-                /*IsSplittable*/ true));
+                /*IsSplittable*/ false));
        DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
                     << ", " << NewSlices.back().endOffset() << "): " << *PLoad
                     << "\n");
@@ -3744,10 +3785,10 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
              PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
  
          StoreInst *PStore = IRB.CreateAlignedStore(
-            PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr,
-                                  APInt(DL->getPointerSizeInBits(), PartOffset),
+            PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
+                                  APInt(DL.getPointerSizeInBits(), PartOffset),
                                    PartPtrTy, StoreBasePtr->getName() + "."),
-            getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false);
+            getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
          (void)PStore;
          DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n");
        }
@@ -3824,26 +3865,26 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
        } else {
          IRB.SetInsertPoint(BasicBlock::iterator(LI));
          PLoad = IRB.CreateAlignedLoad(
-            getAdjustedPtr(IRB, *DL, LoadBasePtr,
-                           APInt(DL->getPointerSizeInBits(), PartOffset),
+            getAdjustedPtr(IRB, DL, LoadBasePtr,
+                           APInt(DL.getPointerSizeInBits(), PartOffset),
                             PartPtrTy, LoadBasePtr->getName() + "."),
-            getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false,
+            getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
              LI->getName());
        }
  
        // And store this partition.
        IRB.SetInsertPoint(BasicBlock::iterator(SI));
        StoreInst *PStore = IRB.CreateAlignedStore(
-          PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr,
-                                APInt(DL->getPointerSizeInBits(), PartOffset),
+          PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
+                                APInt(DL.getPointerSizeInBits(), PartOffset),
                                  PartPtrTy, StoreBasePtr->getName() + "."),
-          getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false);
+          getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
  
        // Now build a new slice for the alloca.
        NewSlices.push_back(
            Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
                  &PStore->getOperandUse(PStore->getPointerOperandIndex()),
-                /*IsSplittable*/ true));
+                /*IsSplittable*/ false));
        DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
                     << ", " << NewSlices.back().endOffset() << "): " << *PStore
                     << "\n");
@@ -3879,24 +3920,29 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
      }
  
      // Mark the original store as dead now that we've split it up and kill its
-    // slice. Note that we leave the original load in place. It may in turn be
-    // split up if it is an alloca load for some other alloca, but it may be
-    // a normal load. This may introduce redundant loads, but where those can
-    // be merged the rest of the optimizer should handle the merging, and this
-    // uncovers SSA splits which is more important. In practice, the original
-    // loads will almost always be fully split and removed eventually, and the
-    // splits will be merged by any trivial CSE, including instcombine.
+    // slice. Note that we leave the original load in place unless this store
+    // was its ownly use. It may in turn be split up if it is an alloca load
+    // for some other alloca, but it may be a normal load. This may introduce
+    // redundant loads, but where those can be merged the rest of the optimizer
+    // should handle the merging, and this uncovers SSA splits which is more
+    // important. In practice, the original loads will almost always be fully
+    // split and removed eventually, and the splits will be merged by any
+    // trivial CSE, including instcombine.
+    if (LI->hasOneUse()) {
+      assert(*LI->user_begin() == SI && "Single use isn't this store!");
+      DeadInsts.insert(LI);
+    }
      DeadInsts.insert(SI);
      Offsets.S->kill();
    }
  
-  // Now we need to remove the killed slices, sort the newly added slices, and
-  // merge the two sorted ranges of slices so that the entire range is sorted
-  // properly for us to re-compute the partitions.
+  // Remove the killed slices that have ben pre-split.
    AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) {
      return S.isDead();
    }), AS.end());
  
+  // Insert our new slices. This will sort and merge them into the sorted
+  // sequence.
    AS.insert(NewSlices);
  
    DEBUG(dbgs() << "  Pre-split slices:\n");
@@ -3926,31 +3972,32 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
  /// appropriate new offsets. It also evaluates how successful the rewrite was
  /// at enabling promotion and if it was successful queues the alloca to be
  /// promoted.
-bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
-                            AllocaSlices::Partition &P) {
+AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
+                                   AllocaSlices::Partition &P) {
    // Try to compute a friendly type for this partition of the alloca. This
    // won't always succeed, in which case we fall back to a legal integer type
    // or an i8 array of an appropriate size.
    Type *SliceTy = nullptr;
+  const DataLayout &DL = AI.getModule()->getDataLayout();
    if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()))
-    if (DL->getTypeAllocSize(CommonUseTy) >= P.size())
+    if (DL.getTypeAllocSize(CommonUseTy) >= P.size())
        SliceTy = CommonUseTy;
    if (!SliceTy)
-    if (Type *TypePartitionTy = getTypePartition(*DL, AI.getAllocatedType(),
+    if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
                                                   P.beginOffset(), P.size()))
        SliceTy = TypePartitionTy;
    if ((!SliceTy || (SliceTy->isArrayTy() &&
                      SliceTy->getArrayElementType()->isIntegerTy())) &&
-      DL->isLegalInteger(P.size() * 8))
+      DL.isLegalInteger(P.size() * 8))
      SliceTy = Type::getIntNTy(*C, P.size() * 8);
    if (!SliceTy)
      SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
-  assert(DL->getTypeAllocSize(SliceTy) >= P.size());
+  assert(DL.getTypeAllocSize(SliceTy) >= P.size());
  
-  bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, *DL);
+  bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
  
    VectorType *VecTy =
-      IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, *DL);
+      IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
    if (VecTy)
      SliceTy = VecTy;
  
@@ -3965,18 +4012,19 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
      NewAI = &AI;
      // FIXME: We should be able to bail at this point with "nothing changed".
      // FIXME: We might want to defer PHI speculation until after here.
+    // FIXME: return nullptr;
    } else {
      unsigned Alignment = AI.getAlignment();
      if (!Alignment) {
        // The minimum alignment which users can rely on when the explicit
        // alignment is omitted or zero is that required by the ABI for this
        // type.
-      Alignment = DL->getABITypeAlignment(AI.getAllocatedType());
+      Alignment = DL.getABITypeAlignment(AI.getAllocatedType());
      }
      Alignment = MinAlign(Alignment, P.beginOffset());
      // If we will get at least this much alignment from the type alone, leave
      // the alloca's alignment unconstrained.
-    if (Alignment <= DL->getABITypeAlignment(SliceTy))
+    if (Alignment <= DL.getABITypeAlignment(SliceTy))
        Alignment = 0;
      NewAI = new AllocaInst(
          SliceTy, nullptr, Alignment,
@@ -3996,7 +4044,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
    SmallPtrSet<PHINode *, 8> PHIUsers;
    SmallPtrSet<SelectInst *, 8> SelectUsers;
  
-  AllocaSliceRewriter Rewriter(*DL, AS, *this, AI, *NewAI, P.beginOffset(),
+  AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
                                 P.endOffset(), IsIntegerPromotable, VecTy,
                                 PHIUsers, SelectUsers);
    bool Promotable = true;
@@ -4018,7 +4066,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
    for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(),
                                              E = PHIUsers.end();
         I != E; ++I)
-    if (!isSafePHIToSpeculate(**I, DL)) {
+    if (!isSafePHIToSpeculate(**I)) {
        Promotable = false;
        PHIUsers.clear();
        SelectUsers.clear();
@@ -4027,7 +4075,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
    for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(),
                                                 E = SelectUsers.end();
         I != E; ++I)
-    if (!isSafeSelectToSpeculate(**I, DL)) {
+    if (!isSafeSelectToSpeculate(**I)) {
        Promotable = false;
        PHIUsers.clear();
        SelectUsers.clear();
@@ -4060,7 +4108,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
        PostPromotionWorklist.pop_back();
    }
  
-  return true;
+  return NewAI;
  }
  
  /// \brief Walks the slices of an alloca and form partitions based on them,
@@ -4071,12 +4119,58 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
  
    unsigned NumPartitions = 0;
    bool Changed = false;
+  const DataLayout &DL = AI.getModule()->getDataLayout();
  
+  // First try to pre-split loads and stores.
    Changed |= presplitLoadsAndStores(AI, AS);
  
+  // Now that we have identified any pre-splitting opportunities, mark any
+  // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail
+  // to split these during pre-splitting, we want to force them to be
+  // rewritten into a partition.
+  bool IsSorted = true;
+  for (Slice &S : AS) {
+    if (!S.isSplittable())
+      continue;
+    // FIXME: We currently leave whole-alloca splittable loads and stores. This
+    // used to be the only splittable loads and stores and we need to be
+    // confident that the above handling of splittable loads and stores is
+    // completely sufficient before we forcibly disable the remaining handling.
+    if (S.beginOffset() == 0 &&
+        S.endOffset() >= DL.getTypeAllocSize(AI.getAllocatedType()))
+      continue;
+    if (isa<LoadInst>(S.getUse()->getUser()) ||
+        isa<StoreInst>(S.getUse()->getUser())) {
+      S.makeUnsplittable();
+      IsSorted = false;
+    }
+  }
+  if (!IsSorted)
+    std::sort(AS.begin(), AS.end());
+
+  /// \brief Describes the allocas introduced by rewritePartition
+  /// in order to migrate the debug info.
+  struct Piece {
+    AllocaInst *Alloca;
+    uint64_t Offset;
+    uint64_t Size;
+    Piece(AllocaInst *AI, uint64_t O, uint64_t S)
+      : Alloca(AI), Offset(O), Size(S) {}
+  };
+  SmallVector<Piece, 4> Pieces;
+
    // Rewrite each partition.
    for (auto &P : AS.partitions()) {
-    Changed |= rewritePartition(AI, AS, P);
+    if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
+      Changed = true;
+      if (NewAI != &AI) {
+        uint64_t SizeOfByte = 8;
+        uint64_t AllocaSize = DL.getTypeSizeInBits(NewAI->getAllocatedType());
+        // Don't include any padding.
+        uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
+        Pieces.push_back(Piece(NewAI, P.beginOffset() * SizeOfByte, Size));
+      }
+    }
      ++NumPartitions;
    }
  
@@ -4084,6 +4178,42 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
    MaxPartitionsPerAlloca =
        std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca);
  
+  // Migrate debug information from the old alloca to the new alloca(s)
+  // and the individial partitions.
+  if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) {
+    DIVariable Var(DbgDecl->getVariable());
+    DIExpression Expr(DbgDecl->getExpression());
+    DIBuilder DIB(*AI.getParent()->getParent()->getParent(),
+                  /*AllowUnresolved*/ false);
+    bool IsSplit = Pieces.size() > 1;
+    for (auto Piece : Pieces) {
+      // Create a piece expression describing the new partition or reuse AI's
+      // expression if there is only one partition.
+      DIExpression PieceExpr = Expr;
+      if (IsSplit || Expr->isBitPiece()) {
+        // If this alloca is already a scalar replacement of a larger aggregate,
+        // Piece.Offset describes the offset inside the scalar.
+        uint64_t Offset = Expr->isBitPiece() ? Expr->getBitPieceOffset() : 0;
+        uint64_t Start = Offset + Piece.Offset;
+        uint64_t Size = Piece.Size;
+        if (Expr->isBitPiece()) {
+          uint64_t AbsEnd = Expr->getBitPieceOffset() + Expr->getBitPieceSize();
+          if (Start >= AbsEnd)
+            // No need to describe a SROAed padding.
+            continue;
+          Size = std::min(Size, AbsEnd - Start);
+        }
+        PieceExpr = DIB.createBitPieceExpression(Start, Size);
+      }
+
+      // Remove any existing dbg.declare intrinsic describing the same alloca.
+      if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Piece.Alloca))
+        OldDDI->eraseFromParent();
+
+      DIB.insertDeclare(Piece.Alloca, Var, PieceExpr, DbgDecl->getDebugLoc(),
+                        &AI);
+    }
+  }
    return Changed;
  }
  
@@ -4116,21 +4246,22 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
      AI.eraseFromParent();
      return true;
    }
+  const DataLayout &DL = AI.getModule()->getDataLayout();
  
    // Skip alloca forms that this analysis can't handle.
    if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
-      DL->getTypeAllocSize(AI.getAllocatedType()) == 0)
+      DL.getTypeAllocSize(AI.getAllocatedType()) == 0)
      return false;
  
    bool Changed = false;
  
    // First, split any FCA loads and stores touching this alloca to promote
    // better splitting and promotion opportunities.
-  AggLoadStoreRewriter AggRewriter(*DL);
+  AggLoadStoreRewriter AggRewriter(DL);
    Changed |= AggRewriter.rewrite(AI);
  
    // Build the slices using a recursive instruction-visiting builder.
-  AllocaSlices AS(*DL, AI);
+  AllocaSlices AS(DL, AI);
    DEBUG(AS.print(dbgs()));
    if (AS.isEscaped())
      return Changed;
@@ -4195,8 +4326,11 @@ void SROA::deleteDeadInstructions(
            DeadInsts.insert(U);
        }
  
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
        DeletedAllocas.insert(AI);
+      if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(AI))
+        DbgDecl->eraseFromParent();
+    }
  
      ++NumDeleted;
      I->eraseFromParent();
@@ -4227,7 +4361,7 @@ bool SROA::promoteAllocas(Function &F) {
  
    if (DT && !ForceSSAUpdater) {
      DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
-    PromoteMemToReg(PromotableAllocas, *DT, nullptr, AT);
+    PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC);
      PromotableAllocas.clear();
      return true;
    }
@@ -4300,22 +4434,17 @@ bool SROA::runOnFunction(Function &F) {
  
    DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
    C = &F.getContext();
-  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  if (!DLP) {
-    DEBUG(dbgs() << "  Skipping SROA -- no target data!\n");
-    return false;
-  }
-  DL = &DLP->getDataLayout();
    DominatorTreeWrapperPass *DTWP =
        getAnalysisIfAvailable<DominatorTreeWrapperPass>();
    DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  AT = &getAnalysis<AssumptionTracker>();
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
  
    BasicBlock &EntryBB = F.getEntryBlock();
    for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
-       I != E; ++I)
+       I != E; ++I) {
      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
        Worklist.insert(AI);
+  }
  
    bool Changed = false;
    // A set of deleted alloca instruction pointers which should be removed from
@@ -4351,7 +4480,7 @@ bool SROA::runOnFunction(Function &F) {
  }
  
  void SROA::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AssumptionTracker>();
+  AU.addRequired<AssumptionCacheTracker>();
    if (RequiresDomTree)
      AU.addRequired<DominatorTreeWrapperPass>();
    AU.setPreservesCFG();