X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTransforms%2FScalar%2FSROA.cpp;h=cd9f8bfaa683e957fb4266f69b952e20596ce4f8;hp=e874cd51a2cf857ce33bd2de1a8d768317b5f5e2;hb=9befb59470dddd7f9f684de8c4f48748e861fe32;hpb=c807870534f0ad10051fd6c6be515d7122d1a00c diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index e874cd51a2c..cd9f8bfaa68 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -28,7 +28,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionTracker.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/Analysis/ValueTracking.h" @@ -55,7 +55,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" #if __cplusplus >= 201103L && !defined(NDEBUG) // We only use this for a debug check in C++11 @@ -77,11 +76,6 @@ STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); STATISTIC(NumDeleted, "Number of instructions deleted"); STATISTIC(NumVectorized, "Number of vectorized aggregates"); -/// Hidden option to force the pass to not use DomTree and mem2reg, instead -/// forming SSA values through the SSAUpdater infrastructure. -static cl::opt ForceSSAUpdater("force-ssa-updater", cl::init(false), - cl::Hidden); - /// Hidden option to enable randomly shuffling the slices to help uncover /// instability in their order. static cl::opt SROARandomShuffleSlices("sroa-random-shuffle-slices", @@ -237,6 +231,22 @@ public: const_iterator end() const { return Slices.end(); } /// @} + /// \brief Erase a range of slices. + void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); } + + /// \brief Insert new slices for this alloca. + /// + /// This moves the slices into the alloca's slices collection, and re-sorts + /// everything so that the usual ordering properties of the alloca's slices + /// hold. + void insert(ArrayRef NewSlices) { + int OldSize = Slices.size(); + Slices.append(NewSlices.begin(), NewSlices.end()); + auto SliceI = Slices.begin() + OldSize; + std::sort(SliceI, Slices.end()); + std::inplace_merge(Slices.begin(), SliceI, Slices.end()); + } + // Forward declare an iterator to befriend it. class partition_iterator; @@ -254,14 +264,15 @@ public: friend class AllocaSlices; friend class AllocaSlices::partition_iterator; - /// \brief The begining and ending offsets of the alloca for this partition. + /// \brief The beginning and ending offsets of the alloca for this + /// partition. uint64_t BeginOffset, EndOffset; /// \brief The start end end iterators of this partition. iterator SI, SJ; - /// \brief A collection of split slices. - SmallVector SplitSlices; + /// \brief A collection of split slice tails overlapping the partition. + SmallVector SplitTails; /// \brief Raw constructor builds an empty partition starting and ending at /// the given iterator. @@ -290,16 +301,25 @@ public: /// a region occupied by split slices. bool empty() const { return SI == SJ; } - /// \name Iterate contained slices. - /// All of these slices are fully contained in the partition. They may be - /// splittable or unsplittable. + /// \name Iterate slices that start within the partition. + /// These may be splittable or unsplittable. They have a begin offset >= the + /// partition begin offset. /// @{ + // FIXME: We should probably define a "concat_iterator" helper and use that + // to stitch together pointee_iterators over the split tails and the + // contiguous iterators of the partition. That would give a much nicer + // interface here. We could then additionally expose filtered iterators for + // split, unsplit, and unsplittable splices based on the usage patterns. iterator begin() const { return SI; } iterator end() const { return SJ; } /// @} - /// \brief Get the sequence of split slices. - ArrayRef splitSlices() const { return SplitSlices; } + /// \brief Get the sequence of split slice tails. + /// + /// These tails are of slices which start before this partition but are + /// split and overlap into the partition. We accumulate these while forming + /// partitions. + ArrayRef splitSliceTails() const { return SplitTails; } }; /// \brief An iterator over partitions of the alloca's slices. @@ -341,30 +361,30 @@ public: /// /// Requires that the iterator not be at the end of the slices. void advance() { - assert((P.SI != SE || !P.SplitSlices.empty()) && + assert((P.SI != SE || !P.SplitTails.empty()) && "Cannot advance past the end of the slices!"); // Clear out any split uses which have ended. - if (!P.SplitSlices.empty()) { + if (!P.SplitTails.empty()) { if (P.EndOffset >= MaxSplitSliceEndOffset) { // If we've finished all splits, this is easy. - P.SplitSlices.clear(); + P.SplitTails.clear(); MaxSplitSliceEndOffset = 0; } else { // Remove the uses which have ended in the prior partition. This // cannot change the max split slice end because we just checked that // the prior partition ended prior to that max. - P.SplitSlices.erase( + P.SplitTails.erase( std::remove_if( - P.SplitSlices.begin(), P.SplitSlices.end(), + P.SplitTails.begin(), P.SplitTails.end(), [&](Slice *S) { return S->endOffset() <= P.EndOffset; }), - P.SplitSlices.end()); - assert(std::any_of(P.SplitSlices.begin(), P.SplitSlices.end(), + P.SplitTails.end()); + assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(), [&](Slice *S) { return S->endOffset() == MaxSplitSliceEndOffset; }) && "Could not find the current max split slice offset!"); - assert(std::all_of(P.SplitSlices.begin(), P.SplitSlices.end(), + assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(), [&](Slice *S) { return S->endOffset() <= MaxSplitSliceEndOffset; }) && @@ -375,7 +395,7 @@ public: // If P.SI is already at the end, then we've cleared the split tail and // now have an end iterator. if (P.SI == SE) { - assert(P.SplitSlices.empty() && "Failed to clear the split slices!"); + assert(P.SplitTails.empty() && "Failed to clear the split slices!"); return; } @@ -386,7 +406,7 @@ public: // partition into the split list. for (Slice &S : P) if (S.isSplittable() && S.endOffset() > P.EndOffset) { - P.SplitSlices.push_back(&S); + P.SplitTails.push_back(&S); MaxSplitSliceEndOffset = std::max(S.endOffset(), MaxSplitSliceEndOffset); } @@ -404,7 +424,7 @@ public: // If the we have split slices and the next slice is after a gap and is // not splittable immediately form an empty partition for the split // slices up until the next slice begins. - if (!P.SplitSlices.empty() && P.SI->beginOffset() != P.EndOffset && + if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset && !P.SI->isSplittable()) { P.BeginOffset = P.EndOffset; P.EndOffset = P.SI->beginOffset(); @@ -414,10 +434,10 @@ public: // OK, we need to consume new slices. Set the end offset based on the // current slice, and step SJ past it. The beginning offset of the - // parttion is the beginning offset of the next slice unless we have + // partition is the beginning offset of the next slice unless we have // pre-existing split slices that are continuing, in which case we begin // at the prior end offset. - P.BeginOffset = P.SplitSlices.empty() ? P.SI->beginOffset() : P.EndOffset; + P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset; P.EndOffset = P.SI->endOffset(); ++P.SJ; @@ -468,17 +488,17 @@ public: "End iterators don't match between compared partition iterators!"); // The observed positions of partitions is marked by the P.SI iterator and - // the emptyness of the split slices. The latter is only relevant when + // the emptiness of the split slices. The latter is only relevant when // P.SI == SE, as the end iterator will additionally have an empty split // slices list, but the prior may have the same P.SI and a tail of split // slices. if (P.SI == RHS.P.SI && - P.SplitSlices.empty() == RHS.P.SplitSlices.empty()) { + P.SplitTails.empty() == RHS.P.SplitTails.empty()) { assert(P.SJ == RHS.P.SJ && "Same set of slices formed two different sized partitions!"); - assert(P.SplitSlices.size() == RHS.P.SplitSlices.size() && + assert(P.SplitTails.size() == RHS.P.SplitTails.size() && "Same slice position with differently sized non-empty split " - "slices sets!"); + "slice tails!"); return true; } return false; @@ -676,6 +696,7 @@ private: // by writing out the code here where we have tho underlying allocation // size readily available. APInt GEPOffset = Offset; + const DataLayout &DL = GEPI.getModule()->getDataLayout(); for (gep_type_iterator GTI = gep_type_begin(GEPI), GTE = gep_type_end(GEPI); GTI != GTE; ++GTI) { @@ -710,16 +731,10 @@ private: void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, uint64_t Size, bool IsVolatile) { - // We allow splitting of loads and stores where the type is an integer type - // and cover the entire alloca. This prevents us from splitting over - // eagerly. - // FIXME: In the great blue eventually, we should eagerly split all integer - // loads and stores, and then have a separate step that merges adjacent - // alloca partitions into a single partition suitable for integer widening. - // Or we should skip the merge step and rely on GVN and other passes to - // merge adjacent loads and stores that survive mem2reg. - bool IsSplittable = - Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize; + // We allow splitting of non-volatile loads and stores where the type is an + // integer type. These may be used to implement 'memcpy' or other "transfer + // of bits" patterns. + bool IsSplittable = Ty->isIntegerTy() && !IsVolatile; insertUse(I, Offset, Size, IsSplittable); } @@ -731,6 +746,7 @@ private: if (!IsOffsetKnown) return PI.setAborted(&LI); + const DataLayout &DL = LI.getModule()->getDataLayout(); uint64_t Size = DL.getTypeStoreSize(LI.getType()); return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile()); } @@ -742,6 +758,7 @@ private: if (!IsOffsetKnown) return PI.setAborted(&SI); + const DataLayout &DL = SI.getModule()->getDataLayout(); uint64_t Size = DL.getTypeStoreSize(ValOp->getType()); // If this memory access can be shown to *statically* extend outside the @@ -879,6 +896,7 @@ private: SmallVector, 4> Uses; Visited.insert(Root); Uses.push_back(std::make_pair(cast(*U), Root)); + const DataLayout &DL = Root->getModule()->getDataLayout(); // If there are no loads or stores, the access is dead. We mark that as // a size zero access. Size = 0; @@ -1013,6 +1031,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) void AllocaSlices::print(raw_ostream &OS, const_iterator I, StringRef Indent) const { printSlice(OS, I, Indent); + OS << "\n"; printUse(OS, I, Indent); } @@ -1020,7 +1039,7 @@ void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I, StringRef Indent) const { OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")" << " slice #" << (I - begin()) - << (I->isSplittable() ? " (splittable)" : "") << "\n"; + << (I->isSplittable() ? " (splittable)" : ""); } void AllocaSlices::printUse(raw_ostream &OS, const_iterator I, @@ -1048,109 +1067,6 @@ LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); } #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -namespace { -/// \brief Implementation of LoadAndStorePromoter for promoting allocas. -/// -/// This subclass of LoadAndStorePromoter adds overrides to handle promoting -/// the loads and stores of an alloca instruction, as well as updating its -/// debug information. This is used when a domtree is unavailable and thus -/// mem2reg in its full form can't be used to handle promotion of allocas to -/// scalar values. -class AllocaPromoter : public LoadAndStorePromoter { - AllocaInst &AI; - DIBuilder &DIB; - - SmallVector DDIs; - SmallVector DVIs; - -public: - AllocaPromoter(const SmallVectorImpl &Insts, SSAUpdater &S, - AllocaInst &AI, DIBuilder &DIB) - : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} - - void run(const SmallVectorImpl &Insts) { - // Retain the debug information attached to the alloca for use when - // rewriting loads and stores. - if (auto *L = LocalAsMetadata::getIfExists(&AI)) { - if (auto *DebugNode = MetadataAsValue::getIfExists(AI.getContext(), L)) { - for (User *U : DebugNode->users()) - if (DbgDeclareInst *DDI = dyn_cast(U)) - DDIs.push_back(DDI); - else if (DbgValueInst *DVI = dyn_cast(U)) - DVIs.push_back(DVI); - } - } - - LoadAndStorePromoter::run(Insts); - - // While we have the debug information, clear it off of the alloca. The - // caller takes care of deleting the alloca. - while (!DDIs.empty()) - DDIs.pop_back_val()->eraseFromParent(); - while (!DVIs.empty()) - DVIs.pop_back_val()->eraseFromParent(); - } - - bool - isInstInList(Instruction *I, - const SmallVectorImpl &Insts) const override { - Value *Ptr; - if (LoadInst *LI = dyn_cast(I)) - Ptr = LI->getOperand(0); - else - Ptr = cast(I)->getPointerOperand(); - - // Only used to detect cycles, which will be rare and quickly found as - // we're walking up a chain of defs rather than down through uses. - SmallPtrSet Visited; - - do { - if (Ptr == &AI) - return true; - - if (BitCastInst *BCI = dyn_cast(Ptr)) - Ptr = BCI->getOperand(0); - else if (GetElementPtrInst *GEPI = dyn_cast(Ptr)) - Ptr = GEPI->getPointerOperand(); - else - return false; - - } while (Visited.insert(Ptr).second); - - return false; - } - - void updateDebugInfo(Instruction *Inst) const override { - for (DbgDeclareInst *DDI : DDIs) - if (StoreInst *SI = dyn_cast(Inst)) - ConvertDebugDeclareToDebugValue(DDI, SI, DIB); - else if (LoadInst *LI = dyn_cast(Inst)) - ConvertDebugDeclareToDebugValue(DDI, LI, DIB); - for (DbgValueInst *DVI : DVIs) { - Value *Arg = nullptr; - if (StoreInst *SI = dyn_cast(Inst)) { - // If an argument is zero extended then use argument directly. The ZExt - // may be zapped by an optimization pass in future. - if (ZExtInst *ZExt = dyn_cast(SI->getOperand(0))) - Arg = dyn_cast(ZExt->getOperand(0)); - else if (SExtInst *SExt = dyn_cast(SI->getOperand(0))) - Arg = dyn_cast(SExt->getOperand(0)); - if (!Arg) - Arg = SI->getValueOperand(); - } else if (LoadInst *LI = dyn_cast(Inst)) { - Arg = LI->getPointerOperand(); - } else { - continue; - } - Instruction *DbgVal = - DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), - DIExpression(DVI->getExpression()), Inst); - DbgVal->setDebugLoc(DVI->getDebugLoc()); - } - } -}; -} // end anon namespace - namespace { /// \brief An optimization pass providing Scalar Replacement of Aggregates. /// @@ -1171,12 +1087,9 @@ namespace { /// this form. By doing so, it will enable promotion of vector aggregates to /// SSA vector values. class SROA : public FunctionPass { - const bool RequiresDomTree; - LLVMContext *C; - const DataLayout *DL; DominatorTree *DT; - AssumptionTracker *AT; + AssumptionCache *AC; /// \brief Worklist of alloca instructions to simplify. /// @@ -1221,9 +1134,7 @@ class SROA : public FunctionPass { SetVector> SpeculatableSelects; public: - SROA(bool RequiresDomTree = true) - : FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr), - DL(nullptr), DT(nullptr) { + SROA() : FunctionPass(ID), C(nullptr), DT(nullptr) { initializeSROAPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -1236,8 +1147,9 @@ private: friend class PHIOrSelectSpeculator; friend class AllocaSliceRewriter; - bool rewritePartition(AllocaInst &AI, AllocaSlices &AS, - AllocaSlices::Partition &P); + bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS); + AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, + AllocaSlices::Partition &P); bool splitAlloca(AllocaInst &AI, AllocaSlices &AS); bool runOnAlloca(AllocaInst &AI); void clobberUse(Use &U); @@ -1248,13 +1160,13 @@ private: char SROA::ID = 0; -FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { - return new SROA(RequiresDomTree); +FunctionPass *llvm::createSROAPass() { + return new SROA(); } INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false, false) @@ -1328,7 +1240,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B, /// /// FIXME: This should be hoisted into a generic utility, likely in /// Transforms/Util/Local.h -static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) { +static bool isSafePHIToSpeculate(PHINode &PN) { // For now, we can only do this promotion if the load is in the same block // as the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. @@ -1360,6 +1272,8 @@ static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) { if (!HaveLoad) return false; + const DataLayout &DL = PN.getModule()->getDataLayout(); + // We can only transform this if it is safe to push the loads into the // predecessor blocks. The only thing to watch out for is that we can't put // a possibly trapping load in the predecessor if it is a critical edge. @@ -1381,8 +1295,8 @@ static bool isSafePHIToSpeculate(PHINode &PN, const DataLayout *DL = nullptr) { // If this pointer is always safe to load, or if we can prove that there // is already a load in the block, then we can move the load to the pred // block. - if (InVal->isDereferenceablePointer(DL) || - isSafeToLoadUnconditionally(InVal, TI, MaxAlign, DL)) + if (isDereferenceablePointer(InVal, DL) || + isSafeToLoadUnconditionally(InVal, TI, MaxAlign)) continue; return false; @@ -1447,12 +1361,12 @@ static void speculatePHINodeLoads(PHINode &PN) { /// /// We can do this to a select if its only uses are loads and if the operand /// to the select can be loaded unconditionally. -static bool isSafeSelectToSpeculate(SelectInst &SI, - const DataLayout *DL = nullptr) { +static bool isSafeSelectToSpeculate(SelectInst &SI) { Value *TValue = SI.getTrueValue(); Value *FValue = SI.getFalseValue(); - bool TDerefable = TValue->isDereferenceablePointer(DL); - bool FDerefable = FValue->isDereferenceablePointer(DL); + const DataLayout &DL = SI.getModule()->getDataLayout(); + bool TDerefable = isDereferenceablePointer(TValue, DL); + bool FDerefable = isDereferenceablePointer(FValue, DL); for (User *U : SI.users()) { LoadInst *LI = dyn_cast(U); @@ -1463,10 +1377,10 @@ static bool isSafeSelectToSpeculate(SelectInst &SI, // absolutely (e.g. allocas) or at this point because we can see other // accesses to it. if (!TDerefable && - !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment(), DL)) + !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment())) return false; if (!FDerefable && - !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment(), DL)) + !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment())) return false; } @@ -1526,7 +1440,8 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, if (Indices.size() == 1 && cast(Indices.back())->isZero()) return BasePtr; - return IRB.CreateInBoundsGEP(BasePtr, Indices, NamePrefix + "sroa_idx"); + return IRB.CreateInBoundsGEP(nullptr, BasePtr, Indices, + NamePrefix + "sroa_idx"); } /// \brief Get a natural GEP off of the BasePtr walking through Ty toward @@ -1706,8 +1621,9 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, // We may end up computing an offset pointer that has the wrong type. If we // never are able to compute one directly that has the correct type, we'll - // fall back to it, so keep it around here. + // fall back to it, so keep it and the base it was computed from around here. Value *OffsetPtr = nullptr; + Value *OffsetBasePtr; // Remember any i8 pointer we come across to re-use if we need to do a raw // byte offset. @@ -1732,16 +1648,19 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, Indices.clear(); if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy, Indices, NamePrefix)) { - if (P->getType() == PointerTy) { - // Zap any offset pointer that we ended up computing in previous rounds. - if (OffsetPtr && OffsetPtr->use_empty()) - if (Instruction *I = dyn_cast(OffsetPtr)) - I->eraseFromParent(); + // If we have a new natural pointer at the offset, clear out any old + // offset pointer we computed. Unless it is the base pointer or + // a non-instruction, we built a GEP we don't need. Zap it. + if (OffsetPtr && OffsetPtr != OffsetBasePtr) + if (Instruction *I = dyn_cast(OffsetPtr)) { + assert(I->use_empty() && "Built a GEP with uses some how!"); + I->eraseFromParent(); + } + OffsetPtr = P; + OffsetBasePtr = Ptr; + // If we also found a pointer of the right type, we're done. + if (P->getType() == PointerTy) return P; - } - if (!OffsetPtr) { - OffsetPtr = P; - } } // Stash this pointer if we've found an i8*. @@ -1773,7 +1692,8 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr - : IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), + : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr, + IRB.getInt(Int8PtrOffset), NamePrefix + "sroa_raw_idx"); } Ptr = OffsetPtr; @@ -1785,6 +1705,27 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, return Ptr; } +/// \brief Compute the adjusted alignment for a load or store from an offset. +static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset, + const DataLayout &DL) { + unsigned Alignment; + Type *Ty; + if (auto *LI = dyn_cast(I)) { + Alignment = LI->getAlignment(); + Ty = LI->getType(); + } else if (auto *SI = dyn_cast(I)) { + Alignment = SI->getAlignment(); + Ty = SI->getValueOperand()->getType(); + } else { + llvm_unreachable("Only loads and stores are allowed!"); + } + + if (!Alignment) + Alignment = DL.getABITypeAlignment(Ty); + + return MinAlign(Alignment, Offset); +} + /// \brief Test whether we can convert a value from the old to the new type. /// /// This predicate should be used to guard calls to convertValue in order to @@ -1794,10 +1735,17 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { if (OldTy == NewTy) return true; - if (IntegerType *OldITy = dyn_cast(OldTy)) - if (IntegerType *NewITy = dyn_cast(NewTy)) - if (NewITy->getBitWidth() >= OldITy->getBitWidth()) - return true; + + // For integer types, we can't handle any bit-width differences. This would + // break both vector conversions with extension and introduce endianness + // issues when in conjunction with loads and stores. + if (isa(OldTy) && isa(NewTy)) { + assert(cast(OldTy)->getBitWidth() != + cast(NewTy)->getBitWidth() && + "We can't have the same bitwidth for different int types"); + return false; + } + if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy)) return false; if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) @@ -1832,10 +1780,8 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, if (OldTy == NewTy) return V; - if (IntegerType *OldITy = dyn_cast(OldTy)) - if (IntegerType *NewITy = dyn_cast(NewTy)) - if (NewITy->getBitWidth() > OldITy->getBitWidth()) - return IRB.CreateZExt(V, NewITy); + assert(!(isa(OldTy) && isa(NewTy)) && + "Integer types must be the exact same to convert."); // See if we need inttoptr for this type pair. A cast involving both scalars // and vectors requires and additional bitcast. @@ -1876,7 +1822,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, /// \brief Test whether the given slice use can be promoted to a vector. /// -/// This function is called to test each entry in a partioning which is slated +/// This function is called to test each entry in a partition which is slated /// for a single slice. static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P, const Slice &S, VectorType *Ty, @@ -2044,7 +1990,7 @@ static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P, if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL)) return false; - for (const Slice *S : P.splitSlices()) + for (const Slice *S : P.splitSliceTails()) if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL)) return false; @@ -2072,7 +2018,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t RelEnd = S.endOffset() - AllocBeginOffset; // We can't reasonably handle cases where the load or store extends past - // the end of the aloca's type and into its padding. + // the end of the alloca's type and into its padding. if (RelEnd > Size) return false; @@ -2081,6 +2027,9 @@ static bool isIntegerWideningViableForSlice(const Slice &S, if (LoadInst *LI = dyn_cast(U->getUser())) { if (LI->isVolatile()) return false; + // We can't handle loads that extend past the allocated memory. + if (DL.getTypeStoreSize(LI->getType()) > Size) + return false; // Note that we don't count vector loads or stores as whole-alloca // operations which enable integer widening because we would prefer to use // vector widening instead. @@ -2099,6 +2048,9 @@ static bool isIntegerWideningViableForSlice(const Slice &S, Type *ValueTy = SI->getValueOperand()->getType(); if (SI->isVolatile()) return false; + // We can't handle stores that extend past the allocated memory. + if (DL.getTypeStoreSize(ValueTy) > Size) + return false; // Note that we don't count vector loads or stores as whole-alloca // operations which enable integer widening because we would prefer to use // vector widening instead. @@ -2169,7 +2121,7 @@ static bool isIntegerWideningViable(AllocaSlices::Partition &P, Type *AllocaTy, WholeAllocaOp)) return false; - for (const Slice *S : P.splitSlices()) + for (const Slice *S : P.splitSliceTails()) if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL, WholeAllocaOp)) return false; @@ -2401,6 +2353,9 @@ public: IsSplittable = I->isSplittable(); IsSplit = BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset; + DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : "")); + DEBUG(AS.printSlice(dbgs(), I, "")); + DEBUG(dbgs() << "\n"); // Compute the intersecting offset range. assert(BeginOffset < NewAllocaEndOffset); @@ -2516,9 +2471,19 @@ private: V = convertValue(DL, IRB, V, IntTy); assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; - if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) - V = extractInteger(DL, IRB, V, cast(LI.getType()), Offset, - "extract"); + if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) { + IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8); + V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract"); + } + // It is possible that the extracted type is not the load type. This + // happens if there is a load past the end of the alloca, and as + // a consequence the slice is narrower but still a candidate for integer + // lowering. To handle this case, we just zero extend the extracted + // integer. + assert(cast(LI.getType())->getBitWidth() >= SliceSize * 8 && + "Can only handle an extract for an overly wide load"); + if (cast(LI.getType())->getBitWidth() > SliceSize * 8) + V = IRB.CreateZExt(V, LI.getType()); return V; } @@ -2529,6 +2494,7 @@ private: Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8) : LI.getType(); + const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize; bool IsPtrAdjusted = false; Value *V; if (VecTy) { @@ -2536,14 +2502,36 @@ private: } else if (IntTy && LI.getType()->isIntegerTy()) { V = rewriteIntegerLoad(LI); } else if (NewBeginOffset == NewAllocaBeginOffset && - canConvertValue(DL, NewAllocaTy, LI.getType())) { - V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(), - LI.getName()); + NewEndOffset == NewAllocaEndOffset && + (canConvertValue(DL, NewAllocaTy, TargetTy) || + (IsLoadPastEnd && NewAllocaTy->isIntegerTy() && + TargetTy->isIntegerTy()))) { + LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + LI.isVolatile(), LI.getName()); + if (LI.isVolatile()) + NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope()); + V = NewLI; + + // If this is an integer load past the end of the slice (which means the + // bytes outside the slice are undef or this load is dead) just forcibly + // fix the integer size with correct handling of endianness. + if (auto *AITy = dyn_cast(NewAllocaTy)) + if (auto *TITy = dyn_cast(TargetTy)) + if (AITy->getBitWidth() < TITy->getBitWidth()) { + V = IRB.CreateZExt(V, TITy, "load.ext"); + if (DL.isBigEndian()) + V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(), + "endian_shift"); + } } else { Type *LTy = TargetTy->getPointerTo(); - V = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy), - getSliceAlign(TargetTy), LI.isVolatile(), - LI.getName()); + LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy), + getSliceAlign(TargetTy), + LI.isVolatile(), LI.getName()); + if (LI.isVolatile()) + NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope()); + + V = NewLI; IsPtrAdjusted = true; } V = convertValue(DL, IRB, V, TargetTy); @@ -2565,7 +2553,8 @@ private: // LI only used for this computation. Value *Placeholder = new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); - V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset, "insert"); + V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset, + "insert"); LI.replaceAllUsesWith(V); Placeholder->replaceAllUsesWith(&LI); delete Placeholder; @@ -2644,7 +2633,8 @@ private: DL.getTypeStoreSizeInBits(V->getType()) && "Non-byte-multiple bit width"); IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8); - V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset, "extract"); + V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset, + "extract"); } if (VecTy) @@ -2652,10 +2642,25 @@ private: if (IntTy && V->getType()->isIntegerTy()) return rewriteIntegerStore(V, SI); + const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize; StoreInst *NewSI; if (NewBeginOffset == NewAllocaBeginOffset && NewEndOffset == NewAllocaEndOffset && - canConvertValue(DL, V->getType(), NewAllocaTy)) { + (canConvertValue(DL, V->getType(), NewAllocaTy) || + (IsStorePastEnd && NewAllocaTy->isIntegerTy() && + V->getType()->isIntegerTy()))) { + // If this is an integer store past the end of slice (and thus the bytes + // past that point are irrelevant or this is unreachable), truncate the + // value prior to storing. + if (auto *VITy = dyn_cast(V->getType())) + if (auto *AITy = dyn_cast(NewAllocaTy)) + if (VITy->getBitWidth() > AITy->getBitWidth()) { + if (DL.isBigEndian()) + V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(), + "endian_shift"); + V = IRB.CreateTrunc(V, AITy, "load.trunc"); + } + V = convertValue(DL, IRB, V, NewAllocaTy); NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), SI.isVolatile()); @@ -2664,7 +2669,8 @@ private: NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()), SI.isVolatile()); } - (void)NewSI; + if (SI.isVolatile()) + NewSI->setAtomic(SI.getOrdering(), SI.getSynchScope()); Pass.DeadInsts.insert(&SI); deleteIfTriviallyDead(OldOp); @@ -3194,7 +3200,8 @@ private: void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { assert(Ty->isSingleValueType()); // Load the single value and insert it using the indices. - Value *GEP = IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"); + Value *GEP = + IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep"); Value *Load = IRB.CreateLoad(GEP, Name + ".load"); Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); DEBUG(dbgs() << " to: " << *Load << "\n"); @@ -3227,7 +3234,7 @@ private: // Extract the single value and store it using the indices. Value *Store = IRB.CreateStore( IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), - IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); + IRB.CreateInBoundsGEP(nullptr, Ptr, GEPIndices, Name + ".gep")); (void)Store; DEBUG(dbgs() << " to: " << *Store << "\n"); } @@ -3415,6 +3422,495 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, return SubTy; } +/// \brief Pre-split loads and stores to simplify rewriting. +/// +/// We want to break up the splittable load+store pairs as much as +/// possible. This is important to do as a preprocessing step, as once we +/// start rewriting the accesses to partitions of the alloca we lose the +/// necessary information to correctly split apart paired loads and stores +/// which both point into this alloca. The case to consider is something like +/// the following: +/// +/// %a = alloca [12 x i8] +/// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0 +/// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4 +/// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8 +/// %iptr1 = bitcast i8* %gep1 to i64* +/// %iptr2 = bitcast i8* %gep2 to i64* +/// %fptr1 = bitcast i8* %gep1 to float* +/// %fptr2 = bitcast i8* %gep2 to float* +/// %fptr3 = bitcast i8* %gep3 to float* +/// store float 0.0, float* %fptr1 +/// store float 1.0, float* %fptr2 +/// %v = load i64* %iptr1 +/// store i64 %v, i64* %iptr2 +/// %f1 = load float* %fptr2 +/// %f2 = load float* %fptr3 +/// +/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and +/// promote everything so we recover the 2 SSA values that should have been +/// there all along. +/// +/// \returns true if any changes are made. +bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { + DEBUG(dbgs() << "Pre-splitting loads and stores\n"); + + // Track the loads and stores which are candidates for pre-splitting here, in + // the order they first appear during the partition scan. These give stable + // iteration order and a basis for tracking which loads and stores we + // actually split. + SmallVector Loads; + SmallVector Stores; + + // We need to accumulate the splits required of each load or store where we + // can find them via a direct lookup. This is important to cross-check loads + // and stores against each other. We also track the slice so that we can kill + // all the slices that end up split. + struct SplitOffsets { + Slice *S; + std::vector Splits; + }; + SmallDenseMap SplitOffsetsMap; + + // Track loads out of this alloca which cannot, for any reason, be pre-split. + // This is important as we also cannot pre-split stores of those loads! + // FIXME: This is all pretty gross. It means that we can be more aggressive + // in pre-splitting when the load feeding the store happens to come from + // a separate alloca. Put another way, the effectiveness of SROA would be + // decreased by a frontend which just concatenated all of its local allocas + // into one big flat alloca. But defeating such patterns is exactly the job + // SROA is tasked with! Sadly, to not have this discrepancy we would have + // change store pre-splitting to actually force pre-splitting of the load + // that feeds it *and all stores*. That makes pre-splitting much harder, but + // maybe it would make it more principled? + SmallPtrSet UnsplittableLoads; + + DEBUG(dbgs() << " Searching for candidate loads and stores\n"); + for (auto &P : AS.partitions()) { + for (Slice &S : P) { + Instruction *I = cast(S.getUse()->getUser()); + if (!S.isSplittable() ||S.endOffset() <= P.endOffset()) { + // If this was a load we have to track that it can't participate in any + // pre-splitting! + if (auto *LI = dyn_cast(I)) + UnsplittableLoads.insert(LI); + continue; + } + assert(P.endOffset() > S.beginOffset() && + "Empty or backwards partition!"); + + // Determine if this is a pre-splittable slice. + if (auto *LI = dyn_cast(I)) { + assert(!LI->isVolatile() && "Cannot split volatile loads!"); + + // The load must be used exclusively to store into other pointers for + // us to be able to arbitrarily pre-split it. The stores must also be + // simple to avoid changing semantics. + auto IsLoadSimplyStored = [](LoadInst *LI) { + for (User *LU : LI->users()) { + auto *SI = dyn_cast(LU); + if (!SI || !SI->isSimple()) + return false; + } + return true; + }; + if (!IsLoadSimplyStored(LI)) { + UnsplittableLoads.insert(LI); + continue; + } + + Loads.push_back(LI); + } else if (auto *SI = dyn_cast(S.getUse()->getUser())) { + if (!SI || + S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex())) + continue; + auto *StoredLoad = dyn_cast(SI->getValueOperand()); + if (!StoredLoad || !StoredLoad->isSimple()) + continue; + assert(!SI->isVolatile() && "Cannot split volatile stores!"); + + Stores.push_back(SI); + } else { + // Other uses cannot be pre-split. + continue; + } + + // Record the initial split. + DEBUG(dbgs() << " Candidate: " << *I << "\n"); + auto &Offsets = SplitOffsetsMap[I]; + assert(Offsets.Splits.empty() && + "Should not have splits the first time we see an instruction!"); + Offsets.S = &S; + Offsets.Splits.push_back(P.endOffset() - S.beginOffset()); + } + + // Now scan the already split slices, and add a split for any of them which + // we're going to pre-split. + for (Slice *S : P.splitSliceTails()) { + auto SplitOffsetsMapI = + SplitOffsetsMap.find(cast(S->getUse()->getUser())); + if (SplitOffsetsMapI == SplitOffsetsMap.end()) + continue; + auto &Offsets = SplitOffsetsMapI->second; + + assert(Offsets.S == S && "Found a mismatched slice!"); + assert(!Offsets.Splits.empty() && + "Cannot have an empty set of splits on the second partition!"); + assert(Offsets.Splits.back() == + P.beginOffset() - Offsets.S->beginOffset() && + "Previous split does not end where this one begins!"); + + // Record each split. The last partition's end isn't needed as the size + // of the slice dictates that. + if (S->endOffset() > P.endOffset()) + Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset()); + } + } + + // We may have split loads where some of their stores are split stores. For + // such loads and stores, we can only pre-split them if their splits exactly + // match relative to their starting offset. We have to verify this prior to + // any rewriting. + Stores.erase( + std::remove_if(Stores.begin(), Stores.end(), + [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) { + // Lookup the load we are storing in our map of split + // offsets. + auto *LI = cast(SI->getValueOperand()); + // If it was completely unsplittable, then we're done, + // and this store can't be pre-split. + if (UnsplittableLoads.count(LI)) + return true; + + auto LoadOffsetsI = SplitOffsetsMap.find(LI); + if (LoadOffsetsI == SplitOffsetsMap.end()) + return false; // Unrelated loads are definitely safe. + auto &LoadOffsets = LoadOffsetsI->second; + + // Now lookup the store's offsets. + auto &StoreOffsets = SplitOffsetsMap[SI]; + + // If the relative offsets of each split in the load and + // store match exactly, then we can split them and we + // don't need to remove them here. + if (LoadOffsets.Splits == StoreOffsets.Splits) + return false; + + DEBUG(dbgs() + << " Mismatched splits for load and store:\n" + << " " << *LI << "\n" + << " " << *SI << "\n"); + + // We've found a store and load that we need to split + // with mismatched relative splits. Just give up on them + // and remove both instructions from our list of + // candidates. + UnsplittableLoads.insert(LI); + return true; + }), + Stores.end()); + // Now we have to go *back* through all the stores, because a later store may + // have caused an earlier store's load to become unsplittable and if it is + // unsplittable for the later store, then we can't rely on it being split in + // the earlier store either. + Stores.erase(std::remove_if(Stores.begin(), Stores.end(), + [&UnsplittableLoads](StoreInst *SI) { + auto *LI = + cast(SI->getValueOperand()); + return UnsplittableLoads.count(LI); + }), + Stores.end()); + // Once we've established all the loads that can't be split for some reason, + // filter any that made it into our list out. + Loads.erase(std::remove_if(Loads.begin(), Loads.end(), + [&UnsplittableLoads](LoadInst *LI) { + return UnsplittableLoads.count(LI); + }), + Loads.end()); + + + // If no loads or stores are left, there is no pre-splitting to be done for + // this alloca. + if (Loads.empty() && Stores.empty()) + return false; + + // From here on, we can't fail and will be building new accesses, so rig up + // an IR builder. + IRBuilderTy IRB(&AI); + + // Collect the new slices which we will merge into the alloca slices. + SmallVector NewSlices; + + // Track any allocas we end up splitting loads and stores for so we iterate + // on them. + SmallPtrSet ResplitPromotableAllocas; + + // At this point, we have collected all of the loads and stores we can + // pre-split, and the specific splits needed for them. We actually do the + // splitting in a specific order in order to handle when one of the loads in + // the value operand to one of the stores. + // + // First, we rewrite all of the split loads, and just accumulate each split + // load in a parallel structure. We also build the slices for them and append + // them to the alloca slices. + SmallDenseMap, 1> SplitLoadsMap; + std::vector SplitLoads; + const DataLayout &DL = AI.getModule()->getDataLayout(); + for (LoadInst *LI : Loads) { + SplitLoads.clear(); + + IntegerType *Ty = cast(LI->getType()); + uint64_t LoadSize = Ty->getBitWidth() / 8; + assert(LoadSize > 0 && "Cannot have a zero-sized integer load!"); + + auto &Offsets = SplitOffsetsMap[LI]; + assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() && + "Slice size should always match load size exactly!"); + uint64_t BaseOffset = Offsets.S->beginOffset(); + assert(BaseOffset + LoadSize > BaseOffset && + "Cannot represent alloca access size using 64-bit integers!"); + + Instruction *BasePtr = cast(LI->getPointerOperand()); + IRB.SetInsertPoint(BasicBlock::iterator(LI)); + + DEBUG(dbgs() << " Splitting load: " << *LI << "\n"); + + uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); + int Idx = 0, Size = Offsets.Splits.size(); + for (;;) { + auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); + auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace()); + LoadInst *PLoad = IRB.CreateAlignedLoad( + getAdjustedPtr(IRB, DL, BasePtr, + APInt(DL.getPointerSizeInBits(), PartOffset), + PartPtrTy, BasePtr->getName() + "."), + getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, + LI->getName()); + + // Append this load onto the list of split loads so we can find it later + // to rewrite the stores. + SplitLoads.push_back(PLoad); + + // Now build a new slice for the alloca. + NewSlices.push_back( + Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, + &PLoad->getOperandUse(PLoad->getPointerOperandIndex()), + /*IsSplittable*/ false)); + DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() + << ", " << NewSlices.back().endOffset() << "): " << *PLoad + << "\n"); + + // See if we've handled all the splits. + if (Idx >= Size) + break; + + // Setup the next partition. + PartOffset = Offsets.Splits[Idx]; + ++Idx; + PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset; + } + + // Now that we have the split loads, do the slow walk over all uses of the + // load and rewrite them as split stores, or save the split loads to use + // below if the store is going to be split there anyways. + bool DeferredStores = false; + for (User *LU : LI->users()) { + StoreInst *SI = cast(LU); + if (!Stores.empty() && SplitOffsetsMap.count(SI)) { + DeferredStores = true; + DEBUG(dbgs() << " Deferred splitting of store: " << *SI << "\n"); + continue; + } + + Value *StoreBasePtr = SI->getPointerOperand(); + IRB.SetInsertPoint(BasicBlock::iterator(SI)); + + DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n"); + + for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) { + LoadInst *PLoad = SplitLoads[Idx]; + uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1]; + auto *PartPtrTy = + PLoad->getType()->getPointerTo(SI->getPointerAddressSpace()); + + StoreInst *PStore = IRB.CreateAlignedStore( + PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr, + APInt(DL.getPointerSizeInBits(), PartOffset), + PartPtrTy, StoreBasePtr->getName() + "."), + getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); + (void)PStore; + DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n"); + } + + // We want to immediately iterate on any allocas impacted by splitting + // this store, and we have to track any promotable alloca (indicated by + // a direct store) as needing to be resplit because it is no longer + // promotable. + if (AllocaInst *OtherAI = dyn_cast(StoreBasePtr)) { + ResplitPromotableAllocas.insert(OtherAI); + Worklist.insert(OtherAI); + } else if (AllocaInst *OtherAI = dyn_cast( + StoreBasePtr->stripInBoundsOffsets())) { + Worklist.insert(OtherAI); + } + + // Mark the original store as dead. + DeadInsts.insert(SI); + } + + // Save the split loads if there are deferred stores among the users. + if (DeferredStores) + SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads))); + + // Mark the original load as dead and kill the original slice. + DeadInsts.insert(LI); + Offsets.S->kill(); + } + + // Second, we rewrite all of the split stores. At this point, we know that + // all loads from this alloca have been split already. For stores of such + // loads, we can simply look up the pre-existing split loads. For stores of + // other loads, we split those loads first and then write split stores of + // them. + for (StoreInst *SI : Stores) { + auto *LI = cast(SI->getValueOperand()); + IntegerType *Ty = cast(LI->getType()); + uint64_t StoreSize = Ty->getBitWidth() / 8; + assert(StoreSize > 0 && "Cannot have a zero-sized integer store!"); + + auto &Offsets = SplitOffsetsMap[SI]; + assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() && + "Slice size should always match load size exactly!"); + uint64_t BaseOffset = Offsets.S->beginOffset(); + assert(BaseOffset + StoreSize > BaseOffset && + "Cannot represent alloca access size using 64-bit integers!"); + + Value *LoadBasePtr = LI->getPointerOperand(); + Instruction *StoreBasePtr = cast(SI->getPointerOperand()); + + DEBUG(dbgs() << " Splitting store: " << *SI << "\n"); + + // Check whether we have an already split load. + auto SplitLoadsMapI = SplitLoadsMap.find(LI); + std::vector *SplitLoads = nullptr; + if (SplitLoadsMapI != SplitLoadsMap.end()) { + SplitLoads = &SplitLoadsMapI->second; + assert(SplitLoads->size() == Offsets.Splits.size() + 1 && + "Too few split loads for the number of splits in the store!"); + } else { + DEBUG(dbgs() << " of load: " << *LI << "\n"); + } + + uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); + int Idx = 0, Size = Offsets.Splits.size(); + for (;;) { + auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); + auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace()); + + // Either lookup a split load or create one. + LoadInst *PLoad; + if (SplitLoads) { + PLoad = (*SplitLoads)[Idx]; + } else { + IRB.SetInsertPoint(BasicBlock::iterator(LI)); + PLoad = IRB.CreateAlignedLoad( + getAdjustedPtr(IRB, DL, LoadBasePtr, + APInt(DL.getPointerSizeInBits(), PartOffset), + PartPtrTy, LoadBasePtr->getName() + "."), + getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, + LI->getName()); + } + + // And store this partition. + IRB.SetInsertPoint(BasicBlock::iterator(SI)); + StoreInst *PStore = IRB.CreateAlignedStore( + PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr, + APInt(DL.getPointerSizeInBits(), PartOffset), + PartPtrTy, StoreBasePtr->getName() + "."), + getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); + + // Now build a new slice for the alloca. + NewSlices.push_back( + Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, + &PStore->getOperandUse(PStore->getPointerOperandIndex()), + /*IsSplittable*/ false)); + DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() + << ", " << NewSlices.back().endOffset() << "): " << *PStore + << "\n"); + if (!SplitLoads) { + DEBUG(dbgs() << " of split load: " << *PLoad << "\n"); + } + + // See if we've finished all the splits. + if (Idx >= Size) + break; + + // Setup the next partition. + PartOffset = Offsets.Splits[Idx]; + ++Idx; + PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset; + } + + // We want to immediately iterate on any allocas impacted by splitting + // this load, which is only relevant if it isn't a load of this alloca and + // thus we didn't already split the loads above. We also have to keep track + // of any promotable allocas we split loads on as they can no longer be + // promoted. + if (!SplitLoads) { + if (AllocaInst *OtherAI = dyn_cast(LoadBasePtr)) { + assert(OtherAI != &AI && "We can't re-split our own alloca!"); + ResplitPromotableAllocas.insert(OtherAI); + Worklist.insert(OtherAI); + } else if (AllocaInst *OtherAI = dyn_cast( + LoadBasePtr->stripInBoundsOffsets())) { + assert(OtherAI != &AI && "We can't re-split our own alloca!"); + Worklist.insert(OtherAI); + } + } + + // Mark the original store as dead now that we've split it up and kill its + // slice. Note that we leave the original load in place unless this store + // was its only use. It may in turn be split up if it is an alloca load + // for some other alloca, but it may be a normal load. This may introduce + // redundant loads, but where those can be merged the rest of the optimizer + // should handle the merging, and this uncovers SSA splits which is more + // important. In practice, the original loads will almost always be fully + // split and removed eventually, and the splits will be merged by any + // trivial CSE, including instcombine. + if (LI->hasOneUse()) { + assert(*LI->user_begin() == SI && "Single use isn't this store!"); + DeadInsts.insert(LI); + } + DeadInsts.insert(SI); + Offsets.S->kill(); + } + + // Remove the killed slices that have ben pre-split. + AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) { + return S.isDead(); + }), AS.end()); + + // Insert our new slices. This will sort and merge them into the sorted + // sequence. + AS.insert(NewSlices); + + DEBUG(dbgs() << " Pre-split slices:\n"); +#ifndef NDEBUG + for (auto I = AS.begin(), E = AS.end(); I != E; ++I) + DEBUG(AS.print(dbgs(), I, " ")); +#endif + + // Finally, don't try to promote any allocas that new require re-splitting. + // They have already been added to the worklist above. + PromotableAllocas.erase( + std::remove_if( + PromotableAllocas.begin(), PromotableAllocas.end(), + [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }), + PromotableAllocas.end()); + + return true; +} + /// \brief Rewrite an alloca partition's users. /// /// This routine drives both of the rewriting goals of the SROA pass. It tries @@ -3425,31 +3921,32 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, /// appropriate new offsets. It also evaluates how successful the rewrite was /// at enabling promotion and if it was successful queues the alloca to be /// promoted. -bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, - AllocaSlices::Partition &P) { +AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, + AllocaSlices::Partition &P) { // Try to compute a friendly type for this partition of the alloca. This // won't always succeed, in which case we fall back to a legal integer type // or an i8 array of an appropriate size. Type *SliceTy = nullptr; + const DataLayout &DL = AI.getModule()->getDataLayout(); if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset())) - if (DL->getTypeAllocSize(CommonUseTy) >= P.size()) + if (DL.getTypeAllocSize(CommonUseTy) >= P.size()) SliceTy = CommonUseTy; if (!SliceTy) - if (Type *TypePartitionTy = getTypePartition(*DL, AI.getAllocatedType(), + if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) SliceTy = TypePartitionTy; if ((!SliceTy || (SliceTy->isArrayTy() && SliceTy->getArrayElementType()->isIntegerTy())) && - DL->isLegalInteger(P.size() * 8)) + DL.isLegalInteger(P.size() * 8)) SliceTy = Type::getIntNTy(*C, P.size() * 8); if (!SliceTy) SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size()); - assert(DL->getTypeAllocSize(SliceTy) >= P.size()); + assert(DL.getTypeAllocSize(SliceTy) >= P.size()); - bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, *DL); + bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); VectorType *VecTy = - IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, *DL); + IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL); if (VecTy) SliceTy = VecTy; @@ -3464,18 +3961,19 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, NewAI = &AI; // FIXME: We should be able to bail at this point with "nothing changed". // FIXME: We might want to defer PHI speculation until after here. + // FIXME: return nullptr; } else { unsigned Alignment = AI.getAlignment(); if (!Alignment) { // The minimum alignment which users can rely on when the explicit // alignment is omitted or zero is that required by the ABI for this // type. - Alignment = DL->getABITypeAlignment(AI.getAllocatedType()); + Alignment = DL.getABITypeAlignment(AI.getAllocatedType()); } Alignment = MinAlign(Alignment, P.beginOffset()); // If we will get at least this much alignment from the type alone, leave // the alloca's alignment unconstrained. - if (Alignment <= DL->getABITypeAlignment(SliceTy)) + if (Alignment <= DL.getABITypeAlignment(SliceTy)) Alignment = 0; NewAI = new AllocaInst( SliceTy, nullptr, Alignment, @@ -3495,19 +3993,15 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, SmallPtrSet PHIUsers; SmallPtrSet SelectUsers; - AllocaSliceRewriter Rewriter(*DL, AS, *this, AI, *NewAI, P.beginOffset(), + AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(), P.endOffset(), IsIntegerPromotable, VecTy, PHIUsers, SelectUsers); bool Promotable = true; - for (Slice *S : P.splitSlices()) { - DEBUG(dbgs() << " rewriting split "); - DEBUG(AS.printSlice(dbgs(), S, "")); + for (Slice *S : P.splitSliceTails()) { Promotable &= Rewriter.visit(S); ++NumUses; } for (Slice &S : P) { - DEBUG(dbgs() << " rewriting "); - DEBUG(AS.printSlice(dbgs(), &S, "")); Promotable &= Rewriter.visit(&S); ++NumUses; } @@ -3521,7 +4015,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, for (SmallPtrSetImpl::iterator I = PHIUsers.begin(), E = PHIUsers.end(); I != E; ++I) - if (!isSafePHIToSpeculate(**I, DL)) { + if (!isSafePHIToSpeculate(**I)) { Promotable = false; PHIUsers.clear(); SelectUsers.clear(); @@ -3530,7 +4024,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, for (SmallPtrSetImpl::iterator I = SelectUsers.begin(), E = SelectUsers.end(); I != E; ++I) - if (!isSafeSelectToSpeculate(**I, DL)) { + if (!isSafeSelectToSpeculate(**I)) { Promotable = false; PHIUsers.clear(); SelectUsers.clear(); @@ -3563,7 +4057,7 @@ bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, PostPromotionWorklist.pop_back(); } - return true; + return NewAI; } /// \brief Walks the slices of an alloca and form partitions based on them, @@ -3574,10 +4068,58 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { unsigned NumPartitions = 0; bool Changed = false; + const DataLayout &DL = AI.getModule()->getDataLayout(); - // Rewrite each parttion. + // First try to pre-split loads and stores. + Changed |= presplitLoadsAndStores(AI, AS); + + // Now that we have identified any pre-splitting opportunities, mark any + // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail + // to split these during pre-splitting, we want to force them to be + // rewritten into a partition. + bool IsSorted = true; + for (Slice &S : AS) { + if (!S.isSplittable()) + continue; + // FIXME: We currently leave whole-alloca splittable loads and stores. This + // used to be the only splittable loads and stores and we need to be + // confident that the above handling of splittable loads and stores is + // completely sufficient before we forcibly disable the remaining handling. + if (S.beginOffset() == 0 && + S.endOffset() >= DL.getTypeAllocSize(AI.getAllocatedType())) + continue; + if (isa(S.getUse()->getUser()) || + isa(S.getUse()->getUser())) { + S.makeUnsplittable(); + IsSorted = false; + } + } + if (!IsSorted) + std::sort(AS.begin(), AS.end()); + + /// \brief Describes the allocas introduced by rewritePartition + /// in order to migrate the debug info. + struct Piece { + AllocaInst *Alloca; + uint64_t Offset; + uint64_t Size; + Piece(AllocaInst *AI, uint64_t O, uint64_t S) + : Alloca(AI), Offset(O), Size(S) {} + }; + SmallVector Pieces; + + // Rewrite each partition. for (auto &P : AS.partitions()) { - Changed |= rewritePartition(AI, AS, P); + if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) { + Changed = true; + if (NewAI != &AI) { + uint64_t SizeOfByte = 8; + uint64_t AllocaSize = DL.getTypeSizeInBits(NewAI->getAllocatedType()); + // Don't include any padding. + uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte); + Pieces.push_back(Piece(NewAI, P.beginOffset() * SizeOfByte, Size)); + } + } ++NumPartitions; } @@ -3585,6 +4127,42 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { MaxPartitionsPerAlloca = std::max(NumPartitions, MaxPartitionsPerAlloca); + // Migrate debug information from the old alloca to the new alloca(s) + // and the individual partitions. + if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) { + auto *Var = DbgDecl->getVariable(); + auto *Expr = DbgDecl->getExpression(); + DIBuilder DIB(*AI.getParent()->getParent()->getParent(), + /*AllowUnresolved*/ false); + bool IsSplit = Pieces.size() > 1; + for (auto Piece : Pieces) { + // Create a piece expression describing the new partition or reuse AI's + // expression if there is only one partition. + auto *PieceExpr = Expr; + if (IsSplit || Expr->isBitPiece()) { + // If this alloca is already a scalar replacement of a larger aggregate, + // Piece.Offset describes the offset inside the scalar. + uint64_t Offset = Expr->isBitPiece() ? Expr->getBitPieceOffset() : 0; + uint64_t Start = Offset + Piece.Offset; + uint64_t Size = Piece.Size; + if (Expr->isBitPiece()) { + uint64_t AbsEnd = Expr->getBitPieceOffset() + Expr->getBitPieceSize(); + if (Start >= AbsEnd) + // No need to describe a SROAed padding. + continue; + Size = std::min(Size, AbsEnd - Start); + } + PieceExpr = DIB.createBitPieceExpression(Start, Size); + } + + // Remove any existing dbg.declare intrinsic describing the same alloca. + if (DbgDeclareInst *OldDDI = FindAllocaDbgDeclare(Piece.Alloca)) + OldDDI->eraseFromParent(); + + DIB.insertDeclare(Piece.Alloca, Var, PieceExpr, DbgDecl->getDebugLoc(), + &AI); + } + } return Changed; } @@ -3617,21 +4195,22 @@ bool SROA::runOnAlloca(AllocaInst &AI) { AI.eraseFromParent(); return true; } + const DataLayout &DL = AI.getModule()->getDataLayout(); // Skip alloca forms that this analysis can't handle. if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || - DL->getTypeAllocSize(AI.getAllocatedType()) == 0) + DL.getTypeAllocSize(AI.getAllocatedType()) == 0) return false; bool Changed = false; // First, split any FCA loads and stores touching this alloca to promote // better splitting and promotion opportunities. - AggLoadStoreRewriter AggRewriter(*DL); + AggLoadStoreRewriter AggRewriter(DL); Changed |= AggRewriter.rewrite(AI); // Build the slices using a recursive instruction-visiting builder. - AllocaSlices AS(*DL, AI); + AllocaSlices AS(DL, AI); DEBUG(AS.print(dbgs())); if (AS.isEscaped()) return Changed; @@ -3696,101 +4275,30 @@ void SROA::deleteDeadInstructions( DeadInsts.insert(U); } - if (AllocaInst *AI = dyn_cast(I)) + if (AllocaInst *AI = dyn_cast(I)) { DeletedAllocas.insert(AI); + if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(AI)) + DbgDecl->eraseFromParent(); + } ++NumDeleted; I->eraseFromParent(); } } -static void enqueueUsersInWorklist(Instruction &I, - SmallVectorImpl &Worklist, - SmallPtrSetImpl &Visited) { - for (User *U : I.users()) - if (Visited.insert(cast(U)).second) - Worklist.push_back(cast(U)); -} - /// \brief Promote the allocas, using the best available technique. /// /// This attempts to promote whatever allocas have been identified as viable in /// the PromotableAllocas list. If that list is empty, there is nothing to do. -/// If there is a domtree available, we attempt to promote using the full power -/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is -/// based on the SSAUpdater utilities. This function returns whether any -/// promotion occurred. +/// This function returns whether any promotion occurred. bool SROA::promoteAllocas(Function &F) { if (PromotableAllocas.empty()) return false; NumPromoted += PromotableAllocas.size(); - if (DT && !ForceSSAUpdater) { - DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); - PromoteMemToReg(PromotableAllocas, *DT, nullptr, AT); - PromotableAllocas.clear(); - return true; - } - - DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); - SSAUpdater SSA; - DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); - SmallVector Insts; - - // We need a worklist to walk the uses of each alloca. - SmallVector Worklist; - SmallPtrSet Visited; - SmallVector DeadInsts; - - for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { - AllocaInst *AI = PromotableAllocas[Idx]; - Insts.clear(); - Worklist.clear(); - Visited.clear(); - - enqueueUsersInWorklist(*AI, Worklist, Visited); - - while (!Worklist.empty()) { - Instruction *I = Worklist.pop_back_val(); - - // FIXME: Currently the SSAUpdater infrastructure doesn't reason about - // lifetime intrinsics and so we strip them (and the bitcasts+GEPs - // leading to them) here. Eventually it should use them to optimize the - // scalar values produced. - if (IntrinsicInst *II = dyn_cast(I)) { - assert(II->getIntrinsicID() == Intrinsic::lifetime_start || - II->getIntrinsicID() == Intrinsic::lifetime_end); - II->eraseFromParent(); - continue; - } - - // Push the loads and stores we find onto the list. SROA will already - // have validated that all loads and stores are viable candidates for - // promotion. - if (LoadInst *LI = dyn_cast(I)) { - assert(LI->getType() == AI->getAllocatedType()); - Insts.push_back(LI); - continue; - } - if (StoreInst *SI = dyn_cast(I)) { - assert(SI->getValueOperand()->getType() == AI->getAllocatedType()); - Insts.push_back(SI); - continue; - } - - // For everything else, we know that only no-op bitcasts and GEPs will - // make it this far, just recurse through them and recall them for later - // removal. - DeadInsts.push_back(I); - enqueueUsersInWorklist(*I, Worklist, Visited); - } - AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); - while (!DeadInsts.empty()) - DeadInsts.pop_back_val()->eraseFromParent(); - AI->eraseFromParent(); - } - + DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); + PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC); PromotableAllocas.clear(); return true; } @@ -3801,22 +4309,15 @@ bool SROA::runOnFunction(Function &F) { DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); - DataLayoutPass *DLP = getAnalysisIfAvailable(); - if (!DLP) { - DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); - return false; - } - DL = &DLP->getDataLayout(); - DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable(); - DT = DTWP ? &DTWP->getDomTree() : nullptr; - AT = &getAnalysis(); + DT = &getAnalysis().getDomTree(); + AC = &getAnalysis().getAssumptionCache(F); BasicBlock &EntryBB = F.getEntryBlock(); for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); - I != E; ++I) + I != E; ++I) { if (AllocaInst *AI = dyn_cast(I)) Worklist.insert(AI); + } bool Changed = false; // A set of deleted alloca instruction pointers which should be removed from @@ -3852,8 +4353,7 @@ bool SROA::runOnFunction(Function &F) { } void SROA::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired(); - if (RequiresDomTree) - AU.addRequired(); + AU.addRequired(); + AU.addRequired(); AU.setPreservesCFG(); }