X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTransforms%2FScalar%2FSROA.cpp;h=f43bb96facce040d32a8d9f266f338dd0cee6a21;hp=daf99fb4f2d354fef901b6de901ce3fc99762c6f;hb=f3840d2c16a4ec4c879a8ded402835746de380f8;hpb=63392ea3ba295d59260553245b14a435b5f71a3e diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index daf99fb4f2d..f43bb96facc 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -30,7 +30,6 @@ #include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" #include "llvm/Function.h" -#include "llvm/GlobalVariable.h" #include "llvm/IRBuilder.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" @@ -42,20 +41,17 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/TinyPtrVector.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Support/CallSite.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/InstVisitor.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -110,8 +106,13 @@ public: } /// \brief Support comparison with a single offset to allow binary searches. - bool operator<(uint64_t RHSOffset) const { - return BeginOffset < RHSOffset; + friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) { + return LHS.BeginOffset < RHSOffset; + } + + friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, + const ByteRange &RHS) { + return LHSOffset < RHS.BeginOffset; } bool operator==(const ByteRange &RHS) const { @@ -136,6 +137,23 @@ public: /// splittable and eagerly split them into scalar values. bool IsSplittable; + /// \brief Test whether a partition has been marked as dead. + bool isDead() const { + if (BeginOffset == UINT64_MAX) { + assert(EndOffset == UINT64_MAX); + return true; + } + return false; + } + + /// \brief Kill a partition. + /// This is accomplished by setting both its beginning and end offset to + /// the maximum possible value. + void kill() { + assert(!isDead() && "He's Dead, Jim!"); + BeginOffset = EndOffset = UINT64_MAX; + } + Partition() : ByteRange(), IsSplittable() {} Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable) : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {} @@ -150,23 +168,22 @@ public: /// memory use itself with a particular partition. As a consequence there is /// intentionally overlap between various uses of the same partition. struct PartitionUse : public ByteRange { - /// \brief The user of this range of the alloca. - AssertingVH User; - - /// \brief The particular pointer value derived from this alloca in use. - AssertingVH Ptr; + /// \brief The use in question. Provides access to both user and used value. + /// + /// Note that this may be null if the partition use is *dead*, that is, it + /// should be ignored. + Use *U; - PartitionUse() : ByteRange(), User(), Ptr() {} - PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, - Instruction *User, Instruction *Ptr) - : ByteRange(BeginOffset, EndOffset), User(User), Ptr(Ptr) {} + PartitionUse() : ByteRange(), U() {} + PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U) + : ByteRange(BeginOffset, EndOffset), U(U) {} }; /// \brief Construct a partitioning of a particular alloca. /// /// Construction does most of the work for partitioning the alloca. This /// performs the necessary walks of users and builds a partitioning from it. - AllocaPartitioning(const TargetData &TD, AllocaInst &AI); + AllocaPartitioning(const DataLayout &TD, AllocaInst &AI); /// \brief Test whether a pointer to the allocation escapes our analysis. /// @@ -197,16 +214,6 @@ public: use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); } use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); } use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); } - void use_insert(unsigned Idx, use_iterator UI, const PartitionUse &U) { - Uses[Idx].insert(UI, U); - } - void use_insert(const_iterator I, use_iterator UI, const PartitionUse &U) { - Uses[I - begin()].insert(UI, U); - } - void use_erase(unsigned Idx, use_iterator UI) { Uses[Idx].erase(UI); } - void use_erase(const_iterator I, use_iterator UI) { - Uses[I - begin()].erase(UI); - } typedef SmallVectorImpl::const_iterator const_use_iterator; const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); } @@ -217,6 +224,22 @@ public: const_use_iterator use_end(const_iterator I) const { return Uses[I - begin()].end(); } + + unsigned use_size(unsigned Idx) const { return Uses[Idx].size(); } + unsigned use_size(const_iterator I) const { return Uses[I - begin()].size(); } + const PartitionUse &getUse(unsigned PIdx, unsigned UIdx) const { + return Uses[PIdx][UIdx]; + } + const PartitionUse &getUse(const_iterator I, unsigned UIdx) const { + return Uses[I - begin()][UIdx]; + } + + void use_push_back(unsigned Idx, const PartitionUse &PU) { + Uses[Idx].push_back(PU); + } + void use_push_back(const_iterator I, const PartitionUse &PU) { + Uses[I - begin()].push_back(PU); + } /// @} /// \brief Allow iterating the dead users for this alloca. @@ -249,8 +272,16 @@ public: /// correctly represent. We stash extra data to help us untangle this /// after the partitioning is complete. struct MemTransferOffsets { + /// The destination begin and end offsets when the destination is within + /// this alloca. If the end offset is zero the destination is not within + /// this alloca. uint64_t DestBegin, DestEnd; + + /// The source begin and end offsets when the source is within this alloca. + /// If the end offset is zero, the source is not within this alloca. uint64_t SourceBegin, SourceEnd; + + /// Flag for whether an alloca is splittable. bool IsSplittable; }; MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const { @@ -262,10 +293,9 @@ public: /// When manipulating PHI nodes or selects, they can use more than one /// partition of an alloca. We store a special mapping to allow finding the /// partition referenced by each of these operands, if any. - iterator findPartitionForPHIOrSelectOperand(Instruction &I, Value *Op) { - SmallDenseMap, - std::pair >::const_iterator MapIt - = PHIOrSelectOpMap.find(std::make_pair(&I, Op)); + iterator findPartitionForPHIOrSelectOperand(Use *U) { + SmallDenseMap >::const_iterator MapIt + = PHIOrSelectOpMap.find(U); if (MapIt == PHIOrSelectOpMap.end()) return end(); @@ -277,11 +307,9 @@ public: /// /// Similar to mapping these operands back to the partitions, this maps /// directly to the use structure of that partition. - use_iterator findPartitionUseForPHIOrSelectOperand(Instruction &I, - Value *Op) { - SmallDenseMap, - std::pair >::const_iterator MapIt - = PHIOrSelectOpMap.find(std::make_pair(&I, Op)); + use_iterator findPartitionUseForPHIOrSelectOperand(Use *U) { + SmallDenseMap >::const_iterator MapIt + = PHIOrSelectOpMap.find(U); assert(MapIt != PHIOrSelectOpMap.end()); return Uses[MapIt->second.first].begin() + MapIt->second.second; } @@ -368,8 +396,7 @@ private: SmallDenseMap > PHIOrSelectSizes; /// \brief Auxiliary information for particular PHI or select operands. - SmallDenseMap, - std::pair, 4> PHIOrSelectOpMap; + SmallDenseMap, 4> PHIOrSelectOpMap; /// \brief A utility routine called from the constructor. /// @@ -384,7 +411,7 @@ template class AllocaPartitioning::BuilderBase : public InstVisitor { public: - BuilderBase(const TargetData &TD, AllocaInst &AI, AllocaPartitioning &P) + BuilderBase(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) : TD(TD), AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())), P(P) { @@ -392,34 +419,35 @@ public: } protected: - const TargetData &TD; + const DataLayout &TD; const uint64_t AllocSize; AllocaPartitioning &P; + SmallPtrSet VisitedUses; + struct OffsetUse { Use *U; - uint64_t Offset; + int64_t Offset; }; SmallVector Queue; // The active offset and use while visiting. Use *U; - uint64_t Offset; + int64_t Offset; - void enqueueUsers(Instruction &I, uint64_t UserOffset) { - SmallPtrSet UserSet; + void enqueueUsers(Instruction &I, int64_t UserOffset) { for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; ++UI) { - if (!UserSet.insert(*UI)) - continue; - - OffsetUse OU = { &UI.getUse(), UserOffset }; - Queue.push_back(OU); + if (VisitedUses.insert(&UI.getUse())) { + OffsetUse OU = { &UI.getUse(), UserOffset }; + Queue.push_back(OU); + } } } - bool computeConstantGEPOffset(GetElementPtrInst &GEPI, uint64_t &GEPOffset) { + bool computeConstantGEPOffset(GetElementPtrInst &GEPI, int64_t &GEPOffset) { GEPOffset = Offset; + unsigned int AS = GEPI.getPointerAddressSpace(); for (gep_type_iterator GTI = gep_type_begin(GEPI), GTE = gep_type_end(GEPI); GTI != GTE; ++GTI) { ConstantInt *OpC = dyn_cast(GTI.getOperand()); @@ -432,12 +460,37 @@ protected: if (StructType *STy = dyn_cast(*GTI)) { unsigned ElementIdx = OpC->getZExtValue(); const StructLayout *SL = TD.getStructLayout(STy); - GEPOffset += SL->getElementOffset(ElementIdx); + uint64_t ElementOffset = SL->getElementOffset(ElementIdx); + // Check that we can continue to model this GEP in a signed 64-bit offset. + if (ElementOffset > INT64_MAX || + (GEPOffset >= 0 && + ((uint64_t)GEPOffset + ElementOffset) > INT64_MAX)) { + DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding " + << "what can be represented in an int64_t!\n" + << " alloca: " << P.AI << "\n"); + return false; + } + if (GEPOffset < 0) + GEPOffset = ElementOffset + (uint64_t)-GEPOffset; + else + GEPOffset += ElementOffset; continue; } - GEPOffset - += OpC->getZExtValue() * TD.getTypeAllocSize(GTI.getIndexedType()); + APInt Index = OpC->getValue().sextOrTrunc(TD.getPointerSizeInBits(AS)); + Index *= APInt(Index.getBitWidth(), + TD.getTypeAllocSize(GTI.getIndexedType())); + Index += APInt(Index.getBitWidth(), (uint64_t)GEPOffset, + /*isSigned*/true); + // Check if the result can be stored in our int64_t offset. + if (!Index.isSignedIntN(sizeof(GEPOffset) * 8)) { + DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding " + << "what can be represented in an int64_t!\n" + << " alloca: " << P.AI << "\n"); + return false; + } + + GEPOffset = Index.getSExtValue(); } return true; } @@ -468,7 +521,7 @@ class AllocaPartitioning::PartitionBuilder SmallDenseMap MemTransferPartitionMap; public: - PartitionBuilder(const TargetData &TD, AllocaInst &AI, AllocaPartitioning &P) + PartitionBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) : BuilderBase(TD, AI, P) {} /// \brief Run the builder over the allocation. @@ -490,12 +543,13 @@ private: return false; } - void insertUse(Instruction &I, uint64_t Offset, uint64_t Size, + void insertUse(Instruction &I, int64_t Offset, uint64_t Size, bool IsSplittable = false) { - uint64_t BeginOffset = Offset, EndOffset = Offset + Size; - - // Completely skip uses which start outside of the allocation. - if (BeginOffset >= AllocSize) { + // Completely skip uses which have a zero size or don't overlap the + // allocation. + if (Size == 0 || + (Offset >= 0 && (uint64_t)Offset >= AllocSize) || + (Offset < 0 && (uint64_t)-Offset >= Size)) { DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset << " which starts past the end of the " << AllocSize << " byte alloca:\n" @@ -504,28 +558,34 @@ private: return; } - // Clamp the size to the allocation. - if (EndOffset > AllocSize) { + // Clamp the start to the beginning of the allocation. + if (Offset < 0) { DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset - << " to remain within the " << AllocSize << " byte alloca:\n" + << " to start at the beginning of the alloca:\n" << " alloca: " << P.AI << "\n" << " use: " << I << "\n"); - EndOffset = AllocSize; + Size -= (uint64_t)-Offset; + Offset = 0; } - // See if we can just add a user onto the last slot currently occupied. - if (!P.Partitions.empty() && - P.Partitions.back().BeginOffset == BeginOffset && - P.Partitions.back().EndOffset == EndOffset) { - P.Partitions.back().IsSplittable &= IsSplittable; - return; + uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size; + + // Clamp the end offset to the end of the allocation. Note that this is + // formulated to handle even the case where "BeginOffset + Size" overflows. + assert(AllocSize >= BeginOffset); // Established above. + if (Size > AllocSize - BeginOffset) { + DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset + << " to remain within the " << AllocSize << " byte alloca:\n" + << " alloca: " << P.AI << "\n" + << " use: " << I << "\n"); + EndOffset = AllocSize; } Partition New(BeginOffset, EndOffset, IsSplittable); P.Partitions.push_back(New); } - bool handleLoadOrStore(Type *Ty, Instruction &I, uint64_t Offset) { + bool handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset) { uint64_t Size = TD.getTypeStoreSize(Ty); // If this memory access can be shown to *statically* extend outside the @@ -535,7 +595,8 @@ private: // risk of overflow. // FIXME: We should instead consider the pointer to have escaped if this // function is being instrumented for addressing bugs or race conditions. - if (Offset >= AllocSize || Size > AllocSize || Offset + Size > AllocSize) { + if (Offset < 0 || (uint64_t)Offset >= AllocSize || + Size > (AllocSize - (uint64_t)Offset)) { DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte " << (isa(I) ? "load" : "store") << " @" << Offset << " which extends past the end of the " << AllocSize @@ -555,7 +616,7 @@ private: } bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { - uint64_t GEPOffset; + int64_t GEPOffset; if (!computeConstantGEPOffset(GEPI, GEPOffset)) return markAsEscaping(GEPI); @@ -564,14 +625,19 @@ private: } bool visitLoadInst(LoadInst &LI) { + assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && + "All simple FCA loads should have been pre-split"); return handleLoadOrStore(LI.getType(), LI, Offset); } bool visitStoreInst(StoreInst &SI) { - if (SI.getOperand(0) == *U) + Value *ValOp = SI.getValueOperand(); + if (ValOp == *U) return markAsEscaping(SI); - return handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset); + assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && + "All simple FCA stores should have been pre-split"); + return handleLoadOrStore(ValOp->getType(), SI, Offset); } @@ -595,30 +661,57 @@ private: // Only intrinsics with a constant length can be split. Offsets.IsSplittable = Length; - if (*U != II.getRawDest()) { - assert(*U == II.getRawSource()); - Offsets.SourceBegin = Offset; - Offsets.SourceEnd = Offset + Size; - } else { + if (*U == II.getRawDest()) { Offsets.DestBegin = Offset; Offsets.DestEnd = Offset + Size; } + if (*U == II.getRawSource()) { + Offsets.SourceBegin = Offset; + Offsets.SourceEnd = Offset + Size; + } - insertUse(II, Offset, Size, Offsets.IsSplittable); - unsigned NewIdx = P.Partitions.size() - 1; - - SmallDenseMap::const_iterator PMI; - bool Inserted = false; - llvm::tie(PMI, Inserted) - = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)); - if (!Inserted && Offsets.IsSplittable) { - // We've found a memory transfer intrinsic which refers to the alloca as - // both a source and dest. We refuse to split these to simplify splitting - // logic. If possible, SROA will still split them into separate allocas - // and then re-analyze. + // If we have set up end offsets for both the source and the destination, + // we have found both sides of this transfer pointing at the same alloca. + bool SeenBothEnds = Offsets.SourceEnd && Offsets.DestEnd; + if (SeenBothEnds && II.getRawDest() != II.getRawSource()) { + unsigned PrevIdx = MemTransferPartitionMap[&II]; + + // Check if the begin offsets match and this is a non-volatile transfer. + // In that case, we can completely elide the transfer. + if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) { + P.Partitions[PrevIdx].kill(); + return true; + } + + // Otherwise we have an offset transfer within the same alloca. We can't + // split those. + P.Partitions[PrevIdx].IsSplittable = Offsets.IsSplittable = false; + } else if (SeenBothEnds) { + // Handle the case where this exact use provides both ends of the + // operation. + assert(II.getRawDest() == II.getRawSource()); + + // For non-volatile transfers this is a no-op. + if (!II.isVolatile()) + return true; + + // Otherwise just suppress splitting. Offsets.IsSplittable = false; - P.Partitions[PMI->second].IsSplittable = false; - P.Partitions[NewIdx].IsSplittable = false; + } + + + // Insert the use now that we've fixed up the splittable nature. + insertUse(II, Offset, Size, Offsets.IsSplittable); + + // Setup the mapping from intrinsic to partition of we've not seen both + // ends of this transfer. + if (!SeenBothEnds) { + unsigned NewIdx = P.Partitions.size() - 1; + bool Inserted + = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)).second; + assert(Inserted && + "Already have intrinsic in map but haven't seen both ends"); + (void)Inserted; } return true; @@ -648,6 +741,9 @@ private: SmallVector, 4> Uses; Visited.insert(Root); Uses.push_back(std::make_pair(cast(*U), Root)); + // If there are no loads or stores, the access is dead. We mark that as + // a size zero access. + Size = 0; do { Instruction *I, *UsedI; llvm::tie(UsedI, I) = Uses.pop_back_val(); @@ -754,7 +850,7 @@ class AllocaPartitioning::UseBuilder : public BuilderBase { SmallPtrSet VisitedDeadInsts; public: - UseBuilder(const TargetData &TD, AllocaInst &AI, AllocaPartitioning &P) + UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) : BuilderBase(TD, AI, P) {} /// \brief Run the builder over the allocation. @@ -774,16 +870,25 @@ private: P.DeadUsers.push_back(&I); } - void insertUse(Instruction &User, uint64_t Offset, uint64_t Size) { - uint64_t BeginOffset = Offset, EndOffset = Offset + Size; - - // If the use extends outside of the allocation, record it as a dead use - // for elimination later. - if (BeginOffset >= AllocSize || Size == 0) + void insertUse(Instruction &User, int64_t Offset, uint64_t Size) { + // If the use has a zero size or extends outside of the allocation, record + // it as a dead use for elimination later. + if (Size == 0 || (uint64_t)Offset >= AllocSize || + (Offset < 0 && (uint64_t)-Offset >= Size)) return markAsDead(User); - // Bound the use by the size of the allocation. - if (EndOffset > AllocSize) + // Clamp the start to the beginning of the allocation. + if (Offset < 0) { + Size -= (uint64_t)-Offset; + Offset = 0; + } + + uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size; + + // Clamp the end offset to the end of the allocation. Note that this is + // formulated to handle even the case where "BeginOffset + Size" overflows. + assert(AllocSize >= BeginOffset); // Established above. + if (Size > AllocSize - BeginOffset) EndOffset = AllocSize; // NB: This only works if we have zero overlapping partitions. @@ -792,24 +897,24 @@ private: B = llvm::prior(B); for (iterator I = B, E = P.end(); I != E && I->BeginOffset < EndOffset; ++I) { - PartitionUse NewUse(std::max(I->BeginOffset, BeginOffset), - std::min(I->EndOffset, EndOffset), - &User, cast(*U)); - P.Uses[I - P.begin()].push_back(NewUse); + PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset), + std::min(I->EndOffset, EndOffset), U); + P.use_push_back(I, NewPU); if (isa(U->getUser()) || isa(U->getUser())) - P.PHIOrSelectOpMap[std::make_pair(&User, U->get())] + P.PHIOrSelectOpMap[U] = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1); } } - void handleLoadOrStore(Type *Ty, Instruction &I, uint64_t Offset) { + void handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset) { uint64_t Size = TD.getTypeStoreSize(Ty); // If this memory access can be shown to *statically* extend outside the // bounds of of the allocation, it's behavior is undefined, so simply // ignore it. Note that this is more strict than the generic clamping // behavior of insertUse. - if (Offset >= AllocSize || Size > AllocSize || Offset + Size > AllocSize) + if (Offset < 0 || (uint64_t)Offset >= AllocSize || + Size > (AllocSize - (uint64_t)Offset)) return markAsDead(I); insertUse(I, Offset, Size); @@ -826,7 +931,7 @@ private: if (GEPI.use_empty()) return markAsDead(GEPI); - uint64_t GEPOffset; + int64_t GEPOffset; if (!computeConstantGEPOffset(GEPI, GEPOffset)) llvm_unreachable("Unable to compute constant offset for use"); @@ -850,6 +955,14 @@ private: void visitMemTransferInst(MemTransferInst &II) { ConstantInt *Length = dyn_cast(II.getLength()); uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; + if (!Size) + return markAsDead(II); + + MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; + if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd && + Offsets.DestBegin == Offsets.SourceBegin) + return markAsDead(II); // Skip identity transfers without side-effects. + insertUse(II, Offset, Size); } @@ -891,6 +1004,10 @@ private: // If the result of the constant fold will be the pointer, recurse // through the select as if we had RAUW'ed it. enqueueUsers(SI, Offset); + else + // Otherwise the operand to the select is dead, and we can replace it + // with undef. + P.DeadOperands.push_back(U); return; } @@ -944,7 +1061,7 @@ void AllocaPartitioning::splitAndMergePartitions() { SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset); } - Partitions[j].BeginOffset = Partitions[j].EndOffset = UINT64_MAX; + Partitions[j].kill(); ++NumDeadPartitions; ++j; } @@ -965,7 +1082,7 @@ void AllocaPartitioning::splitAndMergePartitions() { if (New.BeginOffset != New.EndOffset) Partitions.push_back(New); // Mark the old one for removal. - Partitions[i].BeginOffset = Partitions[i].EndOffset = UINT64_MAX; + Partitions[i].kill(); ++NumDeadPartitions; } @@ -992,15 +1109,14 @@ void AllocaPartitioning::splitAndMergePartitions() { // replaced in the process. std::sort(Partitions.begin(), Partitions.end()); if (NumDeadPartitions) { - assert(Partitions.back().BeginOffset == UINT64_MAX); - assert(Partitions.back().EndOffset == UINT64_MAX); + assert(Partitions.back().isDead()); assert((ptrdiff_t)NumDeadPartitions == std::count(Partitions.begin(), Partitions.end(), Partitions.back())); } Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end()); } -AllocaPartitioning::AllocaPartitioning(const TargetData &TD, AllocaInst &AI) +AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI) : #ifndef NDEBUG AI(AI), @@ -1010,11 +1126,15 @@ AllocaPartitioning::AllocaPartitioning(const TargetData &TD, AllocaInst &AI) if (!PB()) return; - if (Partitions.size() > 1) { - // Sort the uses. This arranges for the offsets to be in ascending order, - // and the sizes to be in descending order. - std::sort(Partitions.begin(), Partitions.end()); + // Sort the uses. This arranges for the offsets to be in ascending order, + // and the sizes to be in descending order. + std::sort(Partitions.begin(), Partitions.end()); + + // Remove any partitions from the back which are marked as dead. + while (!Partitions.empty() && Partitions.back().isDead()) + Partitions.pop_back(); + if (Partitions.size() > 1) { // Intersect splittability for all partitions with equal offsets and sizes. // Then remove all but the first so that we have a sequence of non-equal but // potentially overlapping partitions. @@ -1039,29 +1159,23 @@ AllocaPartitioning::AllocaPartitioning(const TargetData &TD, AllocaInst &AI) Uses.resize(Partitions.size()); UseBuilder UB(TD, AI, *this); UB(); - for (iterator I = Partitions.begin(), E = Partitions.end(); I != E; ++I) - std::stable_sort(use_begin(I), use_end(I)); } Type *AllocaPartitioning::getCommonType(iterator I) const { Type *Ty = 0; for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { - if (isa(*UI->User)) + if (!UI->U) + continue; // Skip dead uses. + if (isa(*UI->U->getUser())) continue; if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset) - break; + continue; Type *UserTy = 0; - if (LoadInst *LI = dyn_cast(&*UI->User)) { + if (LoadInst *LI = dyn_cast(UI->U->getUser())) { UserTy = LI->getType(); - } else if (StoreInst *SI = dyn_cast(&*UI->User)) { + } else if (StoreInst *SI = dyn_cast(UI->U->getUser())) { UserTy = SI->getValueOperand()->getType(); - } else if (SelectInst *SI = dyn_cast(&*UI->User)) { - if (PointerType *PtrTy = dyn_cast(SI->getType())) - UserTy = PtrTy->getElementType(); - } else if (PHINode *PN = dyn_cast(&*UI->User)) { - if (PointerType *PtrTy = dyn_cast(PN->getType())) - UserTy = PtrTy->getElementType(); } if (Ty && Ty != UserTy) @@ -1087,9 +1201,11 @@ void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I, StringRef Indent) const { for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { + if (!UI->U) + continue; // Skip dead uses. OS << Indent << " [" << UI->BeginOffset << "," << UI->EndOffset << ") " - << "used by: " << *UI->User << "\n"; - if (MemTransferInst *II = dyn_cast(&*UI->User)) { + << "used by: " << *UI->U->getUser() << "\n"; + if (MemTransferInst *II = dyn_cast(UI->U->getUser())) { const MemTransferOffsets &MTO = MemTransferInstData.lookup(II); bool IsDest; if (!MTO.IsSplittable) @@ -1232,7 +1348,7 @@ class SROA : public FunctionPass { const bool RequiresDomTree; LLVMContext *C; - const TargetData *TD; + const DataLayout *TD; DominatorTree *DT; /// \brief Worklist of alloca instructions to simplify. @@ -1253,6 +1369,16 @@ class SROA : public FunctionPass { /// uses as dead. Only used to guard insertion into DeadInsts. SmallPtrSet DeadSplitInsts; + /// \brief Post-promotion worklist. + /// + /// Sometimes we discover an alloca which has a high probability of becoming + /// viable for SROA after a round of promotion takes place. In those cases, + /// the alloca is enqueued here for re-processing. + /// + /// Note that we have to be very careful to clear allocas out of this list in + /// the event they are deleted. + SetVector > PostPromotionWorklist; + /// \brief A collection of alloca instructions we can directly promote. std::vector PromotableAllocas; @@ -1269,6 +1395,7 @@ public: static char ID; private: + friend class PHIOrSelectSpeculator; friend class AllocaPartitionRewriter; friend class AllocaPartitionVectorRewriter; @@ -1294,136 +1421,423 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTree) INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false, false) -/// \brief Accumulate the constant offsets in a GEP into a single APInt offset. -/// -/// If the provided GEP is all-constant, the total byte offset formed by the -/// GEP is computed and Offset is set to it. If the GEP has any non-constant -/// operands, the function returns false and the value of Offset is unmodified. -static bool accumulateGEPOffsets(const TargetData &TD, GEPOperator &GEP, - APInt &Offset) { - APInt GEPOffset(Offset.getBitWidth(), 0); - for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); - GTI != GTE; ++GTI) { - ConstantInt *OpC = dyn_cast(GTI.getOperand()); - if (!OpC) - return false; - if (OpC->isZero()) continue; +namespace { +/// \brief Visitor to speculate PHIs and Selects where possible. +class PHIOrSelectSpeculator : public InstVisitor { + // Befriend the base class so it can delegate to private visit methods. + friend class llvm::InstVisitor; - // Handle a struct index, which adds its field offset to the pointer. - if (StructType *STy = dyn_cast(*GTI)) { - unsigned ElementIdx = OpC->getZExtValue(); - const StructLayout *SL = TD.getStructLayout(STy); - GEPOffset += APInt(Offset.getBitWidth(), - SL->getElementOffset(ElementIdx)); - continue; - } + const DataLayout &TD; + AllocaPartitioning &P; + SROA &Pass; - APInt TypeSize(Offset.getBitWidth(), - TD.getTypeAllocSize(GTI.getIndexedType())); - if (VectorType *VTy = dyn_cast(*GTI)) { - assert((VTy->getScalarSizeInBits() % 8) == 0 && - "vector element size is not a multiple of 8, cannot GEP over it"); - TypeSize = VTy->getScalarSizeInBits() / 8; - } +public: + PHIOrSelectSpeculator(const DataLayout &TD, AllocaPartitioning &P, SROA &Pass) + : TD(TD), P(P), Pass(Pass) {} - GEPOffset += OpC->getValue().sextOrTrunc(Offset.getBitWidth()) * TypeSize; + /// \brief Visit the users of an alloca partition and rewrite them. + void visitUsers(AllocaPartitioning::const_iterator PI) { + // Note that we need to use an index here as the underlying vector of uses + // may be grown during speculation. However, we never need to re-visit the + // new uses, and so we can use the initial size bound. + for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) { + const AllocaPartitioning::PartitionUse &PU = P.getUse(PI, Idx); + if (!PU.U) + continue; // Skip dead use. + + visit(cast(PU.U->getUser())); + } } - Offset = GEPOffset; - return true; -} -/// \brief Build a GEP out of a base pointer and indices. -/// -/// This will return the BasePtr if that is valid, or build a new GEP -/// instruction using the IRBuilder if GEP-ing is needed. -static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr, - SmallVectorImpl &Indices, - const Twine &Prefix) { - if (Indices.empty()) - return BasePtr; +private: + // By default, skip this instruction. + void visitInstruction(Instruction &I) {} - // A single zero index is a no-op, so check for this and avoid building a GEP - // in that case. - if (Indices.size() == 1 && cast(Indices.back())->isZero()) - return BasePtr; + /// PHI instructions that use an alloca and are subsequently loaded can be + /// rewritten to load both input pointers in the pred blocks and then PHI the + /// results, allowing the load of the alloca to be promoted. + /// From this: + /// %P2 = phi [i32* %Alloca, i32* %Other] + /// %V = load i32* %P2 + /// to: + /// %V1 = load i32* %Alloca -> will be mem2reg'd + /// ... + /// %V2 = load i32* %Other + /// ... + /// %V = phi [i32 %V1, i32 %V2] + /// + /// We can do this to a select if its only uses are loads and if the operands + /// to the select can be loaded unconditionally. + /// + /// FIXME: This should be hoisted into a generic utility, likely in + /// Transforms/Util/Local.h + bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl &Loads) { + // For now, we can only do this promotion if the load is in the same block + // as the PHI, and if there are no stores between the phi and load. + // TODO: Allow recursive phi users. + // TODO: Allow stores. + BasicBlock *BB = PN.getParent(); + unsigned MaxAlign = 0; + for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); + UI != UE; ++UI) { + LoadInst *LI = dyn_cast(*UI); + if (LI == 0 || !LI->isSimple()) return false; - return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx"); -} + // For now we only allow loads in the same block as the PHI. This is + // a common case that happens when instcombine merges two loads through + // a PHI. + if (LI->getParent() != BB) return false; -/// \brief Get a natural GEP off of the BasePtr walking through Ty toward -/// TargetTy without changing the offset of the pointer. -/// -/// This routine assumes we've already established a properly offset GEP with -/// Indices, and arrived at the Ty type. The goal is to continue to GEP with -/// zero-indices down through type layers until we find one the same as -/// TargetTy. If we can't find one with the same type, we at least try to use -/// one with the same size. If none of that works, we just produce the GEP as -/// indicated by Indices to have the correct offset. -static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const TargetData &TD, - Value *BasePtr, Type *Ty, Type *TargetTy, - SmallVectorImpl &Indices, - const Twine &Prefix) { - if (Ty == TargetTy) - return buildGEP(IRB, BasePtr, Indices, Prefix); + // Ensure that there are no instructions between the PHI and the load that + // could store. + for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) + if (BBI->mayWriteToMemory()) + return false; - // See if we can descend into a struct and locate a field with the correct - // type. - unsigned NumLayers = 0; - Type *ElementTy = Ty; - do { - if (ElementTy->isPointerTy()) - break; - if (SequentialType *SeqTy = dyn_cast(ElementTy)) { - ElementTy = SeqTy->getElementType(); - Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(), 0))); - } else if (StructType *STy = dyn_cast(ElementTy)) { - ElementTy = *STy->element_begin(); - Indices.push_back(IRB.getInt32(0)); - } else { - break; + MaxAlign = std::max(MaxAlign, LI->getAlignment()); + Loads.push_back(LI); } - ++NumLayers; - } while (ElementTy != TargetTy); - if (ElementTy != TargetTy) - Indices.erase(Indices.end() - NumLayers, Indices.end()); - return buildGEP(IRB, BasePtr, Indices, Prefix); -} + // We can only transform this if it is safe to push the loads into the + // predecessor blocks. The only thing to watch out for is that we can't put + // a possibly trapping load in the predecessor if it is a critical edge. + for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; + ++Idx) { + TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); + Value *InVal = PN.getIncomingValue(Idx); -/// \brief Recursively compute indices for a natural GEP. -/// -/// This is the recursive step for getNaturalGEPWithOffset that walks down the -/// element types adding appropriate indices for the GEP. -static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const TargetData &TD, - Value *Ptr, Type *Ty, APInt &Offset, - Type *TargetTy, - SmallVectorImpl &Indices, - const Twine &Prefix) { - if (Offset == 0) - return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix); + // If the value is produced by the terminator of the predecessor (an + // invoke) or it has side-effects, there is no valid place to put a load + // in the predecessor. + if (TI == InVal || TI->mayHaveSideEffects()) + return false; - // We can't recurse through pointer types. - if (Ty->isPointerTy()) - return 0; + // If the predecessor has a single successor, then the edge isn't + // critical. + if (TI->getNumSuccessors() == 1) + continue; - // We try to analyze GEPs over vectors here, but note that these GEPs are - // extremely poorly defined currently. The long-term goal is to remove GEPing - // over a vector from the IR completely. - if (VectorType *VecTy = dyn_cast(Ty)) { - unsigned ElementSizeInBits = VecTy->getScalarSizeInBits(); - if (ElementSizeInBits % 8) - return 0; // GEPs over non-multiple of 8 size vector elements are invalid. - APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); - APInt NumSkippedElements = Offset.udiv(ElementSize); - if (NumSkippedElements.ugt(VecTy->getNumElements())) - return 0; - Offset -= NumSkippedElements * ElementSize; - Indices.push_back(IRB.getInt(NumSkippedElements)); - return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(), - Offset, TargetTy, Indices, Prefix); - } + // If this pointer is always safe to load, or if we can prove that there + // is already a load in the block, then we can move the load to the pred + // block. + if (InVal->isDereferenceablePointer() || + isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD)) + continue; - if (ArrayType *ArrTy = dyn_cast(Ty)) { + return false; + } + + return true; + } + + void visitPHINode(PHINode &PN) { + DEBUG(dbgs() << " original: " << PN << "\n"); + + SmallVector Loads; + if (!isSafePHIToSpeculate(PN, Loads)) + return; + + assert(!Loads.empty()); + + Type *LoadTy = cast(PN.getType())->getElementType(); + IRBuilder<> PHIBuilder(&PN); + PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), + PN.getName() + ".sroa.speculated"); + + // Get the TBAA tag and alignment to use from one of the loads. It doesn't + // matter which one we get and if any differ, it doesn't matter. + LoadInst *SomeLoad = cast(Loads.back()); + MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); + unsigned Align = SomeLoad->getAlignment(); + + // Rewrite all loads of the PN to use the new PHI. + do { + LoadInst *LI = Loads.pop_back_val(); + LI->replaceAllUsesWith(NewPN); + Pass.DeadInsts.push_back(LI); + } while (!Loads.empty()); + + // Inject loads into all of the pred blocks. + for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { + BasicBlock *Pred = PN.getIncomingBlock(Idx); + TerminatorInst *TI = Pred->getTerminator(); + Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx)); + Value *InVal = PN.getIncomingValue(Idx); + IRBuilder<> PredBuilder(TI); + + LoadInst *Load + = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." + + Pred->getName())); + ++NumLoadsSpeculated; + Load->setAlignment(Align); + if (TBAATag) + Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); + NewPN->addIncoming(Load, Pred); + + Instruction *Ptr = dyn_cast(InVal); + if (!Ptr) + // No uses to rewrite. + continue; + + // Try to lookup and rewrite any partition uses corresponding to this phi + // input. + AllocaPartitioning::iterator PI + = P.findPartitionForPHIOrSelectOperand(InUse); + if (PI == P.end()) + continue; + + // Replace the Use in the PartitionUse for this operand with the Use + // inside the load. + AllocaPartitioning::use_iterator UI + = P.findPartitionUseForPHIOrSelectOperand(InUse); + assert(isa(*UI->U->getUser())); + UI->U = &Load->getOperandUse(Load->getPointerOperandIndex()); + } + DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); + } + + /// Select instructions that use an alloca and are subsequently loaded can be + /// rewritten to load both input pointers and then select between the result, + /// allowing the load of the alloca to be promoted. + /// From this: + /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other + /// %V = load i32* %P2 + /// to: + /// %V1 = load i32* %Alloca -> will be mem2reg'd + /// %V2 = load i32* %Other + /// %V = select i1 %cond, i32 %V1, i32 %V2 + /// + /// We can do this to a select if its only uses are loads and if the operand + /// to the select can be loaded unconditionally. + bool isSafeSelectToSpeculate(SelectInst &SI, + SmallVectorImpl &Loads) { + Value *TValue = SI.getTrueValue(); + Value *FValue = SI.getFalseValue(); + bool TDerefable = TValue->isDereferenceablePointer(); + bool FDerefable = FValue->isDereferenceablePointer(); + + for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); + UI != UE; ++UI) { + LoadInst *LI = dyn_cast(*UI); + if (LI == 0 || !LI->isSimple()) return false; + + // Both operands to the select need to be dereferencable, either + // absolutely (e.g. allocas) or at this point because we can see other + // accesses to it. + if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI, + LI->getAlignment(), &TD)) + return false; + if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI, + LI->getAlignment(), &TD)) + return false; + Loads.push_back(LI); + } + + return true; + } + + void visitSelectInst(SelectInst &SI) { + DEBUG(dbgs() << " original: " << SI << "\n"); + IRBuilder<> IRB(&SI); + + // If the select isn't safe to speculate, just use simple logic to emit it. + SmallVector Loads; + if (!isSafeSelectToSpeculate(SI, Loads)) + return; + + Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) }; + AllocaPartitioning::iterator PIs[2]; + AllocaPartitioning::PartitionUse PUs[2]; + for (unsigned i = 0, e = 2; i != e; ++i) { + PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]); + if (PIs[i] != P.end()) { + // If the pointer is within the partitioning, remove the select from + // its uses. We'll add in the new loads below. + AllocaPartitioning::use_iterator UI + = P.findPartitionUseForPHIOrSelectOperand(Ops[i]); + PUs[i] = *UI; + // Clear out the use here so that the offsets into the use list remain + // stable but this use is ignored when rewriting. + UI->U = 0; + } + } + + Value *TV = SI.getTrueValue(); + Value *FV = SI.getFalseValue(); + // Replace the loads of the select with a select of two loads. + while (!Loads.empty()) { + LoadInst *LI = Loads.pop_back_val(); + + IRB.SetInsertPoint(LI); + LoadInst *TL = + IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true"); + LoadInst *FL = + IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false"); + NumLoadsSpeculated += 2; + + // Transfer alignment and TBAA info if present. + TL->setAlignment(LI->getAlignment()); + FL->setAlignment(LI->getAlignment()); + if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { + TL->setMetadata(LLVMContext::MD_tbaa, Tag); + FL->setMetadata(LLVMContext::MD_tbaa, Tag); + } + + Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, + LI->getName() + ".sroa.speculated"); + + LoadInst *Loads[2] = { TL, FL }; + for (unsigned i = 0, e = 2; i != e; ++i) { + if (PIs[i] != P.end()) { + Use *LoadUse = &Loads[i]->getOperandUse(0); + assert(PUs[i].U->get() == LoadUse->get()); + PUs[i].U = LoadUse; + P.use_push_back(PIs[i], PUs[i]); + } + } + + DEBUG(dbgs() << " speculated to: " << *V << "\n"); + LI->replaceAllUsesWith(V); + Pass.DeadInsts.push_back(LI); + } + } +}; +} + +/// \brief Accumulate the constant offsets in a GEP into a single APInt offset. +/// +/// If the provided GEP is all-constant, the total byte offset formed by the +/// GEP is computed and Offset is set to it. If the GEP has any non-constant +/// operands, the function returns false and the value of Offset is unmodified. +static bool accumulateGEPOffsets(const DataLayout &TD, GEPOperator &GEP, + APInt &Offset) { + APInt GEPOffset(Offset.getBitWidth(), 0); + for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); + GTI != GTE; ++GTI) { + ConstantInt *OpC = dyn_cast(GTI.getOperand()); + if (!OpC) + return false; + if (OpC->isZero()) continue; + + // Handle a struct index, which adds its field offset to the pointer. + if (StructType *STy = dyn_cast(*GTI)) { + unsigned ElementIdx = OpC->getZExtValue(); + const StructLayout *SL = TD.getStructLayout(STy); + GEPOffset += APInt(Offset.getBitWidth(), + SL->getElementOffset(ElementIdx)); + continue; + } + + APInt TypeSize(Offset.getBitWidth(), + TD.getTypeAllocSize(GTI.getIndexedType())); + if (VectorType *VTy = dyn_cast(*GTI)) { + assert((VTy->getScalarSizeInBits() % 8) == 0 && + "vector element size is not a multiple of 8, cannot GEP over it"); + TypeSize = VTy->getScalarSizeInBits() / 8; + } + + GEPOffset += OpC->getValue().sextOrTrunc(Offset.getBitWidth()) * TypeSize; + } + Offset = GEPOffset; + return true; +} + +/// \brief Build a GEP out of a base pointer and indices. +/// +/// This will return the BasePtr if that is valid, or build a new GEP +/// instruction using the IRBuilder if GEP-ing is needed. +static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr, + SmallVectorImpl &Indices, + const Twine &Prefix) { + if (Indices.empty()) + return BasePtr; + + // A single zero index is a no-op, so check for this and avoid building a GEP + // in that case. + if (Indices.size() == 1 && cast(Indices.back())->isZero()) + return BasePtr; + + return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx"); +} + +/// \brief Get a natural GEP off of the BasePtr walking through Ty toward +/// TargetTy without changing the offset of the pointer. +/// +/// This routine assumes we've already established a properly offset GEP with +/// Indices, and arrived at the Ty type. The goal is to continue to GEP with +/// zero-indices down through type layers until we find one the same as +/// TargetTy. If we can't find one with the same type, we at least try to use +/// one with the same size. If none of that works, we just produce the GEP as +/// indicated by Indices to have the correct offset. +static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD, + Value *BasePtr, Type *Ty, Type *TargetTy, + SmallVectorImpl &Indices, + const Twine &Prefix) { + if (Ty == TargetTy) + return buildGEP(IRB, BasePtr, Indices, Prefix); + + // See if we can descend into a struct and locate a field with the correct + // type. + unsigned NumLayers = 0; + Type *ElementTy = Ty; + do { + if (ElementTy->isPointerTy()) + break; + if (SequentialType *SeqTy = dyn_cast(ElementTy)) { + ElementTy = SeqTy->getElementType(); + Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits( + ElementTy->isPointerTy() ? + cast(ElementTy)->getAddressSpace(): 0), 0))); + } else if (StructType *STy = dyn_cast(ElementTy)) { + if (STy->element_begin() == STy->element_end()) + break; // Nothing left to descend into. + ElementTy = *STy->element_begin(); + Indices.push_back(IRB.getInt32(0)); + } else { + break; + } + ++NumLayers; + } while (ElementTy != TargetTy); + if (ElementTy != TargetTy) + Indices.erase(Indices.end() - NumLayers, Indices.end()); + + return buildGEP(IRB, BasePtr, Indices, Prefix); +} + +/// \brief Recursively compute indices for a natural GEP. +/// +/// This is the recursive step for getNaturalGEPWithOffset that walks down the +/// element types adding appropriate indices for the GEP. +static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD, + Value *Ptr, Type *Ty, APInt &Offset, + Type *TargetTy, + SmallVectorImpl &Indices, + const Twine &Prefix) { + if (Offset == 0) + return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix); + + // We can't recurse through pointer types. + if (Ty->isPointerTy()) + return 0; + + // We try to analyze GEPs over vectors here, but note that these GEPs are + // extremely poorly defined currently. The long-term goal is to remove GEPing + // over a vector from the IR completely. + if (VectorType *VecTy = dyn_cast(Ty)) { + unsigned ElementSizeInBits = VecTy->getScalarSizeInBits(); + if (ElementSizeInBits % 8) + return 0; // GEPs over non-multiple of 8 size vector elements are invalid. + APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); + APInt NumSkippedElements = Offset.udiv(ElementSize); + if (NumSkippedElements.ugt(VecTy->getNumElements())) + return 0; + Offset -= NumSkippedElements * ElementSize; + Indices.push_back(IRB.getInt(NumSkippedElements)); + return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(), + Offset, TargetTy, Indices, Prefix); + } + + if (ArrayType *ArrTy = dyn_cast(Ty)) { Type *ElementTy = ArrTy->getElementType(); APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); APInt NumSkippedElements = Offset.udiv(ElementSize); @@ -1465,7 +1879,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const TargetData &TD, /// Indices, and setting Ty to the result subtype. /// /// If no natural GEP can be constructed, this function returns null. -static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const TargetData &TD, +static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD, Value *Ptr, APInt Offset, Type *TargetTy, SmallVectorImpl &Indices, const Twine &Prefix) { @@ -1477,6 +1891,8 @@ static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const TargetData &TD, return 0; Type *ElementTy = Ty->getElementType(); + if (!ElementTy->isSized()) + return 0; // We can't GEP through an unsized element. APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); if (ElementSize == 0) return 0; // Zero-length arrays can't help us build a natural GEP. @@ -1503,7 +1919,7 @@ static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const TargetData &TD, /// properities. The algorithm tries to fold as many constant indices into /// a single GEP as possible, thus making each GEP more independent of the /// surrounding code. -static Value *getAdjustedPtr(IRBuilder<> &IRB, const TargetData &TD, +static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD, Value *Ptr, APInt Offset, Type *PointerTy, const Twine &Prefix) { // Even though we don't look through PHI nodes, we could be called on an @@ -1599,7 +2015,7 @@ static Value *getAdjustedPtr(IRBuilder<> &IRB, const TargetData &TD, /// SSA value. We only can ensure this for a limited set of operations, and we /// don't want to do the rewrites unless we are confident that the result will /// be promotable, so we have an early test here. -static bool isVectorPromotionViable(const TargetData &TD, +static bool isVectorPromotionViable(const DataLayout &TD, Type *AllocaTy, AllocaPartitioning &P, uint64_t PartitionBeginOffset, @@ -1622,6 +2038,9 @@ static bool isVectorPromotionViable(const TargetData &TD, ElementSize /= 8; for (; I != E; ++I) { + if (!I->U) + continue; // Skip dead use. + uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset; uint64_t BeginIndex = BeginOffset / ElementSize; if (BeginIndex * ElementSize != BeginOffset || @@ -1639,25 +2058,83 @@ static bool isVectorPromotionViable(const TargetData &TD, (EndOffset - BeginOffset) != VecSize) return false; - if (MemIntrinsic *MI = dyn_cast(&*I->User)) { + if (MemIntrinsic *MI = dyn_cast(I->U->getUser())) { if (MI->isVolatile()) return false; - if (MemTransferInst *MTI = dyn_cast(&*I->User)) { + if (MemTransferInst *MTI = dyn_cast(I->U->getUser())) { const AllocaPartitioning::MemTransferOffsets &MTO = P.getMemTransferOffsets(*MTI); if (!MTO.IsSplittable) return false; } - } else if (I->Ptr->getType()->getPointerElementType()->isStructTy()) { + } else if (I->U->get()->getType()->getPointerElementType()->isStructTy()) { // Disable vector promotion when there are loads or stores of an FCA. return false; - } else if (!isa(*I->User) && !isa(*I->User)) { + } else if (!isa(I->U->getUser()) && + !isa(I->U->getUser())) { return false; } } return true; } +/// \brief Test whether the given alloca partition can be promoted to an int. +/// +/// This is a quick test to check whether we can rewrite a particular alloca +/// partition (and its newly formed alloca) into an integer alloca suitable for +/// promotion to an SSA value. We only can ensure this for a limited set of +/// operations, and we don't want to do the rewrites unless we are confident +/// that the result will be promotable, so we have an early test here. +static bool isIntegerPromotionViable(const DataLayout &TD, + Type *AllocaTy, + uint64_t AllocBeginOffset, + AllocaPartitioning &P, + AllocaPartitioning::const_use_iterator I, + AllocaPartitioning::const_use_iterator E) { + IntegerType *Ty = dyn_cast(AllocaTy); + if (!Ty || 8*TD.getTypeStoreSize(Ty) != Ty->getBitWidth()) + return false; + + // Check the uses to ensure the uses are (likely) promoteable integer uses. + // Also ensure that the alloca has a covering load or store. We don't want + // promote because of some other unsplittable entry (which we may make + // splittable later) and lose the ability to promote each element access. + bool WholeAllocaOp = false; + for (; I != E; ++I) { + if (!I->U) + continue; // Skip dead use. + + // We can't reasonably handle cases where the load or store extends past + // the end of the aloca's type and into its padding. + if ((I->EndOffset - AllocBeginOffset) > TD.getTypeStoreSize(Ty)) + return false; + + if (LoadInst *LI = dyn_cast(I->U->getUser())) { + if (LI->isVolatile() || !LI->getType()->isIntegerTy()) + return false; + if (LI->getType() == Ty) + WholeAllocaOp = true; + } else if (StoreInst *SI = dyn_cast(I->U->getUser())) { + if (SI->isVolatile() || !SI->getValueOperand()->getType()->isIntegerTy()) + return false; + if (SI->getValueOperand()->getType() == Ty) + WholeAllocaOp = true; + } else if (MemIntrinsic *MI = dyn_cast(I->U->getUser())) { + if (MI->isVolatile()) + return false; + if (MemTransferInst *MTI = dyn_cast(I->U->getUser())) { + const AllocaPartitioning::MemTransferOffsets &MTO + = P.getMemTransferOffsets(*MTI); + if (!MTO.IsSplittable) + return false; + } + } else { + return false; + } + } + return WholeAllocaOp; +} + namespace { /// \brief Visitor to rewrite instructions using a partition of an alloca to /// use a new alloca. @@ -1670,7 +2147,7 @@ class AllocaPartitionRewriter : public InstVisitor; - const TargetData &TD; + const DataLayout &TD; AllocaPartitioning &P; SROA &Pass; AllocaInst &OldAI, &NewAI; @@ -1689,15 +2166,22 @@ class AllocaPartitionRewriter : public InstVisitorgetScalarSizeInBits() % 8) == 0 && "Only multiple-of-8 sized vector elements are viable"); ElementSize = VecTy->getScalarSizeInBits() / 8; + } else if (isIntegerPromotionViable(TD, NewAI.getAllocatedType(), + NewAllocaBeginOffset, P, I, E)) { + IntPromotionTy = cast(NewAI.getAllocatedType()); } bool CanSROA = true; for (; I != E; ++I) { + if (!I->U) + continue; // Skip dead uses. BeginOffset = I->BeginOffset; EndOffset = I->EndOffset; - OldPtr = I->Ptr; + OldUse = I->U; + OldPtr = cast(I->U->get()); NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str(); - CanSROA &= visit(I->User); + CanSROA &= visit(cast(I->U->getUser())); } if (VecTy) { assert(CanSROA); @@ -1752,10 +2242,43 @@ private: Value *getAdjustedAllocaPtr(IRBuilder<> &IRB, Type *PointerTy) { assert(BeginOffset >= NewAllocaBeginOffset); - APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset); + unsigned AS = cast(PointerTy)->getAddressSpace(); + APInt Offset(TD.getPointerSizeInBits(AS), BeginOffset - NewAllocaBeginOffset); return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName("")); } + /// \brief Compute suitable alignment to access an offset into the new alloca. + unsigned getOffsetAlign(uint64_t Offset) { + unsigned NewAIAlign = NewAI.getAlignment(); + if (!NewAIAlign) + NewAIAlign = TD.getABITypeAlignment(NewAI.getAllocatedType()); + return MinAlign(NewAIAlign, Offset); + } + + /// \brief Compute suitable alignment to access this partition of the new + /// alloca. + unsigned getPartitionAlign() { + return getOffsetAlign(BeginOffset - NewAllocaBeginOffset); + } + + /// \brief Compute suitable alignment to access a type at an offset of the + /// new alloca. + /// + /// \returns zero if the type's ABI alignment is a suitable alignment, + /// otherwise returns the maximal suitable alignment. + unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) { + unsigned Align = getOffsetAlign(Offset); + return Align == TD.getABITypeAlignment(Ty) ? 0 : Align; + } + + /// \brief Compute suitable alignment to access a type at the beginning of + /// this partition of the new alloca. + /// + /// See \c getOffsetTypeAlign for details; this routine delegates to it. + unsigned getPartitionTypeAlign(Type *Ty) { + return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset); + } + ConstantInt *getIndex(IRBuilder<> &IRB, uint64_t Offset) { assert(VecTy && "Can only call getIndex when rewriting a vector"); uint64_t RelOffset = Offset - NewAllocaBeginOffset; @@ -1765,6 +2288,59 @@ private: return IRB.getInt32(Index); } + Value *extractInteger(IRBuilder<> &IRB, IntegerType *TargetTy, + uint64_t Offset) { + assert(IntPromotionTy && "Alloca is not an integer we can extract from"); + Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t RelOffset = Offset - NewAllocaBeginOffset; + assert(TD.getTypeStoreSize(TargetTy) + RelOffset <= + TD.getTypeStoreSize(IntPromotionTy) && + "Element load outside of alloca store"); + uint64_t ShAmt = 8*RelOffset; + if (TD.isBigEndian()) + ShAmt = 8*(TD.getTypeStoreSize(IntPromotionTy) - + TD.getTypeStoreSize(TargetTy) - RelOffset); + if (ShAmt) + V = IRB.CreateLShr(V, ShAmt, getName(".shift")); + if (TargetTy != IntPromotionTy) { + assert(TargetTy->getBitWidth() < IntPromotionTy->getBitWidth() && + "Cannot extract to a larger integer!"); + V = IRB.CreateTrunc(V, TargetTy, getName(".trunc")); + } + return V; + } + + StoreInst *insertInteger(IRBuilder<> &IRB, Value *V, uint64_t Offset) { + IntegerType *Ty = cast(V->getType()); + if (Ty == IntPromotionTy) + return IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); + + assert(Ty->getBitWidth() < IntPromotionTy->getBitWidth() && + "Cannot insert a larger integer!"); + V = IRB.CreateZExt(V, IntPromotionTy, getName(".ext")); + assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t RelOffset = Offset - NewAllocaBeginOffset; + assert(TD.getTypeStoreSize(Ty) + RelOffset <= + TD.getTypeStoreSize(IntPromotionTy) && + "Element store outside of alloca store"); + uint64_t ShAmt = 8*RelOffset; + if (TD.isBigEndian()) + ShAmt = 8*(TD.getTypeStoreSize(IntPromotionTy) - TD.getTypeStoreSize(Ty) + - RelOffset); + if (ShAmt) + V = IRB.CreateShl(V, ShAmt, getName(".shift")); + + APInt Mask = ~Ty->getMask().zext(IntPromotionTy->getBitWidth()).shl(ShAmt); + Value *Old = IRB.CreateAnd(IRB.CreateAlignedLoad(&NewAI, + NewAI.getAlignment(), + getName(".oldload")), + Mask, getName(".mask")); + return IRB.CreateAlignedStore(IRB.CreateOr(Old, V, getName(".insert")), + &NewAI, NewAI.getAlignment()); + } + void deleteIfTriviallyDead(Value *V) { Instruction *I = cast(V); if (isInstructionTriviallyDead(I)) @@ -1784,12 +2360,12 @@ private: Value *Result; if (LI.getType() == VecTy->getElementType() || BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) { - Result - = IRB.CreateExtractElement(IRB.CreateLoad(&NewAI, getName(".load")), - getIndex(IRB, BeginOffset), - getName(".extract")); + Result = IRB.CreateExtractElement( + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")), + getIndex(IRB, BeginOffset), getName(".extract")); } else { - Result = IRB.CreateLoad(&NewAI, getName(".load")); + Result = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); } if (Result->getType() != LI.getType()) Result = getValueCast(IRB, Result, LI.getType()); @@ -1800,6 +2376,16 @@ private: return true; } + bool rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) { + assert(!LI.isVolatile()); + Value *Result = extractInteger(IRB, cast(LI.getType()), + BeginOffset); + LI.replaceAllUsesWith(Result); + Pass.DeadInsts.push_back(&LI); + DEBUG(dbgs() << " to: " << *Result << "\n"); + return true; + } + bool visitLoadInst(LoadInst &LI) { DEBUG(dbgs() << " original: " << LI << "\n"); Value *OldOp = LI.getOperand(0); @@ -1808,10 +2394,13 @@ private: if (VecTy) return rewriteVectorizedLoadInst(IRB, LI, OldOp); + if (IntPromotionTy) + return rewriteIntegerLoad(IRB, LI); Value *NewPtr = getAdjustedAllocaPtr(IRB, LI.getPointerOperand()->getType()); LI.setOperand(0, NewPtr); + LI.setAlignment(getPartitionTypeAlign(LI.getType())); DEBUG(dbgs() << " to: " << LI << "\n"); deleteIfTriviallyDead(OldOp); @@ -1825,13 +2414,14 @@ private: BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) { if (V->getType() != ElementTy) V = getValueCast(IRB, V, ElementTy); - V = IRB.CreateInsertElement(IRB.CreateLoad(&NewAI, getName(".load")), V, - getIndex(IRB, BeginOffset), + LoadInst *LI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + V = IRB.CreateInsertElement(LI, V, getIndex(IRB, BeginOffset), getName(".insert")); } else if (V->getType() != VecTy) { V = getValueCast(IRB, V, VecTy); } - StoreInst *Store = IRB.CreateStore(V, &NewAI); + StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); Pass.DeadInsts.push_back(&SI); (void)Store; @@ -1839,6 +2429,15 @@ private: return true; } + bool rewriteIntegerStore(IRBuilder<> &IRB, StoreInst &SI) { + assert(!SI.isVolatile()); + StoreInst *Store = insertInteger(IRB, SI.getValueOperand(), BeginOffset); + Pass.DeadInsts.push_back(&SI); + (void)Store; + DEBUG(dbgs() << " to: " << *Store << "\n"); + return true; + } + bool visitStoreInst(StoreInst &SI) { DEBUG(dbgs() << " original: " << SI << "\n"); Value *OldOp = SI.getOperand(1); @@ -1847,10 +2446,20 @@ private: if (VecTy) return rewriteVectorizedStoreInst(IRB, SI, OldOp); + if (IntPromotionTy) + return rewriteIntegerStore(IRB, SI); + + // Strip all inbounds GEPs and pointer casts to try to dig out any root + // alloca that should be re-examined after promoting this alloca. + if (SI.getValueOperand()->getType()->isPointerTy()) + if (AllocaInst *AI = dyn_cast(SI.getValueOperand() + ->stripInBoundsOffsets())) + Pass.PostPromotionWorklist.insert(AI); Value *NewPtr = getAdjustedAllocaPtr(IRB, SI.getPointerOperand()->getType()); SI.setOperand(1, NewPtr); + SI.setAlignment(getPartitionTypeAlign(SI.getValueOperand()->getType())); DEBUG(dbgs() << " to: " << SI << "\n"); deleteIfTriviallyDead(OldOp); @@ -1866,6 +2475,9 @@ private: // pointer to the new alloca. if (!isa(II.getLength())) { II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); + Type *CstTy = II.getAlignmentCst()->getType(); + II.setAlignment(ConstantInt::get(CstTy, getPartitionAlign())); + deleteIfTriviallyDead(OldPtr); return false; } @@ -1885,11 +2497,10 @@ private: !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)))) { Type *SizeTy = II.getLength()->getType(); Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); - CallInst *New = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType()), - II.getValue(), Size, II.getAlignment(), + II.getValue(), Size, getPartitionAlign(), II.isVolatile()); (void)New; DEBUG(dbgs() << " to: " << *New << "\n"); @@ -1927,11 +2538,13 @@ private: // If this is an element-wide memset of a vectorizable alloca, insert it. if (VecTy && (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)) { - StoreInst *Store = IRB.CreateStore( - IRB.CreateInsertElement(IRB.CreateLoad(&NewAI, getName(".load")), V, - getIndex(IRB, BeginOffset), + StoreInst *Store = IRB.CreateAlignedStore( + IRB.CreateInsertElement(IRB.CreateAlignedLoad(&NewAI, + NewAI.getAlignment(), + getName(".load")), + V, getIndex(IRB, BeginOffset), getName(".insert")), - &NewAI); + &NewAI, NewAI.getAlignment()); (void)Store; DEBUG(dbgs() << " to: " << *Store << "\n"); return true; @@ -1949,7 +2562,8 @@ private: assert(V->getType() == VecTy); } - Value *New = IRB.CreateStore(V, &NewAI, II.isVolatile()); + Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), + II.isVolatile()); (void)New; DEBUG(dbgs() << " to: " << *New << "\n"); return !II.isVolatile(); @@ -1968,6 +2582,18 @@ private: const AllocaPartitioning::MemTransferOffsets &MTO = P.getMemTransferOffsets(II); + assert(OldPtr->getType()->isPointerTy() && "Must be a pointer type!"); + unsigned AS = cast(OldPtr->getType())->getAddressSpace(); + // Compute the relative offset within the transfer. + unsigned IntPtrWidth = TD.getPointerSizeInBits(AS); + APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin + : MTO.SourceBegin)); + + unsigned Align = II.getAlignment(); + if (Align > 1) + Align = MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(), + MinAlign(II.getAlignment(), getPartitionAlign())); + // For unsplit intrinsics, we simply modify the source and destination // pointers in place. This isn't just an optimization, it is a matter of // correctness. With unsplit intrinsics we may be dealing with transfers @@ -1982,6 +2608,9 @@ private: else II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType())); + Type *CstTy = II.getAlignmentCst()->getType(); + II.setAlignment(ConstantInt::get(CstTy, Align)); + DEBUG(dbgs() << " to: " << II << "\n"); deleteIfTriviallyDead(OldOp); return false; @@ -1992,11 +2621,6 @@ private: // memmove with memcpy, and we don't need to worry about all manner of // downsides to splitting and transforming the operations. - // Compute the relative offset within the transfer. - unsigned IntPtrWidth = TD.getPointerSizeInBits(); - APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin - : MTO.SourceBegin)); - // If this doesn't map cleanly onto the alloca type, and that type isn't // a single value type, just emit a memcpy. bool EmitMemCpy @@ -2054,277 +2678,94 @@ private: CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, - Size, II.getAlignment(), - II.isVolatile()); + Size, Align, II.isVolatile()); (void)New; DEBUG(dbgs() << " to: " << *New << "\n"); return false; } + // Note that we clamp the alignment to 1 here as a 0 alignment for a memcpy + // is equivalent to 1, but that isn't true if we end up rewriting this as + // a load or store. + if (!Align) + Align = 1; + Value *SrcPtr = OtherPtr; Value *DstPtr = &NewAI; if (!IsDest) std::swap(SrcPtr, DstPtr); Value *Src; - if (IsVectorElement && !IsDest) { - // We have to extract rather than load. - Src = IRB.CreateExtractElement(IRB.CreateLoad(SrcPtr, - getName(".copyload")), - getIndex(IRB, BeginOffset), - getName(".copyextract")); - } else { - Src = IRB.CreateLoad(SrcPtr, II.isVolatile(), getName(".copyload")); - } - - if (IsVectorElement && IsDest) { - // We have to insert into a loaded copy before storing. - Src = IRB.CreateInsertElement(IRB.CreateLoad(&NewAI, getName(".load")), - Src, getIndex(IRB, BeginOffset), - getName(".insert")); - } - - Value *Store = IRB.CreateStore(Src, DstPtr, II.isVolatile()); - (void)Store; - DEBUG(dbgs() << " to: " << *Store << "\n"); - return !II.isVolatile(); - } - - bool visitIntrinsicInst(IntrinsicInst &II) { - assert(II.getIntrinsicID() == Intrinsic::lifetime_start || - II.getIntrinsicID() == Intrinsic::lifetime_end); - DEBUG(dbgs() << " original: " << II << "\n"); - IRBuilder<> IRB(&II); - assert(II.getArgOperand(1) == OldPtr); - - // Record this instruction for deletion. - if (Pass.DeadSplitInsts.insert(&II)) - Pass.DeadInsts.push_back(&II); - - ConstantInt *Size - = ConstantInt::get(cast(II.getArgOperand(0)->getType()), - EndOffset - BeginOffset); - Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType()); - Value *New; - if (II.getIntrinsicID() == Intrinsic::lifetime_start) - New = IRB.CreateLifetimeStart(Ptr, Size); - else - New = IRB.CreateLifetimeEnd(Ptr, Size); - - DEBUG(dbgs() << " to: " << *New << "\n"); - return true; - } - - /// PHI instructions that use an alloca and are subsequently loaded can be - /// rewritten to load both input pointers in the pred blocks and then PHI the - /// results, allowing the load of the alloca to be promoted. - /// From this: - /// %P2 = phi [i32* %Alloca, i32* %Other] - /// %V = load i32* %P2 - /// to: - /// %V1 = load i32* %Alloca -> will be mem2reg'd - /// ... - /// %V2 = load i32* %Other - /// ... - /// %V = phi [i32 %V1, i32 %V2] - /// - /// We can do this to a select if its only uses are loads and if the operand - /// to the select can be loaded unconditionally. - /// - /// FIXME: This should be hoisted into a generic utility, likely in - /// Transforms/Util/Local.h - bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl &Loads) { - // For now, we can only do this promotion if the load is in the same block - // as the PHI, and if there are no stores between the phi and load. - // TODO: Allow recursive phi users. - // TODO: Allow stores. - BasicBlock *BB = PN.getParent(); - unsigned MaxAlign = 0; - for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); - UI != UE; ++UI) { - LoadInst *LI = dyn_cast(*UI); - if (LI == 0 || !LI->isSimple()) return false; - - // For now we only allow loads in the same block as the PHI. This is - // a common case that happens when instcombine merges two loads through - // a PHI. - if (LI->getParent() != BB) return false; - - // Ensure that there are no instructions between the PHI and the load that - // could store. - for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) - if (BBI->mayWriteToMemory()) - return false; - - MaxAlign = std::max(MaxAlign, LI->getAlignment()); - Loads.push_back(LI); + if (IsVectorElement && !IsDest) { + // We have to extract rather than load. + Src = IRB.CreateExtractElement( + IRB.CreateAlignedLoad(SrcPtr, Align, getName(".copyload")), + getIndex(IRB, BeginOffset), + getName(".copyextract")); + } else { + Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(), + getName(".copyload")); } - // We can only transform this if it is safe to push the loads into the - // predecessor blocks. The only thing to watch out for is that we can't put - // a possibly trapping load in the predecessor if it is a critical edge. - for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; - ++Idx) { - TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); - Value *InVal = PN.getIncomingValue(Idx); + if (IsVectorElement && IsDest) { + // We have to insert into a loaded copy before storing. + Src = IRB.CreateInsertElement( + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")), + Src, getIndex(IRB, BeginOffset), + getName(".insert")); + } - // If the value is produced by the terminator of the predecessor (an - // invoke) or it has side-effects, there is no valid place to put a load - // in the predecessor. - if (TI == InVal || TI->mayHaveSideEffects()) - return false; + StoreInst *Store = cast( + IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile())); + (void)Store; + DEBUG(dbgs() << " to: " << *Store << "\n"); + return !II.isVolatile(); + } - // If the predecessor has a single successor, then the edge isn't - // critical. - if (TI->getNumSuccessors() == 1) - continue; + bool visitIntrinsicInst(IntrinsicInst &II) { + assert(II.getIntrinsicID() == Intrinsic::lifetime_start || + II.getIntrinsicID() == Intrinsic::lifetime_end); + DEBUG(dbgs() << " original: " << II << "\n"); + IRBuilder<> IRB(&II); + assert(II.getArgOperand(1) == OldPtr); - // If this pointer is always safe to load, or if we can prove that there - // is already a load in the block, then we can move the load to the pred - // block. - if (InVal->isDereferenceablePointer() || - isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD)) - continue; + // Record this instruction for deletion. + if (Pass.DeadSplitInsts.insert(&II)) + Pass.DeadInsts.push_back(&II); - return false; - } + ConstantInt *Size + = ConstantInt::get(cast(II.getArgOperand(0)->getType()), + EndOffset - BeginOffset); + Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType()); + Value *New; + if (II.getIntrinsicID() == Intrinsic::lifetime_start) + New = IRB.CreateLifetimeStart(Ptr, Size); + else + New = IRB.CreateLifetimeEnd(Ptr, Size); + DEBUG(dbgs() << " to: " << *New << "\n"); return true; } bool visitPHINode(PHINode &PN) { DEBUG(dbgs() << " original: " << PN << "\n"); + // We would like to compute a new pointer in only one place, but have it be // as local as possible to the PHI. To do that, we re-use the location of // the old pointer, which necessarily must be in the right position to // dominate the PHI. IRBuilder<> PtrBuilder(cast(OldPtr)); - SmallVector Loads; - if (!isSafePHIToSpeculate(PN, Loads)) { - Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType()); - // Replace the operands which were using the old pointer. - User::op_iterator OI = PN.op_begin(), OE = PN.op_end(); - for (; OI != OE; ++OI) - if (*OI == OldPtr) - *OI = NewPtr; - - DEBUG(dbgs() << " to: " << PN << "\n"); - deleteIfTriviallyDead(OldPtr); - return false; - } - assert(!Loads.empty()); - - Type *LoadTy = cast(PN.getType())->getElementType(); - IRBuilder<> PHIBuilder(&PN); - PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues()); - NewPN->takeName(&PN); - - // Get the TBAA tag and alignment to use from one of the loads. It doesn't - // matter which one we get and if any differ, it doesn't matter. - LoadInst *SomeLoad = cast(Loads.back()); - MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); - unsigned Align = SomeLoad->getAlignment(); Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType()); + // Replace the operands which were using the old pointer. + User::op_iterator OI = PN.op_begin(), OE = PN.op_end(); + for (; OI != OE; ++OI) + if (*OI == OldPtr) + *OI = NewPtr; - // Rewrite all loads of the PN to use the new PHI. - do { - LoadInst *LI = Loads.pop_back_val(); - LI->replaceAllUsesWith(NewPN); - Pass.DeadInsts.push_back(LI); - } while (!Loads.empty()); - - // Inject loads into all of the pred blocks. - for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { - BasicBlock *Pred = PN.getIncomingBlock(Idx); - TerminatorInst *TI = Pred->getTerminator(); - Value *InVal = PN.getIncomingValue(Idx); - IRBuilder<> PredBuilder(TI); - - // Map the value to the new alloca pointer if this was the old alloca - // pointer. - bool ThisOperand = InVal == OldPtr; - if (ThisOperand) - InVal = NewPtr; - - LoadInst *Load - = PredBuilder.CreateLoad(InVal, getName(".sroa.speculate." + - Pred->getName())); - ++NumLoadsSpeculated; - Load->setAlignment(Align); - if (TBAATag) - Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); - NewPN->addIncoming(Load, Pred); - - if (ThisOperand) - continue; - Instruction *OtherPtr = dyn_cast(InVal); - if (!OtherPtr) - // No uses to rewrite. - continue; - - // Try to lookup and rewrite any partition uses corresponding to this phi - // input. - AllocaPartitioning::iterator PI - = P.findPartitionForPHIOrSelectOperand(PN, OtherPtr); - if (PI != P.end()) { - // If the other pointer is within the partitioning, replace the PHI in - // its uses with the load we just speculated, or add another load for - // it to rewrite if we've already replaced the PHI. - AllocaPartitioning::use_iterator UI - = P.findPartitionUseForPHIOrSelectOperand(PN, OtherPtr); - if (isa(*UI->User)) - UI->User = Load; - else { - AllocaPartitioning::PartitionUse OtherUse = *UI; - OtherUse.User = Load; - P.use_insert(PI, std::upper_bound(UI, P.use_end(PI), OtherUse), - OtherUse); - } - } - } - DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); - return NewPtr == &NewAI; - } - - /// Select instructions that use an alloca and are subsequently loaded can be - /// rewritten to load both input pointers and then select between the result, - /// allowing the load of the alloca to be promoted. - /// From this: - /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other - /// %V = load i32* %P2 - /// to: - /// %V1 = load i32* %Alloca -> will be mem2reg'd - /// %V2 = load i32* %Other - /// %V = select i1 %cond, i32 %V1, i32 %V2 - /// - /// We can do this to a select if its only uses are loads and if the operand - /// to the select can be loaded unconditionally. - bool isSafeSelectToSpeculate(SelectInst &SI, - SmallVectorImpl &Loads) { - Value *TValue = SI.getTrueValue(); - Value *FValue = SI.getFalseValue(); - bool TDerefable = TValue->isDereferenceablePointer(); - bool FDerefable = FValue->isDereferenceablePointer(); - - for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); - UI != UE; ++UI) { - LoadInst *LI = dyn_cast(*UI); - if (LI == 0 || !LI->isSimple()) return false; - - // Both operands to the select need to be dereferencable, either - // absolutely (e.g. allocas) or at this point because we can see other - // accesses to it. - if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI, - LI->getAlignment(), &TD)) - return false; - if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI, - LI->getAlignment(), &TD)) - return false; - Loads.push_back(LI); - } - - return true; + DEBUG(dbgs() << " to: " << PN << "\n"); + deleteIfTriviallyDead(OldPtr); + return false; } bool visitSelectInst(SelectInst &SI) { @@ -2337,70 +2778,224 @@ private: assert(SI.getFalseValue() != OldPtr && "Pointer is both operands!"); else assert(SI.getFalseValue() == OldPtr && "Pointer isn't an operand!"); + Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType()); + SI.setOperand(IsTrueVal ? 1 : 2, NewPtr); + DEBUG(dbgs() << " to: " << SI << "\n"); + deleteIfTriviallyDead(OldPtr); + return false; + } - // If the select isn't safe to speculate, just use simple logic to emit it. - SmallVector Loads; - if (!isSafeSelectToSpeculate(SI, Loads)) { - SI.setOperand(IsTrueVal ? 1 : 2, NewPtr); - DEBUG(dbgs() << " to: " << SI << "\n"); - deleteIfTriviallyDead(OldPtr); - return false; - } +}; +} - Value *OtherPtr = IsTrueVal ? SI.getFalseValue() : SI.getTrueValue(); - AllocaPartitioning::iterator PI - = P.findPartitionForPHIOrSelectOperand(SI, OtherPtr); - AllocaPartitioning::PartitionUse OtherUse; - if (PI != P.end()) { - // If the other pointer is within the partitioning, remove the select - // from its uses. We'll add in the new loads below. - AllocaPartitioning::use_iterator UI - = P.findPartitionUseForPHIOrSelectOperand(SI, OtherPtr); - OtherUse = *UI; - P.use_erase(PI, UI); +namespace { +/// \brief Visitor to rewrite aggregate loads and stores as scalar. +/// +/// This pass aggressively rewrites all aggregate loads and stores on +/// a particular pointer (or any pointer derived from it which we can identify) +/// with scalar loads and stores. +class AggLoadStoreRewriter : public InstVisitor { + // Befriend the base class so it can delegate to private visit methods. + friend class llvm::InstVisitor; + + const DataLayout &TD; + + /// Queue of pointer uses to analyze and potentially rewrite. + SmallVector Queue; + + /// Set to prevent us from cycling with phi nodes and loops. + SmallPtrSet Visited; + + /// The current pointer use being rewritten. This is used to dig up the used + /// value (as opposed to the user). + Use *U; + +public: + AggLoadStoreRewriter(const DataLayout &TD) : TD(TD) {} + + /// Rewrite loads and stores through a pointer and all pointers derived from + /// it. + bool rewrite(Instruction &I) { + DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); + enqueueUsers(I); + bool Changed = false; + while (!Queue.empty()) { + U = Queue.pop_back_val(); + Changed |= visit(cast(U->getUser())); } + return Changed; + } - Value *TV = IsTrueVal ? NewPtr : SI.getTrueValue(); - Value *FV = IsTrueVal ? SI.getFalseValue() : NewPtr; - // Replace the loads of the select with a select of two loads. - while (!Loads.empty()) { - LoadInst *LI = Loads.pop_back_val(); +private: + /// Enqueue all the users of the given instruction for further processing. + /// This uses a set to de-duplicate users. + void enqueueUsers(Instruction &I) { + for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; + ++UI) + if (Visited.insert(*UI)) + Queue.push_back(&UI.getUse()); + } + + // Conservative default is to not rewrite anything. + bool visitInstruction(Instruction &I) { return false; } + + /// \brief Generic recursive split emission class. + template + class OpSplitter { + protected: + /// The builder used to form new instructions. + IRBuilder<> IRB; + /// The indices which to be used with insert- or extractvalue to select the + /// appropriate value within the aggregate. + SmallVector Indices; + /// The indices to a GEP instruction which will move Ptr to the correct slot + /// within the aggregate. + SmallVector GEPIndices; + /// The base pointer of the original op, used as a base for GEPing the + /// split operations. + Value *Ptr; + + /// Initialize the splitter with an insertion point, Ptr and start with a + /// single zero GEP index. + OpSplitter(Instruction *InsertionPoint, Value *Ptr) + : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} - IRB.SetInsertPoint(LI); - LoadInst *TL = - IRB.CreateLoad(TV, getName("." + LI->getName() + ".true")); - LoadInst *FL = - IRB.CreateLoad(FV, getName("." + LI->getName() + ".false")); - NumLoadsSpeculated += 2; - if (PI != P.end()) { - LoadInst *OtherLoad = IsTrueVal ? FL : TL; - assert(OtherUse.Ptr == OtherLoad->getOperand(0)); - OtherUse.User = OtherLoad; - P.use_insert(PI, P.use_end(PI), OtherUse); + public: + /// \brief Generic recursive split emission routine. + /// + /// This method recursively splits an aggregate op (load or store) into + /// scalar or vector ops. It splits recursively until it hits a single value + /// and emits that single value operation via the template argument. + /// + /// The logic of this routine relies on GEPs and insertvalue and + /// extractvalue all operating with the same fundamental index list, merely + /// formatted differently (GEPs need actual values). + /// + /// \param Ty The type being split recursively into smaller ops. + /// \param Agg The aggregate value being built up or stored, depending on + /// whether this is splitting a load or a store respectively. + void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { + if (Ty->isSingleValueType()) + return static_cast(this)->emitFunc(Ty, Agg, Name); + + if (ArrayType *ATy = dyn_cast(Ty)) { + unsigned OldSize = Indices.size(); + (void)OldSize; + for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; + ++Idx) { + assert(Indices.size() == OldSize && "Did not return to the old size"); + Indices.push_back(Idx); + GEPIndices.push_back(IRB.getInt32(Idx)); + emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx)); + GEPIndices.pop_back(); + Indices.pop_back(); + } + return; } - // Transfer alignment and TBAA info if present. - TL->setAlignment(LI->getAlignment()); - FL->setAlignment(LI->getAlignment()); - if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { - TL->setMetadata(LLVMContext::MD_tbaa, Tag); - FL->setMetadata(LLVMContext::MD_tbaa, Tag); + if (StructType *STy = dyn_cast(Ty)) { + unsigned OldSize = Indices.size(); + (void)OldSize; + for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; + ++Idx) { + assert(Indices.size() == OldSize && "Did not return to the old size"); + Indices.push_back(Idx); + GEPIndices.push_back(IRB.getInt32(Idx)); + emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx)); + GEPIndices.pop_back(); + Indices.pop_back(); + } + return; } - Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL); - V->takeName(LI); - DEBUG(dbgs() << " speculated to: " << *V << "\n"); - LI->replaceAllUsesWith(V); - Pass.DeadInsts.push_back(LI); + llvm_unreachable("Only arrays and structs are aggregate loadable types"); } - if (PI != P.end()) - std::stable_sort(P.use_begin(PI), P.use_end(PI)); + }; - deleteIfTriviallyDead(OldPtr); - return NewPtr == &NewAI; + struct LoadOpSplitter : public OpSplitter { + LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) + : OpSplitter(InsertionPoint, Ptr) {} + + /// Emit a leaf load of a single value. This is called at the leaves of the + /// recursive emission to actually load values. + void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { + assert(Ty->isSingleValueType()); + // Load the single value and insert it using the indices. + Value *Load = IRB.CreateLoad(IRB.CreateInBoundsGEP(Ptr, GEPIndices, + Name + ".gep"), + Name + ".load"); + Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); + DEBUG(dbgs() << " to: " << *Load << "\n"); + } + }; + + bool visitLoadInst(LoadInst &LI) { + assert(LI.getPointerOperand() == *U); + if (!LI.isSimple() || LI.getType()->isSingleValueType()) + return false; + + // We have an aggregate being loaded, split it apart. + DEBUG(dbgs() << " original: " << LI << "\n"); + LoadOpSplitter Splitter(&LI, *U); + Value *V = UndefValue::get(LI.getType()); + Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); + LI.replaceAllUsesWith(V); + LI.eraseFromParent(); + return true; + } + + struct StoreOpSplitter : public OpSplitter { + StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) + : OpSplitter(InsertionPoint, Ptr) {} + + /// Emit a leaf store of a single value. This is called at the leaves of the + /// recursive emission to actually produce stores. + void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { + assert(Ty->isSingleValueType()); + // Extract the single value and store it using the indices. + Value *Store = IRB.CreateStore( + IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), + IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); + (void)Store; + DEBUG(dbgs() << " to: " << *Store << "\n"); + } + }; + + bool visitStoreInst(StoreInst &SI) { + if (!SI.isSimple() || SI.getPointerOperand() != *U) + return false; + Value *V = SI.getValueOperand(); + if (V->getType()->isSingleValueType()) + return false; + + // We have an aggregate being stored, split it apart. + DEBUG(dbgs() << " original: " << SI << "\n"); + StoreOpSplitter Splitter(&SI, *U); + Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); + SI.eraseFromParent(); + return true; + } + + bool visitBitCastInst(BitCastInst &BC) { + enqueueUsers(BC); + return false; + } + + bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { + enqueueUsers(GEPI); + return false; + } + + bool visitPHINode(PHINode &PN) { + enqueueUsers(PN); + return false; } + bool visitSelectInst(SelectInst &SI) { + enqueueUsers(SI); + return false; + } }; } @@ -2417,7 +3012,7 @@ private: /// when the size or offset cause either end of type-based partition to be off. /// Also, this is a best-effort routine. It is reasonable to give up and not /// return a type if necessary. -static Type *getTypePartition(const TargetData &TD, Type *Ty, +static Type *getTypePartition(const DataLayout &TD, Type *Ty, uint64_t Offset, uint64_t Size) { if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size) return Ty; @@ -2533,9 +3128,23 @@ bool SROA::rewriteAllocaPartition(AllocaInst &AI, AllocaPartitioning &P, AllocaPartitioning::iterator PI) { uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset; - if (P.use_begin(PI) == P.use_end(PI)) + bool IsLive = false; + for (AllocaPartitioning::use_iterator UI = P.use_begin(PI), + UE = P.use_end(PI); + UI != UE && !IsLive; ++UI) + if (UI->U) + IsLive = true; + if (!IsLive) return false; // No live uses left of this partition. + DEBUG(dbgs() << "Speculating PHIs and selects in partition " + << "[" << PI->BeginOffset << "," << PI->EndOffset << ")\n"); + + PHIOrSelectSpeculator Speculator(*TD, P, *this); + DEBUG(dbgs() << " speculating "); + DEBUG(P.print(dbgs(), PI, "")); + Speculator.visitUsers(PI); + // Try to compute a friendly type for this partition of the alloca. This // won't always succeed, in which case we fall back to a legal integer type // or an i8 array of an appropriate size. @@ -2567,9 +3176,19 @@ bool SROA::rewriteAllocaPartition(AllocaInst &AI, assert(PI == P.begin() && "Begin offset is zero on later partition"); NewAI = &AI; } else { - // FIXME: The alignment here is overly conservative -- we could in many - // cases get away with much weaker alignment constraints. - NewAI = new AllocaInst(AllocaTy, 0, AI.getAlignment(), + unsigned Alignment = AI.getAlignment(); + if (!Alignment) { + // The minimum alignment which users can rely on when the explicit + // alignment is omitted or zero is that required by the ABI for this + // type. + Alignment = TD->getABITypeAlignment(AI.getAllocatedType()); + } + Alignment = MinAlign(Alignment, PI->BeginOffset); + // If we will get at least this much alignment from the type alone, leave + // the alloca's alignment unconstrained. + if (Alignment <= TD->getABITypeAlignment(AllocaTy)) + Alignment = 0; + NewAI = new AllocaInst(AllocaTy, 0, Alignment, AI.getName() + ".sroa." + Twine(PI - P.begin()), &AI); ++NumNewAllocas; @@ -2579,11 +3198,16 @@ bool SROA::rewriteAllocaPartition(AllocaInst &AI, << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: " << *NewAI << "\n"); + // Track the high watermark of the post-promotion worklist. We will reset it + // to this point if the alloca is not in fact scheduled for promotion. + unsigned PPWOldSize = PostPromotionWorklist.size(); + AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI, PI->BeginOffset, PI->EndOffset); DEBUG(dbgs() << " rewriting "); DEBUG(P.print(dbgs(), PI, "")); - if (Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI))) { + bool Promotable = Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI)); + if (Promotable) { DEBUG(dbgs() << " and queuing for promotion\n"); PromotableAllocas.push_back(NewAI); } else if (NewAI != &AI) { @@ -2592,6 +3216,12 @@ bool SROA::rewriteAllocaPartition(AllocaInst &AI, // alloca which didn't actually change and didn't get promoted. Worklist.insert(NewAI); } + + // Drop any post-promotion work items if promotion didn't happen. + if (!Promotable) + while (PostPromotionWorklist.size() > PPWOldSize) + PostPromotionWorklist.pop_back(); + return true; } @@ -2625,25 +3255,20 @@ bool SROA::runOnAlloca(AllocaInst &AI) { TD->getTypeAllocSize(AI.getAllocatedType()) == 0) return false; - // First check if this is a non-aggregate type that we should simply promote. - if (!AI.getAllocatedType()->isAggregateType() && isAllocaPromotable(&AI)) { - DEBUG(dbgs() << " Trivially scalar type, queuing for promotion...\n"); - PromotableAllocas.push_back(&AI); - return false; - } + bool Changed = false; + + // First, split any FCA loads and stores touching this alloca to promote + // better splitting and promotion opportunities. + AggLoadStoreRewriter AggRewriter(*TD); + Changed |= AggRewriter.rewrite(AI); // Build the partition set using a recursive instruction-visiting builder. AllocaPartitioning P(*TD, AI); DEBUG(P.print(dbgs())); if (P.isEscaped()) - return false; - - // No partitions to split. Leave the dead alloca for a later pass to clean up. - if (P.begin() == P.end()) - return false; + return Changed; // Delete all the dead users of this alloca before splitting and rewriting it. - bool Changed = false; for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(), DE = P.dead_user_end(); DI != DE; ++DI) { @@ -2664,6 +3289,10 @@ bool SROA::runOnAlloca(AllocaInst &AI) { } } + // No partitions to split. Leave the dead alloca for a later pass to clean up. + if (P.begin() == P.end()) + return Changed; + return splitAlloca(AI, P) || Changed; } @@ -2765,15 +3394,17 @@ namespace { const SetType &Set; public: + typedef AllocaInst *argument_type; + IsAllocaInSet(const SetType &Set) : Set(Set) {} - bool operator()(AllocaInst *AI) { return Set.count(AI); } + bool operator()(AllocaInst *AI) const { return Set.count(AI); } }; } bool SROA::runOnFunction(Function &F) { DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); - TD = getAnalysisIfAvailable(); + TD = getAnalysisIfAvailable(); if (!TD) { DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); return false; @@ -2791,19 +3422,29 @@ bool SROA::runOnFunction(Function &F) { // the list of promotable allocas. SmallPtrSet DeletedAllocas; - while (!Worklist.empty()) { - Changed |= runOnAlloca(*Worklist.pop_back_val()); - deleteDeadInstructions(DeletedAllocas); - if (!DeletedAllocas.empty()) { - PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), - PromotableAllocas.end(), - IsAllocaInSet(DeletedAllocas)), - PromotableAllocas.end()); - DeletedAllocas.clear(); + do { + while (!Worklist.empty()) { + Changed |= runOnAlloca(*Worklist.pop_back_val()); + deleteDeadInstructions(DeletedAllocas); + + // Remove the deleted allocas from various lists so that we don't try to + // continue processing them. + if (!DeletedAllocas.empty()) { + Worklist.remove_if(IsAllocaInSet(DeletedAllocas)); + PostPromotionWorklist.remove_if(IsAllocaInSet(DeletedAllocas)); + PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), + PromotableAllocas.end(), + IsAllocaInSet(DeletedAllocas)), + PromotableAllocas.end()); + DeletedAllocas.clear(); + } } - } - Changed |= promoteAllocas(F); + Changed |= promoteAllocas(F); + + Worklist = PostPromotionWorklist; + PostPromotionWorklist.clear(); + } while (!Worklist.empty()); return Changed; }