From eed919b1ba4c6186258b4beb951625703c1c568e Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Mon, 21 Sep 2009 05:57:11 +0000 Subject: [PATCH] Improve GVN to be able to forward substitute a small load from a piece of a large store when both are in the same block. This allows clang to compile the testcase in PR4216 to this code: _test_bitfield: movl 4(%esp), %eax movl %eax, %ecx andl $-65536, %ecx orl $32962, %eax andl $40186, %eax orl %ecx, %eax ret This is not ideal, but is a whole lot better than the code produced by llvm-gcc: _test_bitfield: movw $-32574, %ax orw 4(%esp), %ax andw $-25350, %ax movw %ax, 4(%esp) movw 7(%esp), %cx shlw $8, %cx movzbl 6(%esp), %edx orw %cx, %dx movzwl %dx, %ecx shll $16, %ecx movzwl %ax, %eax orl %ecx, %eax ret and dramatically better than that produced by gcc 4.2: _test_bitfield: pushl %ebx call L3 "L00000000001$pb": L3: popl %ebx movl 8(%esp), %eax leal 0(,%eax,4), %edx sarb $7, %dl movl %eax, %ecx andl $7168, %ecx andl $-7201, %ebx movzbl %dl, %edx andl $1, %edx sall $5, %edx orl %ecx, %ebx orl %edx, %ebx andl $24, %eax andl $-58336, %ebx orl %eax, %ebx orl $32962, %ebx movl %ebx, %eax popl %ebx ret git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@82439 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/GVN.cpp | 177 ++++++++++++++++++++++++++++++++-- test/Transforms/GVN/rle.ll | 50 ++++++++++ 2 files changed, 220 insertions(+), 7 deletions(-) diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 2ae19820299..5f953dfbb03 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -23,6 +23,7 @@ #include "llvm/Function.h" #include "llvm/IntrinsicInst.h" #include "llvm/LLVMContext.h" +#include "llvm/Operator.h" #include "llvm/Value.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" @@ -38,6 +39,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetData.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -1341,6 +1343,153 @@ bool GVN::processNonLocalLoad(LoadInst *LI, return true; } +/// GetBaseWithConstantOffset - Analyze the specified pointer to see if it can +/// be expressed as a base pointer plus a constant offset. Return the base and +/// offset to the caller. +static Value *GetBaseWithConstantOffset(Value *Ptr, int64_t &Offset, + const TargetData *TD) { + Operator *PtrOp = dyn_cast(Ptr); + if (PtrOp == 0) return Ptr; + + // Just look through bitcasts. + if (PtrOp->getOpcode() == Instruction::BitCast) + return GetBaseWithConstantOffset(PtrOp->getOperand(0), Offset, TD); + + // If this is a GEP with constant indices, we can look through it. + GEPOperator *GEP = dyn_cast(PtrOp); + if (GEP == 0 || !GEP->hasAllConstantIndices()) return Ptr; + + gep_type_iterator GTI = gep_type_begin(GEP); + for (User::op_iterator I = GEP->idx_begin(), E = GEP->idx_end(); I != E; + ++I, ++GTI) { + ConstantInt *OpC = cast(*I); + if (OpC->isZero()) continue; + + // Handle a struct and array indices which add their offset to the pointer. + if (const StructType *STy = dyn_cast(*GTI)) { + Offset += TD->getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); + } else { + uint64_t Size = TD->getTypeAllocSize(GTI.getIndexedType()); + Offset += OpC->getSExtValue()*Size; + } + } + + // Re-sign extend from the pointer size if needed to get overflow edge cases + // right. + unsigned PtrSize = TD->getPointerSizeInBits(); + if (PtrSize < 64) + Offset = (Offset << (64-PtrSize)) >> (64-PtrSize); + + return GetBaseWithConstantOffset(GEP->getPointerOperand(), Offset, TD); +} + + +/// HandleLoadFromClobberingStore - This function is called when we have a +/// memdep query of a load that ends up being a clobbering store. This means +/// that the store *may* provide bits used by the load but we can't be sure +/// because the pointers don't mustalias. Check this case to see if there is +/// anything more we can do before we give up. +static Value *HandleLoadFromClobberingStore(LoadInst *L, StoreInst *DepSI, + const TargetData *TD) { + int64_t StoreOffset = 0, LoadOffset = 0; + Value *StoreBase = + GetBaseWithConstantOffset(DepSI->getPointerOperand(), StoreOffset, TD); + Value *LoadBase = + GetBaseWithConstantOffset(L->getPointerOperand(), LoadOffset, TD); + if (StoreBase != LoadBase) + return 0; + + // If the load and store are to the exact same address, they should have been + // a must alias. AA must have gotten confused. + // FIXME: Study to see if/when this happens. + if (LoadOffset == StoreOffset) { +#if 0 + errs() << "STORE/LOAD DEP WITH COMMON POINTER MISSED:\n" + << "Base = " << *StoreBase << "\n" + << "Store Ptr = " << *DepSI->getPointerOperand() << "\n" + << "Store Offs = " << StoreOffset << " - " << *DepSI << "\n" + << "Load Ptr = " << *L->getPointerOperand() << "\n" + << "Load Offs = " << LoadOffset << " - " << *L << "\n\n"; + errs() << "'" << L->getParent()->getParent()->getName() << "'" + << *L->getParent(); +#endif + return 0; + } + + // If the load and store don't overlap at all, the store doesn't provide + // anything to the load. In this case, they really don't alias at all, AA + // must have gotten confused. + // FIXME: Investigate cases where this bails out, e.g. rdar://7238614. Then + // remove this check, as it is duplicated with what we have below. + uint64_t StoreSize = TD->getTypeSizeInBits(DepSI->getOperand(0)->getType()); + uint64_t LoadSize = TD->getTypeSizeInBits(L->getType()); + + if ((StoreSize & 7) | (LoadSize & 7)) + return 0; + StoreSize >>= 3; // Convert to bytes. + LoadSize >>= 3; + + + bool isAAFailure = false; + if (StoreOffset < LoadOffset) { + isAAFailure = StoreOffset+int64_t(StoreSize) <= LoadOffset; + } else { + isAAFailure = LoadOffset+int64_t(LoadSize) <= StoreOffset; + } + if (isAAFailure) { +#if 0 + errs() << "STORE LOAD DEP WITH COMMON BASE:\n" + << "Base = " << *StoreBase << "\n" + << "Store Ptr = " << *DepSI->getPointerOperand() << "\n" + << "Store Offs = " << StoreOffset << " - " << *DepSI << "\n" + << "Load Ptr = " << *L->getPointerOperand() << "\n" + << "Load Offs = " << LoadOffset << " - " << *L << "\n\n"; + errs() << "'" << L->getParent()->getParent()->getName() << "'" + << *L->getParent(); +#endif + return 0; + } + + // If the Load isn't completely contained within the stored bits, we don't + // have all the bits to feed it. We could do something crazy in the future + // (issue a smaller load then merge the bits in) but this seems unlikely to be + // valuable. + if (StoreOffset > LoadOffset || + StoreOffset+StoreSize < LoadOffset+LoadSize) + return 0; + + // Adjust LoadOffset to be relative from the start of StoreOffset. + LoadOffset -= StoreOffset; + + Value *SrcVal = DepSI->getOperand(0); + + LLVMContext &Ctx = SrcVal->getType()->getContext(); + + // Compute which bits of the stored value are being used by the load. Convert + // to an integer type to start with. + if (isa(SrcVal->getType())) + SrcVal = new PtrToIntInst(SrcVal, TD->getIntPtrType(Ctx), "tmp", L); + if (!isa(SrcVal->getType())) + SrcVal = new BitCastInst(SrcVal, IntegerType::get(Ctx, StoreSize*8), "", L); + + // Shift the bits to the least significant depending on endianness. + unsigned ShiftAmt; + if (TD->isLittleEndian()) { + ShiftAmt = LoadOffset*8; + } else { + ShiftAmt = StoreSize-LoadSize-LoadOffset; + } + + SrcVal = BinaryOperator::CreateLShr(SrcVal, + ConstantInt::get(SrcVal->getType(), ShiftAmt), "tmp", L); + + SrcVal = new TruncInst(SrcVal, IntegerType::get(Ctx, LoadSize*8), "", L); + + return CoerceAvailableValueToLoadType(SrcVal, L->getType(), L, *TD); +} + + + /// processLoad - Attempt to eliminate a load, first by eliminating it /// locally, and then attempting non-local elimination if that fails. bool GVN::processLoad(LoadInst *L, SmallVectorImpl &toErase) { @@ -1352,7 +1501,12 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl &toErase) { // If the value isn't available, don't do anything! if (Dep.isClobber()) { - // FIXME: In the future, we should handle things like: + // FIXME: We should handle memset/memcpy/memmove as dependent instructions + // to forward the value if available. + //if (isa(Dep.getInst())) + //errs() << "LOAD DEPENDS ON MEM: " << *L << "\n" << *Dep.getInst()<<"\n\n"; + + // Check to see if we have something like this: // store i32 123, i32* %P // %A = bitcast i32* %P to i8* // %B = gep i8* %A, i32 1 @@ -1362,6 +1516,21 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl &toErase) { // a common base + constant offset, and if the previous store (or memset) // completely covers this load. This sort of thing can happen in bitfield // access code. + if (StoreInst *DepSI = dyn_cast(Dep.getInst())) + if (const TargetData *TD = getAnalysisIfAvailable()) + if (Value *AvailVal = HandleLoadFromClobberingStore(L, DepSI, TD)) { + DEBUG(errs() << "GVN COERCED STORE BITS:\n" << *DepSI << '\n' + << *AvailVal << '\n' << *L << "\n\n\n"); + + // Replace the load! + L->replaceAllUsesWith(AvailVal); + if (isa(AvailVal->getType())) + MD->invalidateCachedPointerInfo(AvailVal); + toErase.push_back(L); + NumGVNLoad++; + return true; + } + DEBUG( // fast print dep, using operator<< on instruction would be too slow errs() << "GVN: load "; @@ -1429,12 +1598,6 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl &toErase) { return true; } - // FIXME: We should handle memset/memcpy/memmove as dependent instructions to - // forward the value if available. - //if (isa(DepInst)) - // errs() << "LOAD DEPENDS ON MEM: " << *L << "\n" << *DepInst << "\n\n"; - - // If this load really doesn't depend on anything, then we must be loading an // undef value. This can happen when loading for a fresh allocation with no // intervening stores, for example. diff --git a/test/Transforms/GVN/rle.ll b/test/Transforms/GVN/rle.ll index 503a5bbacd1..736a7e40ecc 100644 --- a/test/Transforms/GVN/rle.ll +++ b/test/Transforms/GVN/rle.ll @@ -141,6 +141,35 @@ Cont: ; CHECK: ret i8 %A } +;; non-local i32/float -> i8 load forwarding. This also tests that the "P3" +;; bitcast equivalence can be properly phi translated. +define i8 @coerce_mustalias_nonlocal1(i32* %P, i1 %cond) { + %P2 = bitcast i32* %P to float* + br i1 %cond, label %T, label %F +T: + store i32 42, i32* %P + br label %Cont + +F: + store float 1.0, float* %P2 + br label %Cont + +Cont: + %P3 = bitcast i32* %P to i8* + %A = load i8* %P3 + ret i8 %A + +;; FIXME: This is disabled because this caused a miscompile in the llvm-gcc +;; bootstrap, see r82411 +; +; HECK: @coerce_mustalias_nonlocal1 +; HECK: Cont: +; HECK: %A = phi i8 [ +; HECK-NOT: load +; HECK: ret i8 %A +} + + ;; non-local i32 -> i8 partial redundancy load forwarding. define i8 @coerce_mustalias_pre0(i32* %P, i1 %cond) { %P3 = bitcast i32* %P to i8* @@ -165,3 +194,24 @@ Cont: ; CHECK: ret i8 %A } +;;===----------------------------------------------------------------------===;; +;; Store -> Load and Load -> Load forwarding where src and dst are different +;; types, and the reload is an offset from the store pointer. +;;===----------------------------------------------------------------------===;; + +;; i32 -> f32 forwarding. +define i8 @coerce_offset0(i32 %V, i32* %P) { + store i32 %V, i32* %P + + %P2 = bitcast i32* %P to i8* + %P3 = getelementptr i8* %P2, i32 2 + + %A = load i8* %P3 + ret i8 %A +; CHECK: @coerce_offset0 +; CHECK-NOT: load +; CHECK: ret i8 +} + + + -- 2.34.1