From: Chris Lattner Date: Sat, 19 Feb 2011 19:31:39 +0000 (+0000) Subject: Implement rdar://9009151, transforming strided loop stores of X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=3a393728a62122d7009d8e2cbe52a221874e576a Implement rdar://9009151, transforming strided loop stores of unsplatable values into memset_pattern16 when it is available (recent darwins). This transforms lots of strided loop stores of ints for example, like 5 in vpr: Formed memset: call void @memset_pattern16(i8* %4, i8* getelementptr inbounds ([16 x i8]* @.memset_pattern9, i32 0, i32 0), i64 %tmp25) from store to: {%3,+,4}<%11> at: store i32 3, i32* %scevgep, align 4, !tbaa !4 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@126040 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index d15e58d0ff6..be3ff9258fc 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -39,6 +39,7 @@ #define DEBUG_TYPE "loop-idiom" #include "llvm/Transforms/Scalar.h" #include "llvm/IntrinsicInst.h" +#include "llvm/Module.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -76,11 +77,11 @@ namespace { bool processLoopStore(StoreInst *SI, const SCEV *BECount); bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); - bool processLoopStoreOfSplatValue(Value *DestPtr, unsigned StoreSize, - unsigned StoreAlignment, - Value *SplatValue, Instruction *TheStore, - const SCEVAddRecExpr *Ev, - const SCEV *BECount); + bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, + unsigned StoreAlignment, + Value *SplatValue, Instruction *TheStore, + const SCEVAddRecExpr *Ev, + const SCEV *BECount); bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, const SCEVAddRecExpr *StoreEv, const SCEVAddRecExpr *LoadEv, @@ -275,14 +276,11 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { // validity check in mayLoopAccessLocation to be updated though. if (Stride == 0 || StoreSize != Stride->getValue()->getValue()) return false; - - // If the stored value is a byte-wise value (like i32 -1), then it may be - // turned into a memset of i8 -1, assuming that all the consecutive bytes - // are stored. A store of i32 0x01020304 can never be turned into a memset. - if (Value *SplatValue = isBytewiseValue(StoredVal)) - if (processLoopStoreOfSplatValue(StorePtr, StoreSize, SI->getAlignment(), - SplatValue, SI, StoreEv, BECount)) - return true; + + // See if we can optimize just this store in isolation. + if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(), + StoredVal, SI, StoreEv, BECount)) + return true; // If the stored value is a strided load in the same loop with the same stride // this this may be transformable into a memcpy. This kicks in for stuff like @@ -333,9 +331,9 @@ processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { if (Stride == 0 || MSI->getLength() != Stride->getValue()) return false; - return processLoopStoreOfSplatValue(Pointer, (unsigned)SizeInBytes, - MSI->getAlignment(), MSI->getValue(), - MSI, Ev, BECount); + return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, + MSI->getAlignment(), MSI->getValue(), + MSI, Ev, BECount); } @@ -372,22 +370,97 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, return false; } -/// processLoopStoreOfSplatValue - We see a strided store of a memsetable value. -/// If we can transform this into a memset in the loop preheader, do so. -bool LoopIdiomRecognize:: -processLoopStoreOfSplatValue(Value *DestPtr, unsigned StoreSize, - unsigned StoreAlignment, Value *SplatValue, - Instruction *TheStore, - const SCEVAddRecExpr *Ev, const SCEV *BECount) { - // If we're not allowed to form memset, we fail. - if (!TLI->has(LibFunc::memset)) - return false; +/// getMemSetPatternValue - If a strided store of the specified value is safe to +/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should +/// be passed in. Otherwise, return null. +/// +/// Note that we don't ever attempt to use memset_pattern8 or 4, because these +/// just replicate their input array and then pass on to memset_pattern16. +static Constant *getMemSetPatternValue(Value *V, const TargetData &TD) { + // If the value isn't a constant, we can't promote it to being in a constant + // array. We could theoretically do a store to an alloca or something, but + // that doesn't seem worthwhile. + Constant *C = dyn_cast(V); + if (C == 0) return 0; + + // Only handle simple values that are a power of two bytes in size. + uint64_t Size = TD.getTypeSizeInBits(V->getType()); + if (Size == 0 || (Size & 7) || (Size & (Size-1))) + return 0; + + // Convert the constant to an integer type of the appropriate size so we can + // start hacking on it. + if (isa(V->getType())) + C = ConstantExpr::getPtrToInt(C, IntegerType::get(C->getContext(), Size)); + else if (isa(V->getType()) || V->getType()->isFloatingPointTy()) + C = ConstantExpr::getBitCast(C, IntegerType::get(C->getContext(), Size)); + else if (!isa(V->getType())) + return 0; // Unhandled type. + + // Convert to size in bytes. + Size /= 8; + + // If we couldn't fold this to an integer, we fail. We don't bother to handle + // relocatable expressions like the address of a global yet. + // FIXME! + ConstantInt *CI = dyn_cast(C); + if (CI == 0) return 0; + + APInt CVal = CI->getValue(); + + // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see + // if the top and bottom are the same. + if (Size > 16) return 0; + // If this is a big endian target (PPC) then we need to bswap. + if (TD.isBigEndian()) + CVal = CVal.byteSwap(); + + // Determine what each byte of the pattern value should be. + char Value[16]; + for (unsigned i = 0; i != 16; ++i) { + // Get the byte value we're indexing into. + unsigned CByte = i % Size; + Value[i] = (unsigned char)(CVal.getZExtValue() >> CByte); + } - // Verify that the stored value is loop invariant. If not, we can't promote - // the memset. - if (!CurLoop->isLoopInvariant(SplatValue)) + return ConstantArray::get(V->getContext(), StringRef(Value, 16), false); +} + + +/// processLoopStridedStore - We see a strided store of some value. If we can +/// transform this into a memset or memset_pattern in the loop preheader, do so. +bool LoopIdiomRecognize:: +processLoopStridedStore(Value *DestPtr, unsigned StoreSize, + unsigned StoreAlignment, Value *StoredVal, + Instruction *TheStore, const SCEVAddRecExpr *Ev, + const SCEV *BECount) { + + // If the stored value is a byte-wise value (like i32 -1), then it may be + // turned into a memset of i8 -1, assuming that all the consecutive bytes + // are stored. A store of i32 0x01020304 can never be turned into a memset, + // but it can be turned into memset_pattern if the target supports it. + Value *SplatValue = isBytewiseValue(StoredVal); + Constant *PatternValue = 0; + + // If we're allowed to form a memset, and the stored value would be acceptable + // for memset, use it. + if (SplatValue && TLI->has(LibFunc::memset) && + // Verify that the stored value is loop invariant. If not, we can't + // promote the memset. + CurLoop->isLoopInvariant(SplatValue)) { + // Keep and use SplatValue. + PatternValue = 0; + } else if (TLI->has(LibFunc::memset_pattern16) && + (PatternValue = getMemSetPatternValue(StoredVal, *TD))) { + // It looks like we can use PatternValue! + SplatValue = 0; + } else { + // Otherwise, this isn't an idiom we can transform. For example, we can't + // do anything with a 3-byte store, for example. return false; + } + // Okay, we have a strided store "p[i]" of a splattable value. We can turn // this into a memset in the loop preheader now if we want. However, this @@ -415,7 +488,7 @@ processLoopStoreOfSplatValue(Value *DestPtr, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - const Type *IntPtr = TD->getIntPtrType(SplatValue->getContext()); + const Type *IntPtr = TD->getIntPtrType(DestPtr->getContext()); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), @@ -427,8 +500,28 @@ processLoopStoreOfSplatValue(Value *DestPtr, unsigned StoreSize, Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); - Value *NewCall = - Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment); + Value *NewCall; + if (SplatValue) + NewCall = Builder.CreateMemSet(BasePtr, SplatValue,NumBytes,StoreAlignment); + else { + Module *M = TheStore->getParent()->getParent()->getParent(); + Value *MSP = M->getOrInsertFunction("memset_pattern16", + Builder.getVoidTy(), + Builder.getInt8PtrTy(), + Builder.getInt8PtrTy(), IntPtr, + (void*)0); + + // Otherwise we should form a memset_pattern16. PatternValue is known to be + // an constant array of 16-bytes. Plop the value into a mergable global. + GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, + GlobalValue::InternalLinkage, + PatternValue, ".memset_pattern"); + GV->setUnnamedAddr(true); // Ok to merge these. + GV->setAlignment(16); + Value *PatternPtr = Builder.CreateConstInBoundsGEP2_32(GV, 0, 0, "pattern"); + + NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes); + } DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n" << " from store to: " << *Ev << " at: " << *TheStore << "\n"); diff --git a/test/Transforms/LoopIdiom/basic.ll b/test/Transforms/LoopIdiom/basic.ll index a94aaf18faa..ead2e6f11cd 100644 --- a/test/Transforms/LoopIdiom/basic.ll +++ b/test/Transforms/LoopIdiom/basic.ll @@ -273,3 +273,30 @@ for.end13: ; preds = %for.inc10 ; CHECK-NOT: store ; CHECK: ret void } + +; On darwin10 (which is the triple in this .ll file) this loop can be turned +; into a memset_pattern call. +; rdar://9009151 +define void @test11(i32* nocapture %P) nounwind ssp { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ] + %arrayidx = getelementptr i32* %P, i64 %indvar + store i32 1, i32* %arrayidx, align 4 + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, 10000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +; CHECK: @test11 +; CHECK-NEXT: entry: +; CHECK-NEXT: bitcast +; CHECK-NEXT: memset_pattern +; CHECK-NOT: store +; CHECK: ret void +} + +