Implement PR8644: forwarding a memcpy value to a byval,

author Chris Lattner <sabre@nondot.org>

Sun, 21 Nov 2010 00:28:59 +0000 (00:28 +0000)

committer Chris Lattner <sabre@nondot.org>

Sun, 21 Nov 2010 00:28:59 +0000 (00:28 +0000)
author Chris Lattner <sabre@nondot.org>
Sun, 21 Nov 2010 00:28:59 +0000 (00:28 +0000)
committer Chris Lattner <sabre@nondot.org>
Sun, 21 Nov 2010 00:28:59 +0000 (00:28 +0000)
diff --git a/include/llvm/Support/CallSite.h b/include/llvm/Support/CallSite.h

index 571a04924d169a268d5cf01204a38d765f87f5ec..8a998a8cd0d15fc5b3e6e456bbf9420528adb33e 100644 (file)
--- a/include/llvm/Support/CallSite.h
+++ b/include/llvm/Support/CallSite.h
@@ -52,11 +52,6 @@ public:
    CallSiteBase(CallTy *CI) : I(CI, true) { assert(CI); }
    CallSiteBase(InvokeTy *II) : I(II, false) { assert(II); }
    CallSiteBase(ValTy *II) { *this = get(II); }
-  CallSiteBase(InstrTy *II) {
-    assert(II && "Null instruction given?");
-    *this = get(II);
-    assert(I.getPointer() && "Not a call?");
-  }
  protected:
    /// CallSiteBase::get - This static method is sort of like a constructor.  It
    /// will create an appropriate call site for a Call or Invoke instruction, but
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp

index 6f53c322ef5fc6fba9f87a0fdb17c766485d4b37..4f1cfd622f057ec6d8426c2454319e4e2ae9a1d3 100644 (file)
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -301,11 +301,13 @@ void MemsetRanges::addStore(int64_t Start, StoreInst *SI) {
  
  namespace {
    class MemCpyOpt : public FunctionPass {
+    MemoryDependenceAnalysis *MD;
      bool runOnFunction(Function &F);
    public:
      static char ID; // Pass identification, replacement for typeid
      MemCpyOpt() : FunctionPass(ID) {
        initializeMemCpyOptPass(*PassRegistry::getPassRegistry());
+      MD = 0;
      }
  
    private:
@@ -327,6 +329,7 @@ namespace {
                                uint64_t cpyLen, CallInst *C);
      bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
                                         uint64_t MSize);
+    bool processByValArgument(CallSite CS, unsigned ArgNo);
      bool iterateOnFunction(Function &F);
    };
    
@@ -359,9 +362,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
    // a memcpy.
    if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
      if (!LI->isVolatile() && LI->hasOneUse()) {
-      MemoryDependenceAnalysis &MD = getAnalysis<MemoryDependenceAnalysis>();
-
-      MemDepResult dep = MD.getDependency(LI);
+      MemDepResult dep = MD->getDependency(LI);
        CallInst *C = 0;
        if (dep.isClobber() && !isa<MemCpyInst>(dep.getInst()))
          C = dyn_cast<CallInst>(dep.getInst());
@@ -372,7 +373,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
                          LI->getPointerOperand()->stripPointerCasts(),
                          TD->getTypeStoreSize(SI->getOperand(0)->getType()), C);
          if (changed) {
-          MD.removeInstruction(SI);
+          MD->removeInstruction(SI);
            SI->eraseFromParent();
            LI->eraseFromParent();
            ++NumMemCpyInstr;
@@ -505,8 +506,8 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
      Value *C = CallInst::Create(MemSetF, Ops, Ops+5, "", InsertPt);
      DEBUG(dbgs() << "Replace stores:\n";
            for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i)
-            dbgs() << *Range.TheStores[i];
-          dbgs() << "With: " << *C); C=C;
+            dbgs() << *Range.TheStores[i] << '\n';
+          dbgs() << "With: " << *C << '\n'); C=C;
    
      // Don't invalidate the iterator
      BBI = BI;
@@ -657,11 +658,10 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
  
    // Drop any cached information about the call, because we may have changed
    // its dependence information by changing its parameter.
-  MemoryDependenceAnalysis &MD = getAnalysis<MemoryDependenceAnalysis>();
-  MD.removeInstruction(C);
+  MD->removeInstruction(C);
  
-  // Remove the memcpy
-  MD.removeInstruction(cpy);
+  // Remove the memcpy.
+  MD->removeInstruction(cpy);
    ++NumMemCpyInstr;
  
    return true;
@@ -675,7 +675,7 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
                                                uint64_t MSize) {
    // We can only transforms memcpy's where the dest of one is the source of the
    // other.
-  if (M->getSource() != MDep->getDest())
+  if (M->getSource() != MDep->getDest() || MDep->isVolatile())
      return false;
    
    // Second, the length of the memcpy's must be the same, or the preceeding one
@@ -684,27 +684,26 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
    if (!C1) return false;
    
    uint64_t DepSize = C1->getValue().getZExtValue();
-  
    if (DepSize < MSize)
      return false;
    
-  Intrinsic::ID ResultFn = Intrinsic::memcpy;
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
    
    // If the dest of the second might alias the source of the first, then the
    // source and dest might overlap.  We still want to eliminate the intermediate
    // value, but we have to generate a memmove instead of memcpy.
-  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  Intrinsic::ID ResultFn = Intrinsic::memcpy;
    if (!AA.isNoAlias(M->getRawDest(), MSize, MDep->getRawSource(), DepSize))
      ResultFn = Intrinsic::memmove;
    
-  // If all checks passed, then we can transform these memcpy's
+  // If all checks passed, then we can transform M.
    const Type *ArgTys[3] = {
      M->getRawDest()->getType(),
      MDep->getRawSource()->getType(),
      M->getLength()->getType()
    };
    Function *MemCpyFun =
-    Intrinsic::getDeclaration(M->getParent()->getParent()->getParent(),
+    Intrinsic::getDeclaration(MDep->getParent()->getParent()->getParent(),
                                ResultFn, ArgTys, 3);
    
    // Make sure to use the lesser of the alignment of the source and the dest
@@ -717,29 +716,30 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
      M->getRawDest(),
      MDep->getRawSource(), 
      M->getLength(),
-    ConstantInt::get(Type::getInt32Ty(M->getContext()), Align), 
+    ConstantInt::get(Type::getInt32Ty(MemCpyFun->getContext()), Align), 
      M->getVolatileCst()
    };
    CallInst *C = CallInst::Create(MemCpyFun, Args, Args+5, "", M);
    
    
-  MemoryDependenceAnalysis &MD = getAnalysis<MemoryDependenceAnalysis>();
-
    // Verify that the copied-from memory doesn't change in between the two
    // transfers.  For example, in:
    //    memcpy(a <- b)
    //    *b = 42;
    //    memcpy(c <- a)
    // It would be invalid to transform the second memcpy into memcpy(c <- b).
-  MemDepResult NewDep = MD.getDependency(C);
+  //
+  // TODO: If the code between M and MDep is transparent to the destination "c",
+  // then we could still perform the xform by moving M up to the first memcpy.
+  MemDepResult NewDep = MD->getDependency(C);
    if (!NewDep.isClobber() || NewDep.getInst() != MDep) {
-    MD.removeInstruction(C);
+    MD->removeInstruction(C);
      C->eraseFromParent();
      return false;
    }
  
    // Otherwise we're good!  Nuke the instruction we're replacing.
-  MD.removeInstruction(M);
+  MD->removeInstruction(M);
    M->eraseFromParent();
    ++NumMemCpyInstr;
    return true;
@@ -752,25 +752,23 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
  /// circumstances). This allows later passes to remove the first memcpy
  /// altogether.
  bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
-  MemoryDependenceAnalysis &MD = getAnalysis<MemoryDependenceAnalysis>();
-
-  // We can only optimize statically-sized memcpy's.
-  ConstantInt *cpyLen = dyn_cast<ConstantInt>(M->getLength());
-  if (!cpyLen) return false;
+  // We can only optimize statically-sized memcpy's that are non-volatile.
+  ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
+  if (CopySize == 0 || M->isVolatile()) return false;
  
    // The are two possible optimizations we can do for memcpy:
    //   a) memcpy-memcpy xform which exposes redundance for DSE.
    //   b) call-memcpy xform for return slot optimization.
-  MemDepResult dep = MD.getDependency(M);
-  if (!dep.isClobber())
+  MemDepResult DepInfo = MD->getDependency(M);
+  if (!DepInfo.isClobber())
      return false;
    
-  if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(dep.getInst()))
-    return processMemCpyMemCpyDependence(M, MDep, cpyLen->getZExtValue());
+  if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst()))
+    return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue());
      
-  if (CallInst *C = dyn_cast<CallInst>(dep.getInst())) {
+  if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
      bool changed = performCallSlotOptzn(M, M->getDest(), M->getSource(),
-                                        cpyLen->getZExtValue(), C);
+                                        CopySize->getZExtValue(), C);
      if (changed) M->eraseFromParent();
      return changed;
    }
@@ -805,33 +803,116 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {
  
    // MemDep may have over conservative information about this instruction, just
    // conservatively flush it from the cache.
-  getAnalysis<MemoryDependenceAnalysis>().removeInstruction(M);
+  MD->removeInstruction(M);
  
    ++NumMoveToCpy;
    return true;
  }
    
+/// processByValArgument - This is called on every byval argument in call sites.
+bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
+  TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  if (!TD) return false;
  
-// MemCpyOpt::iterateOnFunction - Executes one iteration of GVN.
+  Value *ByValArg = CS.getArgument(ArgNo);
+  
+  // MemDep doesn't have a way to do a local query with a memory location.
+  // Instead, just insert a load and ask for its dependences.
+  LoadInst *TmpLoad = new LoadInst(ByValArg, "", CS.getInstruction());
+  MemDepResult DepInfo = MD->getDependency(TmpLoad);
+  
+  MD->removeInstruction(TmpLoad);
+  TmpLoad->eraseFromParent();
+  
+  if (!DepInfo.isClobber())
+    return false;
+
+  // If the byval argument isn't fed by a memcpy, ignore it.  If it is fed by
+  // a memcpy, see if we can byval from the source of the memcpy instead of the
+  // result.
+  MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
+  if (MDep == 0 || MDep->isVolatile() ||
+      ByValArg->stripPointerCasts() != MDep->getDest())
+    return false;
+  
+  // The length of the memcpy must be larger or equal to the size of the byval.
+  // must be larger than the following one.
+  ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
+  if (C1 == 0 ||
+      C1->getValue().getZExtValue() < TD->getTypeAllocSize(ByValArg->getType()))
+    return false;
+
+  // Get the alignment of the byval.  If it is greater than the memcpy, then we
+  // can't do the substitution.  If the call doesn't specify the alignment, then
+  // it is some target specific value that we can't know.
+  unsigned ByValAlign = CS.getParamAlignment(ArgNo+1);
+  if (ByValAlign == 0 || MDep->getAlignment() < ByValAlign)
+    return false;  
+  
+  // Verify that the copied-from memory doesn't change in between the memcpy and
+  // the byval call.
+  //    memcpy(a <- b)
+  //    *b = 42;
+  //    foo(*a)
+  // It would be invalid to transform the second memcpy into foo(*b).
+  Value *TmpCast = MDep->getSource();
+  if (MDep->getSource()->getType() != ByValArg->getType())
+    TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
+                              "tmpcast", CS.getInstruction());
+  Value *TmpVal =
+    UndefValue::get(cast<PointerType>(TmpCast->getType())->getElementType());
+  Instruction *TmpStore = new StoreInst(TmpVal, TmpCast, false,
+                                        CS.getInstruction());
+  DepInfo = MD->getDependency(TmpStore);
+  bool isUnsafe = !DepInfo.isClobber() || DepInfo.getInst() != MDep;
+  MD->removeInstruction(TmpStore);
+  TmpStore->eraseFromParent();
+  
+  if (isUnsafe) {
+    // Clean up the inserted cast instruction.
+    if (TmpCast != MDep->getSource())
+      cast<Instruction>(TmpCast)->eraseFromParent();
+    return false;
+  }
+
+  DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n"
+               << "  " << *MDep << "\n"
+               << "  " << *CS.getInstruction() << "\n");
+  
+  // Otherwise we're good!  Update the byval argument.
+  CS.setArgument(ArgNo, TmpCast);
+  ++NumMemCpyInstr;
+  return true;
+}
+
+/// iterateOnFunction - Executes one iteration of MemCpyOpt.
  bool MemCpyOpt::iterateOnFunction(Function &F) {
    bool MadeChange = false;
  
    // Walk all instruction in the function.
    for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
-    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
-         BI != BE;) {
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
        // Avoid invalidating the iterator.
        Instruction *I = BI++;
        
+      bool RepeatInstruction = false;
+      
        if (StoreInst *SI = dyn_cast<StoreInst>(I))
          MadeChange |= processStore(SI, BI);
-      else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I))
-        MadeChange |= processMemCpy(M);
-      else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I)) {
-        if (processMemMove(M)) {
-          --BI;         // Reprocess the new memcpy.
-          MadeChange = true;
-        }
+      else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I)) {
+        RepeatInstruction = processMemCpy(M);
+      } else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I)) {
+        RepeatInstruction = processMemMove(M);
+      } else if (CallSite CS = (Value*)I) {
+        for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
+          if (CS.paramHasAttr(i+1, Attribute::ByVal))
+            MadeChange |= processByValArgument(CS, i);
+      }
+
+      // Reprocess the instruction if desired.
+      if (RepeatInstruction) {
+        --BI;
+        MadeChange = true;
        }
      }
    }
@@ -844,12 +925,14 @@ bool MemCpyOpt::iterateOnFunction(Function &F) {
  //
  bool MemCpyOpt::runOnFunction(Function &F) {
    bool MadeChange = false;
+  MD = &getAnalysis<MemoryDependenceAnalysis>();
    while (1) {
      if (!iterateOnFunction(F))
        break;
      MadeChange = true;
    }
    
+  MD = 0;
    return MadeChange;
  }
  
diff --git a/test/Transforms/MemCpyOpt/memcpy.ll b/test/Transforms/MemCpyOpt/memcpy.ll

index 303c2fcceef324bbe4e49f88f3a8e4ba83b2e30e..7309319c46eb1bf16845122c2e710b61292d3a44 100644 (file)
--- a/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/test/Transforms/MemCpyOpt/memcpy.ll
@@ -62,3 +62,18 @@ define void @test3({ x86_fp80, x86_fp80 }* noalias sret %agg.result) nounwind  {
  ; CHECK-NEXT: call void @llvm.memcpy
  ; CHECK-NEXT: ret void
  }
+
+
+; PR8644
+define void @test4(i8 *%P) {
+  %A = alloca {i32, i32}
+  %a = bitcast {i32, i32}* %A to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %P, i64 8, i32 4, i1 false)
+  call void @test4a(i8* byval align 1 %a) 
+  ret void
+; CHECK: @test4
+; CHECK-NEXT: call void @test4a(
+}
+
+declare void @test4a(i8* byval align 1)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/Transforms/MemCpyOpt/sret.ll b/test/Transforms/MemCpyOpt/sret.ll

index d35ab910d7c23f41ea089b6a48ba5f81244e6330..ddfd0fd1fccef846d9c3f01f1792e37df8abe086 100644 (file)
--- a/test/Transforms/MemCpyOpt/sret.ll
+++ b/test/Transforms/MemCpyOpt/sret.ll
@@ -3,7 +3,7 @@
  target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
  target triple = "i686-apple-darwin9"
  
-define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret  %agg.result, { x86_fp80, x86_fp80 }* byval  %z) nounwind  {
+define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret  %agg.result, { x86_fp80, x86_fp80 }* byval  align 8 %z) nounwind  {
  entry:
         %iz = alloca { x86_fp80, x86_fp80 }             ; <{ x86_fp80, x86_fp80 }*> [#uses=3]
         %memtmp = alloca { x86_fp80, x86_fp80 }, align 16               ; <{ x86_fp80, x86_fp80 }*> [#uses=2]
@@ -16,7 +16,7 @@ entry:
         %tmp8 = load x86_fp80* %tmp7, align 16          ; <x86_fp80> [#uses=1]
         store x86_fp80 %tmp3, x86_fp80* %real, align 16
         store x86_fp80 %tmp8, x86_fp80* %tmp4, align 16
-       call void @ccoshl( { x86_fp80, x86_fp80 }* noalias sret  %memtmp, { x86_fp80, x86_fp80 }* byval  %iz ) nounwind 
+       call void @ccoshl( { x86_fp80, x86_fp80 }* noalias sret  %memtmp, { x86_fp80, x86_fp80 }* byval align 8 %iz ) nounwind 
         %memtmp14 = bitcast { x86_fp80, x86_fp80 }* %memtmp to i8*              ; <i8*> [#uses=1]
         %agg.result15 = bitcast { x86_fp80, x86_fp80 }* %agg.result to i8*              ; <i8*> [#uses=1]
         call void @llvm.memcpy.i32( i8* %agg.result15, i8* %memtmp14, i32 32, i32 16 )
author	Chris Lattner <sabre@nondot.org>
	Sun, 21 Nov 2010 00:28:59 +0000 (00:28 +0000)
committer	Chris Lattner <sabre@nondot.org>
	Sun, 21 Nov 2010 00:28:59 +0000 (00:28 +0000)
include/llvm/Support/CallSite.h		patch \| blob \| history
lib/Transforms/Scalar/MemCpyOptimizer.cpp		patch \| blob \| history
test/Transforms/MemCpyOpt/memcpy.ll		patch \| blob \| history
test/Transforms/MemCpyOpt/sret.ll		patch \| blob \| history