move isBytewiseValue out to ValueTracking.h/cpp

[oota-llvm.git] / lib / Transforms / Scalar / MemCpyOptimizer.cpp
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp

index afbed3741f4fc835596c1e4c9c809e7a1526a044..00ee14578573f9dc2f5a51eeb0e2542d07a8aabe 100644 (file)
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -14,73 +14,26 @@
  
  #define DEBUG_TYPE "memcpyopt"
  #include "llvm/Transforms/Scalar.h"
+#include "llvm/GlobalVariable.h"
  #include "llvm/IntrinsicInst.h"
  #include "llvm/Instructions.h"
-#include "llvm/LLVMContext.h"
  #include "llvm/ADT/SmallVector.h"
  #include "llvm/ADT/Statistic.h"
  #include "llvm/Analysis/Dominators.h"
  #include "llvm/Analysis/AliasAnalysis.h"
  #include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/raw_ostream.h"
  #include "llvm/Target/TargetData.h"
  #include <list>
  using namespace llvm;
  
  STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
  STATISTIC(NumMemSetInfer, "Number of memsets inferred");
-
-/// isBytewiseValue - If the specified value can be set by repeating the same
-/// byte in memory, return the i8 value that it is represented with.  This is
-/// true for all i8 values obviously, but is also true for i32 0, i32 -1,
-/// i16 0xF0F0, double 0.0 etc.  If the value can't be handled with a repeated
-/// byte store (e.g. i16 0x1234), return null.
-static Value *isBytewiseValue(Value *V, LLVMContext& Context) {
-  // All byte-wide stores are splatable, even of arbitrary variables.
-  if (V->getType() == Type::Int8Ty) return V;
-  
-  // Constant float and double values can be handled as integer values if the
-  // corresponding integer value is "byteable".  An important case is 0.0. 
-  if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
-    if (CFP->getType() == Type::FloatTy)
-      V = ConstantExpr::getBitCast(CFP, Type::Int32Ty);
-    if (CFP->getType() == Type::DoubleTy)
-      V = ConstantExpr::getBitCast(CFP, Type::Int64Ty);
-    // Don't handle long double formats, which have strange constraints.
-  }
-  
-  // We can handle constant integers that are power of two in size and a 
-  // multiple of 8 bits.
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
-    unsigned Width = CI->getBitWidth();
-    if (isPowerOf2_32(Width) && Width > 8) {
-      // We can handle this value if the recursive binary decomposition is the
-      // same at all levels.
-      APInt Val = CI->getValue();
-      APInt Val2;
-      while (Val.getBitWidth() != 8) {
-        unsigned NextWidth = Val.getBitWidth()/2;
-        Val2  = Val.lshr(NextWidth);
-        Val2.trunc(Val.getBitWidth()/2);
-        Val.trunc(Val.getBitWidth()/2);
-
-        // If the top/bottom halves aren't the same, reject it.
-        if (Val != Val2)
-          return 0;
-      }
-      return ConstantInt::get(Context, Val);
-    }
-  }
-  
-  // Conceptually, we could handle things like:
-  //   %a = zext i8 %X to i16
-  //   %b = shl i16 %a, 8
-  //   %c = or i16 %a, %b
-  // but until there is an example that actually needs this, it doesn't seem
-  // worth worrying about.
-  return 0;
-}
+STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
+STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset");
  
  static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx,
                                    bool &VariableIdxFound, TargetData &TD) {
@@ -271,6 +224,7 @@ void MemsetRanges::addStore(int64_t Start, StoreInst *SI) {
    if (Start < I->Start) {
      I->Start = Start;
      I->StartPtr = SI->getPointerOperand();
+    I->Alignment = SI->getAlignment();
    }
      
    // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
@@ -295,12 +249,15 @@ void MemsetRanges::addStore(int64_t Start, StoreInst *SI) {
  //===----------------------------------------------------------------------===//
  
  namespace {
-
-  class VISIBILITY_HIDDEN MemCpyOpt : public FunctionPass {
+  class MemCpyOpt : public FunctionPass {
+    MemoryDependenceAnalysis *MD;
      bool runOnFunction(Function &F);
    public:
      static char ID; // Pass identification, replacement for typeid
-    MemCpyOpt() : FunctionPass(&ID) {}
+    MemCpyOpt() : FunctionPass(ID) {
+      initializeMemCpyOptPass(*PassRegistry::getPassRegistry());
+      MD = 0;
+    }
  
    private:
      // This transformation requires dominator postdominator info
@@ -309,16 +266,19 @@ namespace {
        AU.addRequired<DominatorTree>();
        AU.addRequired<MemoryDependenceAnalysis>();
        AU.addRequired<AliasAnalysis>();
-      AU.addRequired<TargetData>();
        AU.addPreserved<AliasAnalysis>();
        AU.addPreserved<MemoryDependenceAnalysis>();
-      AU.addPreserved<TargetData>();
      }
    
      // Helper fuctions
-    bool processStore(StoreInst *SI, BasicBlock::iterator& BBI);
-    bool processMemCpy(MemCpyInst* M);
-    bool performCallSlotOptzn(MemCpyInst* cpy, CallInst* C);
+    bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
+    bool processMemCpy(MemCpyInst *M);
+    bool processMemMove(MemMoveInst *M);
+    bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
+                              uint64_t cpyLen, CallInst *C);
+    bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
+                                       uint64_t MSize);
+    bool processByValArgument(CallSite CS, unsigned ArgNo);
      bool iterateOnFunction(Function &F);
    };
    
@@ -328,38 +288,70 @@ namespace {
  // createMemCpyOptPass - The public interface to this file...
  FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }
  
-static RegisterPass<MemCpyOpt> X("memcpyopt",
-                                 "MemCpy Optimization");
-
-
+INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
+                    false, false)
  
  /// processStore - When GVN is scanning forward over instructions, we look for
  /// some other patterns to fold away.  In particular, this looks for stores to
  /// neighboring locations of memory.  If it sees enough consequtive ones
  /// (currently 4) it attempts to merge them together into a memcpy/memset.
-bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator& BBI) {
+bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
    if (SI->isVolatile()) return false;
    
+  TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  if (!TD) return false;
+
+  // Detect cases where we're performing call slot forwarding, but
+  // happen to be using a load-store pair to implement it, rather than
+  // a memcpy.
+  if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
+    if (!LI->isVolatile() && LI->hasOneUse()) {
+      MemDepResult dep = MD->getDependency(LI);
+      CallInst *C = 0;
+      if (dep.isClobber() && !isa<MemCpyInst>(dep.getInst()))
+        C = dyn_cast<CallInst>(dep.getInst());
+      
+      if (C) {
+        bool changed = performCallSlotOptzn(LI,
+                        SI->getPointerOperand()->stripPointerCasts(), 
+                        LI->getPointerOperand()->stripPointerCasts(),
+                        TD->getTypeStoreSize(SI->getOperand(0)->getType()), C);
+        if (changed) {
+          MD->removeInstruction(SI);
+          SI->eraseFromParent();
+          LI->eraseFromParent();
+          ++NumMemCpyInstr;
+          return true;
+        }
+      }
+    }
+  }
+  
+  LLVMContext &Context = SI->getContext();
+
    // There are two cases that are interesting for this code to handle: memcpy
    // and memset.  Right now we only handle memset.
    
    // Ensure that the value being stored is something that can be memset'able a
    // byte at a time like "0" or "-1" or any width, as well as things like
    // 0xA0A0A0A0 and 0.0.
-  Value *ByteVal = isBytewiseValue(SI->getOperand(0), SI->getContext());
+  Value *ByteVal = isBytewiseValue(SI->getOperand(0));
    if (!ByteVal)
      return false;
  
-  TargetData &TD = getAnalysis<TargetData>();
    AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
-  LLVMContext &Context = SI->getContext();
    Module *M = SI->getParent()->getParent()->getParent();
  
    // Okay, so we now have a single store that can be splatable.  Scan to find
    // all subsequent stores of the same value to offset from the same pointer.
    // Join these together into ranges, so we can decide whether contiguous blocks
    // are stored.
-  MemsetRanges Ranges(TD);
+  MemsetRanges Ranges(*TD);
    
    Value *StartPtr = SI->getPointerOperand();
    
@@ -369,7 +361,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator& BBI) {
        // If the call is readnone, ignore it, otherwise bail out.  We don't even
        // allow readonly here because we don't want something like:
        // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
-      if (AA.getModRefBehavior(CallSite::get(BI)) ==
+      if (AA.getModRefBehavior(CallSite(BI)) ==
              AliasAnalysis::DoesNotAccessMemory)
          continue;
        
@@ -387,13 +379,12 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator& BBI) {
      if (NextStore->isVolatile()) break;
      
      // Check to see if this stored value is of the same byte-splattable value.
-    if (ByteVal != isBytewiseValue(NextStore->getOperand(0), 
-                                   NextStore->getContext()))
+    if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
        break;
  
      // Check to see if this store is to a constant offset from the start ptr.
      int64_t Offset;
-    if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, TD))
+    if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, *TD))
        break;
  
      Ranges.addStore(Offset, NextStore);
@@ -408,9 +399,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator& BBI) {
    // store as well.  We try to avoid this unless there is at least something
    // interesting as a small compile-time optimization.
    Ranges.addStore(0, SI);
-
    
-  Function *MemSetF = 0;
    
    // Now that we have full information about ranges, loop over the ranges and
    // emit memset's for anything big enough to be worthwhile.
@@ -422,7 +411,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator& BBI) {
      if (Range.TheStores.size() == 1) continue;
      
      // If it is profitable to lower this range to memset, do so now.
-    if (!Range.isProfitableToUseMemset(TD))
+    if (!Range.isProfitableToUseMemset(*TD))
        continue;
      
      // Otherwise, we do want to transform this!  Create a new memset.  We put
@@ -430,40 +419,51 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator& BBI) {
      // memset block.  This ensure that the memset is dominated by any addressing
      // instruction needed by the start of the block.
      BasicBlock::iterator InsertPt = BI;
-  
-    if (MemSetF == 0) {
-      const Type *Tys[] = {Type::Int64Ty};
-      MemSetF = Intrinsic::getDeclaration(M, Intrinsic::memset,
-                                          Tys, 1);
-   }
-    
+
      // Get the starting pointer of the block.
      StartPtr = Range.StartPtr;
-  
+
+    // Determine alignment
+    unsigned Alignment = Range.Alignment;
+    if (Alignment == 0) {
+      const Type *EltType = 
+         cast<PointerType>(StartPtr->getType())->getElementType();
+      Alignment = TD->getABITypeAlignment(EltType);
+    }
+
      // Cast the start ptr to be i8* as memset requires.
-    const Type *i8Ptr = Context.getPointerTypeUnqual(Type::Int8Ty);
-    if (StartPtr->getType() != i8Ptr)
+    const PointerType* StartPTy = cast<PointerType>(StartPtr->getType());
+    const PointerType *i8Ptr = Type::getInt8PtrTy(Context,
+                                                  StartPTy->getAddressSpace());
+    if (StartPTy!= i8Ptr)
        StartPtr = new BitCastInst(StartPtr, i8Ptr, StartPtr->getName(),
                                   InsertPt);
-  
+
      Value *Ops[] = {
        StartPtr, ByteVal,   // Start, value
        // size
-      ConstantInt::get(Type::Int64Ty, Range.End-Range.Start),
+      ConstantInt::get(Type::getInt64Ty(Context), Range.End-Range.Start),
        // align
-      ConstantInt::get(Type::Int32Ty, Range.Alignment)
+      ConstantInt::get(Type::getInt32Ty(Context), Alignment),
+      // volatile
+      ConstantInt::getFalse(Context),
      };
-    Value *C = CallInst::Create(MemSetF, Ops, Ops+4, "", InsertPt);
-    DEBUG(cerr << "Replace stores:\n";
+    const Type *Tys[] = { Ops[0]->getType(), Ops[2]->getType() };
+
+    Function *MemSetF = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys, 2);
+
+    Value *C = CallInst::Create(MemSetF, Ops, Ops+5, "", InsertPt);
+    DEBUG(dbgs() << "Replace stores:\n";
            for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i)
-            cerr << *Range.TheStores[i];
-          cerr << "With: " << *C); C=C;
+            dbgs() << *Range.TheStores[i] << '\n';
+          dbgs() << "With: " << *C << '\n'); (void)C;
    
      // Don't invalidate the iterator
      BBI = BI;
    
      // Zap all the stores.
-    for (SmallVector<StoreInst*, 16>::const_iterator SI = Range.TheStores.begin(),
+    for (SmallVector<StoreInst*, 16>::const_iterator
+         SI = Range.TheStores.begin(),
           SE = Range.TheStores.end(); SI != SE; ++SI)
        (*SI)->eraseFromParent();
      ++NumMemSetInfer;
@@ -477,7 +477,9 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator& BBI) {
  /// performCallSlotOptzn - takes a memcpy and a call that it depends on,
  /// and checks for the possibility of a call slot optimization by having
  /// the call write its result directly into the destination of the memcpy.
-bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
+bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
+                                     Value *cpyDest, Value *cpySrc,
+                                     uint64_t cpyLen, CallInst *C) {
    // The general transformation to keep in mind is
    //
    //   call @func(..., src, ...)
@@ -494,56 +496,49 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
  
    // Deliberately get the source and destination with bitcasts stripped away,
    // because we'll need to do type comparisons based on the underlying type.
-  Value* cpyDest = cpy->getDest();
-  Value* cpySrc = cpy->getSource();
-  CallSite CS = CallSite::get(C);
-
-  // We need to be able to reason about the size of the memcpy, so we require
-  // that it be a constant.
-  ConstantInt* cpyLength = dyn_cast<ConstantInt>(cpy->getLength());
-  if (!cpyLength)
-    return false;
+  CallSite CS(C);
  
    // Require that src be an alloca.  This simplifies the reasoning considerably.
-  AllocaInst* srcAlloca = dyn_cast<AllocaInst>(cpySrc);
+  AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
    if (!srcAlloca)
      return false;
  
    // Check that all of src is copied to dest.
-  TargetData& TD = getAnalysis<TargetData>();
+  TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  if (!TD) return false;
  
-  ConstantInt* srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
+  ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
    if (!srcArraySize)
      return false;
  
-  uint64_t srcSize = TD.getTypeAllocSize(srcAlloca->getAllocatedType()) *
+  uint64_t srcSize = TD->getTypeAllocSize(srcAlloca->getAllocatedType()) *
      srcArraySize->getZExtValue();
  
-  if (cpyLength->getZExtValue() < srcSize)
+  if (cpyLen < srcSize)
      return false;
  
    // Check that accessing the first srcSize bytes of dest will not cause a
    // trap.  Otherwise the transform is invalid since it might cause a trap
    // to occur earlier than it otherwise would.
-  if (AllocaInst* A = dyn_cast<AllocaInst>(cpyDest)) {
+  if (AllocaInst *A = dyn_cast<AllocaInst>(cpyDest)) {
      // The destination is an alloca.  Check it is larger than srcSize.
-    ConstantInt* destArraySize = dyn_cast<ConstantInt>(A->getArraySize());
+    ConstantInt *destArraySize = dyn_cast<ConstantInt>(A->getArraySize());
      if (!destArraySize)
        return false;
  
-    uint64_t destSize = TD.getTypeAllocSize(A->getAllocatedType()) *
+    uint64_t destSize = TD->getTypeAllocSize(A->getAllocatedType()) *
        destArraySize->getZExtValue();
  
      if (destSize < srcSize)
        return false;
-  } else if (Argument* A = dyn_cast<Argument>(cpyDest)) {
+  } else if (Argument *A = dyn_cast<Argument>(cpyDest)) {
      // If the destination is an sret parameter then only accesses that are
      // outside of the returned struct type can trap.
      if (!A->hasStructRetAttr())
        return false;
  
-    const Type* StructTy = cast<PointerType>(A->getType())->getElementType();
-    uint64_t destSize = TD.getTypeAllocSize(StructTy);
+    const Type *StructTy = cast<PointerType>(A->getType())->getElementType();
+    uint64_t destSize = TD->getTypeAllocSize(StructTy);
  
      if (destSize < srcSize)
        return false;
@@ -558,14 +553,13 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
    SmallVector<User*, 8> srcUseList(srcAlloca->use_begin(),
                                     srcAlloca->use_end());
    while (!srcUseList.empty()) {
-    User* UI = srcUseList.back();
-    srcUseList.pop_back();
+    User *UI = srcUseList.pop_back_val();
  
      if (isa<BitCastInst>(UI)) {
        for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
             I != E; ++I)
          srcUseList.push_back(*I);
-    } else if (GetElementPtrInst* G = dyn_cast<GetElementPtrInst>(UI)) {
+    } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(UI)) {
        if (G->hasAllZeroIndices())
          for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
               I != E; ++I)
@@ -579,8 +573,8 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
  
    // Since we're changing the parameter to the callsite, we need to make sure
    // that what would be the new parameter dominates the callsite.
-  DominatorTree& DT = getAnalysis<DominatorTree>();
-  if (Instruction* cpyDestInst = dyn_cast<Instruction>(cpyDest))
+  DominatorTree &DT = getAnalysis<DominatorTree>();
+  if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest))
      if (!DT.dominates(cpyDestInst, C))
        return false;
  
@@ -588,8 +582,8 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
    // unexpected manner, for example via a global, which we deduce from
    // the use analysis, we also need to know that it does not sneakily
    // access dest.  We rely on AA to figure this out for us.
-  AliasAnalysis& AA = getAnalysis<AliasAnalysis>();
-  if (AA.getModRefInfo(C, cpy->getRawDest(), srcSize) !=
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  if (AA.getModRefInfo(C, cpyDest, srcSize) !=
        AliasAnalysis::NoModRef)
      return false;
  
@@ -601,11 +595,11 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
          cpyDest = CastInst::CreatePointerCast(cpyDest, cpySrc->getType(),
                                                cpyDest->getName(), C);
        changedArgument = true;
-      if (CS.getArgument(i)->getType() != cpyDest->getType())
-        CS.setArgument(i, CastInst::CreatePointerCast(cpyDest, 
-                       CS.getArgument(i)->getType(), cpyDest->getName(), C));
-      else
+      if (CS.getArgument(i)->getType() == cpyDest->getType())
          CS.setArgument(i, cpyDest);
+      else
+        CS.setArgument(i, CastInst::CreatePointerCast(cpyDest, 
+                          CS.getArgument(i)->getType(), cpyDest->getName(), C));
      }
  
    if (!changedArgument)
@@ -613,134 +607,300 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
  
    // Drop any cached information about the call, because we may have changed
    // its dependence information by changing its parameter.
-  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
-  MD.removeInstruction(C);
+  MD->removeInstruction(C);
  
-  // Remove the memcpy
-  MD.removeInstruction(cpy);
-  cpy->eraseFromParent();
-  NumMemCpyInstr++;
+  // Remove the memcpy.
+  MD->removeInstruction(cpy);
+  ++NumMemCpyInstr;
  
    return true;
  }
  
-/// processMemCpy - perform simplication of memcpy's.  If we have memcpy A which
-/// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be
-/// a memcpy from X to Z (or potentially a memmove, depending on circumstances).
-///  This allows later passes to remove the first memcpy altogether.
-bool MemCpyOpt::processMemCpy(MemCpyInst* M) {
-  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
-
-  // The are two possible optimizations we can do for memcpy:
-  //   a) memcpy-memcpy xform which exposes redundance for DSE
-  //   b) call-memcpy xform for return slot optimization
-  MemDepResult dep = MD.getDependency(M);
-  if (!dep.isClobber())
-    return false;
-  if (!isa<MemCpyInst>(dep.getInst())) {
-    if (CallInst* C = dyn_cast<CallInst>(dep.getInst()))
-      return performCallSlotOptzn(M, C);
+/// processMemCpyMemCpyDependence - We've found that the (upward scanning)
+/// memory dependence of memcpy 'M' is the memcpy 'MDep'.  Try to simplify M to
+/// copy from MDep's input if we can.  MSize is the size of M's copy.
+/// 
+bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
+                                              uint64_t MSize) {
+  // We can only transforms memcpy's where the dest of one is the source of the
+  // other.
+  if (M->getSource() != MDep->getDest() || MDep->isVolatile())
      return false;
-  }
    
-  MemCpyInst* MDep = cast<MemCpyInst>(dep.getInst());
-  
-  // We can only transforms memcpy's where the dest of one is the source of the
-  // other
-  if (M->getSource() != MDep->getDest())
+  // If dep instruction is reading from our current input, then it is a noop
+  // transfer and substituting the input won't change this instruction.  Just
+  // ignore the input and let someone else zap MDep.  This handles cases like:
+  //    memcpy(a <- a)
+  //    memcpy(b <- a)
+  if (M->getSource() == MDep->getSource())
      return false;
    
    // Second, the length of the memcpy's must be the same, or the preceeding one
    // must be larger than the following one.
-  ConstantInt* C1 = dyn_cast<ConstantInt>(MDep->getLength());
-  ConstantInt* C2 = dyn_cast<ConstantInt>(M->getLength());
-  if (!C1 || !C2)
-    return false;
-  
-  uint64_t DepSize = C1->getValue().getZExtValue();
-  uint64_t CpySize = C2->getValue().getZExtValue();
+  ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
+  if (!C1) return false;
    
-  if (DepSize < CpySize)
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+
+  // Verify that the copied-from memory doesn't change in between the two
+  // transfers.  For example, in:
+  //    memcpy(a <- b)
+  //    *b = 42;
+  //    memcpy(c <- a)
+  // It would be invalid to transform the second memcpy into memcpy(c <- b).
+  //
+  // TODO: If the code between M and MDep is transparent to the destination "c",
+  // then we could still perform the xform by moving M up to the first memcpy.
+  //
+  // NOTE: This is conservative, it will stop on any read from the source loc,
+  // not just the defining memcpy.
+  MemDepResult SourceDep =
+    MD->getPointerDependencyFrom(AA.getLocationForSource(MDep),
+                                 false, M, M->getParent());
+  if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
      return false;
    
-  // Finally, we have to make sure that the dest of the second does not
-  // alias the source of the first
-  AliasAnalysis& AA = getAnalysis<AliasAnalysis>();
-  if (AA.alias(M->getRawDest(), CpySize, MDep->getRawSource(), DepSize) !=
+  // If the dest of the second might alias the source of the first, then the
+  // source and dest might overlap.  We still want to eliminate the intermediate
+  // value, but we have to generate a memmove instead of memcpy.
+  Intrinsic::ID ResultFn = Intrinsic::memcpy;
+  if (AA.alias(AA.getLocationForDest(M), AA.getLocationForSource(MDep)) !=
        AliasAnalysis::NoAlias)
+    ResultFn = Intrinsic::memmove;
+  
+  // If all checks passed, then we can transform M.
+  const Type *ArgTys[3] = {
+    M->getRawDest()->getType(),
+    MDep->getRawSource()->getType(),
+    M->getLength()->getType()
+  };
+  Function *MemCpyFun =
+    Intrinsic::getDeclaration(MDep->getParent()->getParent()->getParent(),
+                              ResultFn, ArgTys, 3);
+  
+  // Make sure to use the lesser of the alignment of the source and the dest
+  // since we're changing where we're reading from, but don't want to increase
+  // the alignment past what can be read from or written to.
+  // TODO: Is this worth it if we're creating a less aligned memcpy? For
+  // example we could be moving from movaps -> movq on x86.
+  unsigned Align = std::min(MDep->getAlignment(), M->getAlignment());
+  Value *Args[5] = {
+    M->getRawDest(),
+    MDep->getRawSource(), 
+    M->getLength(),
+    ConstantInt::get(Type::getInt32Ty(MemCpyFun->getContext()), Align), 
+    M->getVolatileCst()
+  };
+  CallInst::Create(MemCpyFun, Args, Args+5, "", M);
+
+  // Remove the instruction we're replacing.
+  MD->removeInstruction(M);
+  M->eraseFromParent();
+  ++NumMemCpyInstr;
+  return true;
+}
+
+
+/// processMemCpy - perform simplification of memcpy's.  If we have memcpy A
+/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
+/// B to be a memcpy from X to Z (or potentially a memmove, depending on
+/// circumstances). This allows later passes to remove the first memcpy
+/// altogether.
+bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
+  // We can only optimize statically-sized memcpy's that are non-volatile.
+  ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
+  if (CopySize == 0 || M->isVolatile()) return false;
+
+  // If the source and destination of the memcpy are the same, then zap it.
+  if (M->getSource() == M->getDest()) {
+    MD->removeInstruction(M);
+    M->eraseFromParent();
      return false;
-  else if (AA.alias(M->getRawDest(), CpySize, M->getRawSource(), CpySize) !=
-           AliasAnalysis::NoAlias)
-    return false;
-  else if (AA.alias(MDep->getRawDest(), DepSize, MDep->getRawSource(), DepSize)
-           != AliasAnalysis::NoAlias)
+  }
+
+  // If copying from a constant, try to turn the memcpy into a memset.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource()))
+    if (GV->isConstant() && GV->hasDefinitiveInitializer())
+      if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) {
+        Value *Ops[] = {
+          M->getRawDest(), ByteVal,               // Start, value
+          CopySize,                               // Size
+          M->getAlignmentCst(),                   // Alignment
+          ConstantInt::getFalse(M->getContext()), // volatile
+        };
+        const Type *Tys[] = { Ops[0]->getType(), Ops[2]->getType() };
+        Module *Mod = M->getParent()->getParent()->getParent();
+        Function *MemSetF = Intrinsic::getDeclaration(Mod, Intrinsic::memset,
+                                                      Tys, 2);
+        CallInst::Create(MemSetF, Ops, Ops+5, "", M);
+        MD->removeInstruction(M);
+        M->eraseFromParent();
+        ++NumCpyToSet;
+        return true;
+      }
+
+  // The are two possible optimizations we can do for memcpy:
+  //   a) memcpy-memcpy xform which exposes redundance for DSE.
+  //   b) call-memcpy xform for return slot optimization.
+  MemDepResult DepInfo = MD->getDependency(M);
+  if (!DepInfo.isClobber())
      return false;
    
-  // If all checks passed, then we can transform these memcpy's
-  const Type *Tys[1];
-  Tys[0] = M->getLength()->getType();
-  Function* MemCpyFun = Intrinsic::getDeclaration(
-                                 M->getParent()->getParent()->getParent(),
-                                 M->getIntrinsicID(), Tys, 1);
+  if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst()))
+    return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue());
      
-  Value *Args[4] = {
-    M->getRawDest(), MDep->getRawSource(), M->getLength(), M->getAlignmentCst()
-  };
+  if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
+    if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
+                             CopySize->getZExtValue(), C)) {
+      M->eraseFromParent();
+      return true;
+    }
+  }
+  return false;
+}
+
+/// processMemMove - Transforms memmove calls to memcpy calls when the src/dst
+/// are guaranteed not to alias.
+bool MemCpyOpt::processMemMove(MemMoveInst *M) {
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+
+  // See if the pointers alias.
+  if (AA.alias(AA.getLocationForDest(M),
+               AA.getLocationForSource(M)) !=
+      AliasAnalysis::NoAlias)
+    return false;
    
-  CallInst* C = CallInst::Create(MemCpyFun, Args, Args+4, "", M);
+  DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n");
    
+  // If not, then we know we can transform this.
+  Module *Mod = M->getParent()->getParent()->getParent();
+  const Type *ArgTys[3] = { M->getRawDest()->getType(),
+                            M->getRawSource()->getType(),
+                            M->getLength()->getType() };
+  M->setCalledFunction(Intrinsic::getDeclaration(Mod, Intrinsic::memcpy,
+                                                 ArgTys, 3));
+
+  // MemDep may have over conservative information about this instruction, just
+  // conservatively flush it from the cache.
+  MD->removeInstruction(M);
+
+  ++NumMoveToCpy;
+  return true;
+}
    
-  // If C and M don't interfere, then this is a valid transformation.  If they
-  // did, this would mean that the two sources overlap, which would be bad.
-  if (MD.getDependency(C) == dep) {
-    MD.removeInstruction(M);
-    M->eraseFromParent();
-    NumMemCpyInstr++;
-    return true;
-  }
+/// processByValArgument - This is called on every byval argument in call sites.
+bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
+  TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  if (!TD) return false;
+
+  // Find out what feeds this byval argument.
+  Value *ByValArg = CS.getArgument(ArgNo);
+  const Type *ByValTy =cast<PointerType>(ByValArg->getType())->getElementType();
+  uint64_t ByValSize = TD->getTypeAllocSize(ByValTy);
+  MemDepResult DepInfo =
+    MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize),
+                                 true, CS.getInstruction(),
+                                 CS.getInstruction()->getParent());
+  if (!DepInfo.isClobber())
+    return false;
+
+  // If the byval argument isn't fed by a memcpy, ignore it.  If it is fed by
+  // a memcpy, see if we can byval from the source of the memcpy instead of the
+  // result.
+  MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
+  if (MDep == 0 || MDep->isVolatile() ||
+      ByValArg->stripPointerCasts() != MDep->getDest())
+    return false;
    
-  // Otherwise, there was no point in doing this, so we remove the call we
-  // inserted and act like nothing happened.
-  MD.removeInstruction(C);
-  C->eraseFromParent();
-  return false;
-}
+  // The length of the memcpy must be larger or equal to the size of the byval.
+  ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
+  if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize)
+    return false;
  
-// MemCpyOpt::runOnFunction - This is the main transformation entry point for a
-// function.
-//
-bool MemCpyOpt::runOnFunction(Function& F) {
+  // Get the alignment of the byval.  If it is greater than the memcpy, then we
+  // can't do the substitution.  If the call doesn't specify the alignment, then
+  // it is some target specific value that we can't know.
+  unsigned ByValAlign = CS.getParamAlignment(ArgNo+1);
+  if (ByValAlign == 0 || MDep->getAlignment() < ByValAlign)
+    return false;  
+  
+  // Verify that the copied-from memory doesn't change in between the memcpy and
+  // the byval call.
+  //    memcpy(a <- b)
+  //    *b = 42;
+  //    foo(*a)
+  // It would be invalid to transform the second memcpy into foo(*b).
+  //
+  // NOTE: This is conservative, it will stop on any read from the source loc,
+  // not just the defining memcpy.
+  MemDepResult SourceDep =
+    MD->getPointerDependencyFrom(AliasAnalysis::getLocationForSource(MDep),
+                                 false, CS.getInstruction(), MDep->getParent());
+  if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+    return false;
    
-  bool changed = false;
-  bool shouldContinue = true;
+  Value *TmpCast = MDep->getSource();
+  if (MDep->getSource()->getType() != ByValArg->getType())
+    TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
+                              "tmpcast", CS.getInstruction());
    
-  while (shouldContinue) {
-    shouldContinue = iterateOnFunction(F);
-    changed |= shouldContinue;
-  }
+  DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n"
+               << "  " << *MDep << "\n"
+               << "  " << *CS.getInstruction() << "\n");
    
-  return changed;
+  // Otherwise we're good!  Update the byval argument.
+  CS.setArgument(ArgNo, TmpCast);
+  ++NumMemCpyInstr;
+  return true;
  }
  
-
-// MemCpyOpt::iterateOnFunction - Executes one iteration of GVN
+/// iterateOnFunction - Executes one iteration of MemCpyOpt.
  bool MemCpyOpt::iterateOnFunction(Function &F) {
-  bool changed_function = false;
+  bool MadeChange = false;
  
-  // Walk all instruction in the function
+  // Walk all instruction in the function.
    for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
-    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
-         BI != BE;) {
-      // Avoid invalidating the iterator
-      Instruction* I = BI++;
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+      // Avoid invalidating the iterator.
+      Instruction *I = BI++;
+      
+      bool RepeatInstruction = false;
        
        if (StoreInst *SI = dyn_cast<StoreInst>(I))
-        changed_function |= processStore(SI, BI);
-      else if (MemCpyInst* M = dyn_cast<MemCpyInst>(I)) {
-        changed_function |= processMemCpy(M);
+        MadeChange |= processStore(SI, BI);
+      else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I)) {
+        RepeatInstruction = processMemCpy(M);
+      } else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I)) {
+        RepeatInstruction = processMemMove(M);
+      } else if (CallSite CS = (Value*)I) {
+        for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
+          if (CS.paramHasAttr(i+1, Attribute::ByVal))
+            MadeChange |= processByValArgument(CS, i);
+      }
+
+      // Reprocess the instruction if desired.
+      if (RepeatInstruction) {
+        --BI;
+        MadeChange = true;
        }
      }
    }
    
-  return changed_function;
+  return MadeChange;
+}
+
+// MemCpyOpt::runOnFunction - This is the main transformation entry point for a
+// function.
+//
+bool MemCpyOpt::runOnFunction(Function &F) {
+  bool MadeChange = false;
+  MD = &getAnalysis<MemoryDependenceAnalysis>();
+  while (1) {
+    if (!iterateOnFunction(F))
+      break;
+    MadeChange = true;
+  }
+  
+  MD = 0;
+  return MadeChange;
  }