Revert r116831 and r116839, which are breaking selfhost builds.

[oota-llvm.git] / lib / Transforms / Scalar / MemCpyOptimizer.cpp
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp

index 30cdeee8262b3ba575a6eadb316f94bb960e4ef2..d4a9171e85899654e1d260d82756894097c51ba1 100644 (file)
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -42,7 +42,7 @@ static Value *isBytewiseValue(Value *V) {
    LLVMContext &Context = V->getContext();
    
    // All byte-wide stores are splatable, even of arbitrary variables.
-  if (V->getType() == Type::getInt8Ty(Context)) return V;
+  if (V->getType()->isIntegerTy(8)) return V;
    
    // Constant float and double values can be handled as integer values if the
    // corresponding integer value is "byteable".  An important case is 0.0. 
@@ -304,7 +304,9 @@ namespace {
      bool runOnFunction(Function &F);
    public:
      static char ID; // Pass identification, replacement for typeid
-    MemCpyOpt() : FunctionPass(&ID) {}
+    MemCpyOpt() : FunctionPass(ID) {
+      initializeMemCpyOptPass(*PassRegistry::getPassRegistry());
+    }
  
    private:
      // This transformation requires dominator postdominator info
@@ -321,7 +323,8 @@ namespace {
      bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
      bool processMemCpy(MemCpyInst *M);
      bool processMemMove(MemMoveInst *M);
-    bool performCallSlotOptzn(MemCpyInst *cpy, CallInst *C);
+    bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
+                              uint64_t cpyLen, CallInst *C);
      bool iterateOnFunction(Function &F);
    };
    
@@ -331,10 +334,13 @@ namespace {
  // createMemCpyOptPass - The public interface to this file...
  FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }
  
-static RegisterPass<MemCpyOpt> X("memcpyopt",
-                                 "MemCpy Optimization");
-
-
+INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
+                    false, false)
  
  /// processStore - When GVN is scanning forward over instructions, we look for
  /// some other patterns to fold away.  In particular, this looks for stores to
@@ -343,6 +349,37 @@ static RegisterPass<MemCpyOpt> X("memcpyopt",
  bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
    if (SI->isVolatile()) return false;
    
+  TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  if (!TD) return false;
+
+  // Detect cases where we're performing call slot forwarding, but
+  // happen to be using a load-store pair to implement it, rather than
+  // a memcpy.
+  if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
+    if (!LI->isVolatile() && LI->hasOneUse()) {
+      MemoryDependenceAnalysis &MD = getAnalysis<MemoryDependenceAnalysis>();
+
+      MemDepResult dep = MD.getDependency(LI);
+      CallInst *C = 0;
+      if (dep.isClobber() && !isa<MemCpyInst>(dep.getInst()))
+        C = dyn_cast<CallInst>(dep.getInst());
+      
+      if (C) {
+        bool changed = performCallSlotOptzn(LI,
+                        SI->getPointerOperand()->stripPointerCasts(), 
+                        LI->getPointerOperand()->stripPointerCasts(),
+                        TD->getTypeStoreSize(SI->getOperand(0)->getType()), C);
+        if (changed) {
+          MD.removeInstruction(SI);
+          SI->eraseFromParent();
+          LI->eraseFromParent();
+          ++NumMemCpyInstr;
+          return true;
+        }
+      }
+    }
+  }
+  
    LLVMContext &Context = SI->getContext();
  
    // There are two cases that are interesting for this code to handle: memcpy
@@ -355,8 +392,6 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
    if (!ByteVal)
      return false;
  
-  TargetData *TD = getAnalysisIfAvailable<TargetData>();
-  if (!TD) return false;
    AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
    Module *M = SI->getParent()->getParent()->getParent();
  
@@ -374,7 +409,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
        // If the call is readnone, ignore it, otherwise bail out.  We don't even
        // allow readonly here because we don't want something like:
        // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
-      if (AA.getModRefBehavior(CallSite::get(BI)) ==
+      if (AA.getModRefBehavior(CallSite(BI)) ==
              AliasAnalysis::DoesNotAccessMemory)
          continue;
        
@@ -413,7 +448,6 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
    // interesting as a small compile-time optimization.
    Ranges.addStore(0, SI);
    
-  Function *MemSetF = 0;
    
    // Now that we have full information about ranges, loop over the ranges and
    // emit memset's for anything big enough to be worthwhile.
@@ -433,33 +467,44 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
      // memset block.  This ensure that the memset is dominated by any addressing
      // instruction needed by the start of the block.
      BasicBlock::iterator InsertPt = BI;
-  
-    if (MemSetF == 0) {
-      const Type *Ty = Type::getInt64Ty(Context);
-      MemSetF = Intrinsic::getDeclaration(M, Intrinsic::memset, &Ty, 1);
-    }
-    
+
      // Get the starting pointer of the block.
      StartPtr = Range.StartPtr;
-  
+
+    // Determine alignment
+    unsigned Alignment = Range.Alignment;
+    if (Alignment == 0) {
+      const Type *EltType = 
+         cast<PointerType>(StartPtr->getType())->getElementType();
+      Alignment = TD->getABITypeAlignment(EltType);
+    }
+
      // Cast the start ptr to be i8* as memset requires.
-    const Type *i8Ptr = PointerType::getUnqual(Type::getInt8Ty(Context));
-    if (StartPtr->getType() != i8Ptr)
+    const PointerType* StartPTy = cast<PointerType>(StartPtr->getType());
+    const PointerType *i8Ptr = Type::getInt8PtrTy(Context,
+                                                  StartPTy->getAddressSpace());
+    if (StartPTy!= i8Ptr)
        StartPtr = new BitCastInst(StartPtr, i8Ptr, StartPtr->getName(),
                                   InsertPt);
-  
+
      Value *Ops[] = {
        StartPtr, ByteVal,   // Start, value
        // size
        ConstantInt::get(Type::getInt64Ty(Context), Range.End-Range.Start),
        // align
-      ConstantInt::get(Type::getInt32Ty(Context), Range.Alignment)
+      ConstantInt::get(Type::getInt32Ty(Context), Alignment),
+      // volatile
+      ConstantInt::get(Type::getInt1Ty(Context), 0),
      };
-    Value *C = CallInst::Create(MemSetF, Ops, Ops+4, "", InsertPt);
-    DEBUG(errs() << "Replace stores:\n";
+    const Type *Tys[] = { Ops[0]->getType(), Ops[2]->getType() };
+
+    Function *MemSetF = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys, 2);
+
+    Value *C = CallInst::Create(MemSetF, Ops, Ops+5, "", InsertPt);
+    DEBUG(dbgs() << "Replace stores:\n";
            for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i)
-            errs() << *Range.TheStores[i];
-          errs() << "With: " << *C); C=C;
+            dbgs() << *Range.TheStores[i];
+          dbgs() << "With: " << *C); C=C;
    
      // Don't invalidate the iterator
      BBI = BI;
@@ -480,7 +525,9 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
  /// performCallSlotOptzn - takes a memcpy and a call that it depends on,
  /// and checks for the possibility of a call slot optimization by having
  /// the call write its result directly into the destination of the memcpy.
-bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
+bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
+                                     Value *cpyDest, Value *cpySrc,
+                                     uint64_t cpyLen, CallInst *C) {
    // The general transformation to keep in mind is
    //
    //   call @func(..., src, ...)
@@ -497,15 +544,7 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
  
    // Deliberately get the source and destination with bitcasts stripped away,
    // because we'll need to do type comparisons based on the underlying type.
-  Value *cpyDest = cpy->getDest();
-  Value *cpySrc = cpy->getSource();
-  CallSite CS = CallSite::get(C);
-
-  // We need to be able to reason about the size of the memcpy, so we require
-  // that it be a constant.
-  ConstantInt *cpyLength = dyn_cast<ConstantInt>(cpy->getLength());
-  if (!cpyLength)
-    return false;
+  CallSite CS(C);
  
    // Require that src be an alloca.  This simplifies the reasoning considerably.
    AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
@@ -523,7 +562,7 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
    uint64_t srcSize = TD->getTypeAllocSize(srcAlloca->getAllocatedType()) *
      srcArraySize->getZExtValue();
  
-  if (cpyLength->getZExtValue() < srcSize)
+  if (cpyLen < srcSize)
      return false;
  
    // Check that accessing the first srcSize bytes of dest will not cause a
@@ -562,8 +601,7 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
    SmallVector<User*, 8> srcUseList(srcAlloca->use_begin(),
                                     srcAlloca->use_end());
    while (!srcUseList.empty()) {
-    User *UI = srcUseList.back();
-    srcUseList.pop_back();
+    User *UI = srcUseList.pop_back_val();
  
      if (isa<BitCastInst>(UI)) {
        for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
@@ -593,7 +631,7 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
    // the use analysis, we also need to know that it does not sneakily
    // access dest.  We rely on AA to figure this out for us.
    AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
-  if (AA.getModRefInfo(C, cpy->getRawDest(), srcSize) !=
+  if (AA.getModRefInfo(C, cpyDest, srcSize) !=
        AliasAnalysis::NoModRef)
      return false;
  
@@ -622,19 +660,23 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
  
    // Remove the memcpy
    MD.removeInstruction(cpy);
-  cpy->eraseFromParent();
-  NumMemCpyInstr++;
+  ++NumMemCpyInstr;
  
    return true;
  }
  
-/// processMemCpy - perform simplication of memcpy's.  If we have memcpy A which
-/// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be
-/// a memcpy from X to Z (or potentially a memmove, depending on circumstances).
-///  This allows later passes to remove the first memcpy altogether.
+/// processMemCpy - perform simplification of memcpy's.  If we have memcpy A
+/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
+/// B to be a memcpy from X to Z (or potentially a memmove, depending on
+/// circumstances). This allows later passes to remove the first memcpy
+/// altogether.
  bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
    MemoryDependenceAnalysis &MD = getAnalysis<MemoryDependenceAnalysis>();
  
+  // We can only optimize statically-sized memcpy's.
+  ConstantInt *cpyLen = dyn_cast<ConstantInt>(M->getLength());
+  if (!cpyLen) return false;
+
    // The are two possible optimizations we can do for memcpy:
    //   a) memcpy-memcpy xform which exposes redundance for DSE.
    //   b) call-memcpy xform for return slot optimization.
@@ -642,8 +684,12 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
    if (!dep.isClobber())
      return false;
    if (!isa<MemCpyInst>(dep.getInst())) {
-    if (CallInst *C = dyn_cast<CallInst>(dep.getInst()))
-      return performCallSlotOptzn(M, C);
+    if (CallInst *C = dyn_cast<CallInst>(dep.getInst())) {
+      bool changed = performCallSlotOptzn(M, M->getDest(), M->getSource(),
+                                  cpyLen->getZExtValue(), C);
+      if (changed) M->eraseFromParent();
+      return changed;
+    }
      return false;
    }
    
@@ -681,24 +727,34 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
      return false;
    
    // If all checks passed, then we can transform these memcpy's
-  const Type *Ty = M->getLength()->getType();
+  const Type *ArgTys[3] = { M->getRawDest()->getType(),
+                            MDep->getRawSource()->getType(),
+                            M->getLength()->getType() };
    Function *MemCpyFun = Intrinsic::getDeclaration(
                                   M->getParent()->getParent()->getParent(),
-                                 M->getIntrinsicID(), &Ty, 1);
+                                 M->getIntrinsicID(), ArgTys, 3);
      
-  Value *Args[4] = {
-    M->getRawDest(), MDep->getRawSource(), M->getLength(), M->getAlignmentCst()
+  // Make sure to use the lesser of the alignment of the source and the dest
+  // since we're changing where we're reading from, but don't want to increase
+  // the alignment past what can be read from or written to.
+  // TODO: Is this worth it if we're creating a less aligned memcpy? For
+  // example we could be moving from movaps -> movq on x86.
+  unsigned Align = std::min(MDep->getAlignmentCst()->getZExtValue(),
+                            M->getAlignmentCst()->getZExtValue());
+  LLVMContext &Context = M->getContext();
+  ConstantInt *AlignCI = ConstantInt::get(Type::getInt32Ty(Context), Align);
+  Value *Args[5] = {
+    M->getRawDest(), MDep->getRawSource(), M->getLength(),
+    AlignCI, M->getVolatileCst()
    };
-  
-  CallInst *C = CallInst::Create(MemCpyFun, Args, Args+4, "", M);
-  
+  CallInst *C = CallInst::Create(MemCpyFun, Args, Args+5, "", M);
    
    // If C and M don't interfere, then this is a valid transformation.  If they
    // did, this would mean that the two sources overlap, which would be bad.
    if (MD.getDependency(C) == dep) {
      MD.removeInstruction(M);
      M->eraseFromParent();
-    NumMemCpyInstr++;
+    ++NumMemCpyInstr;
      return true;
    }
    
@@ -716,7 +772,7 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {
  
    // If the memmove is a constant size, use it for the alias query, this allows
    // us to optimize things like: memmove(P, P+64, 64);
-  uint64_t MemMoveSize = ~0ULL;
+  unsigned MemMoveSize = AliasAnalysis::UnknownSize;
    if (ConstantInt *Len = dyn_cast<ConstantInt>(M->getLength()))
      MemMoveSize = Len->getZExtValue();
    
@@ -725,12 +781,15 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) {
        AliasAnalysis::NoAlias)
      return false;
    
-  DEBUG(errs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n");
+  DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n");
    
    // If not, then we know we can transform this.
    Module *Mod = M->getParent()->getParent()->getParent();
-  const Type *Ty = M->getLength()->getType();
-  M->setOperand(0, Intrinsic::getDeclaration(Mod, Intrinsic::memcpy, &Ty, 1));
+  const Type *ArgTys[3] = { M->getRawDest()->getType(),
+                            M->getRawSource()->getType(),
+                            M->getLength()->getType() };
+  M->setCalledFunction(Intrinsic::getDeclaration(Mod, Intrinsic::memcpy,
+                                                 ArgTys, 3));
  
    // MemDep may have over conservative information about this instruction, just
    // conservatively flush it from the cache.