lib/Transforms/Scalar/MemCpyOptimizer.cpp

   1 //===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This pass performs various transformations related to eliminating memcpy
  11 // calls, or transforming sets of stores into memset's.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "llvm/Transforms/Scalar.h"
  16 #include "llvm/ADT/SmallVector.h"
  17 #include "llvm/ADT/Statistic.h"
  18 #include "llvm/Analysis/AliasAnalysis.h"
  19 #include "llvm/Analysis/AssumptionCache.h"
  20 #include "llvm/Analysis/GlobalsModRef.h"
  21 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
  22 #include "llvm/Analysis/TargetLibraryInfo.h"
  23 #include "llvm/Analysis/ValueTracking.h"
  24 #include "llvm/IR/DataLayout.h"
  25 #include "llvm/IR/Dominators.h"
  26 #include "llvm/IR/GetElementPtrTypeIterator.h"
  27 #include "llvm/IR/GlobalVariable.h"
  28 #include "llvm/IR/IRBuilder.h"
  29 #include "llvm/IR/Instructions.h"
  30 #include "llvm/IR/IntrinsicInst.h"
  31 #include "llvm/Support/Debug.h"
  32 #include "llvm/Support/raw_ostream.h"
  33 #include "llvm/Transforms/Utils/Local.h"
  34 #include <algorithm>
  35 using namespace llvm;
  36
  37 #define DEBUG_TYPE "memcpyopt"
  38
  39 STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
  40 STATISTIC(NumMemSetInfer, "Number of memsets inferred");
  41 STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
  42 STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset");
  43
  44 static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,
  45                                   bool &VariableIdxFound,
  46                                   const DataLayout &DL) {
  47   // Skip over the first indices.
  48   gep_type_iterator GTI = gep_type_begin(GEP);
  49   for (unsigned i = 1; i != Idx; ++i, ++GTI)
  50     /*skip along*/;
  51
  52   // Compute the offset implied by the rest of the indices.
  53   int64_t Offset = 0;
  54   for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
  55     ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i));
  56     if (!OpC)
  57       return VariableIdxFound = true;
  58     if (OpC->isZero()) continue;  // No offset.
  59
  60     // Handle struct indices, which add their field offset to the pointer.
  61     if (StructType *STy = dyn_cast<StructType>(*GTI)) {
  62       Offset += DL.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
  63       continue;
  64     }
  65
  66     // Otherwise, we have a sequential type like an array or vector.  Multiply
  67     // the index by the ElementSize.
  68     uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
  69     Offset += Size*OpC->getSExtValue();
  70   }
  71
  72   return Offset;
  73 }
  74
  75 /// Return true if Ptr1 is provably equal to Ptr2 plus a constant offset, and
  76 /// return that constant offset. For example, Ptr1 might be &A[42], and Ptr2
  77 /// might be &A[40]. In this case offset would be -8.
  78 static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
  79                             const DataLayout &DL) {
  80   Ptr1 = Ptr1->stripPointerCasts();
  81   Ptr2 = Ptr2->stripPointerCasts();
  82
  83   // Handle the trivial case first.
  84   if (Ptr1 == Ptr2) {
  85     Offset = 0;
  86     return true;
  87   }
  88
  89   GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1);
  90   GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2);
  91
  92   bool VariableIdxFound = false;
  93
  94   // If one pointer is a GEP and the other isn't, then see if the GEP is a
  95   // constant offset from the base, as in "P" and "gep P, 1".
  96   if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) {
  97     Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, DL);
  98     return !VariableIdxFound;
  99   }
 100
 101   if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) {
 102     Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, DL);
 103     return !VariableIdxFound;
 104   }
 105
 106   // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
 107   // base.  After that base, they may have some number of common (and
 108   // potentially variable) indices.  After that they handle some constant
 109   // offset, which determines their offset from each other.  At this point, we
 110   // handle no other case.
 111   if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0))
 112     return false;
 113
 114   // Skip any common indices and track the GEP types.
 115   unsigned Idx = 1;
 116   for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx)
 117     if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx))
 118       break;
 119
 120   int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, DL);
 121   int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, DL);
 122   if (VariableIdxFound) return false;
 123
 124   Offset = Offset2-Offset1;
 125   return true;
 126 }
 127
 128
 129 /// Represents a range of memset'd bytes with the ByteVal value.
 130 /// This allows us to analyze stores like:
 131 ///   store 0 -> P+1
 132 ///   store 0 -> P+0
 133 ///   store 0 -> P+3
 134 ///   store 0 -> P+2
 135 /// which sometimes happens with stores to arrays of structs etc.  When we see
 136 /// the first store, we make a range [1, 2).  The second store extends the range
 137 /// to [0, 2).  The third makes a new range [2, 3).  The fourth store joins the
 138 /// two ranges into [0, 3) which is memset'able.
 139 namespace {
 140 struct MemsetRange {
 141   // Start/End - A semi range that describes the span that this range covers.
 142   // The range is closed at the start and open at the end: [Start, End).
 143   int64_t Start, End;
 144
 145   /// StartPtr - The getelementptr instruction that points to the start of the
 146   /// range.
 147   Value *StartPtr;
 148
 149   /// Alignment - The known alignment of the first store.
 150   unsigned Alignment;
 151
 152   /// TheStores - The actual stores that make up this range.
 153   SmallVector<Instruction*, 16> TheStores;
 154
 155   bool isProfitableToUseMemset(const DataLayout &DL) const;
 156 };
 157 } // end anon namespace
 158
 159 bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
 160   // If we found more than 4 stores to merge or 16 bytes, use memset.
 161   if (TheStores.size() >= 4 || End-Start >= 16) return true;
 162
 163   // If there is nothing to merge, don't do anything.
 164   if (TheStores.size() < 2) return false;
 165
 166   // If any of the stores are a memset, then it is always good to extend the
 167   // memset.
 168   for (unsigned i = 0, e = TheStores.size(); i != e; ++i)
 169     if (!isa<StoreInst>(TheStores[i]))
 170       return true;
 171
 172   // Assume that the code generator is capable of merging pairs of stores
 173   // together if it wants to.
 174   if (TheStores.size() == 2) return false;
 175
 176   // If we have fewer than 8 stores, it can still be worthwhile to do this.
 177   // For example, merging 4 i8 stores into an i32 store is useful almost always.
 178   // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
 179   // memset will be split into 2 32-bit stores anyway) and doing so can
 180   // pessimize the llvm optimizer.
 181   //
 182   // Since we don't have perfect knowledge here, make some assumptions: assume
 183   // the maximum GPR width is the same size as the largest legal integer
 184   // size. If so, check to see whether we will end up actually reducing the
 185   // number of stores used.
 186   unsigned Bytes = unsigned(End-Start);
 187   unsigned MaxIntSize = DL.getLargestLegalIntTypeSize();
 188   if (MaxIntSize == 0)
 189     MaxIntSize = 1;
 190   unsigned NumPointerStores = Bytes / MaxIntSize;
 191
 192   // Assume the remaining bytes if any are done a byte at a time.
 193   unsigned NumByteStores = Bytes - NumPointerStores * MaxIntSize;
 194
 195   // If we will reduce the # stores (according to this heuristic), do the
 196   // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
 197   // etc.
 198   return TheStores.size() > NumPointerStores+NumByteStores;
 199 }
 200
 201
 202 namespace {
 203 class MemsetRanges {
 204   /// A sorted list of the memset ranges.
 205   SmallVector<MemsetRange, 8> Ranges;
 206   typedef SmallVectorImpl<MemsetRange>::iterator range_iterator;
 207   const DataLayout &DL;
 208 public:
 209   MemsetRanges(const DataLayout &DL) : DL(DL) {}
 210
 211   typedef SmallVectorImpl<MemsetRange>::const_iterator const_iterator;
 212   const_iterator begin() const { return Ranges.begin(); }
 213   const_iterator end() const { return Ranges.end(); }
 214   bool empty() const { return Ranges.empty(); }
 215
 216   void addInst(int64_t OffsetFromFirst, Instruction *Inst) {
 217     if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
 218       addStore(OffsetFromFirst, SI);
 219     else
 220       addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst));
 221   }
 222
 223   void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
 224     int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
 225
 226     addRange(OffsetFromFirst, StoreSize,
 227              SI->getPointerOperand(), SI->getAlignment(), SI);
 228   }
 229
 230   void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
 231     int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
 232     addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI);
 233   }
 234
 235   void addRange(int64_t Start, int64_t Size, Value *Ptr,
 236                 unsigned Alignment, Instruction *Inst);
 237
 238 };
 239
 240 } // end anon namespace
 241
 242
 243 /// Add a new store to the MemsetRanges data structure.  This adds a
 244 /// new range for the specified store at the specified offset, merging into
 245 /// existing ranges as appropriate.
 246 void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
 247                             unsigned Alignment, Instruction *Inst) {
 248   int64_t End = Start+Size;
 249
 250   range_iterator I = std::lower_bound(Ranges.begin(), Ranges.end(), Start,
 251     [](const MemsetRange &LHS, int64_t RHS) { return LHS.End < RHS; });
 252
 253   // We now know that I == E, in which case we didn't find anything to merge
 254   // with, or that Start <= I->End.  If End < I->Start or I == E, then we need
 255   // to insert a new range.  Handle this now.
 256   if (I == Ranges.end() || End < I->Start) {
 257     MemsetRange &R = *Ranges.insert(I, MemsetRange());
 258     R.Start        = Start;
 259     R.End          = End;
 260     R.StartPtr     = Ptr;
 261     R.Alignment    = Alignment;
 262     R.TheStores.push_back(Inst);
 263     return;
 264   }
 265
 266   // This store overlaps with I, add it.
 267   I->TheStores.push_back(Inst);
 268
 269   // At this point, we may have an interval that completely contains our store.
 270   // If so, just add it to the interval and return.
 271   if (I->Start <= Start && I->End >= End)
 272     return;
 273
 274   // Now we know that Start <= I->End and End >= I->Start so the range overlaps
 275   // but is not entirely contained within the range.
 276
 277   // See if the range extends the start of the range.  In this case, it couldn't
 278   // possibly cause it to join the prior range, because otherwise we would have
 279   // stopped on *it*.
 280   if (Start < I->Start) {
 281     I->Start = Start;
 282     I->StartPtr = Ptr;
 283     I->Alignment = Alignment;
 284   }
 285
 286   // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
 287   // is in or right at the end of I), and that End >= I->Start.  Extend I out to
 288   // End.
 289   if (End > I->End) {
 290     I->End = End;
 291     range_iterator NextI = I;
 292     while (++NextI != Ranges.end() && End >= NextI->Start) {
 293       // Merge the range in.
 294       I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());
 295       if (NextI->End > I->End)
 296         I->End = NextI->End;
 297       Ranges.erase(NextI);
 298       NextI = I;
 299     }
 300   }
 301 }
 302
 303 //===----------------------------------------------------------------------===//
 304 //                         MemCpyOpt Pass
 305 //===----------------------------------------------------------------------===//
 306
 307 namespace {
 308   class MemCpyOpt : public FunctionPass {
 309     MemoryDependenceAnalysis *MD;
 310     TargetLibraryInfo *TLI;
 311   public:
 312     static char ID; // Pass identification, replacement for typeid
 313     MemCpyOpt() : FunctionPass(ID) {
 314       initializeMemCpyOptPass(*PassRegistry::getPassRegistry());
 315       MD = nullptr;
 316       TLI = nullptr;
 317     }
 318
 319     bool runOnFunction(Function &F) override;
 320
 321   private:
 322     // This transformation requires dominator postdominator info
 323     void getAnalysisUsage(AnalysisUsage &AU) const override {
 324       AU.setPreservesCFG();
 325       AU.addRequired<AssumptionCacheTracker>();
 326       AU.addRequired<DominatorTreeWrapperPass>();
 327       AU.addRequired<MemoryDependenceAnalysis>();
 328       AU.addRequired<AAResultsWrapperPass>();
 329       AU.addRequired<TargetLibraryInfoWrapperPass>();
 330       AU.addPreserved<GlobalsAAWrapperPass>();
 331       AU.addPreserved<MemoryDependenceAnalysis>();
 332     }
 333
 334     // Helper functions
 335     bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
 336     bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
 337     bool processMemCpy(MemCpyInst *M);
 338     bool processMemMove(MemMoveInst *M);
 339     bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
 340                               uint64_t cpyLen, unsigned cpyAlign, CallInst *C);
 341     bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
 342     bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep);
 343     bool performMemCpyToMemSetOptzn(MemCpyInst *M, MemSetInst *MDep);
 344     bool processByValArgument(CallSite CS, unsigned ArgNo);
 345     Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
 346                                       Value *ByteVal);
 347
 348     bool iterateOnFunction(Function &F);
 349   };
 350
 351   char MemCpyOpt::ID = 0;
 352 }
 353
 354 /// The public interface to this file...
 355 FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }
 356
 357 INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
 358                       false, false)
 359 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 360 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 361 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
 362 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 363 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 364 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 365 INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
 366                     false, false)
 367
 368 /// When scanning forward over instructions, we look for some other patterns to
 369 /// fold away. In particular, this looks for stores to neighboring locations of
 370 /// memory. If it sees enough consecutive ones, it attempts to merge them
 371 /// together into a memcpy/memset.
 372 Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
 373                                              Value *StartPtr, Value *ByteVal) {
 374   const DataLayout &DL = StartInst->getModule()->getDataLayout();
 375
 376   // Okay, so we now have a single store that can be splatable.  Scan to find
 377   // all subsequent stores of the same value to offset from the same pointer.
 378   // Join these together into ranges, so we can decide whether contiguous blocks
 379   // are stored.
 380   MemsetRanges Ranges(DL);
 381
 382   BasicBlock::iterator BI(StartInst);
 383   for (++BI; !isa<TerminatorInst>(BI); ++BI) {
 384     if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
 385       // If the instruction is readnone, ignore it, otherwise bail out.  We
 386       // don't even allow readonly here because we don't want something like:
 387       // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
 388       if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
 389         break;
 390       continue;
 391     }
 392
 393     if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
 394       // If this is a store, see if we can merge it in.
 395       if (!NextStore->isSimple()) break;
 396
 397       // Check to see if this stored value is of the same byte-splattable value.
 398       if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
 399         break;
 400
 401       // Check to see if this store is to a constant offset from the start ptr.
 402       int64_t Offset;
 403       if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset,
 404                            DL))
 405         break;
 406
 407       Ranges.addStore(Offset, NextStore);
 408     } else {
 409       MemSetInst *MSI = cast<MemSetInst>(BI);
 410
 411       if (MSI->isVolatile() || ByteVal != MSI->getValue() ||
 412           !isa<ConstantInt>(MSI->getLength()))
 413         break;
 414
 415       // Check to see if this store is to a constant offset from the start ptr.
 416       int64_t Offset;
 417       if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, DL))
 418         break;
 419
 420       Ranges.addMemSet(Offset, MSI);
 421     }
 422   }
 423
 424   // If we have no ranges, then we just had a single store with nothing that
 425   // could be merged in.  This is a very common case of course.
 426   if (Ranges.empty())
 427     return nullptr;
 428
 429   // If we had at least one store that could be merged in, add the starting
 430   // store as well.  We try to avoid this unless there is at least something
 431   // interesting as a small compile-time optimization.
 432   Ranges.addInst(0, StartInst);
 433
 434   // If we create any memsets, we put it right before the first instruction that
 435   // isn't part of the memset block.  This ensure that the memset is dominated
 436   // by any addressing instruction needed by the start of the block.
 437   IRBuilder<> Builder(&*BI);
 438
 439   // Now that we have full information about ranges, loop over the ranges and
 440   // emit memset's for anything big enough to be worthwhile.
 441   Instruction *AMemSet = nullptr;
 442   for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
 443        I != E; ++I) {
 444     const MemsetRange &Range = *I;
 445
 446     if (Range.TheStores.size() == 1) continue;
 447
 448     // If it is profitable to lower this range to memset, do so now.
 449     if (!Range.isProfitableToUseMemset(DL))
 450       continue;
 451
 452     // Otherwise, we do want to transform this!  Create a new memset.
 453     // Get the starting pointer of the block.
 454     StartPtr = Range.StartPtr;
 455
 456     // Determine alignment
 457     unsigned Alignment = Range.Alignment;
 458     if (Alignment == 0) {
 459       Type *EltType =
 460         cast<PointerType>(StartPtr->getType())->getElementType();
 461       Alignment = DL.getABITypeAlignment(EltType);
 462     }
 463
 464     AMemSet =
 465       Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
 466
 467     DEBUG(dbgs() << "Replace stores:\n";
 468           for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i)
 469             dbgs() << *Range.TheStores[i] << '\n';
 470           dbgs() << "With: " << *AMemSet << '\n');
 471
 472     if (!Range.TheStores.empty())
 473       AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
 474
 475     // Zap all the stores.
 476     for (SmallVectorImpl<Instruction *>::const_iterator
 477          SI = Range.TheStores.begin(),
 478          SE = Range.TheStores.end(); SI != SE; ++SI) {
 479       MD->removeInstruction(*SI);
 480       (*SI)->eraseFromParent();
 481     }
 482     ++NumMemSetInfer;
 483   }
 484
 485   return AMemSet;
 486 }
 487
 488
 489 bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
 490   if (!SI->isSimple()) return false;
 491
 492   // Avoid merging nontemporal stores since the resulting
 493   // memcpy/memset would not be able to preserve the nontemporal hint.
 494   // In theory we could teach how to propagate the !nontemporal metadata to
 495   // memset calls. However, that change would force the backend to
 496   // conservatively expand !nontemporal memset calls back to sequences of
 497   // store instructions (effectively undoing the merging).
 498   if (SI->getMetadata(LLVMContext::MD_nontemporal))
 499     return false;
 500
 501   const DataLayout &DL = SI->getModule()->getDataLayout();
 502
 503   // Detect cases where we're performing call slot forwarding, but
 504   // happen to be using a load-store pair to implement it, rather than
 505   // a memcpy.
 506   if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
 507     if (LI->isSimple() && LI->hasOneUse() &&
 508         LI->getParent() == SI->getParent()) {
 509       MemDepResult ldep = MD->getDependency(LI);
 510       CallInst *C = nullptr;
 511       if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
 512         C = dyn_cast<CallInst>(ldep.getInst());
 513
 514       if (C) {
 515         // Check that nothing touches the dest of the "copy" between
 516         // the call and the store.
 517         AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
 518         MemoryLocation StoreLoc = MemoryLocation::get(SI);
 519         for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator();
 520              I != E; --I) {
 521           if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) {
 522             C = nullptr;
 523             break;
 524           }
 525         }
 526       }
 527
 528       if (C) {
 529         unsigned storeAlign = SI->getAlignment();
 530         if (!storeAlign)
 531           storeAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
 532         unsigned loadAlign = LI->getAlignment();
 533         if (!loadAlign)
 534           loadAlign = DL.getABITypeAlignment(LI->getType());
 535
 536         bool changed = performCallSlotOptzn(
 537             LI, SI->getPointerOperand()->stripPointerCasts(),
 538             LI->getPointerOperand()->stripPointerCasts(),
 539             DL.getTypeStoreSize(SI->getOperand(0)->getType()),
 540             std::min(storeAlign, loadAlign), C);
 541         if (changed) {
 542           MD->removeInstruction(SI);
 543           SI->eraseFromParent();
 544           MD->removeInstruction(LI);
 545           LI->eraseFromParent();
 546           ++NumMemCpyInstr;
 547           return true;
 548         }
 549       }
 550     }
 551   }
 552
 553   // There are two cases that are interesting for this code to handle: memcpy
 554   // and memset.  Right now we only handle memset.
 555
 556   // Ensure that the value being stored is something that can be memset'able a
 557   // byte at a time like "0" or "-1" or any width, as well as things like
 558   // 0xA0A0A0A0 and 0.0.
 559   if (Value *ByteVal = isBytewiseValue(SI->getOperand(0)))
 560     if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(),
 561                                               ByteVal)) {
 562       BBI = I->getIterator(); // Don't invalidate iterator.
 563       return true;
 564     }
 565
 566   return false;
 567 }
 568
 569 bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
 570   // See if there is another memset or store neighboring this memset which
 571   // allows us to widen out the memset to do a single larger store.
 572   if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile())
 573     if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(),
 574                                               MSI->getValue())) {
 575       BBI = I->getIterator(); // Don't invalidate iterator.
 576       return true;
 577     }
 578   return false;
 579 }
 580
 581
 582 /// Takes a memcpy and a call that it depends on,
 583 /// and checks for the possibility of a call slot optimization by having
 584 /// the call write its result directly into the destination of the memcpy.
 585 bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
 586                                      Value *cpyDest, Value *cpySrc,
 587                                      uint64_t cpyLen, unsigned cpyAlign,
 588                                      CallInst *C) {
 589   // The general transformation to keep in mind is
 590   //
 591   //   call @func(..., src, ...)
 592   //   memcpy(dest, src, ...)
 593   //
 594   // ->
 595   //
 596   //   memcpy(dest, src, ...)
 597   //   call @func(..., dest, ...)
 598   //
 599   // Since moving the memcpy is technically awkward, we additionally check that
 600   // src only holds uninitialized values at the moment of the call, meaning that
 601   // the memcpy can be discarded rather than moved.
 602
 603   // Deliberately get the source and destination with bitcasts stripped away,
 604   // because we'll need to do type comparisons based on the underlying type.
 605   CallSite CS(C);
 606
 607   // Require that src be an alloca.  This simplifies the reasoning considerably.
 608   AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
 609   if (!srcAlloca)
 610     return false;
 611
 612   ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
 613   if (!srcArraySize)
 614     return false;
 615
 616   const DataLayout &DL = cpy->getModule()->getDataLayout();
 617   uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) *
 618                      srcArraySize->getZExtValue();
 619
 620   if (cpyLen < srcSize)
 621     return false;
 622
 623   // Check that accessing the first srcSize bytes of dest will not cause a
 624   // trap.  Otherwise the transform is invalid since it might cause a trap
 625   // to occur earlier than it otherwise would.
 626   if (AllocaInst *A = dyn_cast<AllocaInst>(cpyDest)) {
 627     // The destination is an alloca.  Check it is larger than srcSize.
 628     ConstantInt *destArraySize = dyn_cast<ConstantInt>(A->getArraySize());
 629     if (!destArraySize)
 630       return false;
 631
 632     uint64_t destSize = DL.getTypeAllocSize(A->getAllocatedType()) *
 633                         destArraySize->getZExtValue();
 634
 635     if (destSize < srcSize)
 636       return false;
 637   } else if (Argument *A = dyn_cast<Argument>(cpyDest)) {
 638     if (A->getDereferenceableBytes() < srcSize) {
 639       // If the destination is an sret parameter then only accesses that are
 640       // outside of the returned struct type can trap.
 641       if (!A->hasStructRetAttr())
 642         return false;
 643
 644       Type *StructTy = cast<PointerType>(A->getType())->getElementType();
 645       if (!StructTy->isSized()) {
 646         // The call may never return and hence the copy-instruction may never
 647         // be executed, and therefore it's not safe to say "the destination
 648         // has at least <cpyLen> bytes, as implied by the copy-instruction",
 649         return false;
 650       }
 651
 652       uint64_t destSize = DL.getTypeAllocSize(StructTy);
 653       if (destSize < srcSize)
 654         return false;
 655     }
 656   } else {
 657     return false;
 658   }
 659
 660   // Check that dest points to memory that is at least as aligned as src.
 661   unsigned srcAlign = srcAlloca->getAlignment();
 662   if (!srcAlign)
 663     srcAlign = DL.getABITypeAlignment(srcAlloca->getAllocatedType());
 664   bool isDestSufficientlyAligned = srcAlign <= cpyAlign;
 665   // If dest is not aligned enough and we can't increase its alignment then
 666   // bail out.
 667   if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest))
 668     return false;
 669
 670   // Check that src is not accessed except via the call and the memcpy.  This
 671   // guarantees that it holds only undefined values when passed in (so the final
 672   // memcpy can be dropped), that it is not read or written between the call and
 673   // the memcpy, and that writing beyond the end of it is undefined.
 674   SmallVector<User*, 8> srcUseList(srcAlloca->user_begin(),
 675                                    srcAlloca->user_end());
 676   while (!srcUseList.empty()) {
 677     User *U = srcUseList.pop_back_val();
 678
 679     if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) {
 680       for (User *UU : U->users())
 681         srcUseList.push_back(UU);
 682       continue;
 683     }
 684     if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) {
 685       if (!G->hasAllZeroIndices())
 686         return false;
 687
 688       for (User *UU : U->users())
 689         srcUseList.push_back(UU);
 690       continue;
 691     }
 692     if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U))
 693       if (IT->getIntrinsicID() == Intrinsic::lifetime_start ||
 694           IT->getIntrinsicID() == Intrinsic::lifetime_end)
 695         continue;
 696
 697     if (U != C && U != cpy)
 698       return false;
 699   }
 700
 701   // Check that src isn't captured by the called function since the
 702   // transformation can cause aliasing issues in that case.
 703   for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
 704     if (CS.getArgument(i) == cpySrc && !CS.doesNotCapture(i))
 705       return false;
 706
 707   // Since we're changing the parameter to the callsite, we need to make sure
 708   // that what would be the new parameter dominates the callsite.
 709   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 710   if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest))
 711     if (!DT.dominates(cpyDestInst, C))
 712       return false;
 713
 714   // In addition to knowing that the call does not access src in some
 715   // unexpected manner, for example via a global, which we deduce from
 716   // the use analysis, we also need to know that it does not sneakily
 717   // access dest.  We rely on AA to figure this out for us.
 718   AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
 719   ModRefInfo MR = AA.getModRefInfo(C, cpyDest, srcSize);
 720   // If necessary, perform additional analysis.
 721   if (MR != MRI_NoModRef)
 722     MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT);
 723   if (MR != MRI_NoModRef)
 724     return false;
 725
 726   // All the checks have passed, so do the transformation.
 727   bool changedArgument = false;
 728   for (unsigned i = 0; i < CS.arg_size(); ++i)
 729     if (CS.getArgument(i)->stripPointerCasts() == cpySrc) {
 730       Value *Dest = cpySrc->getType() == cpyDest->getType() ?  cpyDest
 731         : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(),
 732                                       cpyDest->getName(), C);
 733       changedArgument = true;
 734       if (CS.getArgument(i)->getType() == Dest->getType())
 735         CS.setArgument(i, Dest);
 736       else
 737         CS.setArgument(i, CastInst::CreatePointerCast(Dest,
 738                           CS.getArgument(i)->getType(), Dest->getName(), C));
 739     }
 740
 741   if (!changedArgument)
 742     return false;
 743
 744   // If the destination wasn't sufficiently aligned then increase its alignment.
 745   if (!isDestSufficientlyAligned) {
 746     assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
 747     cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
 748   }
 749
 750   // Drop any cached information about the call, because we may have changed
 751   // its dependence information by changing its parameter.
 752   MD->removeInstruction(C);
 753
 754   // Update AA metadata
 755   // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
 756   // handled here, but combineMetadata doesn't support them yet
 757   unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
 758                          LLVMContext::MD_noalias,
 759                          LLVMContext::MD_invariant_group};
 760   combineMetadata(C, cpy, KnownIDs);
 761
 762   // Remove the memcpy.
 763   MD->removeInstruction(cpy);
 764   ++NumMemCpyInstr;
 765
 766   return true;
 767 }
 768
 769 /// We've found that the (upward scanning) memory dependence of memcpy 'M' is
 770 /// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can.
 771 bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) {
 772   // We can only transforms memcpy's where the dest of one is the source of the
 773   // other.
 774   if (M->getSource() != MDep->getDest() || MDep->isVolatile())
 775     return false;
 776
 777   // If dep instruction is reading from our current input, then it is a noop
 778   // transfer and substituting the input won't change this instruction.  Just
 779   // ignore the input and let someone else zap MDep.  This handles cases like:
 780   //    memcpy(a <- a)
 781   //    memcpy(b <- a)
 782   if (M->getSource() == MDep->getSource())
 783     return false;
 784
 785   // Second, the length of the memcpy's must be the same, or the preceding one
 786   // must be larger than the following one.
 787   ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
 788   ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength());
 789   if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
 790     return false;
 791
 792   AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
 793
 794   // Verify that the copied-from memory doesn't change in between the two
 795   // transfers.  For example, in:
 796   //    memcpy(a <- b)
 797   //    *b = 42;
 798   //    memcpy(c <- a)
 799   // It would be invalid to transform the second memcpy into memcpy(c <- b).
 800   //
 801   // TODO: If the code between M and MDep is transparent to the destination "c",
 802   // then we could still perform the xform by moving M up to the first memcpy.
 803   //
 804   // NOTE: This is conservative, it will stop on any read from the source loc,
 805   // not just the defining memcpy.
 806   MemDepResult SourceDep =
 807       MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false,
 808                                    M->getIterator(), M->getParent());
 809   if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
 810     return false;
 811
 812   // If the dest of the second might alias the source of the first, then the
 813   // source and dest might overlap.  We still want to eliminate the intermediate
 814   // value, but we have to generate a memmove instead of memcpy.
 815   bool UseMemMove = false;
 816   if (!AA.isNoAlias(MemoryLocation::getForDest(M),
 817                     MemoryLocation::getForSource(MDep)))
 818     UseMemMove = true;
 819
 820   // If all checks passed, then we can transform M.
 821
 822   // Make sure to use the lesser of the alignment of the source and the dest
 823   // since we're changing where we're reading from, but don't want to increase
 824   // the alignment past what can be read from or written to.
 825   // TODO: Is this worth it if we're creating a less aligned memcpy? For
 826   // example we could be moving from movaps -> movq on x86.
 827   unsigned Align = std::min(MDep->getAlignment(), M->getAlignment());
 828
 829   IRBuilder<> Builder(M);
 830   if (UseMemMove)
 831     Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(),
 832                           Align, M->isVolatile());
 833   else
 834     Builder.CreateMemCpy(M->getRawDest(), MDep->getRawSource(), M->getLength(),
 835                          Align, M->isVolatile());
 836
 837   // Remove the instruction we're replacing.
 838   MD->removeInstruction(M);
 839   M->eraseFromParent();
 840   ++NumMemCpyInstr;
 841   return true;
 842 }
 843
 844 /// We've found that the (upward scanning) memory dependence of \p MemCpy is
 845 /// \p MemSet.  Try to simplify \p MemSet to only set the trailing bytes that
 846 /// weren't copied over by \p MemCpy.
 847 ///
 848 /// In other words, transform:
 849 /// \code
 850 ///   memset(dst, c, dst_size);
 851 ///   memcpy(dst, src, src_size);
 852 /// \endcode
 853 /// into:
 854 /// \code
 855 ///   memcpy(dst, src, src_size);
 856 ///   memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size);
 857 /// \endcode
 858 bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
 859                                               MemSetInst *MemSet) {
 860   // We can only transform memset/memcpy with the same destination.
 861   if (MemSet->getDest() != MemCpy->getDest())
 862     return false;
 863
 864   // Check that there are no other dependencies on the memset destination.
 865   MemDepResult DstDepInfo =
 866       MD->getPointerDependencyFrom(MemoryLocation::getForDest(MemSet), false,
 867                                    MemCpy->getIterator(), MemCpy->getParent());
 868   if (DstDepInfo.getInst() != MemSet)
 869     return false;
 870
 871   // Use the same i8* dest as the memcpy, killing the memset dest if different.
 872   Value *Dest = MemCpy->getRawDest();
 873   Value *DestSize = MemSet->getLength();
 874   Value *SrcSize = MemCpy->getLength();
 875
 876   // By default, create an unaligned memset.
 877   unsigned Align = 1;
 878   // If Dest is aligned, and SrcSize is constant, use the minimum alignment
 879   // of the sum.
 880   const unsigned DestAlign =
 881       std::max(MemSet->getAlignment(), MemCpy->getAlignment());
 882   if (DestAlign > 1)
 883     if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
 884       Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign);
 885
 886   IRBuilder<> Builder(MemCpy);
 887
 888   // If the sizes have different types, zext the smaller one.
 889   if (DestSize->getType() != SrcSize->getType()) {
 890     if (DestSize->getType()->getIntegerBitWidth() >
 891         SrcSize->getType()->getIntegerBitWidth())
 892       SrcSize = Builder.CreateZExt(SrcSize, DestSize->getType());
 893     else
 894       DestSize = Builder.CreateZExt(DestSize, SrcSize->getType());
 895   }
 896
 897   Value *MemsetLen =
 898       Builder.CreateSelect(Builder.CreateICmpULE(DestSize, SrcSize),
 899                            ConstantInt::getNullValue(DestSize->getType()),
 900                            Builder.CreateSub(DestSize, SrcSize));
 901   Builder.CreateMemSet(Builder.CreateGEP(Dest, SrcSize), MemSet->getOperand(1),
 902                        MemsetLen, Align);
 903
 904   MD->removeInstruction(MemSet);
 905   MemSet->eraseFromParent();
 906   return true;
 907 }
 908
 909 /// Transform memcpy to memset when its source was just memset.
 910 /// In other words, turn:
 911 /// \code
 912 ///   memset(dst1, c, dst1_size);
 913 ///   memcpy(dst2, dst1, dst2_size);
 914 /// \endcode
 915 /// into:
 916 /// \code
 917 ///   memset(dst1, c, dst1_size);
 918 ///   memset(dst2, c, dst2_size);
 919 /// \endcode
 920 /// When dst2_size <= dst1_size.
 921 ///
 922 /// The \p MemCpy must have a Constant length.
 923 bool MemCpyOpt::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
 924                                            MemSetInst *MemSet) {
 925   // This only makes sense on memcpy(..., memset(...), ...).
 926   if (MemSet->getRawDest() != MemCpy->getRawSource())
 927     return false;
 928
 929   ConstantInt *CopySize = cast<ConstantInt>(MemCpy->getLength());
 930   ConstantInt *MemSetSize = dyn_cast<ConstantInt>(MemSet->getLength());
 931   // Make sure the memcpy doesn't read any more than what the memset wrote.
 932   // Don't worry about sizes larger than i64.
 933   if (!MemSetSize || CopySize->getZExtValue() > MemSetSize->getZExtValue())
 934     return false;
 935
 936   IRBuilder<> Builder(MemCpy);
 937   Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
 938                        CopySize, MemCpy->getAlignment());
 939   return true;
 940 }
 941
 942 /// Perform simplification of memcpy's.  If we have memcpy A
 943 /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
 944 /// B to be a memcpy from X to Z (or potentially a memmove, depending on
 945 /// circumstances). This allows later passes to remove the first memcpy
 946 /// altogether.
 947 bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
 948   // We can only optimize non-volatile memcpy's.
 949   if (M->isVolatile()) return false;
 950
 951   // If the source and destination of the memcpy are the same, then zap it.
 952   if (M->getSource() == M->getDest()) {
 953     MD->removeInstruction(M);
 954     M->eraseFromParent();
 955     return false;
 956   }
 957
 958   // If copying from a constant, try to turn the memcpy into a memset.
 959   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource()))
 960     if (GV->isConstant() && GV->hasDefinitiveInitializer())
 961       if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) {
 962         IRBuilder<> Builder(M);
 963         Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
 964                              M->getAlignment(), false);
 965         MD->removeInstruction(M);
 966         M->eraseFromParent();
 967         ++NumCpyToSet;
 968         return true;
 969       }
 970
 971   MemDepResult DepInfo = MD->getDependency(M);
 972
 973   // Try to turn a partially redundant memset + memcpy into
 974   // memcpy + smaller memset.  We don't need the memcpy size for this.
 975   if (DepInfo.isClobber())
 976     if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst()))
 977       if (processMemSetMemCpyDependence(M, MDep))
 978         return true;
 979
 980   // The optimizations after this point require the memcpy size.
 981   ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
 982   if (!CopySize) return false;
 983
 984   // There are four possible optimizations we can do for memcpy:
 985   //   a) memcpy-memcpy xform which exposes redundance for DSE.
 986   //   b) call-memcpy xform for return slot optimization.
 987   //   c) memcpy from freshly alloca'd space or space that has just started its
 988   //      lifetime copies undefined data, and we can therefore eliminate the
 989   //      memcpy in favor of the data that was already at the destination.
 990   //   d) memcpy from a just-memset'd source can be turned into memset.
 991   if (DepInfo.isClobber()) {
 992     if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
 993       if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
 994                                CopySize->getZExtValue(), M->getAlignment(),
 995                                C)) {
 996         MD->removeInstruction(M);
 997         M->eraseFromParent();
 998         return true;
 999       }
1000     }
1001   }
1002
1003   MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
1004   MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
1005       SrcLoc, true, M->getIterator(), M->getParent());
1006
1007   if (SrcDepInfo.isClobber()) {
1008     if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
1009       return processMemCpyMemCpyDependence(M, MDep);
1010   } else if (SrcDepInfo.isDef()) {
1011     Instruction *I = SrcDepInfo.getInst();
1012     bool hasUndefContents = false;
1013
1014     if (isa<AllocaInst>(I)) {
1015       hasUndefContents = true;
1016     } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
1017       if (II->getIntrinsicID() == Intrinsic::lifetime_start)
1018         if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0)))
1019           if (LTSize->getZExtValue() >= CopySize->getZExtValue())
1020             hasUndefContents = true;
1021     }
1022
1023     if (hasUndefContents) {
1024       MD->removeInstruction(M);
1025       M->eraseFromParent();
1026       ++NumMemCpyInstr;
1027       return true;
1028     }
1029   }
1030
1031   if (SrcDepInfo.isClobber())
1032     if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
1033       if (performMemCpyToMemSetOptzn(M, MDep)) {
1034         MD->removeInstruction(M);
1035         M->eraseFromParent();
1036         ++NumCpyToSet;
1037         return true;
1038       }
1039
1040   return false;
1041 }
1042
1043 /// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
1044 /// not to alias.
1045 bool MemCpyOpt::processMemMove(MemMoveInst *M) {
1046   AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
1047
1048   if (!TLI->has(LibFunc::memmove))
1049     return false;
1050
1051   // See if the pointers alias.
1052   if (!AA.isNoAlias(MemoryLocation::getForDest(M),
1053                     MemoryLocation::getForSource(M)))
1054     return false;
1055
1056   DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n");
1057
1058   // If not, then we know we can transform this.
1059   Module *Mod = M->getParent()->getParent()->getParent();
1060   Type *ArgTys[3] = { M->getRawDest()->getType(),
1061                       M->getRawSource()->getType(),
1062                       M->getLength()->getType() };
1063   M->setCalledFunction(Intrinsic::getDeclaration(Mod, Intrinsic::memcpy,
1064                                                  ArgTys));
1065
1066   // MemDep may have over conservative information about this instruction, just
1067   // conservatively flush it from the cache.
1068   MD->removeInstruction(M);
1069
1070   ++NumMoveToCpy;
1071   return true;
1072 }
1073
1074 /// This is called on every byval argument in call sites.
1075 bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
1076   const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout();
1077   // Find out what feeds this byval argument.
1078   Value *ByValArg = CS.getArgument(ArgNo);
1079   Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
1080   uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
1081   MemDepResult DepInfo = MD->getPointerDependencyFrom(
1082       MemoryLocation(ByValArg, ByValSize), true,
1083       CS.getInstruction()->getIterator(), CS.getInstruction()->getParent());
1084   if (!DepInfo.isClobber())
1085     return false;
1086
1087   // If the byval argument isn't fed by a memcpy, ignore it.  If it is fed by
1088   // a memcpy, see if we can byval from the source of the memcpy instead of the
1089   // result.
1090   MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
1091   if (!MDep || MDep->isVolatile() ||
1092       ByValArg->stripPointerCasts() != MDep->getDest())
1093     return false;
1094
1095   // The length of the memcpy must be larger or equal to the size of the byval.
1096   ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
1097   if (!C1 || C1->getValue().getZExtValue() < ByValSize)
1098     return false;
1099
1100   // Get the alignment of the byval.  If the call doesn't specify the alignment,
1101   // then it is some target specific value that we can't know.
1102   unsigned ByValAlign = CS.getParamAlignment(ArgNo+1);
1103   if (ByValAlign == 0) return false;
1104
1105   // If it is greater than the memcpy, then we check to see if we can force the
1106   // source of the memcpy to the alignment we need.  If we fail, we bail out.
1107   AssumptionCache &AC =
1108       getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
1109           *CS->getParent()->getParent());
1110   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1111   if (MDep->getAlignment() < ByValAlign &&
1112       getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL,
1113                                  CS.getInstruction(), &AC, &DT) < ByValAlign)
1114     return false;
1115
1116   // Verify that the copied-from memory doesn't change in between the memcpy and
1117   // the byval call.
1118   //    memcpy(a <- b)
1119   //    *b = 42;
1120   //    foo(*a)
1121   // It would be invalid to transform the second memcpy into foo(*b).
1122   //
1123   // NOTE: This is conservative, it will stop on any read from the source loc,
1124   // not just the defining memcpy.
1125   MemDepResult SourceDep = MD->getPointerDependencyFrom(
1126       MemoryLocation::getForSource(MDep), false,
1127       CS.getInstruction()->getIterator(), MDep->getParent());
1128   if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
1129     return false;
1130
1131   Value *TmpCast = MDep->getSource();
1132   if (MDep->getSource()->getType() != ByValArg->getType())
1133     TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
1134                               "tmpcast", CS.getInstruction());
1135
1136   DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n"
1137                << "  " << *MDep << "\n"
1138                << "  " << *CS.getInstruction() << "\n");
1139
1140   // Otherwise we're good!  Update the byval argument.
1141   CS.setArgument(ArgNo, TmpCast);
1142   ++NumMemCpyInstr;
1143   return true;
1144 }
1145
1146 /// Executes one iteration of MemCpyOpt.
1147 bool MemCpyOpt::iterateOnFunction(Function &F) {
1148   bool MadeChange = false;
1149
1150   // Walk all instruction in the function.
1151   for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
1152     for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
1153       // Avoid invalidating the iterator.
1154       Instruction *I = &*BI++;
1155
1156       bool RepeatInstruction = false;
1157
1158       if (StoreInst *SI = dyn_cast<StoreInst>(I))
1159         MadeChange |= processStore(SI, BI);
1160       else if (MemSetInst *M = dyn_cast<MemSetInst>(I))
1161         RepeatInstruction = processMemSet(M, BI);
1162       else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I))
1163         RepeatInstruction = processMemCpy(M);
1164       else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I))
1165         RepeatInstruction = processMemMove(M);
1166       else if (auto CS = CallSite(I)) {
1167         for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
1168           if (CS.isByValArgument(i))
1169             MadeChange |= processByValArgument(CS, i);
1170       }
1171
1172       // Reprocess the instruction if desired.
1173       if (RepeatInstruction) {
1174         if (BI != BB->begin()) --BI;
1175         MadeChange = true;
1176       }
1177     }
1178   }
1179
1180   return MadeChange;
1181 }
1182
1183 /// This is the main transformation entry point for a function.
1184 bool MemCpyOpt::runOnFunction(Function &F) {
1185   if (skipOptnoneFunction(F))
1186     return false;
1187
1188   bool MadeChange = false;
1189   MD = &getAnalysis<MemoryDependenceAnalysis>();
1190   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
1191
1192   // If we don't have at least memset and memcpy, there is little point of doing
1193   // anything here.  These are required by a freestanding implementation, so if
1194   // even they are disabled, there is no point in trying hard.
1195   if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy))
1196     return false;
1197
1198   while (1) {
1199     if (!iterateOnFunction(F))
1200       break;
1201     MadeChange = true;
1202   }
1203
1204   MD = nullptr;
1205   return MadeChange;
1206 }