lib/Transforms/Scalar/LoopIdiomRecognize.cpp

   1 //===-- LoopIdiomRecognize.cpp - Loop idiom recognition -------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This pass implements an idiom recognizer that transforms simple loops into a
  11 // non-loop form.  In cases that this kicks in, it can be a significant
  12 // performance win.
  13 //
  14 //===----------------------------------------------------------------------===//
  15 //
  16 // TODO List:
  17 //
  18 // Future loop memory idioms to recognize:
  19 //   memcmp, memmove, strlen, etc.
  20 // Future floating point idioms to recognize in -ffast-math mode:
  21 //   fpowi
  22 // Future integer operation idioms to recognize:
  23 //   ctpop, ctlz, cttz
  24 //
  25 // Beware that isel's default lowering for ctpop is highly inefficient for
  26 // i64 and larger types when i64 is legal and the value has few bits set.  It
  27 // would be good to enhance isel to emit a loop for ctpop in this case.
  28 //
  29 // We should enhance the memset/memcpy recognition to handle multiple stores in
  30 // the loop.  This would handle things like:
  31 //   void foo(_Complex float *P)
  32 //     for (i) { __real__(*P) = 0;  __imag__(*P) = 0; }
  33 //
  34 // We should enhance this to handle negative strides through memory.
  35 // Alternatively (and perhaps better) we could rely on an earlier pass to force
  36 // forward iteration through memory, which is generally better for cache
  37 // behavior.  Negative strides *do* happen for memset/memcpy loops.
  38 //
  39 // This could recognize common matrix multiplies and dot product idioms and
  40 // replace them with calls to BLAS (if linked in??).
  41 //
  42 //===----------------------------------------------------------------------===//
  43
  44 #define DEBUG_TYPE "loop-idiom"
  45 #include "llvm/Transforms/Scalar.h"
  46 #include "llvm/ADT/Statistic.h"
  47 #include "llvm/Analysis/AliasAnalysis.h"
  48 #include "llvm/Analysis/LoopPass.h"
  49 #include "llvm/Analysis/ScalarEvolutionExpander.h"
  50 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  51 #include "llvm/Analysis/TargetTransformInfo.h"
  52 #include "llvm/Analysis/ValueTracking.h"
  53 #include "llvm/IR/DataLayout.h"
  54 #include "llvm/IR/Dominators.h"
  55 #include "llvm/IR/IRBuilder.h"
  56 #include "llvm/IR/IntrinsicInst.h"
  57 #include "llvm/IR/Module.h"
  58 #include "llvm/Support/Debug.h"
  59 #include "llvm/Support/raw_ostream.h"
  60 #include "llvm/Target/TargetLibraryInfo.h"
  61 #include "llvm/Transforms/Utils/Local.h"
  62 using namespace llvm;
  63
  64 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
  65 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
  66
  67 namespace {
  68
  69   class LoopIdiomRecognize;
  70
  71   /// This class defines some utility functions for loop idiom recognization.
  72   class LIRUtil {
  73   public:
  74     /// Return true iff the block contains nothing but an uncondition branch
  75     /// (aka goto instruction).
  76     static bool isAlmostEmpty(BasicBlock *);
  77
  78     static BranchInst *getBranch(BasicBlock *BB) {
  79       return dyn_cast<BranchInst>(BB->getTerminator());
  80     }
  81
  82     /// Derive the precondition block (i.e the block that guards the loop
  83     /// preheader) from the given preheader.
  84     static BasicBlock *getPrecondBb(BasicBlock *PreHead);
  85   };
  86
  87   /// This class is to recoginize idioms of population-count conducted in
  88   /// a noncountable loop. Currently it only recognizes this pattern:
  89   /// \code
  90   ///   while(x) {cnt++; ...; x &= x - 1; ...}
  91   /// \endcode
  92   class NclPopcountRecognize {
  93     LoopIdiomRecognize &LIR;
  94     Loop *CurLoop;
  95     BasicBlock *PreCondBB;
  96
  97     typedef IRBuilder<> IRBuilderTy;
  98
  99   public:
 100     explicit NclPopcountRecognize(LoopIdiomRecognize &TheLIR);
 101     bool recognize();
 102
 103   private:
 104     /// Take a glimpse of the loop to see if we need to go ahead recoginizing
 105     /// the idiom.
 106     bool preliminaryScreen();
 107
 108     /// Check if the given conditional branch is based on the comparison
 109     /// between a variable and zero, and if the variable is non-zero, the
 110     /// control yields to the loop entry. If the branch matches the behavior,
 111     /// the variable involved in the comparion is returned. This function will
 112     /// be called to see if the precondition and postcondition of the loop
 113     /// are in desirable form.
 114     Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const;
 115
 116     /// Return true iff the idiom is detected in the loop. and 1) \p CntInst
 117     /// is set to the instruction counting the pupulation bit. 2) \p CntPhi
 118     /// is set to the corresponding phi node. 3) \p Var is set to the value
 119     /// whose population bits are being counted.
 120     bool detectIdiom
 121       (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const;
 122
 123     /// Insert ctpop intrinsic function and some obviously dead instructions.
 124     void transform (Instruction *CntInst, PHINode *CntPhi, Value *Var);
 125
 126     /// Create llvm.ctpop.* intrinsic function.
 127     CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL);
 128   };
 129
 130   class LoopIdiomRecognize : public LoopPass {
 131     Loop *CurLoop;
 132     const DataLayout *DL;
 133     DominatorTree *DT;
 134     ScalarEvolution *SE;
 135     TargetLibraryInfo *TLI;
 136     const TargetTransformInfo *TTI;
 137   public:
 138     static char ID;
 139     explicit LoopIdiomRecognize() : LoopPass(ID) {
 140       initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
 141       DL = 0; DT = 0; SE = 0; TLI = 0; TTI = 0;
 142     }
 143
 144     bool runOnLoop(Loop *L, LPPassManager &LPM) override;
 145     bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
 146                         SmallVectorImpl<BasicBlock*> &ExitBlocks);
 147
 148     bool processLoopStore(StoreInst *SI, const SCEV *BECount);
 149     bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
 150
 151     bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
 152                                  unsigned StoreAlignment,
 153                                  Value *SplatValue, Instruction *TheStore,
 154                                  const SCEVAddRecExpr *Ev,
 155                                  const SCEV *BECount);
 156     bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
 157                                     const SCEVAddRecExpr *StoreEv,
 158                                     const SCEVAddRecExpr *LoadEv,
 159                                     const SCEV *BECount);
 160
 161     /// This transformation requires natural loop information & requires that
 162     /// loop preheaders be inserted into the CFG.
 163     ///
 164     void getAnalysisUsage(AnalysisUsage &AU) const override {
 165       AU.addRequired<LoopInfo>();
 166       AU.addPreserved<LoopInfo>();
 167       AU.addRequiredID(LoopSimplifyID);
 168       AU.addPreservedID(LoopSimplifyID);
 169       AU.addRequiredID(LCSSAID);
 170       AU.addPreservedID(LCSSAID);
 171       AU.addRequired<AliasAnalysis>();
 172       AU.addPreserved<AliasAnalysis>();
 173       AU.addRequired<ScalarEvolution>();
 174       AU.addPreserved<ScalarEvolution>();
 175       AU.addPreserved<DominatorTreeWrapperPass>();
 176       AU.addRequired<DominatorTreeWrapperPass>();
 177       AU.addRequired<TargetLibraryInfo>();
 178       AU.addRequired<TargetTransformInfo>();
 179     }
 180
 181     const DataLayout *getDataLayout() {
 182       if (DL)
 183         return DL;
 184       DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
 185       DL = DLP ? &DLP->getDataLayout() : 0;
 186       return DL;
 187     }
 188
 189     DominatorTree *getDominatorTree() {
 190       return DT ? DT
 191                 : (DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree());
 192     }
 193
 194     ScalarEvolution *getScalarEvolution() {
 195       return SE ? SE : (SE = &getAnalysis<ScalarEvolution>());
 196     }
 197
 198     TargetLibraryInfo *getTargetLibraryInfo() {
 199       return TLI ? TLI : (TLI = &getAnalysis<TargetLibraryInfo>());
 200     }
 201
 202     const TargetTransformInfo *getTargetTransformInfo() {
 203       return TTI ? TTI : (TTI = &getAnalysis<TargetTransformInfo>());
 204     }
 205
 206     Loop *getLoop() const { return CurLoop; }
 207
 208   private:
 209     bool runOnNoncountableLoop();
 210     bool runOnCountableLoop();
 211   };
 212 }
 213
 214 char LoopIdiomRecognize::ID = 0;
 215 INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
 216                       false, false)
 217 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 218 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 219 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 220 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 221 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 222 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 223 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 224 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
 225 INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
 226                     false, false)
 227
 228 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); }
 229
 230 /// deleteDeadInstruction - Delete this instruction.  Before we do, go through
 231 /// and zero out all the operands of this instruction.  If any of them become
 232 /// dead, delete them and the computation tree that feeds them.
 233 ///
 234 static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE,
 235                                   const TargetLibraryInfo *TLI) {
 236   SmallVector<Instruction*, 32> NowDeadInsts;
 237
 238   NowDeadInsts.push_back(I);
 239
 240   // Before we touch this instruction, remove it from SE!
 241   do {
 242     Instruction *DeadInst = NowDeadInsts.pop_back_val();
 243
 244     // This instruction is dead, zap it, in stages.  Start by removing it from
 245     // SCEV.
 246     SE.forgetValue(DeadInst);
 247
 248     for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
 249       Value *Op = DeadInst->getOperand(op);
 250       DeadInst->setOperand(op, 0);
 251
 252       // If this operand just became dead, add it to the NowDeadInsts list.
 253       if (!Op->use_empty()) continue;
 254
 255       if (Instruction *OpI = dyn_cast<Instruction>(Op))
 256         if (isInstructionTriviallyDead(OpI, TLI))
 257           NowDeadInsts.push_back(OpI);
 258     }
 259
 260     DeadInst->eraseFromParent();
 261
 262   } while (!NowDeadInsts.empty());
 263 }
 264
 265 /// deleteIfDeadInstruction - If the specified value is a dead instruction,
 266 /// delete it and any recursively used instructions.
 267 static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE,
 268                                     const TargetLibraryInfo *TLI) {
 269   if (Instruction *I = dyn_cast<Instruction>(V))
 270     if (isInstructionTriviallyDead(I, TLI))
 271       deleteDeadInstruction(I, SE, TLI);
 272 }
 273
 274 //===----------------------------------------------------------------------===//
 275 //
 276 //          Implementation of LIRUtil
 277 //
 278 //===----------------------------------------------------------------------===//
 279
 280 // This function will return true iff the given block contains nothing but goto.
 281 // A typical usage of this function is to check if the preheader function is
 282 // "almost" empty such that generated intrinsic functions can be moved across
 283 // the preheader and be placed at the end of the precondition block without
 284 // the concern of breaking data dependence.
 285 bool LIRUtil::isAlmostEmpty(BasicBlock *BB) {
 286   if (BranchInst *Br = getBranch(BB)) {
 287     return Br->isUnconditional() && BB->size() == 1;
 288   }
 289   return false;
 290 }
 291
 292 BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) {
 293   if (BasicBlock *BB = PreHead->getSinglePredecessor()) {
 294     BranchInst *Br = getBranch(BB);
 295     return Br && Br->isConditional() ? BB : 0;
 296   }
 297   return 0;
 298 }
 299
 300 //===----------------------------------------------------------------------===//
 301 //
 302 //          Implementation of NclPopcountRecognize
 303 //
 304 //===----------------------------------------------------------------------===//
 305
 306 NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR):
 307   LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(0) {
 308 }
 309
 310 bool NclPopcountRecognize::preliminaryScreen() {
 311   const TargetTransformInfo *TTI = LIR.getTargetTransformInfo();
 312   if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
 313     return false;
 314
 315   // Counting population are usually conducted by few arithmetic instructions.
 316   // Such instructions can be easilly "absorbed" by vacant slots in a
 317   // non-compact loop. Therefore, recognizing popcount idiom only makes sense
 318   // in a compact loop.
 319
 320   // Give up if the loop has multiple blocks or multiple backedges.
 321   if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
 322     return false;
 323
 324   BasicBlock *LoopBody = *(CurLoop->block_begin());
 325   if (LoopBody->size() >= 20) {
 326     // The loop is too big, bail out.
 327     return false;
 328   }
 329
 330   // It should have a preheader containing nothing but a goto instruction.
 331   BasicBlock *PreHead = CurLoop->getLoopPreheader();
 332   if (!PreHead || !LIRUtil::isAlmostEmpty(PreHead))
 333     return false;
 334
 335   // It should have a precondition block where the generated popcount instrinsic
 336   // function will be inserted.
 337   PreCondBB = LIRUtil::getPrecondBb(PreHead);
 338   if (!PreCondBB)
 339     return false;
 340
 341   return true;
 342 }
 343
 344 Value *NclPopcountRecognize::matchCondition (BranchInst *Br,
 345                                              BasicBlock *LoopEntry) const {
 346   if (!Br || !Br->isConditional())
 347     return 0;
 348
 349   ICmpInst *Cond = dyn_cast<ICmpInst>(Br->getCondition());
 350   if (!Cond)
 351     return 0;
 352
 353   ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
 354   if (!CmpZero || !CmpZero->isZero())
 355     return 0;
 356
 357   ICmpInst::Predicate Pred = Cond->getPredicate();
 358   if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) ||
 359       (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry))
 360     return Cond->getOperand(0);
 361
 362   return 0;
 363 }
 364
 365 bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst,
 366                                        PHINode *&CntPhi,
 367                                        Value *&Var) const {
 368   // Following code tries to detect this idiom:
 369   //
 370   //    if (x0 != 0)
 371   //      goto loop-exit // the precondition of the loop
 372   //    cnt0 = init-val;
 373   //    do {
 374   //       x1 = phi (x0, x2);
 375   //       cnt1 = phi(cnt0, cnt2);
 376   //
 377   //       cnt2 = cnt1 + 1;
 378   //        ...
 379   //       x2 = x1 & (x1 - 1);
 380   //        ...
 381   //    } while(x != 0);
 382   //
 383   // loop-exit:
 384   //
 385
 386   // step 1: Check to see if the look-back branch match this pattern:
 387   //    "if (a!=0) goto loop-entry".
 388   BasicBlock *LoopEntry;
 389   Instruction *DefX2, *CountInst;
 390   Value *VarX1, *VarX0;
 391   PHINode *PhiX, *CountPhi;
 392
 393   DefX2 = CountInst = 0;
 394   VarX1 = VarX0 = 0;
 395   PhiX = CountPhi = 0;
 396   LoopEntry = *(CurLoop->block_begin());
 397
 398   // step 1: Check if the loop-back branch is in desirable form.
 399   {
 400     if (Value *T = matchCondition (LIRUtil::getBranch(LoopEntry), LoopEntry))
 401       DefX2 = dyn_cast<Instruction>(T);
 402     else
 403       return false;
 404   }
 405
 406   // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
 407   {
 408     if (!DefX2 || DefX2->getOpcode() != Instruction::And)
 409       return false;
 410
 411     BinaryOperator *SubOneOp;
 412
 413     if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
 414       VarX1 = DefX2->getOperand(1);
 415     else {
 416       VarX1 = DefX2->getOperand(0);
 417       SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
 418     }
 419     if (!SubOneOp)
 420       return false;
 421
 422     Instruction *SubInst = cast<Instruction>(SubOneOp);
 423     ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1));
 424     if (!Dec ||
 425         !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
 426           (SubInst->getOpcode() == Instruction::Add && Dec->isAllOnesValue()))) {
 427       return false;
 428     }
 429   }
 430
 431   // step 3: Check the recurrence of variable X
 432   {
 433     PhiX = dyn_cast<PHINode>(VarX1);
 434     if (!PhiX ||
 435         (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) {
 436       return false;
 437     }
 438   }
 439
 440   // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
 441   {
 442     CountInst = NULL;
 443     for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(),
 444            IterE = LoopEntry->end(); Iter != IterE; Iter++) {
 445       Instruction *Inst = Iter;
 446       if (Inst->getOpcode() != Instruction::Add)
 447         continue;
 448
 449       ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
 450       if (!Inc || !Inc->isOne())
 451         continue;
 452
 453       PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
 454       if (!Phi || Phi->getParent() != LoopEntry)
 455         continue;
 456
 457       // Check if the result of the instruction is live of the loop.
 458       bool LiveOutLoop = false;
 459       for (User *U : Inst->users()) {
 460         if ((cast<Instruction>(U))->getParent() != LoopEntry) {
 461           LiveOutLoop = true; break;
 462         }
 463       }
 464
 465       if (LiveOutLoop) {
 466         CountInst = Inst;
 467         CountPhi = Phi;
 468         break;
 469       }
 470     }
 471
 472     if (!CountInst)
 473       return false;
 474   }
 475
 476   // step 5: check if the precondition is in this form:
 477   //   "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
 478   {
 479     BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB);
 480     Value *T = matchCondition (PreCondBr, CurLoop->getLoopPreheader());
 481     if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
 482       return false;
 483
 484     CntInst = CountInst;
 485     CntPhi = CountPhi;
 486     Var = T;
 487   }
 488
 489   return true;
 490 }
 491
 492 void NclPopcountRecognize::transform(Instruction *CntInst,
 493                                      PHINode *CntPhi, Value *Var) {
 494
 495   ScalarEvolution *SE = LIR.getScalarEvolution();
 496   TargetLibraryInfo *TLI = LIR.getTargetLibraryInfo();
 497   BasicBlock *PreHead = CurLoop->getLoopPreheader();
 498   BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB);
 499   const DebugLoc DL = CntInst->getDebugLoc();
 500
 501   // Assuming before transformation, the loop is following:
 502   //  if (x) // the precondition
 503   //     do { cnt++; x &= x - 1; } while(x);
 504
 505   // Step 1: Insert the ctpop instruction at the end of the precondition block
 506   IRBuilderTy Builder(PreCondBr);
 507   Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
 508   {
 509     PopCnt = createPopcntIntrinsic(Builder, Var, DL);
 510     NewCount = PopCntZext =
 511       Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
 512
 513     if (NewCount != PopCnt)
 514       (cast<Instruction>(NewCount))->setDebugLoc(DL);
 515
 516     // TripCnt is exactly the number of iterations the loop has
 517     TripCnt = NewCount;
 518
 519     // If the population counter's initial value is not zero, insert Add Inst.
 520     Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
 521     ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
 522     if (!InitConst || !InitConst->isZero()) {
 523       NewCount = Builder.CreateAdd(NewCount, CntInitVal);
 524       (cast<Instruction>(NewCount))->setDebugLoc(DL);
 525     }
 526   }
 527
 528   // Step 2: Replace the precondition from "if(x == 0) goto loop-exit" to
 529   //   "if(NewCount == 0) loop-exit". Withtout this change, the intrinsic
 530   //   function would be partial dead code, and downstream passes will drag
 531   //   it back from the precondition block to the preheader.
 532   {
 533     ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
 534
 535     Value *Opnd0 = PopCntZext;
 536     Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
 537     if (PreCond->getOperand(0) != Var)
 538       std::swap(Opnd0, Opnd1);
 539
 540     ICmpInst *NewPreCond =
 541       cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
 542     PreCond->replaceAllUsesWith(NewPreCond);
 543
 544     deleteDeadInstruction(PreCond, *SE, TLI);
 545   }
 546
 547   // Step 3: Note that the population count is exactly the trip count of the
 548   // loop in question, which enble us to to convert the loop from noncountable
 549   // loop into a countable one. The benefit is twofold:
 550   //
 551   //  - If the loop only counts population, the entire loop become dead after
 552   //    the transformation. It is lots easier to prove a countable loop dead
 553   //    than to prove a noncountable one. (In some C dialects, a infite loop
 554   //    isn't dead even if it computes nothing useful. In general, DCE needs
 555   //    to prove a noncountable loop finite before safely delete it.)
 556   //
 557   //  - If the loop also performs something else, it remains alive.
 558   //    Since it is transformed to countable form, it can be aggressively
 559   //    optimized by some optimizations which are in general not applicable
 560   //    to a noncountable loop.
 561   //
 562   // After this step, this loop (conceptually) would look like following:
 563   //   newcnt = __builtin_ctpop(x);
 564   //   t = newcnt;
 565   //   if (x)
 566   //     do { cnt++; x &= x-1; t--) } while (t > 0);
 567   BasicBlock *Body = *(CurLoop->block_begin());
 568   {
 569     BranchInst *LbBr = LIRUtil::getBranch(Body);
 570     ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
 571     Type *Ty = TripCnt->getType();
 572
 573     PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", Body->begin());
 574
 575     Builder.SetInsertPoint(LbCond);
 576     Value *Opnd1 = cast<Value>(TcPhi);
 577     Value *Opnd2 = cast<Value>(ConstantInt::get(Ty, 1));
 578     Instruction *TcDec =
 579       cast<Instruction>(Builder.CreateSub(Opnd1, Opnd2, "tcdec", false, true));
 580
 581     TcPhi->addIncoming(TripCnt, PreHead);
 582     TcPhi->addIncoming(TcDec, Body);
 583
 584     CmpInst::Predicate Pred = (LbBr->getSuccessor(0) == Body) ?
 585       CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
 586     LbCond->setPredicate(Pred);
 587     LbCond->setOperand(0, TcDec);
 588     LbCond->setOperand(1, cast<Value>(ConstantInt::get(Ty, 0)));
 589   }
 590
 591   // Step 4: All the references to the original population counter outside
 592   //  the loop are replaced with the NewCount -- the value returned from
 593   //  __builtin_ctpop().
 594   {
 595     SmallVector<Value *, 4> CntUses;
 596     for (User *U : CntInst->users())
 597       if (cast<Instruction>(U)->getParent() != Body)
 598         CntUses.push_back(U);
 599     for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) {
 600       (cast<Instruction>(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount);
 601     }
 602   }
 603
 604   // step 5: Forget the "non-computable" trip-count SCEV associated with the
 605   //   loop. The loop would otherwise not be deleted even if it becomes empty.
 606   SE->forgetLoop(CurLoop);
 607 }
 608
 609 CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder,
 610                                                       Value *Val, DebugLoc DL) {
 611   Value *Ops[] = { Val };
 612   Type *Tys[] = { Val->getType() };
 613
 614   Module *M = (*(CurLoop->block_begin()))->getParent()->getParent();
 615   Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
 616   CallInst *CI = IRBuilder.CreateCall(Func, Ops);
 617   CI->setDebugLoc(DL);
 618
 619   return CI;
 620 }
 621
 622 /// recognize - detect population count idiom in a non-countable loop. If
 623 ///   detected, transform the relevant code to popcount intrinsic function
 624 ///   call, and return true; otherwise, return false.
 625 bool NclPopcountRecognize::recognize() {
 626
 627   if (!LIR.getTargetTransformInfo())
 628     return false;
 629
 630   LIR.getScalarEvolution();
 631
 632   if (!preliminaryScreen())
 633     return false;
 634
 635   Instruction *CntInst;
 636   PHINode *CntPhi;
 637   Value *Val;
 638   if (!detectIdiom(CntInst, CntPhi, Val))
 639     return false;
 640
 641   transform(CntInst, CntPhi, Val);
 642   return true;
 643 }
 644
 645 //===----------------------------------------------------------------------===//
 646 //
 647 //          Implementation of LoopIdiomRecognize
 648 //
 649 //===----------------------------------------------------------------------===//
 650
 651 bool LoopIdiomRecognize::runOnCountableLoop() {
 652   const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
 653   if (isa<SCEVCouldNotCompute>(BECount)) return false;
 654
 655   // If this loop executes exactly one time, then it should be peeled, not
 656   // optimized by this pass.
 657   if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
 658     if (BECst->getValue()->getValue() == 0)
 659       return false;
 660
 661   // We require target data for now.
 662   if (!getDataLayout())
 663     return false;
 664
 665   // set DT
 666   (void)getDominatorTree();
 667
 668   LoopInfo &LI = getAnalysis<LoopInfo>();
 669   TLI = &getAnalysis<TargetLibraryInfo>();
 670
 671   // set TLI
 672   (void)getTargetLibraryInfo();
 673
 674   SmallVector<BasicBlock*, 8> ExitBlocks;
 675   CurLoop->getUniqueExitBlocks(ExitBlocks);
 676
 677   DEBUG(dbgs() << "loop-idiom Scanning: F["
 678                << CurLoop->getHeader()->getParent()->getName()
 679                << "] Loop %" << CurLoop->getHeader()->getName() << "\n");
 680
 681   bool MadeChange = false;
 682   // Scan all the blocks in the loop that are not in subloops.
 683   for (Loop::block_iterator BI = CurLoop->block_begin(),
 684          E = CurLoop->block_end(); BI != E; ++BI) {
 685     // Ignore blocks in subloops.
 686     if (LI.getLoopFor(*BI) != CurLoop)
 687       continue;
 688
 689     MadeChange |= runOnLoopBlock(*BI, BECount, ExitBlocks);
 690   }
 691   return MadeChange;
 692 }
 693
 694 bool LoopIdiomRecognize::runOnNoncountableLoop() {
 695   NclPopcountRecognize Popcount(*this);
 696   if (Popcount.recognize())
 697     return true;
 698
 699   return false;
 700 }
 701
 702 bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
 703   if (skipOptnoneFunction(L))
 704     return false;
 705
 706   CurLoop = L;
 707
 708   // If the loop could not be converted to canonical form, it must have an
 709   // indirectbr in it, just give up.
 710   if (!L->getLoopPreheader())
 711     return false;
 712
 713   // Disable loop idiom recognition if the function's name is a common idiom.
 714   StringRef Name = L->getHeader()->getParent()->getName();
 715   if (Name == "memset" || Name == "memcpy")
 716     return false;
 717
 718   SE = &getAnalysis<ScalarEvolution>();
 719   if (SE->hasLoopInvariantBackedgeTakenCount(L))
 720     return runOnCountableLoop();
 721   return runOnNoncountableLoop();
 722 }
 723
 724 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
 725 /// with the specified backedge count.  This block is known to be in the current
 726 /// loop and not in any subloops.
 727 bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
 728                                      SmallVectorImpl<BasicBlock*> &ExitBlocks) {
 729   // We can only promote stores in this block if they are unconditionally
 730   // executed in the loop.  For a block to be unconditionally executed, it has
 731   // to dominate all the exit blocks of the loop.  Verify this now.
 732   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
 733     if (!DT->dominates(BB, ExitBlocks[i]))
 734       return false;
 735
 736   bool MadeChange = false;
 737   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
 738     Instruction *Inst = I++;
 739     // Look for store instructions, which may be optimized to memset/memcpy.
 740     if (StoreInst *SI = dyn_cast<StoreInst>(Inst))  {
 741       WeakVH InstPtr(I);
 742       if (!processLoopStore(SI, BECount)) continue;
 743       MadeChange = true;
 744
 745       // If processing the store invalidated our iterator, start over from the
 746       // top of the block.
 747       if (InstPtr == 0)
 748         I = BB->begin();
 749       continue;
 750     }
 751
 752     // Look for memset instructions, which may be optimized to a larger memset.
 753     if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst))  {
 754       WeakVH InstPtr(I);
 755       if (!processLoopMemSet(MSI, BECount)) continue;
 756       MadeChange = true;
 757
 758       // If processing the memset invalidated our iterator, start over from the
 759       // top of the block.
 760       if (InstPtr == 0)
 761         I = BB->begin();
 762       continue;
 763     }
 764   }
 765
 766   return MadeChange;
 767 }
 768
 769
 770 /// processLoopStore - See if this store can be promoted to a memset or memcpy.
 771 bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
 772   if (!SI->isSimple()) return false;
 773
 774   Value *StoredVal = SI->getValueOperand();
 775   Value *StorePtr = SI->getPointerOperand();
 776
 777   // Reject stores that are so large that they overflow an unsigned.
 778   uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
 779   if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
 780     return false;
 781
 782   // See if the pointer expression is an AddRec like {base,+,1} on the current
 783   // loop, which indicates a strided store.  If we have something else, it's a
 784   // random store we can't handle.
 785   const SCEVAddRecExpr *StoreEv =
 786     dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
 787   if (StoreEv == 0 || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
 788     return false;
 789
 790   // Check to see if the stride matches the size of the store.  If so, then we
 791   // know that every byte is touched in the loop.
 792   unsigned StoreSize = (unsigned)SizeInBits >> 3;
 793   const SCEVConstant *Stride = dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
 794
 795   if (Stride == 0 || StoreSize != Stride->getValue()->getValue()) {
 796     // TODO: Could also handle negative stride here someday, that will require
 797     // the validity check in mayLoopAccessLocation to be updated though.
 798     // Enable this to print exact negative strides.
 799     if (0 && Stride && StoreSize == -Stride->getValue()->getValue()) {
 800       dbgs() << "NEGATIVE STRIDE: " << *SI << "\n";
 801       dbgs() << "BB: " << *SI->getParent();
 802     }
 803
 804     return false;
 805   }
 806
 807   // See if we can optimize just this store in isolation.
 808   if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(),
 809                               StoredVal, SI, StoreEv, BECount))
 810     return true;
 811
 812   // If the stored value is a strided load in the same loop with the same stride
 813   // this this may be transformable into a memcpy.  This kicks in for stuff like
 814   //   for (i) A[i] = B[i];
 815   if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
 816     const SCEVAddRecExpr *LoadEv =
 817       dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getOperand(0)));
 818     if (LoadEv && LoadEv->getLoop() == CurLoop && LoadEv->isAffine() &&
 819         StoreEv->getOperand(1) == LoadEv->getOperand(1) && LI->isSimple())
 820       if (processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, LoadEv, BECount))
 821         return true;
 822   }
 823   //errs() << "UNHANDLED strided store: " << *StoreEv << " - " << *SI << "\n";
 824
 825   return false;
 826 }
 827
 828 /// processLoopMemSet - See if this memset can be promoted to a large memset.
 829 bool LoopIdiomRecognize::
 830 processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) {
 831   // We can only handle non-volatile memsets with a constant size.
 832   if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) return false;
 833
 834   // If we're not allowed to hack on memset, we fail.
 835   if (!TLI->has(LibFunc::memset))
 836     return false;
 837
 838   Value *Pointer = MSI->getDest();
 839
 840   // See if the pointer expression is an AddRec like {base,+,1} on the current
 841   // loop, which indicates a strided store.  If we have something else, it's a
 842   // random store we can't handle.
 843   const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
 844   if (Ev == 0 || Ev->getLoop() != CurLoop || !Ev->isAffine())
 845     return false;
 846
 847   // Reject memsets that are so large that they overflow an unsigned.
 848   uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
 849   if ((SizeInBytes >> 32) != 0)
 850     return false;
 851
 852   // Check to see if the stride matches the size of the memset.  If so, then we
 853   // know that every byte is touched in the loop.
 854   const SCEVConstant *Stride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
 855
 856   // TODO: Could also handle negative stride here someday, that will require the
 857   // validity check in mayLoopAccessLocation to be updated though.
 858   if (Stride == 0 || MSI->getLength() != Stride->getValue())
 859     return false;
 860
 861   return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
 862                                  MSI->getAlignment(), MSI->getValue(),
 863                                  MSI, Ev, BECount);
 864 }
 865
 866
 867 /// mayLoopAccessLocation - Return true if the specified loop might access the
 868 /// specified pointer location, which is a loop-strided access.  The 'Access'
 869 /// argument specifies what the verboten forms of access are (read or write).
 870 static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access,
 871                                   Loop *L, const SCEV *BECount,
 872                                   unsigned StoreSize, AliasAnalysis &AA,
 873                                   Instruction *IgnoredStore) {
 874   // Get the location that may be stored across the loop.  Since the access is
 875   // strided positively through memory, we say that the modified location starts
 876   // at the pointer and has infinite size.
 877   uint64_t AccessSize = AliasAnalysis::UnknownSize;
 878
 879   // If the loop iterates a fixed number of times, we can refine the access size
 880   // to be exactly the size of the memset, which is (BECount+1)*StoreSize
 881   if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
 882     AccessSize = (BECst->getValue()->getZExtValue()+1)*StoreSize;
 883
 884   // TODO: For this to be really effective, we have to dive into the pointer
 885   // operand in the store.  Store to &A[i] of 100 will always return may alias
 886   // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
 887   // which will then no-alias a store to &A[100].
 888   AliasAnalysis::Location StoreLoc(Ptr, AccessSize);
 889
 890   for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
 891        ++BI)
 892     for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I)
 893       if (&*I != IgnoredStore &&
 894           (AA.getModRefInfo(I, StoreLoc) & Access))
 895         return true;
 896
 897   return false;
 898 }
 899
 900 /// getMemSetPatternValue - If a strided store of the specified value is safe to
 901 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
 902 /// be passed in.  Otherwise, return null.
 903 ///
 904 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
 905 /// just replicate their input array and then pass on to memset_pattern16.
 906 static Constant *getMemSetPatternValue(Value *V, const DataLayout &DL) {
 907   // If the value isn't a constant, we can't promote it to being in a constant
 908   // array.  We could theoretically do a store to an alloca or something, but
 909   // that doesn't seem worthwhile.
 910   Constant *C = dyn_cast<Constant>(V);
 911   if (C == 0) return 0;
 912
 913   // Only handle simple values that are a power of two bytes in size.
 914   uint64_t Size = DL.getTypeSizeInBits(V->getType());
 915   if (Size == 0 || (Size & 7) || (Size & (Size-1)))
 916     return 0;
 917
 918   // Don't care enough about darwin/ppc to implement this.
 919   if (DL.isBigEndian())
 920     return 0;
 921
 922   // Convert to size in bytes.
 923   Size /= 8;
 924
 925   // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
 926   // if the top and bottom are the same (e.g. for vectors and large integers).
 927   if (Size > 16) return 0;
 928
 929   // If the constant is exactly 16 bytes, just use it.
 930   if (Size == 16) return C;
 931
 932   // Otherwise, we'll use an array of the constants.
 933   unsigned ArraySize = 16/Size;
 934   ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
 935   return ConstantArray::get(AT, std::vector<Constant*>(ArraySize, C));
 936 }
 937
 938
 939 /// processLoopStridedStore - We see a strided store of some value.  If we can
 940 /// transform this into a memset or memset_pattern in the loop preheader, do so.
 941 bool LoopIdiomRecognize::
 942 processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
 943                         unsigned StoreAlignment, Value *StoredVal,
 944                         Instruction *TheStore, const SCEVAddRecExpr *Ev,
 945                         const SCEV *BECount) {
 946
 947   // If the stored value is a byte-wise value (like i32 -1), then it may be
 948   // turned into a memset of i8 -1, assuming that all the consecutive bytes
 949   // are stored.  A store of i32 0x01020304 can never be turned into a memset,
 950   // but it can be turned into memset_pattern if the target supports it.
 951   Value *SplatValue = isBytewiseValue(StoredVal);
 952   Constant *PatternValue = 0;
 953
 954   unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
 955
 956   // If we're allowed to form a memset, and the stored value would be acceptable
 957   // for memset, use it.
 958   if (SplatValue && TLI->has(LibFunc::memset) &&
 959       // Verify that the stored value is loop invariant.  If not, we can't
 960       // promote the memset.
 961       CurLoop->isLoopInvariant(SplatValue)) {
 962     // Keep and use SplatValue.
 963     PatternValue = 0;
 964   } else if (DestAS == 0 &&
 965              TLI->has(LibFunc::memset_pattern16) &&
 966              (PatternValue = getMemSetPatternValue(StoredVal, *DL))) {
 967     // Don't create memset_pattern16s with address spaces.
 968     // It looks like we can use PatternValue!
 969     SplatValue = 0;
 970   } else {
 971     // Otherwise, this isn't an idiom we can transform.  For example, we can't
 972     // do anything with a 3-byte store.
 973     return false;
 974   }
 975
 976   // The trip count of the loop and the base pointer of the addrec SCEV is
 977   // guaranteed to be loop invariant, which means that it should dominate the
 978   // header.  This allows us to insert code for it in the preheader.
 979   BasicBlock *Preheader = CurLoop->getLoopPreheader();
 980   IRBuilder<> Builder(Preheader->getTerminator());
 981   SCEVExpander Expander(*SE, "loop-idiom");
 982
 983   Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
 984
 985   // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
 986   // this into a memset in the loop preheader now if we want.  However, this
 987   // would be unsafe to do if there is anything else in the loop that may read
 988   // or write to the aliased location.  Check for any overlap by generating the
 989   // base pointer and checking the region.
 990   Value *BasePtr =
 991     Expander.expandCodeFor(Ev->getStart(), DestInt8PtrTy,
 992                            Preheader->getTerminator());
 993
 994   if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef,
 995                             CurLoop, BECount,
 996                             StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) {
 997     Expander.clear();
 998     // If we generated new code for the base pointer, clean up.
 999     deleteIfDeadInstruction(BasePtr, *SE, TLI);
1000     return false;
1001   }
1002
1003   // Okay, everything looks good, insert the memset.
1004
1005   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
1006   // pointer size if it isn't already.
1007   Type *IntPtr = Builder.getIntPtrTy(DL, DestAS);
1008   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
1009
1010   const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
1011                                          SCEV::FlagNUW);
1012   if (StoreSize != 1) {
1013     NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
1014                                SCEV::FlagNUW);
1015   }
1016
1017   Value *NumBytes =
1018     Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
1019
1020   CallInst *NewCall;
1021   if (SplatValue) {
1022     NewCall = Builder.CreateMemSet(BasePtr,
1023                                    SplatValue,
1024                                    NumBytes,
1025                                    StoreAlignment);
1026   } else {
1027     // Everything is emitted in default address space
1028     Type *Int8PtrTy = DestInt8PtrTy;
1029
1030     Module *M = TheStore->getParent()->getParent()->getParent();
1031     Value *MSP = M->getOrInsertFunction("memset_pattern16",
1032                                         Builder.getVoidTy(),
1033                                         Int8PtrTy,
1034                                         Int8PtrTy,
1035                                         IntPtr,
1036                                         (void*)0);
1037
1038     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
1039     // an constant array of 16-bytes.  Plop the value into a mergable global.
1040     GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1041                                             GlobalValue::InternalLinkage,
1042                                             PatternValue, ".memset_pattern");
1043     GV->setUnnamedAddr(true); // Ok to merge these.
1044     GV->setAlignment(16);
1045     Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
1046     NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes);
1047   }
1048
1049   DEBUG(dbgs() << "  Formed memset: " << *NewCall << "\n"
1050                << "    from store to: " << *Ev << " at: " << *TheStore << "\n");
1051   NewCall->setDebugLoc(TheStore->getDebugLoc());
1052
1053   // Okay, the memset has been formed.  Zap the original store and anything that
1054   // feeds into it.
1055   deleteDeadInstruction(TheStore, *SE, TLI);
1056   ++NumMemSet;
1057   return true;
1058 }
1059
1060 /// processLoopStoreOfLoopLoad - We see a strided store whose value is a
1061 /// same-strided load.
1062 bool LoopIdiomRecognize::
1063 processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
1064                            const SCEVAddRecExpr *StoreEv,
1065                            const SCEVAddRecExpr *LoadEv,
1066                            const SCEV *BECount) {
1067   // If we're not allowed to form memcpy, we fail.
1068   if (!TLI->has(LibFunc::memcpy))
1069     return false;
1070
1071   LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
1072
1073   // The trip count of the loop and the base pointer of the addrec SCEV is
1074   // guaranteed to be loop invariant, which means that it should dominate the
1075   // header.  This allows us to insert code for it in the preheader.
1076   BasicBlock *Preheader = CurLoop->getLoopPreheader();
1077   IRBuilder<> Builder(Preheader->getTerminator());
1078   SCEVExpander Expander(*SE, "loop-idiom");
1079
1080   // Okay, we have a strided store "p[i]" of a loaded value.  We can turn
1081   // this into a memcpy in the loop preheader now if we want.  However, this
1082   // would be unsafe to do if there is anything else in the loop that may read
1083   // or write the memory region we're storing to.  This includes the load that
1084   // feeds the stores.  Check for an alias by generating the base address and
1085   // checking everything.
1086   Value *StoreBasePtr =
1087     Expander.expandCodeFor(StoreEv->getStart(),
1088                            Builder.getInt8PtrTy(SI->getPointerAddressSpace()),
1089                            Preheader->getTerminator());
1090
1091   if (mayLoopAccessLocation(StoreBasePtr, AliasAnalysis::ModRef,
1092                             CurLoop, BECount, StoreSize,
1093                             getAnalysis<AliasAnalysis>(), SI)) {
1094     Expander.clear();
1095     // If we generated new code for the base pointer, clean up.
1096     deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
1097     return false;
1098   }
1099
1100   // For a memcpy, we have to make sure that the input array is not being
1101   // mutated by the loop.
1102   Value *LoadBasePtr =
1103     Expander.expandCodeFor(LoadEv->getStart(),
1104                            Builder.getInt8PtrTy(LI->getPointerAddressSpace()),
1105                            Preheader->getTerminator());
1106
1107   if (mayLoopAccessLocation(LoadBasePtr, AliasAnalysis::Mod, CurLoop, BECount,
1108                             StoreSize, getAnalysis<AliasAnalysis>(), SI)) {
1109     Expander.clear();
1110     // If we generated new code for the base pointer, clean up.
1111     deleteIfDeadInstruction(LoadBasePtr, *SE, TLI);
1112     deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
1113     return false;
1114   }
1115
1116   // Okay, everything is safe, we can transform this!
1117
1118
1119   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
1120   // pointer size if it isn't already.
1121   Type *IntPtrTy = Builder.getIntPtrTy(DL, SI->getPointerAddressSpace());
1122   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
1123
1124   const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1),
1125                                          SCEV::FlagNUW);
1126   if (StoreSize != 1)
1127     NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
1128                                SCEV::FlagNUW);
1129
1130   Value *NumBytes =
1131     Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
1132
1133   CallInst *NewCall =
1134     Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes,
1135                          std::min(SI->getAlignment(), LI->getAlignment()));
1136   NewCall->setDebugLoc(SI->getDebugLoc());
1137
1138   DEBUG(dbgs() << "  Formed memcpy: " << *NewCall << "\n"
1139                << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
1140                << "    from store ptr=" << *StoreEv << " at: " << *SI << "\n");
1141
1142
1143   // Okay, the memset has been formed.  Zap the original store and anything that
1144   // feeds into it.
1145   deleteDeadInstruction(SI, *SE, TLI);
1146   ++NumMemCpy;
1147   return true;
1148 }