lib/CodeGen/CodeGenPrepare.cpp

   1 //===- CodeGenPrepare.cpp - Prepare a function for code generation --------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This pass munges the code in the input function to better prepare it for
  11 // SelectionDAG-based code generation. This works around limitations in it's
  12 // basic-block-at-a-time approach. It should eventually be removed.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "llvm/CodeGen/Passes.h"
  17 #include "llvm/ADT/DenseMap.h"
  18 #include "llvm/ADT/SmallSet.h"
  19 #include "llvm/ADT/Statistic.h"
  20 #include "llvm/Analysis/InstructionSimplify.h"
  21 #include "llvm/Analysis/TargetLibraryInfo.h"
  22 #include "llvm/Analysis/TargetTransformInfo.h"
  23 #include "llvm/Analysis/ValueTracking.h"
  24 #include "llvm/IR/CallSite.h"
  25 #include "llvm/IR/Constants.h"
  26 #include "llvm/IR/DataLayout.h"
  27 #include "llvm/IR/DerivedTypes.h"
  28 #include "llvm/IR/Dominators.h"
  29 #include "llvm/IR/Function.h"
  30 #include "llvm/IR/GetElementPtrTypeIterator.h"
  31 #include "llvm/IR/IRBuilder.h"
  32 #include "llvm/IR/InlineAsm.h"
  33 #include "llvm/IR/Instructions.h"
  34 #include "llvm/IR/IntrinsicInst.h"
  35 #include "llvm/IR/MDBuilder.h"
  36 #include "llvm/IR/PatternMatch.h"
  37 #include "llvm/IR/Statepoint.h"
  38 #include "llvm/IR/ValueHandle.h"
  39 #include "llvm/IR/ValueMap.h"
  40 #include "llvm/Pass.h"
  41 #include "llvm/Support/CommandLine.h"
  42 #include "llvm/Support/Debug.h"
  43 #include "llvm/Support/raw_ostream.h"
  44 #include "llvm/Target/TargetLowering.h"
  45 #include "llvm/Target/TargetSubtargetInfo.h"
  46 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  47 #include "llvm/Transforms/Utils/BuildLibCalls.h"
  48 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
  49 #include "llvm/Transforms/Utils/Local.h"
  50 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
  51 using namespace llvm;
  52 using namespace llvm::PatternMatch;
  53
  54 #define DEBUG_TYPE "codegenprepare"
  55
  56 STATISTIC(NumBlocksElim, "Number of blocks eliminated");
  57 STATISTIC(NumPHIsElim,   "Number of trivial PHIs eliminated");
  58 STATISTIC(NumGEPsElim,   "Number of GEPs converted to casts");
  59 STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of "
  60                       "sunken Cmps");
  61 STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses "
  62                        "of sunken Casts");
  63 STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
  64                           "computations were sunk");
  65 STATISTIC(NumExtsMoved,  "Number of [s|z]ext instructions combined with loads");
  66 STATISTIC(NumExtUses,    "Number of uses of [s|z]ext instructions optimized");
  67 STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
  68 STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
  69 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
  70 STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches");
  71 STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
  72
  73 static cl::opt<bool> DisableBranchOpts(
  74   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
  75   cl::desc("Disable branch optimizations in CodeGenPrepare"));
  76
  77 static cl::opt<bool>
  78     DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false),
  79                   cl::desc("Disable GC optimizations in CodeGenPrepare"));
  80
  81 static cl::opt<bool> DisableSelectToBranch(
  82   "disable-cgp-select2branch", cl::Hidden, cl::init(false),
  83   cl::desc("Disable select to branch conversion."));
  84
  85 static cl::opt<bool> AddrSinkUsingGEPs(
  86   "addr-sink-using-gep", cl::Hidden, cl::init(false),
  87   cl::desc("Address sinking in CGP using GEPs."));
  88
  89 static cl::opt<bool> EnableAndCmpSinking(
  90    "enable-andcmp-sinking", cl::Hidden, cl::init(true),
  91    cl::desc("Enable sinkinig and/cmp into branches."));
  92
  93 static cl::opt<bool> DisableStoreExtract(
  94     "disable-cgp-store-extract", cl::Hidden, cl::init(false),
  95     cl::desc("Disable store(extract) optimizations in CodeGenPrepare"));
  96
  97 static cl::opt<bool> StressStoreExtract(
  98     "stress-cgp-store-extract", cl::Hidden, cl::init(false),
  99     cl::desc("Stress test store(extract) optimizations in CodeGenPrepare"));
 100
 101 static cl::opt<bool> DisableExtLdPromotion(
 102     "disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
 103     cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in "
 104              "CodeGenPrepare"));
 105
 106 static cl::opt<bool> StressExtLdPromotion(
 107     "stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
 108     cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) "
 109              "optimization in CodeGenPrepare"));
 110
 111 namespace {
 112 typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
 113 typedef PointerIntPair<Type *, 1, bool> TypeIsSExt;
 114 typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy;
 115 class TypePromotionTransaction;
 116
 117   class CodeGenPrepare : public FunctionPass {
 118     const TargetMachine *TM;
 119     const TargetLowering *TLI;
 120     const TargetTransformInfo *TTI;
 121     const TargetLibraryInfo *TLInfo;
 122
 123     /// As we scan instructions optimizing them, this is the next instruction
 124     /// to optimize. Transforms that can invalidate this should update it.
 125     BasicBlock::iterator CurInstIterator;
 126
 127     /// Keeps track of non-local addresses that have been sunk into a block.
 128     /// This allows us to avoid inserting duplicate code for blocks with
 129     /// multiple load/stores of the same address.
 130     ValueMap<Value*, Value*> SunkAddrs;
 131
 132     /// Keeps track of all instructions inserted for the current function.
 133     SetOfInstrs InsertedInsts;
 134     /// Keeps track of the type of the related instruction before their
 135     /// promotion for the current function.
 136     InstrToOrigTy PromotedInsts;
 137
 138     /// True if CFG is modified in any way.
 139     bool ModifiedDT;
 140
 141     /// True if optimizing for size.
 142     bool OptSize;
 143
 144     /// DataLayout for the Function being processed.
 145     const DataLayout *DL;
 146
 147   public:
 148     static char ID; // Pass identification, replacement for typeid
 149     explicit CodeGenPrepare(const TargetMachine *TM = nullptr)
 150         : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr), DL(nullptr) {
 151         initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
 152       }
 153     bool runOnFunction(Function &F) override;
 154
 155     const char *getPassName() const override { return "CodeGen Prepare"; }
 156
 157     void getAnalysisUsage(AnalysisUsage &AU) const override {
 158       AU.addPreserved<DominatorTreeWrapperPass>();
 159       AU.addRequired<TargetLibraryInfoWrapperPass>();
 160       AU.addRequired<TargetTransformInfoWrapperPass>();
 161     }
 162
 163   private:
 164     bool eliminateFallThrough(Function &F);
 165     bool eliminateMostlyEmptyBlocks(Function &F);
 166     bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
 167     void eliminateMostlyEmptyBlock(BasicBlock *BB);
 168     bool optimizeBlock(BasicBlock &BB, bool& ModifiedDT);
 169     bool optimizeInst(Instruction *I, bool& ModifiedDT);
 170     bool optimizeMemoryInst(Instruction *I, Value *Addr,
 171                             Type *AccessTy, unsigned AS);
 172     bool optimizeInlineAsmInst(CallInst *CS);
 173     bool optimizeCallInst(CallInst *CI, bool& ModifiedDT);
 174     bool moveExtToFormExtLoad(Instruction *&I);
 175     bool optimizeExtUses(Instruction *I);
 176     bool optimizeSelectInst(SelectInst *SI);
 177     bool optimizeShuffleVectorInst(ShuffleVectorInst *SI);
 178     bool optimizeSwitchInst(SwitchInst *CI);
 179     bool optimizeExtractElementInst(Instruction *Inst);
 180     bool dupRetToEnableTailCallOpts(BasicBlock *BB);
 181     bool placeDbgValues(Function &F);
 182     bool sinkAndCmp(Function &F);
 183     bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI,
 184                         Instruction *&Inst,
 185                         const SmallVectorImpl<Instruction *> &Exts,
 186                         unsigned CreatedInstCost);
 187     bool splitBranchCondition(Function &F);
 188     bool simplifyOffsetableRelocate(Instruction &I);
 189     void stripInvariantGroupMetadata(Instruction &I);
 190   };
 191 }
 192
 193 char CodeGenPrepare::ID = 0;
 194 INITIALIZE_TM_PASS(CodeGenPrepare, "codegenprepare",
 195                    "Optimize for code generation", false, false)
 196
 197 FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) {
 198   return new CodeGenPrepare(TM);
 199 }
 200
 201 bool CodeGenPrepare::runOnFunction(Function &F) {
 202   if (skipOptnoneFunction(F))
 203     return false;
 204
 205   DL = &F.getParent()->getDataLayout();
 206
 207   bool EverMadeChange = false;
 208   // Clear per function information.
 209   InsertedInsts.clear();
 210   PromotedInsts.clear();
 211
 212   ModifiedDT = false;
 213   if (TM)
 214     TLI = TM->getSubtargetImpl(F)->getTargetLowering();
 215   TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 216   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 217   OptSize = F.optForSize();
 218
 219   /// This optimization identifies DIV instructions that can be
 220   /// profitably bypassed and carried out with a shorter, faster divide.
 221   if (!OptSize && TLI && TLI->isSlowDivBypassed()) {
 222     const DenseMap<unsigned int, unsigned int> &BypassWidths =
 223        TLI->getBypassSlowDivWidths();
 224     for (Function::iterator I = F.begin(); I != F.end(); I++)
 225       EverMadeChange |= bypassSlowDivision(F, I, BypassWidths);
 226   }
 227
 228   // Eliminate blocks that contain only PHI nodes and an
 229   // unconditional branch.
 230   EverMadeChange |= eliminateMostlyEmptyBlocks(F);
 231
 232   // llvm.dbg.value is far away from the value then iSel may not be able
 233   // handle it properly. iSel will drop llvm.dbg.value if it can not
 234   // find a node corresponding to the value.
 235   EverMadeChange |= placeDbgValues(F);
 236
 237   // If there is a mask, compare against zero, and branch that can be combined
 238   // into a single target instruction, push the mask and compare into branch
 239   // users. Do this before OptimizeBlock -> OptimizeInst ->
 240   // OptimizeCmpExpression, which perturbs the pattern being searched for.
 241   if (!DisableBranchOpts) {
 242     EverMadeChange |= sinkAndCmp(F);
 243     EverMadeChange |= splitBranchCondition(F);
 244   }
 245
 246   bool MadeChange = true;
 247   while (MadeChange) {
 248     MadeChange = false;
 249     for (Function::iterator I = F.begin(); I != F.end(); ) {
 250       BasicBlock *BB = &*I++;
 251       bool ModifiedDTOnIteration = false;
 252       MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
 253
 254       // Restart BB iteration if the dominator tree of the Function was changed
 255       if (ModifiedDTOnIteration)
 256         break;
 257     }
 258     EverMadeChange |= MadeChange;
 259   }
 260
 261   SunkAddrs.clear();
 262
 263   if (!DisableBranchOpts) {
 264     MadeChange = false;
 265     SmallPtrSet<BasicBlock*, 8> WorkList;
 266     for (BasicBlock &BB : F) {
 267       SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB));
 268       MadeChange |= ConstantFoldTerminator(&BB, true);
 269       if (!MadeChange) continue;
 270
 271       for (SmallVectorImpl<BasicBlock*>::iterator
 272              II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
 273         if (pred_begin(*II) == pred_end(*II))
 274           WorkList.insert(*II);
 275     }
 276
 277     // Delete the dead blocks and any of their dead successors.
 278     MadeChange |= !WorkList.empty();
 279     while (!WorkList.empty()) {
 280       BasicBlock *BB = *WorkList.begin();
 281       WorkList.erase(BB);
 282       SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
 283
 284       DeleteDeadBlock(BB);
 285
 286       for (SmallVectorImpl<BasicBlock*>::iterator
 287              II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
 288         if (pred_begin(*II) == pred_end(*II))
 289           WorkList.insert(*II);
 290     }
 291
 292     // Merge pairs of basic blocks with unconditional branches, connected by
 293     // a single edge.
 294     if (EverMadeChange || MadeChange)
 295       MadeChange |= eliminateFallThrough(F);
 296
 297     EverMadeChange |= MadeChange;
 298   }
 299
 300   if (!DisableGCOpts) {
 301     SmallVector<Instruction *, 2> Statepoints;
 302     for (BasicBlock &BB : F)
 303       for (Instruction &I : BB)
 304         if (isStatepoint(I))
 305           Statepoints.push_back(&I);
 306     for (auto &I : Statepoints)
 307       EverMadeChange |= simplifyOffsetableRelocate(*I);
 308   }
 309
 310   return EverMadeChange;
 311 }
 312
 313 /// Merge basic blocks which are connected by a single edge, where one of the
 314 /// basic blocks has a single successor pointing to the other basic block,
 315 /// which has a single predecessor.
 316 bool CodeGenPrepare::eliminateFallThrough(Function &F) {
 317   bool Changed = false;
 318   // Scan all of the blocks in the function, except for the entry block.
 319   for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) {
 320     BasicBlock *BB = &*I++;
 321     // If the destination block has a single pred, then this is a trivial
 322     // edge, just collapse it.
 323     BasicBlock *SinglePred = BB->getSinglePredecessor();
 324
 325     // Don't merge if BB's address is taken.
 326     if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue;
 327
 328     BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator());
 329     if (Term && !Term->isConditional()) {
 330       Changed = true;
 331       DEBUG(dbgs() << "To merge:\n"<< *SinglePred << "\n\n\n");
 332       // Remember if SinglePred was the entry block of the function.
 333       // If so, we will need to move BB back to the entry position.
 334       bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
 335       MergeBasicBlockIntoOnlyPred(BB, nullptr);
 336
 337       if (isEntry && BB != &BB->getParent()->getEntryBlock())
 338         BB->moveBefore(&BB->getParent()->getEntryBlock());
 339
 340       // We have erased a block. Update the iterator.
 341       I = BB->getIterator();
 342     }
 343   }
 344   return Changed;
 345 }
 346
 347 /// Eliminate blocks that contain only PHI nodes, debug info directives, and an
 348 /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
 349 /// edges in ways that are non-optimal for isel. Start by eliminating these
 350 /// blocks so we can split them the way we want them.
 351 bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {
 352   bool MadeChange = false;
 353   // Note that this intentionally skips the entry block.
 354   for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) {
 355     BasicBlock *BB = &*I++;
 356
 357     // If this block doesn't end with an uncond branch, ignore it.
 358     BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
 359     if (!BI || !BI->isUnconditional())
 360       continue;
 361
 362     // If the instruction before the branch (skipping debug info) isn't a phi
 363     // node, then other stuff is happening here.
 364     BasicBlock::iterator BBI = BI->getIterator();
 365     if (BBI != BB->begin()) {
 366       --BBI;
 367       while (isa<DbgInfoIntrinsic>(BBI)) {
 368         if (BBI == BB->begin())
 369           break;
 370         --BBI;
 371       }
 372       if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
 373         continue;
 374     }
 375
 376     // Do not break infinite loops.
 377     BasicBlock *DestBB = BI->getSuccessor(0);
 378     if (DestBB == BB)
 379       continue;
 380
 381     if (!canMergeBlocks(BB, DestBB))
 382       continue;
 383
 384     eliminateMostlyEmptyBlock(BB);
 385     MadeChange = true;
 386   }
 387   return MadeChange;
 388 }
 389
 390 /// Return true if we can merge BB into DestBB if there is a single
 391 /// unconditional branch between them, and BB contains no other non-phi
 392 /// instructions.
 393 bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,
 394                                     const BasicBlock *DestBB) const {
 395   // We only want to eliminate blocks whose phi nodes are used by phi nodes in
 396   // the successor.  If there are more complex condition (e.g. preheaders),
 397   // don't mess around with them.
 398   BasicBlock::const_iterator BBI = BB->begin();
 399   while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) {
 400     for (const User *U : PN->users()) {
 401       const Instruction *UI = cast<Instruction>(U);
 402       if (UI->getParent() != DestBB || !isa<PHINode>(UI))
 403         return false;
 404       // If User is inside DestBB block and it is a PHINode then check
 405       // incoming value. If incoming value is not from BB then this is
 406       // a complex condition (e.g. preheaders) we want to avoid here.
 407       if (UI->getParent() == DestBB) {
 408         if (const PHINode *UPN = dyn_cast<PHINode>(UI))
 409           for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) {
 410             Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I));
 411             if (Insn && Insn->getParent() == BB &&
 412                 Insn->getParent() != UPN->getIncomingBlock(I))
 413               return false;
 414           }
 415       }
 416     }
 417   }
 418
 419   // If BB and DestBB contain any common predecessors, then the phi nodes in BB
 420   // and DestBB may have conflicting incoming values for the block.  If so, we
 421   // can't merge the block.
 422   const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin());
 423   if (!DestBBPN) return true;  // no conflict.
 424
 425   // Collect the preds of BB.
 426   SmallPtrSet<const BasicBlock*, 16> BBPreds;
 427   if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
 428     // It is faster to get preds from a PHI than with pred_iterator.
 429     for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
 430       BBPreds.insert(BBPN->getIncomingBlock(i));
 431   } else {
 432     BBPreds.insert(pred_begin(BB), pred_end(BB));
 433   }
 434
 435   // Walk the preds of DestBB.
 436   for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) {
 437     BasicBlock *Pred = DestBBPN->getIncomingBlock(i);
 438     if (BBPreds.count(Pred)) {   // Common predecessor?
 439       BBI = DestBB->begin();
 440       while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) {
 441         const Value *V1 = PN->getIncomingValueForBlock(Pred);
 442         const Value *V2 = PN->getIncomingValueForBlock(BB);
 443
 444         // If V2 is a phi node in BB, look up what the mapped value will be.
 445         if (const PHINode *V2PN = dyn_cast<PHINode>(V2))
 446           if (V2PN->getParent() == BB)
 447             V2 = V2PN->getIncomingValueForBlock(Pred);
 448
 449         // If there is a conflict, bail out.
 450         if (V1 != V2) return false;
 451       }
 452     }
 453   }
 454
 455   return true;
 456 }
 457
 458
 459 /// Eliminate a basic block that has only phi's and an unconditional branch in
 460 /// it.
 461 void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
 462   BranchInst *BI = cast<BranchInst>(BB->getTerminator());
 463   BasicBlock *DestBB = BI->getSuccessor(0);
 464
 465   DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" << *BB << *DestBB);
 466
 467   // If the destination block has a single pred, then this is a trivial edge,
 468   // just collapse it.
 469   if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) {
 470     if (SinglePred != DestBB) {
 471       // Remember if SinglePred was the entry block of the function.  If so, we
 472       // will need to move BB back to the entry position.
 473       bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
 474       MergeBasicBlockIntoOnlyPred(DestBB, nullptr);
 475
 476       if (isEntry && BB != &BB->getParent()->getEntryBlock())
 477         BB->moveBefore(&BB->getParent()->getEntryBlock());
 478
 479       DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
 480       return;
 481     }
 482   }
 483
 484   // Otherwise, we have multiple predecessors of BB.  Update the PHIs in DestBB
 485   // to handle the new incoming edges it is about to have.
 486   PHINode *PN;
 487   for (BasicBlock::iterator BBI = DestBB->begin();
 488        (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
 489     // Remove the incoming value for BB, and remember it.
 490     Value *InVal = PN->removeIncomingValue(BB, false);
 491
 492     // Two options: either the InVal is a phi node defined in BB or it is some
 493     // value that dominates BB.
 494     PHINode *InValPhi = dyn_cast<PHINode>(InVal);
 495     if (InValPhi && InValPhi->getParent() == BB) {
 496       // Add all of the input values of the input PHI as inputs of this phi.
 497       for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i)
 498         PN->addIncoming(InValPhi->getIncomingValue(i),
 499                         InValPhi->getIncomingBlock(i));
 500     } else {
 501       // Otherwise, add one instance of the dominating value for each edge that
 502       // we will be adding.
 503       if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
 504         for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
 505           PN->addIncoming(InVal, BBPN->getIncomingBlock(i));
 506       } else {
 507         for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
 508           PN->addIncoming(InVal, *PI);
 509       }
 510     }
 511   }
 512
 513   // The PHIs are now updated, change everything that refers to BB to use
 514   // DestBB and remove BB.
 515   BB->replaceAllUsesWith(DestBB);
 516   BB->eraseFromParent();
 517   ++NumBlocksElim;
 518
 519   DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
 520 }
 521
 522 // Computes a map of base pointer relocation instructions to corresponding
 523 // derived pointer relocation instructions given a vector of all relocate calls
 524 static void computeBaseDerivedRelocateMap(
 525     const SmallVectorImpl<User *> &AllRelocateCalls,
 526     DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> &
 527         RelocateInstMap) {
 528   // Collect information in two maps: one primarily for locating the base object
 529   // while filling the second map; the second map is the final structure holding
 530   // a mapping between Base and corresponding Derived relocate calls
 531   DenseMap<std::pair<unsigned, unsigned>, IntrinsicInst *> RelocateIdxMap;
 532   for (auto &U : AllRelocateCalls) {
 533     GCRelocateOperands ThisRelocate(U);
 534     IntrinsicInst *I = cast<IntrinsicInst>(U);
 535     auto K = std::make_pair(ThisRelocate.getBasePtrIndex(),
 536                             ThisRelocate.getDerivedPtrIndex());
 537     RelocateIdxMap.insert(std::make_pair(K, I));
 538   }
 539   for (auto &Item : RelocateIdxMap) {
 540     std::pair<unsigned, unsigned> Key = Item.first;
 541     if (Key.first == Key.second)
 542       // Base relocation: nothing to insert
 543       continue;
 544
 545     IntrinsicInst *I = Item.second;
 546     auto BaseKey = std::make_pair(Key.first, Key.first);
 547
 548     // We're iterating over RelocateIdxMap so we cannot modify it.
 549     auto MaybeBase = RelocateIdxMap.find(BaseKey);
 550     if (MaybeBase == RelocateIdxMap.end())
 551       // TODO: We might want to insert a new base object relocate and gep off
 552       // that, if there are enough derived object relocates.
 553       continue;
 554
 555     RelocateInstMap[MaybeBase->second].push_back(I);
 556   }
 557 }
 558
 559 // Accepts a GEP and extracts the operands into a vector provided they're all
 560 // small integer constants
 561 static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
 562                                           SmallVectorImpl<Value *> &OffsetV) {
 563   for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
 564     // Only accept small constant integer operands
 565     auto Op = dyn_cast<ConstantInt>(GEP->getOperand(i));
 566     if (!Op || Op->getZExtValue() > 20)
 567       return false;
 568   }
 569
 570   for (unsigned i = 1; i < GEP->getNumOperands(); i++)
 571     OffsetV.push_back(GEP->getOperand(i));
 572   return true;
 573 }
 574
 575 // Takes a RelocatedBase (base pointer relocation instruction) and Targets to
 576 // replace, computes a replacement, and affects it.
 577 static bool
 578 simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase,
 579                           const SmallVectorImpl<IntrinsicInst *> &Targets) {
 580   bool MadeChange = false;
 581   for (auto &ToReplace : Targets) {
 582     GCRelocateOperands MasterRelocate(RelocatedBase);
 583     GCRelocateOperands ThisRelocate(ToReplace);
 584
 585     assert(ThisRelocate.getBasePtrIndex() == MasterRelocate.getBasePtrIndex() &&
 586            "Not relocating a derived object of the original base object");
 587     if (ThisRelocate.getBasePtrIndex() == ThisRelocate.getDerivedPtrIndex()) {
 588       // A duplicate relocate call. TODO: coalesce duplicates.
 589       continue;
 590     }
 591
 592     Value *Base = ThisRelocate.getBasePtr();
 593     auto Derived = dyn_cast<GetElementPtrInst>(ThisRelocate.getDerivedPtr());
 594     if (!Derived || Derived->getPointerOperand() != Base)
 595       continue;
 596
 597     SmallVector<Value *, 2> OffsetV;
 598     if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV))
 599       continue;
 600
 601     // Create a Builder and replace the target callsite with a gep
 602     assert(RelocatedBase->getNextNode() && "Should always have one since it's not a terminator");
 603
 604     // Insert after RelocatedBase
 605     IRBuilder<> Builder(RelocatedBase->getNextNode());
 606     Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());
 607
 608     // If gc_relocate does not match the actual type, cast it to the right type.
 609     // In theory, there must be a bitcast after gc_relocate if the type does not
 610     // match, and we should reuse it to get the derived pointer. But it could be
 611     // cases like this:
 612     // bb1:
 613     //  ...
 614     //  %g1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
 615     //  br label %merge
 616     //
 617     // bb2:
 618     //  ...
 619     //  %g2 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
 620     //  br label %merge
 621     //
 622     // merge:
 623     //  %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ]
 624     //  %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)*
 625     //
 626     // In this case, we can not find the bitcast any more. So we insert a new bitcast
 627     // no matter there is already one or not. In this way, we can handle all cases, and
 628     // the extra bitcast should be optimized away in later passes.
 629     Instruction *ActualRelocatedBase = RelocatedBase;
 630     if (RelocatedBase->getType() != Base->getType()) {
 631       ActualRelocatedBase =
 632           cast<Instruction>(Builder.CreateBitCast(RelocatedBase, Base->getType()));
 633     }
 634     Value *Replacement = Builder.CreateGEP(
 635         Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV));
 636     Instruction *ReplacementInst = cast<Instruction>(Replacement);
 637     Replacement->takeName(ToReplace);
 638     // If the newly generated derived pointer's type does not match the original derived
 639     // pointer's type, cast the new derived pointer to match it. Same reasoning as above.
 640     Instruction *ActualReplacement = ReplacementInst;
 641     if (ReplacementInst->getType() != ToReplace->getType()) {
 642       ActualReplacement =
 643           cast<Instruction>(Builder.CreateBitCast(ReplacementInst, ToReplace->getType()));
 644     }
 645     ToReplace->replaceAllUsesWith(ActualReplacement);
 646     ToReplace->eraseFromParent();
 647
 648     MadeChange = true;
 649   }
 650   return MadeChange;
 651 }
 652
 653 // Turns this:
 654 //
 655 // %base = ...
 656 // %ptr = gep %base + 15
 657 // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
 658 // %base' = relocate(%tok, i32 4, i32 4)
 659 // %ptr' = relocate(%tok, i32 4, i32 5)
 660 // %val = load %ptr'
 661 //
 662 // into this:
 663 //
 664 // %base = ...
 665 // %ptr = gep %base + 15
 666 // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
 667 // %base' = gc.relocate(%tok, i32 4, i32 4)
 668 // %ptr' = gep %base' + 15
 669 // %val = load %ptr'
 670 bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {
 671   bool MadeChange = false;
 672   SmallVector<User *, 2> AllRelocateCalls;
 673
 674   for (auto *U : I.users())
 675     if (isGCRelocate(dyn_cast<Instruction>(U)))
 676       // Collect all the relocate calls associated with a statepoint
 677       AllRelocateCalls.push_back(U);
 678
 679   // We need atleast one base pointer relocation + one derived pointer
 680   // relocation to mangle
 681   if (AllRelocateCalls.size() < 2)
 682     return false;
 683
 684   // RelocateInstMap is a mapping from the base relocate instruction to the
 685   // corresponding derived relocate instructions
 686   DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> RelocateInstMap;
 687   computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);
 688   if (RelocateInstMap.empty())
 689     return false;
 690
 691   for (auto &Item : RelocateInstMap)
 692     // Item.first is the RelocatedBase to offset against
 693     // Item.second is the vector of Targets to replace
 694     MadeChange = simplifyRelocatesOffABase(Item.first, Item.second);
 695   return MadeChange;
 696 }
 697
 698 /// SinkCast - Sink the specified cast instruction into its user blocks
 699 static bool SinkCast(CastInst *CI) {
 700   BasicBlock *DefBB = CI->getParent();
 701
 702   /// InsertedCasts - Only insert a cast in each block once.
 703   DenseMap<BasicBlock*, CastInst*> InsertedCasts;
 704
 705   bool MadeChange = false;
 706   for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end();
 707        UI != E; ) {
 708     Use &TheUse = UI.getUse();
 709     Instruction *User = cast<Instruction>(*UI);
 710
 711     // Figure out which BB this cast is used in.  For PHI's this is the
 712     // appropriate predecessor block.
 713     BasicBlock *UserBB = User->getParent();
 714     if (PHINode *PN = dyn_cast<PHINode>(User)) {
 715       UserBB = PN->getIncomingBlock(TheUse);
 716     }
 717
 718     // Preincrement use iterator so we don't invalidate it.
 719     ++UI;
 720
 721     // If this user is in the same block as the cast, don't change the cast.
 722     if (UserBB == DefBB) continue;
 723
 724     // If we have already inserted a cast into this block, use it.
 725     CastInst *&InsertedCast = InsertedCasts[UserBB];
 726
 727     if (!InsertedCast) {
 728       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
 729       assert(InsertPt != UserBB->end());
 730       InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0),
 731                                       CI->getType(), "", &*InsertPt);
 732     }
 733
 734     // Replace a use of the cast with a use of the new cast.
 735     TheUse = InsertedCast;
 736     MadeChange = true;
 737     ++NumCastUses;
 738   }
 739
 740   // If we removed all uses, nuke the cast.
 741   if (CI->use_empty()) {
 742     CI->eraseFromParent();
 743     MadeChange = true;
 744   }
 745
 746   return MadeChange;
 747 }
 748
 749 /// If the specified cast instruction is a noop copy (e.g. it's casting from
 750 /// one pointer type to another, i32->i8 on PPC), sink it into user blocks to
 751 /// reduce the number of virtual registers that must be created and coalesced.
 752 ///
 753 /// Return true if any changes are made.
 754 ///
 755 static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
 756                                        const DataLayout &DL) {
 757   // If this is a noop copy,
 758   EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType());
 759   EVT DstVT = TLI.getValueType(DL, CI->getType());
 760
 761   // This is an fp<->int conversion?
 762   if (SrcVT.isInteger() != DstVT.isInteger())
 763     return false;
 764
 765   // If this is an extension, it will be a zero or sign extension, which
 766   // isn't a noop.
 767   if (SrcVT.bitsLT(DstVT)) return false;
 768
 769   // If these values will be promoted, find out what they will be promoted
 770   // to.  This helps us consider truncates on PPC as noop copies when they
 771   // are.
 772   if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
 773       TargetLowering::TypePromoteInteger)
 774     SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
 775   if (TLI.getTypeAction(CI->getContext(), DstVT) ==
 776       TargetLowering::TypePromoteInteger)
 777     DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);
 778
 779   // If, after promotion, these are the same types, this is a noop copy.
 780   if (SrcVT != DstVT)
 781     return false;
 782
 783   return SinkCast(CI);
 784 }
 785
 786 /// Try to combine CI into a call to the llvm.uadd.with.overflow intrinsic if
 787 /// possible.
 788 ///
 789 /// Return true if any changes were made.
 790 static bool CombineUAddWithOverflow(CmpInst *CI) {
 791   Value *A, *B;
 792   Instruction *AddI;
 793   if (!match(CI,
 794              m_UAddWithOverflow(m_Value(A), m_Value(B), m_Instruction(AddI))))
 795     return false;
 796
 797   Type *Ty = AddI->getType();
 798   if (!isa<IntegerType>(Ty))
 799     return false;
 800
 801   // We don't want to move around uses of condition values this late, so we we
 802   // check if it is legal to create the call to the intrinsic in the basic
 803   // block containing the icmp:
 804
 805   if (AddI->getParent() != CI->getParent() && !AddI->hasOneUse())
 806     return false;
 807
 808 #ifndef NDEBUG
 809   // Someday m_UAddWithOverflow may get smarter, but this is a safe assumption
 810   // for now:
 811   if (AddI->hasOneUse())
 812     assert(*AddI->user_begin() == CI && "expected!");
 813 #endif
 814
 815   Module *M = CI->getParent()->getParent()->getParent();
 816   Value *F = Intrinsic::getDeclaration(M, Intrinsic::uadd_with_overflow, Ty);
 817
 818   auto *InsertPt = AddI->hasOneUse() ? CI : AddI;
 819
 820   auto *UAddWithOverflow =
 821       CallInst::Create(F, {A, B}, "uadd.overflow", InsertPt);
 822   auto *UAdd = ExtractValueInst::Create(UAddWithOverflow, 0, "uadd", InsertPt);
 823   auto *Overflow =
 824       ExtractValueInst::Create(UAddWithOverflow, 1, "overflow", InsertPt);
 825
 826   CI->replaceAllUsesWith(Overflow);
 827   AddI->replaceAllUsesWith(UAdd);
 828   CI->eraseFromParent();
 829   AddI->eraseFromParent();
 830   return true;
 831 }
 832
 833 /// Sink the given CmpInst into user blocks to reduce the number of virtual
 834 /// registers that must be created and coalesced. This is a clear win except on
 835 /// targets with multiple condition code registers (PowerPC), where it might
 836 /// lose; some adjustment may be wanted there.
 837 ///
 838 /// Return true if any changes are made.
 839 static bool SinkCmpExpression(CmpInst *CI) {
 840   BasicBlock *DefBB = CI->getParent();
 841
 842   /// Only insert a cmp in each block once.
 843   DenseMap<BasicBlock*, CmpInst*> InsertedCmps;
 844
 845   bool MadeChange = false;
 846   for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end();
 847        UI != E; ) {
 848     Use &TheUse = UI.getUse();
 849     Instruction *User = cast<Instruction>(*UI);
 850
 851     // Preincrement use iterator so we don't invalidate it.
 852     ++UI;
 853
 854     // Don't bother for PHI nodes.
 855     if (isa<PHINode>(User))
 856       continue;
 857
 858     // Figure out which BB this cmp is used in.
 859     BasicBlock *UserBB = User->getParent();
 860
 861     // If this user is in the same block as the cmp, don't change the cmp.
 862     if (UserBB == DefBB) continue;
 863
 864     // If we have already inserted a cmp into this block, use it.
 865     CmpInst *&InsertedCmp = InsertedCmps[UserBB];
 866
 867     if (!InsertedCmp) {
 868       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
 869       assert(InsertPt != UserBB->end());
 870       InsertedCmp =
 871           CmpInst::Create(CI->getOpcode(), CI->getPredicate(),
 872                           CI->getOperand(0), CI->getOperand(1), "", &*InsertPt);
 873     }
 874
 875     // Replace a use of the cmp with a use of the new cmp.
 876     TheUse = InsertedCmp;
 877     MadeChange = true;
 878     ++NumCmpUses;
 879   }
 880
 881   // If we removed all uses, nuke the cmp.
 882   if (CI->use_empty()) {
 883     CI->eraseFromParent();
 884     MadeChange = true;
 885   }
 886
 887   return MadeChange;
 888 }
 889
 890 static bool OptimizeCmpExpression(CmpInst *CI) {
 891   if (SinkCmpExpression(CI))
 892     return true;
 893
 894   if (CombineUAddWithOverflow(CI))
 895     return true;
 896
 897   return false;
 898 }
 899
 900 /// Check if the candidates could be combined with a shift instruction, which
 901 /// includes:
 902 /// 1. Truncate instruction
 903 /// 2. And instruction and the imm is a mask of the low bits:
 904 /// imm & (imm+1) == 0
 905 static bool isExtractBitsCandidateUse(Instruction *User) {
 906   if (!isa<TruncInst>(User)) {
 907     if (User->getOpcode() != Instruction::And ||
 908         !isa<ConstantInt>(User->getOperand(1)))
 909       return false;
 910
 911     const APInt &Cimm = cast<ConstantInt>(User->getOperand(1))->getValue();
 912
 913     if ((Cimm & (Cimm + 1)).getBoolValue())
 914       return false;
 915   }
 916   return true;
 917 }
 918
 919 /// Sink both shift and truncate instruction to the use of truncate's BB.
 920 static bool
 921 SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
 922                      DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts,
 923                      const TargetLowering &TLI, const DataLayout &DL) {
 924   BasicBlock *UserBB = User->getParent();
 925   DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
 926   TruncInst *TruncI = dyn_cast<TruncInst>(User);
 927   bool MadeChange = false;
 928
 929   for (Value::user_iterator TruncUI = TruncI->user_begin(),
 930                             TruncE = TruncI->user_end();
 931        TruncUI != TruncE;) {
 932
 933     Use &TruncTheUse = TruncUI.getUse();
 934     Instruction *TruncUser = cast<Instruction>(*TruncUI);
 935     // Preincrement use iterator so we don't invalidate it.
 936
 937     ++TruncUI;
 938
 939     int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode());
 940     if (!ISDOpcode)
 941       continue;
 942
 943     // If the use is actually a legal node, there will not be an
 944     // implicit truncate.
 945     // FIXME: always querying the result type is just an
 946     // approximation; some nodes' legality is determined by the
 947     // operand or other means. There's no good way to find out though.
 948     if (TLI.isOperationLegalOrCustom(
 949             ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true)))
 950       continue;
 951
 952     // Don't bother for PHI nodes.
 953     if (isa<PHINode>(TruncUser))
 954       continue;
 955
 956     BasicBlock *TruncUserBB = TruncUser->getParent();
 957
 958     if (UserBB == TruncUserBB)
 959       continue;
 960
 961     BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];
 962     CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];
 963
 964     if (!InsertedShift && !InsertedTrunc) {
 965       BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
 966       assert(InsertPt != TruncUserBB->end());
 967       // Sink the shift
 968       if (ShiftI->getOpcode() == Instruction::AShr)
 969         InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
 970                                                    "", &*InsertPt);
 971       else
 972         InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
 973                                                    "", &*InsertPt);
 974
 975       // Sink the trunc
 976       BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt();
 977       TruncInsertPt++;
 978       assert(TruncInsertPt != TruncUserBB->end());
 979
 980       InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,
 981                                        TruncI->getType(), "", &*TruncInsertPt);
 982
 983       MadeChange = true;
 984
 985       TruncTheUse = InsertedTrunc;
 986     }
 987   }
 988   return MadeChange;
 989 }
 990
 991 /// Sink the shift *right* instruction into user blocks if the uses could
 992 /// potentially be combined with this shift instruction and generate BitExtract
 993 /// instruction. It will only be applied if the architecture supports BitExtract
 994 /// instruction. Here is an example:
 995 /// BB1:
 996 ///   %x.extract.shift = lshr i64 %arg1, 32
 997 /// BB2:
 998 ///   %x.extract.trunc = trunc i64 %x.extract.shift to i16
 999 /// ==>
1000 ///
1001 /// BB2:
1002 ///   %x.extract.shift.1 = lshr i64 %arg1, 32
1003 ///   %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
1004 ///
1005 /// CodeGen will recoginze the pattern in BB2 and generate BitExtract
1006 /// instruction.
1007 /// Return true if any changes are made.
1008 static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
1009                                 const TargetLowering &TLI,
1010                                 const DataLayout &DL) {
1011   BasicBlock *DefBB = ShiftI->getParent();
1012
1013   /// Only insert instructions in each block once.
1014   DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;
1015
1016   bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType()));
1017
1018   bool MadeChange = false;
1019   for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();
1020        UI != E;) {
1021     Use &TheUse = UI.getUse();
1022     Instruction *User = cast<Instruction>(*UI);
1023     // Preincrement use iterator so we don't invalidate it.
1024     ++UI;
1025
1026     // Don't bother for PHI nodes.
1027     if (isa<PHINode>(User))
1028       continue;
1029
1030     if (!isExtractBitsCandidateUse(User))
1031       continue;
1032
1033     BasicBlock *UserBB = User->getParent();
1034
1035     if (UserBB == DefBB) {
1036       // If the shift and truncate instruction are in the same BB. The use of
1037       // the truncate(TruncUse) may still introduce another truncate if not
1038       // legal. In this case, we would like to sink both shift and truncate
1039       // instruction to the BB of TruncUse.
1040       // for example:
1041       // BB1:
1042       // i64 shift.result = lshr i64 opnd, imm
1043       // trunc.result = trunc shift.result to i16
1044       //
1045       // BB2:
1046       //   ----> We will have an implicit truncate here if the architecture does
1047       //   not have i16 compare.
1048       // cmp i16 trunc.result, opnd2
1049       //
1050       if (isa<TruncInst>(User) && shiftIsLegal
1051           // If the type of the truncate is legal, no trucate will be
1052           // introduced in other basic blocks.
1053           &&
1054           (!TLI.isTypeLegal(TLI.getValueType(DL, User->getType()))))
1055         MadeChange =
1056             SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL);
1057
1058       continue;
1059     }
1060     // If we have already inserted a shift into this block, use it.
1061     BinaryOperator *&InsertedShift = InsertedShifts[UserBB];
1062
1063     if (!InsertedShift) {
1064       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
1065       assert(InsertPt != UserBB->end());
1066
1067       if (ShiftI->getOpcode() == Instruction::AShr)
1068         InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
1069                                                    "", &*InsertPt);
1070       else
1071         InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
1072                                                    "", &*InsertPt);
1073
1074       MadeChange = true;
1075     }
1076
1077     // Replace a use of the shift with a use of the new shift.
1078     TheUse = InsertedShift;
1079   }
1080
1081   // If we removed all uses, nuke the shift.
1082   if (ShiftI->use_empty())
1083     ShiftI->eraseFromParent();
1084
1085   return MadeChange;
1086 }
1087
1088 // Translate a masked load intrinsic like
1089 // <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
1090 //                               <16 x i1> %mask, <16 x i32> %passthru)
1091 // to a chain of basic blocks, with loading element one-by-one if
1092 // the appropriate mask bit is set
1093 //
1094 //  %1 = bitcast i8* %addr to i32*
1095 //  %2 = extractelement <16 x i1> %mask, i32 0
1096 //  %3 = icmp eq i1 %2, true
1097 //  br i1 %3, label %cond.load, label %else
1098 //
1099 //cond.load:                                        ; preds = %0
1100 //  %4 = getelementptr i32* %1, i32 0
1101 //  %5 = load i32* %4
1102 //  %6 = insertelement <16 x i32> undef, i32 %5, i32 0
1103 //  br label %else
1104 //
1105 //else:                                             ; preds = %0, %cond.load
1106 //  %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
1107 //  %7 = extractelement <16 x i1> %mask, i32 1
1108 //  %8 = icmp eq i1 %7, true
1109 //  br i1 %8, label %cond.load1, label %else2
1110 //
1111 //cond.load1:                                       ; preds = %else
1112 //  %9 = getelementptr i32* %1, i32 1
1113 //  %10 = load i32* %9
1114 //  %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
1115 //  br label %else2
1116 //
1117 //else2:                                            ; preds = %else, %cond.load1
1118 //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
1119 //  %12 = extractelement <16 x i1> %mask, i32 2
1120 //  %13 = icmp eq i1 %12, true
1121 //  br i1 %13, label %cond.load4, label %else5
1122 //
1123 static void ScalarizeMaskedLoad(CallInst *CI) {
1124   Value *Ptr  = CI->getArgOperand(0);
1125   Value *Alignment = CI->getArgOperand(1);
1126   Value *Mask = CI->getArgOperand(2);
1127   Value *Src0 = CI->getArgOperand(3);
1128
1129   unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
1130   VectorType *VecType = dyn_cast<VectorType>(CI->getType());
1131   assert(VecType && "Unexpected return type of masked load intrinsic");
1132
1133   Type *EltTy = CI->getType()->getVectorElementType();
1134
1135   IRBuilder<> Builder(CI->getContext());
1136   Instruction *InsertPt = CI;
1137   BasicBlock *IfBlock = CI->getParent();
1138   BasicBlock *CondBlock = nullptr;
1139   BasicBlock *PrevIfBlock = CI->getParent();
1140
1141   Builder.SetInsertPoint(InsertPt);
1142   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
1143
1144   // Short-cut if the mask is all-true.
1145   bool IsAllOnesMask = isa<Constant>(Mask) &&
1146     cast<Constant>(Mask)->isAllOnesValue();
1147
1148   if (IsAllOnesMask) {
1149     Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal);
1150     CI->replaceAllUsesWith(NewI);
1151     CI->eraseFromParent();
1152     return;
1153   }
1154
1155   // Adjust alignment for the scalar instruction.
1156   AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits()/8);
1157   // Bitcast %addr fron i8* to EltTy*
1158   Type *NewPtrType =
1159     EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
1160   Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
1161   unsigned VectorWidth = VecType->getNumElements();
1162
1163   Value *UndefVal = UndefValue::get(VecType);
1164
1165   // The result vector
1166   Value *VResult = UndefVal;
1167
1168   if (isa<ConstantVector>(Mask)) {
1169     for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
1170       if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
1171           continue;
1172       Value *Gep =
1173           Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
1174       LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal);
1175       VResult = Builder.CreateInsertElement(VResult, Load,
1176                                             Builder.getInt32(Idx));
1177     }
1178     Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
1179     CI->replaceAllUsesWith(NewI);
1180     CI->eraseFromParent();
1181     return;
1182   }
1183
1184   PHINode *Phi = nullptr;
1185   Value *PrevPhi = UndefVal;
1186
1187   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
1188
1189     // Fill the "else" block, created in the previous iteration
1190     //
1191     //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
1192     //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
1193     //  %to_load = icmp eq i1 %mask_1, true
1194     //  br i1 %to_load, label %cond.load, label %else
1195     //
1196     if (Idx > 0) {
1197       Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
1198       Phi->addIncoming(VResult, CondBlock);
1199       Phi->addIncoming(PrevPhi, PrevIfBlock);
1200       PrevPhi = Phi;
1201       VResult = Phi;
1202     }
1203
1204     Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
1205     Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
1206                                     ConstantInt::get(Predicate->getType(), 1));
1207
1208     // Create "cond" block
1209     //
1210     //  %EltAddr = getelementptr i32* %1, i32 0
1211     //  %Elt = load i32* %EltAddr
1212     //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
1213     //
1214     CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load");
1215     Builder.SetInsertPoint(InsertPt);
1216
1217     Value *Gep =
1218         Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
1219     LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
1220     VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
1221
1222     // Create "else" block, fill it in the next iteration
1223     BasicBlock *NewIfBlock =
1224         CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
1225     Builder.SetInsertPoint(InsertPt);
1226     Instruction *OldBr = IfBlock->getTerminator();
1227     BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
1228     OldBr->eraseFromParent();
1229     PrevIfBlock = IfBlock;
1230     IfBlock = NewIfBlock;
1231   }
1232
1233   Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
1234   Phi->addIncoming(VResult, CondBlock);
1235   Phi->addIncoming(PrevPhi, PrevIfBlock);
1236   Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
1237   CI->replaceAllUsesWith(NewI);
1238   CI->eraseFromParent();
1239 }
1240
1241 // Translate a masked store intrinsic, like
1242 // void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
1243 //                               <16 x i1> %mask)
1244 // to a chain of basic blocks, that stores element one-by-one if
1245 // the appropriate mask bit is set
1246 //
1247 //   %1 = bitcast i8* %addr to i32*
1248 //   %2 = extractelement <16 x i1> %mask, i32 0
1249 //   %3 = icmp eq i1 %2, true
1250 //   br i1 %3, label %cond.store, label %else
1251 //
1252 // cond.store:                                       ; preds = %0
1253 //   %4 = extractelement <16 x i32> %val, i32 0
1254 //   %5 = getelementptr i32* %1, i32 0
1255 //   store i32 %4, i32* %5
1256 //   br label %else
1257 //
1258 // else:                                             ; preds = %0, %cond.store
1259 //   %6 = extractelement <16 x i1> %mask, i32 1
1260 //   %7 = icmp eq i1 %6, true
1261 //   br i1 %7, label %cond.store1, label %else2
1262 //
1263 // cond.store1:                                      ; preds = %else
1264 //   %8 = extractelement <16 x i32> %val, i32 1
1265 //   %9 = getelementptr i32* %1, i32 1
1266 //   store i32 %8, i32* %9
1267 //   br label %else2
1268 //   . . .
1269 static void ScalarizeMaskedStore(CallInst *CI) {
1270   Value *Src = CI->getArgOperand(0);
1271   Value *Ptr  = CI->getArgOperand(1);
1272   Value *Alignment = CI->getArgOperand(2);
1273   Value *Mask = CI->getArgOperand(3);
1274
1275   unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
1276   VectorType *VecType = dyn_cast<VectorType>(Src->getType());
1277   assert(VecType && "Unexpected data type in masked store intrinsic");
1278
1279   Type *EltTy = VecType->getElementType();
1280
1281   IRBuilder<> Builder(CI->getContext());
1282   Instruction *InsertPt = CI;
1283   BasicBlock *IfBlock = CI->getParent();
1284   Builder.SetInsertPoint(InsertPt);
1285   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
1286
1287   // Short-cut if the mask is all-true.
1288   bool IsAllOnesMask = isa<Constant>(Mask) &&
1289     cast<Constant>(Mask)->isAllOnesValue();
1290
1291   if (IsAllOnesMask) {
1292     Builder.CreateAlignedStore(Src, Ptr, AlignVal);
1293     CI->eraseFromParent();
1294     return;
1295   }
1296
1297   // Adjust alignment for the scalar instruction.
1298   AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits()/8);
1299   // Bitcast %addr fron i8* to EltTy*
1300   Type *NewPtrType =
1301     EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
1302   Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
1303   unsigned VectorWidth = VecType->getNumElements();
1304
1305   if (isa<ConstantVector>(Mask)) {
1306     for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
1307       if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
1308           continue;
1309       Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
1310       Value *Gep =
1311           Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
1312       Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
1313     }
1314     CI->eraseFromParent();
1315     return;
1316   }
1317
1318   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
1319
1320     // Fill the "else" block, created in the previous iteration
1321     //
1322     //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
1323     //  %to_store = icmp eq i1 %mask_1, true
1324     //  br i1 %to_store, label %cond.store, label %else
1325     //
1326     Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
1327     Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
1328                                     ConstantInt::get(Predicate->getType(), 1));
1329
1330     // Create "cond" block
1331     //
1332     //  %OneElt = extractelement <16 x i32> %Src, i32 Idx
1333     //  %EltAddr = getelementptr i32* %1, i32 0
1334     //  %store i32 %OneElt, i32* %EltAddr
1335     //
1336     BasicBlock *CondBlock =
1337         IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
1338     Builder.SetInsertPoint(InsertPt);
1339
1340     Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
1341     Value *Gep =
1342         Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
1343     Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
1344
1345     // Create "else" block, fill it in the next iteration
1346     BasicBlock *NewIfBlock =
1347         CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
1348     Builder.SetInsertPoint(InsertPt);
1349     Instruction *OldBr = IfBlock->getTerminator();
1350     BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
1351     OldBr->eraseFromParent();
1352     IfBlock = NewIfBlock;
1353   }
1354   CI->eraseFromParent();
1355 }
1356
1357 // Translate a masked gather intrinsic like
1358 // <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
1359 //                               <16 x i1> %Mask, <16 x i32> %Src)
1360 // to a chain of basic blocks, with loading element one-by-one if
1361 // the appropriate mask bit is set
1362 //
1363 // % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
1364 // % Mask0 = extractelement <16 x i1> %Mask, i32 0
1365 // % ToLoad0 = icmp eq i1 % Mask0, true
1366 // br i1 % ToLoad0, label %cond.load, label %else
1367 //
1368 // cond.load:
1369 // % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
1370 // % Load0 = load i32, i32* % Ptr0, align 4
1371 // % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0
1372 // br label %else
1373 //
1374 // else:
1375 // %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0]
1376 // % Mask1 = extractelement <16 x i1> %Mask, i32 1
1377 // % ToLoad1 = icmp eq i1 % Mask1, true
1378 // br i1 % ToLoad1, label %cond.load1, label %else2
1379 //
1380 // cond.load1:
1381 // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
1382 // % Load1 = load i32, i32* % Ptr1, align 4
1383 // % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1
1384 // br label %else2
1385 // . . .
1386 // % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
1387 // ret <16 x i32> %Result
1388 static void ScalarizeMaskedGather(CallInst *CI) {
1389   Value *Ptrs = CI->getArgOperand(0);
1390   Value *Alignment = CI->getArgOperand(1);
1391   Value *Mask = CI->getArgOperand(2);
1392   Value *Src0 = CI->getArgOperand(3);
1393
1394   VectorType *VecType = dyn_cast<VectorType>(CI->getType());
1395
1396   assert(VecType && "Unexpected return type of masked load intrinsic");
1397
1398   IRBuilder<> Builder(CI->getContext());
1399   Instruction *InsertPt = CI;
1400   BasicBlock *IfBlock = CI->getParent();
1401   BasicBlock *CondBlock = nullptr;
1402   BasicBlock *PrevIfBlock = CI->getParent();
1403   Builder.SetInsertPoint(InsertPt);
1404   unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
1405
1406   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
1407
1408   Value *UndefVal = UndefValue::get(VecType);
1409
1410   // The result vector
1411   Value *VResult = UndefVal;
1412   unsigned VectorWidth = VecType->getNumElements();
1413
1414   // Shorten the way if the mask is a vector of constants.
1415   bool IsConstMask = isa<ConstantVector>(Mask);
1416
1417   if (IsConstMask) {
1418     for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
1419       if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
1420         continue;
1421       Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
1422                                                 "Ptr" + Twine(Idx));
1423       LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
1424                                                  "Load" + Twine(Idx));
1425       VResult = Builder.CreateInsertElement(VResult, Load,
1426                                             Builder.getInt32(Idx),
1427                                             "Res" + Twine(Idx));
1428     }
1429     Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
1430     CI->replaceAllUsesWith(NewI);
1431     CI->eraseFromParent();
1432     return;
1433   }
1434
1435   PHINode *Phi = nullptr;
1436   Value *PrevPhi = UndefVal;
1437
1438   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
1439
1440     // Fill the "else" block, created in the previous iteration
1441     //
1442     //  %Mask1 = extractelement <16 x i1> %Mask, i32 1
1443     //  %ToLoad1 = icmp eq i1 %Mask1, true
1444     //  br i1 %ToLoad1, label %cond.load, label %else
1445     //
1446     if (Idx > 0) {
1447       Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
1448       Phi->addIncoming(VResult, CondBlock);
1449       Phi->addIncoming(PrevPhi, PrevIfBlock);
1450       PrevPhi = Phi;
1451       VResult = Phi;
1452     }
1453
1454     Value *Predicate = Builder.CreateExtractElement(Mask,
1455                                                     Builder.getInt32(Idx),
1456                                                     "Mask" + Twine(Idx));
1457     Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
1458                                     ConstantInt::get(Predicate->getType(), 1),
1459                                     "ToLoad" + Twine(Idx));
1460
1461     // Create "cond" block
1462     //
1463     //  %EltAddr = getelementptr i32* %1, i32 0
1464     //  %Elt = load i32* %EltAddr
1465     //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
1466     //
1467     CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
1468     Builder.SetInsertPoint(InsertPt);
1469
1470     Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
1471                                               "Ptr" + Twine(Idx));
1472     LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
1473                                                "Load" + Twine(Idx));
1474     VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
1475                                           "Res" + Twine(Idx));
1476
1477     // Create "else" block, fill it in the next iteration
1478     BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
1479     Builder.SetInsertPoint(InsertPt);
1480     Instruction *OldBr = IfBlock->getTerminator();
1481     BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
1482     OldBr->eraseFromParent();
1483     PrevIfBlock = IfBlock;
1484     IfBlock = NewIfBlock;
1485   }
1486
1487   Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
1488   Phi->addIncoming(VResult, CondBlock);
1489   Phi->addIncoming(PrevPhi, PrevIfBlock);
1490   Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
1491   CI->replaceAllUsesWith(NewI);
1492   CI->eraseFromParent();
1493 }
1494
1495 // Translate a masked scatter intrinsic, like
1496 // void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
1497 //                                  <16 x i1> %Mask)
1498 // to a chain of basic blocks, that stores element one-by-one if
1499 // the appropriate mask bit is set.
1500 //
1501 // % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
1502 // % Mask0 = extractelement <16 x i1> % Mask, i32 0
1503 // % ToStore0 = icmp eq i1 % Mask0, true
1504 // br i1 %ToStore0, label %cond.store, label %else
1505 //
1506 // cond.store:
1507 // % Elt0 = extractelement <16 x i32> %Src, i32 0
1508 // % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
1509 // store i32 %Elt0, i32* % Ptr0, align 4
1510 // br label %else
1511 //
1512 // else:
1513 // % Mask1 = extractelement <16 x i1> % Mask, i32 1
1514 // % ToStore1 = icmp eq i1 % Mask1, true
1515 // br i1 % ToStore1, label %cond.store1, label %else2
1516 //
1517 // cond.store1:
1518 // % Elt1 = extractelement <16 x i32> %Src, i32 1
1519 // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
1520 // store i32 % Elt1, i32* % Ptr1, align 4
1521 // br label %else2
1522 //   . . .
1523 static void ScalarizeMaskedScatter(CallInst *CI) {
1524   Value *Src = CI->getArgOperand(0);
1525   Value *Ptrs = CI->getArgOperand(1);
1526   Value *Alignment = CI->getArgOperand(2);
1527   Value *Mask = CI->getArgOperand(3);
1528
1529   assert(isa<VectorType>(Src->getType()) &&
1530          "Unexpected data type in masked scatter intrinsic");
1531   assert(isa<VectorType>(Ptrs->getType()) &&
1532          isa<PointerType>(Ptrs->getType()->getVectorElementType()) &&
1533          "Vector of pointers is expected in masked scatter intrinsic");
1534
1535   IRBuilder<> Builder(CI->getContext());
1536   Instruction *InsertPt = CI;
1537   BasicBlock *IfBlock = CI->getParent();
1538   Builder.SetInsertPoint(InsertPt);
1539   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
1540
1541   unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
1542   unsigned VectorWidth = Src->getType()->getVectorNumElements();
1543
1544   // Shorten the way if the mask is a vector of constants.
1545   bool IsConstMask = isa<ConstantVector>(Mask);
1546
1547   if (IsConstMask) {
1548     for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
1549       if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
1550         continue;
1551       Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
1552                                                    "Elt" + Twine(Idx));
1553       Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
1554                                                 "Ptr" + Twine(Idx));
1555       Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
1556     }
1557     CI->eraseFromParent();
1558     return;
1559   }
1560   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
1561     // Fill the "else" block, created in the previous iteration
1562     //
1563     //  % Mask1 = extractelement <16 x i1> % Mask, i32 Idx
1564     //  % ToStore = icmp eq i1 % Mask1, true
1565     //  br i1 % ToStore, label %cond.store, label %else
1566     //
1567     Value *Predicate = Builder.CreateExtractElement(Mask,
1568                                                     Builder.getInt32(Idx),
1569                                                     "Mask" + Twine(Idx));
1570     Value *Cmp =
1571        Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
1572                           ConstantInt::get(Predicate->getType(), 1),
1573                           "ToStore" + Twine(Idx));
1574
1575     // Create "cond" block
1576     //
1577     //  % Elt1 = extractelement <16 x i32> %Src, i32 1
1578     //  % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
1579     //  %store i32 % Elt1, i32* % Ptr1
1580     //
1581     BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
1582     Builder.SetInsertPoint(InsertPt);
1583
1584     Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
1585                                                  "Elt" + Twine(Idx));
1586     Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
1587                                               "Ptr" + Twine(Idx));
1588     Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
1589
1590     // Create "else" block, fill it in the next iteration
1591     BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
1592     Builder.SetInsertPoint(InsertPt);
1593     Instruction *OldBr = IfBlock->getTerminator();
1594     BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
1595     OldBr->eraseFromParent();
1596     IfBlock = NewIfBlock;
1597   }
1598   CI->eraseFromParent();
1599 }
1600
1601 bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
1602   BasicBlock *BB = CI->getParent();
1603
1604   // Lower inline assembly if we can.
1605   // If we found an inline asm expession, and if the target knows how to
1606   // lower it to normal LLVM code, do so now.
1607   if (TLI && isa<InlineAsm>(CI->getCalledValue())) {
1608     if (TLI->ExpandInlineAsm(CI)) {
1609       // Avoid invalidating the iterator.
1610       CurInstIterator = BB->begin();
1611       // Avoid processing instructions out of order, which could cause
1612       // reuse before a value is defined.
1613       SunkAddrs.clear();
1614       return true;
1615     }
1616     // Sink address computing for memory operands into the block.
1617     if (optimizeInlineAsmInst(CI))
1618       return true;
1619   }
1620
1621   // Align the pointer arguments to this call if the target thinks it's a good
1622   // idea
1623   unsigned MinSize, PrefAlign;
1624   if (TLI && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
1625     for (auto &Arg : CI->arg_operands()) {
1626       // We want to align both objects whose address is used directly and
1627       // objects whose address is used in casts and GEPs, though it only makes
1628       // sense for GEPs if the offset is a multiple of the desired alignment and
1629       // if size - offset meets the size threshold.
1630       if (!Arg->getType()->isPointerTy())
1631         continue;
1632       APInt Offset(DL->getPointerSizeInBits(
1633                        cast<PointerType>(Arg->getType())->getAddressSpace()),
1634                    0);
1635       Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset);
1636       uint64_t Offset2 = Offset.getLimitedValue();
1637       if ((Offset2 & (PrefAlign-1)) != 0)
1638         continue;
1639       AllocaInst *AI;
1640       if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&
1641           DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
1642         AI->setAlignment(PrefAlign);
1643       // Global variables can only be aligned if they are defined in this
1644       // object (i.e. they are uniquely initialized in this object), and
1645       // over-aligning global variables that have an explicit section is
1646       // forbidden.
1647       GlobalVariable *GV;
1648       if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->hasUniqueInitializer() &&
1649           !GV->hasSection() && GV->getAlignment() < PrefAlign &&
1650           DL->getTypeAllocSize(GV->getType()->getElementType()) >=
1651               MinSize + Offset2)
1652         GV->setAlignment(PrefAlign);
1653     }
1654     // If this is a memcpy (or similar) then we may be able to improve the
1655     // alignment
1656     if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
1657       unsigned Align = getKnownAlignment(MI->getDest(), *DL);
1658       if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI))
1659         Align = std::min(Align, getKnownAlignment(MTI->getSource(), *DL));
1660       if (Align > MI->getAlignment())
1661         MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Align));
1662     }
1663   }
1664
1665   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
1666   if (II) {
1667     switch (II->getIntrinsicID()) {
1668     default: break;
1669     case Intrinsic::objectsize: {
1670       // Lower all uses of llvm.objectsize.*
1671       bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
1672       Type *ReturnTy = CI->getType();
1673       Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
1674
1675       // Substituting this can cause recursive simplifications, which can
1676       // invalidate our iterator.  Use a WeakVH to hold onto it in case this
1677       // happens.
1678       WeakVH IterHandle(&*CurInstIterator);
1679
1680       replaceAndRecursivelySimplify(CI, RetVal,
1681                                     TLInfo, nullptr);
1682
1683       // If the iterator instruction was recursively deleted, start over at the
1684       // start of the block.
1685       if (IterHandle != CurInstIterator.getNodePtrUnchecked()) {
1686         CurInstIterator = BB->begin();
1687         SunkAddrs.clear();
1688       }
1689       return true;
1690     }
1691     case Intrinsic::masked_load: {
1692       // Scalarize unsupported vector masked load
1693       if (!TTI->isLegalMaskedLoad(CI->getType())) {
1694         ScalarizeMaskedLoad(CI);
1695         ModifiedDT = true;
1696         return true;
1697       }
1698       return false;
1699     }
1700     case Intrinsic::masked_store: {
1701       if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) {
1702         ScalarizeMaskedStore(CI);
1703         ModifiedDT = true;
1704         return true;
1705       }
1706       return false;
1707     }
1708     case Intrinsic::masked_gather: {
1709       if (!TTI->isLegalMaskedGather(CI->getType())) {
1710         ScalarizeMaskedGather(CI);
1711         ModifiedDT = true;
1712         return true;
1713       }
1714       return false;
1715     }
1716     case Intrinsic::masked_scatter: {
1717       if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
1718         ScalarizeMaskedScatter(CI);
1719         ModifiedDT = true;
1720         return true;
1721       }
1722       return false;
1723     }
1724     case Intrinsic::aarch64_stlxr:
1725     case Intrinsic::aarch64_stxr: {
1726       ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
1727       if (!ExtVal || !ExtVal->hasOneUse() ||
1728           ExtVal->getParent() == CI->getParent())
1729         return false;
1730       // Sink a zext feeding stlxr/stxr before it, so it can be folded into it.
1731       ExtVal->moveBefore(CI);
1732       // Mark this instruction as "inserted by CGP", so that other
1733       // optimizations don't touch it.
1734       InsertedInsts.insert(ExtVal);
1735       return true;
1736     }
1737     case Intrinsic::invariant_group_barrier:
1738       II->replaceAllUsesWith(II->getArgOperand(0));
1739       II->eraseFromParent();
1740       return true;
1741     }
1742
1743     if (TLI) {
1744       // Unknown address space.
1745       // TODO: Target hook to pick which address space the intrinsic cares
1746       // about?
1747       unsigned AddrSpace = ~0u;
1748       SmallVector<Value*, 2> PtrOps;
1749       Type *AccessTy;
1750       if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy, AddrSpace))
1751         while (!PtrOps.empty())
1752           if (optimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy, AddrSpace))
1753             return true;
1754     }
1755   }
1756
1757   // From here on out we're working with named functions.
1758   if (!CI->getCalledFunction()) return false;
1759
1760   // Lower all default uses of _chk calls.  This is very similar
1761   // to what InstCombineCalls does, but here we are only lowering calls
1762   // to fortified library functions (e.g. __memcpy_chk) that have the default
1763   // "don't know" as the objectsize.  Anything else should be left alone.
1764   FortifiedLibCallSimplifier Simplifier(TLInfo, true);
1765   if (Value *V = Simplifier.optimizeCall(CI)) {
1766     CI->replaceAllUsesWith(V);
1767     CI->eraseFromParent();
1768     return true;
1769   }
1770   return false;
1771 }
1772
1773 /// Look for opportunities to duplicate return instructions to the predecessor
1774 /// to enable tail call optimizations. The case it is currently looking for is:
1775 /// @code
1776 /// bb0:
1777 ///   %tmp0 = tail call i32 @f0()
1778 ///   br label %return
1779 /// bb1:
1780 ///   %tmp1 = tail call i32 @f1()
1781 ///   br label %return
1782 /// bb2:
1783 ///   %tmp2 = tail call i32 @f2()
1784 ///   br label %return
1785 /// return:
1786 ///   %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
1787 ///   ret i32 %retval
1788 /// @endcode
1789 ///
1790 /// =>
1791 ///
1792 /// @code
1793 /// bb0:
1794 ///   %tmp0 = tail call i32 @f0()
1795 ///   ret i32 %tmp0
1796 /// bb1:
1797 ///   %tmp1 = tail call i32 @f1()
1798 ///   ret i32 %tmp1
1799 /// bb2:
1800 ///   %tmp2 = tail call i32 @f2()
1801 ///   ret i32 %tmp2
1802 /// @endcode
1803 bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) {
1804   if (!TLI)
1805     return false;
1806
1807   ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator());
1808   if (!RI)
1809     return false;
1810
1811   PHINode *PN = nullptr;
1812   BitCastInst *BCI = nullptr;
1813   Value *V = RI->getReturnValue();
1814   if (V) {
1815     BCI = dyn_cast<BitCastInst>(V);
1816     if (BCI)
1817       V = BCI->getOperand(0);
1818
1819     PN = dyn_cast<PHINode>(V);
1820     if (!PN)
1821       return false;
1822   }
1823
1824   if (PN && PN->getParent() != BB)
1825     return false;
1826
1827   // It's not safe to eliminate the sign / zero extension of the return value.
1828   // See llvm::isInTailCallPosition().
1829   const Function *F = BB->getParent();
1830   AttributeSet CallerAttrs = F->getAttributes();
1831   if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
1832       CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
1833     return false;
1834
1835   // Make sure there are no instructions between the PHI and return, or that the
1836   // return is the first instruction in the block.
1837   if (PN) {
1838     BasicBlock::iterator BI = BB->begin();
1839     do { ++BI; } while (isa<DbgInfoIntrinsic>(BI));
1840     if (&*BI == BCI)
1841       // Also skip over the bitcast.
1842       ++BI;
1843     if (&*BI != RI)
1844       return false;
1845   } else {
1846     BasicBlock::iterator BI = BB->begin();
1847     while (isa<DbgInfoIntrinsic>(BI)) ++BI;
1848     if (&*BI != RI)
1849       return false;
1850   }
1851
1852   /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail
1853   /// call.
1854   SmallVector<CallInst*, 4> TailCalls;
1855   if (PN) {
1856     for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
1857       CallInst *CI = dyn_cast<CallInst>(PN->getIncomingValue(I));
1858       // Make sure the phi value is indeed produced by the tail call.
1859       if (CI && CI->hasOneUse() && CI->getParent() == PN->getIncomingBlock(I) &&
1860           TLI->mayBeEmittedAsTailCall(CI))
1861         TailCalls.push_back(CI);
1862     }
1863   } else {
1864     SmallPtrSet<BasicBlock*, 4> VisitedBBs;
1865     for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
1866       if (!VisitedBBs.insert(*PI).second)
1867         continue;
1868
1869       BasicBlock::InstListType &InstList = (*PI)->getInstList();
1870       BasicBlock::InstListType::reverse_iterator RI = InstList.rbegin();
1871       BasicBlock::InstListType::reverse_iterator RE = InstList.rend();
1872       do { ++RI; } while (RI != RE && isa<DbgInfoIntrinsic>(&*RI));
1873       if (RI == RE)
1874         continue;
1875
1876       CallInst *CI = dyn_cast<CallInst>(&*RI);
1877       if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI))
1878         TailCalls.push_back(CI);
1879     }
1880   }
1881
1882   bool Changed = false;
1883   for (unsigned i = 0, e = TailCalls.size(); i != e; ++i) {
1884     CallInst *CI = TailCalls[i];
1885     CallSite CS(CI);
1886
1887     // Conservatively require the attributes of the call to match those of the
1888     // return. Ignore noalias because it doesn't affect the call sequence.
1889     AttributeSet CalleeAttrs = CS.getAttributes();
1890     if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
1891           removeAttribute(Attribute::NoAlias) !=
1892         AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
1893           removeAttribute(Attribute::NoAlias))
1894       continue;
1895
1896     // Make sure the call instruction is followed by an unconditional branch to
1897     // the return block.
1898     BasicBlock *CallBB = CI->getParent();
1899     BranchInst *BI = dyn_cast<BranchInst>(CallBB->getTerminator());
1900     if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB)
1901       continue;
1902
1903     // Duplicate the return into CallBB.
1904     (void)FoldReturnIntoUncondBranch(RI, BB, CallBB);
1905     ModifiedDT = Changed = true;
1906     ++NumRetsDup;
1907   }
1908
1909   // If we eliminated all predecessors of the block, delete the block now.
1910   if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
1911     BB->eraseFromParent();
1912
1913   return Changed;
1914 }
1915
1916 //===----------------------------------------------------------------------===//
1917 // Memory Optimization
1918 //===----------------------------------------------------------------------===//
1919
1920 namespace {
1921
1922 /// This is an extended version of TargetLowering::AddrMode
1923 /// which holds actual Value*'s for register values.
1924 struct ExtAddrMode : public TargetLowering::AddrMode {
1925   Value *BaseReg;
1926   Value *ScaledReg;
1927   ExtAddrMode() : BaseReg(nullptr), ScaledReg(nullptr) {}
1928   void print(raw_ostream &OS) const;
1929   void dump() const;
1930
1931   bool operator==(const ExtAddrMode& O) const {
1932     return (BaseReg == O.BaseReg) && (ScaledReg == O.ScaledReg) &&
1933            (BaseGV == O.BaseGV) && (BaseOffs == O.BaseOffs) &&
1934            (HasBaseReg == O.HasBaseReg) && (Scale == O.Scale);
1935   }
1936 };
1937
1938 #ifndef NDEBUG
1939 static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
1940   AM.print(OS);
1941   return OS;
1942 }
1943 #endif
1944
1945 void ExtAddrMode::print(raw_ostream &OS) const {
1946   bool NeedPlus = false;
1947   OS << "[";
1948   if (BaseGV) {
1949     OS << (NeedPlus ? " + " : "")
1950        << "GV:";
1951     BaseGV->printAsOperand(OS, /*PrintType=*/false);
1952     NeedPlus = true;
1953   }
1954
1955   if (BaseOffs) {
1956     OS << (NeedPlus ? " + " : "")
1957        << BaseOffs;
1958     NeedPlus = true;
1959   }
1960
1961   if (BaseReg) {
1962     OS << (NeedPlus ? " + " : "")
1963        << "Base:";
1964     BaseReg->printAsOperand(OS, /*PrintType=*/false);
1965     NeedPlus = true;
1966   }
1967   if (Scale) {
1968     OS << (NeedPlus ? " + " : "")
1969        << Scale << "*";
1970     ScaledReg->printAsOperand(OS, /*PrintType=*/false);
1971   }
1972
1973   OS << ']';
1974 }
1975
1976 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1977 void ExtAddrMode::dump() const {
1978   print(dbgs());
1979   dbgs() << '\n';
1980 }
1981 #endif
1982
1983 /// \brief This class provides transaction based operation on the IR.
1984 /// Every change made through this class is recorded in the internal state and
1985 /// can be undone (rollback) until commit is called.
1986 class TypePromotionTransaction {
1987
1988   /// \brief This represents the common interface of the individual transaction.
1989   /// Each class implements the logic for doing one specific modification on
1990   /// the IR via the TypePromotionTransaction.
1991   class TypePromotionAction {
1992   protected:
1993     /// The Instruction modified.
1994     Instruction *Inst;
1995
1996   public:
1997     /// \brief Constructor of the action.
1998     /// The constructor performs the related action on the IR.
1999     TypePromotionAction(Instruction *Inst) : Inst(Inst) {}
2000
2001     virtual ~TypePromotionAction() {}
2002
2003     /// \brief Undo the modification done by this action.
2004     /// When this method is called, the IR must be in the same state as it was
2005     /// before this action was applied.
2006     /// \pre Undoing the action works if and only if the IR is in the exact same
2007     /// state as it was directly after this action was applied.
2008     virtual void undo() = 0;
2009
2010     /// \brief Advocate every change made by this action.
2011     /// When the results on the IR of the action are to be kept, it is important
2012     /// to call this function, otherwise hidden information may be kept forever.
2013     virtual void commit() {
2014       // Nothing to be done, this action is not doing anything.
2015     }
2016   };
2017
2018   /// \brief Utility to remember the position of an instruction.
2019   class InsertionHandler {
2020     /// Position of an instruction.
2021     /// Either an instruction:
2022     /// - Is the first in a basic block: BB is used.
2023     /// - Has a previous instructon: PrevInst is used.
2024     union {
2025       Instruction *PrevInst;
2026       BasicBlock *BB;
2027     } Point;
2028     /// Remember whether or not the instruction had a previous instruction.
2029     bool HasPrevInstruction;
2030
2031   public:
2032     /// \brief Record the position of \p Inst.
2033     InsertionHandler(Instruction *Inst) {
2034       BasicBlock::iterator It = Inst->getIterator();
2035       HasPrevInstruction = (It != (Inst->getParent()->begin()));
2036       if (HasPrevInstruction)
2037         Point.PrevInst = &*--It;
2038       else
2039         Point.BB = Inst->getParent();
2040     }
2041
2042     /// \brief Insert \p Inst at the recorded position.
2043     void insert(Instruction *Inst) {
2044       if (HasPrevInstruction) {
2045         if (Inst->getParent())
2046           Inst->removeFromParent();
2047         Inst->insertAfter(Point.PrevInst);
2048       } else {
2049         Instruction *Position = &*Point.BB->getFirstInsertionPt();
2050         if (Inst->getParent())
2051           Inst->moveBefore(Position);
2052         else
2053           Inst->insertBefore(Position);
2054       }
2055     }
2056   };
2057
2058   /// \brief Move an instruction before another.
2059   class InstructionMoveBefore : public TypePromotionAction {
2060     /// Original position of the instruction.
2061     InsertionHandler Position;
2062
2063   public:
2064     /// \brief Move \p Inst before \p Before.
2065     InstructionMoveBefore(Instruction *Inst, Instruction *Before)
2066         : TypePromotionAction(Inst), Position(Inst) {
2067       DEBUG(dbgs() << "Do: move: " << *Inst << "\nbefore: " << *Before << "\n");
2068       Inst->moveBefore(Before);
2069     }
2070
2071     /// \brief Move the instruction back to its original position.
2072     void undo() override {
2073       DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n");
2074       Position.insert(Inst);
2075     }
2076   };
2077
2078   /// \brief Set the operand of an instruction with a new value.
2079   class OperandSetter : public TypePromotionAction {
2080     /// Original operand of the instruction.
2081     Value *Origin;
2082     /// Index of the modified instruction.
2083     unsigned Idx;
2084
2085   public:
2086     /// \brief Set \p Idx operand of \p Inst with \p NewVal.
2087     OperandSetter(Instruction *Inst, unsigned Idx, Value *NewVal)
2088         : TypePromotionAction(Inst), Idx(Idx) {
2089       DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n"
2090                    << "for:" << *Inst << "\n"
2091                    << "with:" << *NewVal << "\n");
2092       Origin = Inst->getOperand(Idx);
2093       Inst->setOperand(Idx, NewVal);
2094     }
2095
2096     /// \brief Restore the original value of the instruction.
2097     void undo() override {
2098       DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n"
2099                    << "for: " << *Inst << "\n"
2100                    << "with: " << *Origin << "\n");
2101       Inst->setOperand(Idx, Origin);
2102     }
2103   };
2104
2105   /// \brief Hide the operands of an instruction.
2106   /// Do as if this instruction was not using any of its operands.
2107   class OperandsHider : public TypePromotionAction {
2108     /// The list of original operands.
2109     SmallVector<Value *, 4> OriginalValues;
2110
2111   public:
2112     /// \brief Remove \p Inst from the uses of the operands of \p Inst.
2113     OperandsHider(Instruction *Inst) : TypePromotionAction(Inst) {
2114       DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n");
2115       unsigned NumOpnds = Inst->getNumOperands();
2116       OriginalValues.reserve(NumOpnds);
2117       for (unsigned It = 0; It < NumOpnds; ++It) {
2118         // Save the current operand.
2119         Value *Val = Inst->getOperand(It);
2120         OriginalValues.push_back(Val);
2121         // Set a dummy one.
2122         // We could use OperandSetter here, but that would imply an overhead
2123         // that we are not willing to pay.
2124         Inst->setOperand(It, UndefValue::get(Val->getType()));
2125       }
2126     }
2127
2128     /// \brief Restore the original list of uses.
2129     void undo() override {
2130       DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n");
2131       for (unsigned It = 0, EndIt = OriginalValues.size(); It != EndIt; ++It)
2132         Inst->setOperand(It, OriginalValues[It]);
2133     }
2134   };
2135
2136   /// \brief Build a truncate instruction.
2137   class TruncBuilder : public TypePromotionAction {
2138     Value *Val;
2139   public:
2140     /// \brief Build a truncate instruction of \p Opnd producing a \p Ty
2141     /// result.
2142     /// trunc Opnd to Ty.
2143     TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) {
2144       IRBuilder<> Builder(Opnd);
2145       Val = Builder.CreateTrunc(Opnd, Ty, "promoted");
2146       DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");
2147     }
2148
2149     /// \brief Get the built value.
2150     Value *getBuiltValue() { return Val; }
2151
2152     /// \brief Remove the built instruction.
2153     void undo() override {
2154       DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n");
2155       if (Instruction *IVal = dyn_cast<Instruction>(Val))
2156         IVal->eraseFromParent();
2157     }
2158   };
2159
2160   /// \brief Build a sign extension instruction.
2161   class SExtBuilder : public TypePromotionAction {
2162     Value *Val;
2163   public:
2164     /// \brief Build a sign extension instruction of \p Opnd producing a \p Ty
2165     /// result.
2166     /// sext Opnd to Ty.
2167     SExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
2168         : TypePromotionAction(InsertPt) {
2169       IRBuilder<> Builder(InsertPt);
2170       Val = Builder.CreateSExt(Opnd, Ty, "promoted");
2171       DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n");
2172     }
2173
2174     /// \brief Get the built value.
2175     Value *getBuiltValue() { return Val; }
2176
2177     /// \brief Remove the built instruction.
2178     void undo() override {
2179       DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n");
2180       if (Instruction *IVal = dyn_cast<Instruction>(Val))
2181         IVal->eraseFromParent();
2182     }
2183   };
2184
2185   /// \brief Build a zero extension instruction.
2186   class ZExtBuilder : public TypePromotionAction {
2187     Value *Val;
2188   public:
2189     /// \brief Build a zero extension instruction of \p Opnd producing a \p Ty
2190     /// result.
2191     /// zext Opnd to Ty.
2192     ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
2193         : TypePromotionAction(InsertPt) {
2194       IRBuilder<> Builder(InsertPt);
2195       Val = Builder.CreateZExt(Opnd, Ty, "promoted");
2196       DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");
2197     }
2198
2199     /// \brief Get the built value.
2200     Value *getBuiltValue() { return Val; }
2201
2202     /// \brief Remove the built instruction.
2203     void undo() override {
2204       DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n");
2205       if (Instruction *IVal = dyn_cast<Instruction>(Val))
2206         IVal->eraseFromParent();
2207     }
2208   };
2209
2210   /// \brief Mutate an instruction to another type.
2211   class TypeMutator : public TypePromotionAction {
2212     /// Record the original type.
2213     Type *OrigTy;
2214
2215   public:
2216     /// \brief Mutate the type of \p Inst into \p NewTy.
2217     TypeMutator(Instruction *Inst, Type *NewTy)
2218         : TypePromotionAction(Inst), OrigTy(Inst->getType()) {
2219       DEBUG(dbgs() << "Do: MutateType: " << *Inst << " with " << *NewTy
2220                    << "\n");
2221       Inst->mutateType(NewTy);
2222     }
2223
2224     /// \brief Mutate the instruction back to its original type.
2225     void undo() override {
2226       DEBUG(dbgs() << "Undo: MutateType: " << *Inst << " with " << *OrigTy
2227                    << "\n");
2228       Inst->mutateType(OrigTy);
2229     }
2230   };
2231
2232   /// \brief Replace the uses of an instruction by another instruction.
2233   class UsesReplacer : public TypePromotionAction {
2234     /// Helper structure to keep track of the replaced uses.
2235     struct InstructionAndIdx {
2236       /// The instruction using the instruction.
2237       Instruction *Inst;
2238       /// The index where this instruction is used for Inst.
2239       unsigned Idx;
2240       InstructionAndIdx(Instruction *Inst, unsigned Idx)
2241           : Inst(Inst), Idx(Idx) {}
2242     };
2243
2244     /// Keep track of the original uses (pair Instruction, Index).
2245     SmallVector<InstructionAndIdx, 4> OriginalUses;
2246     typedef SmallVectorImpl<InstructionAndIdx>::iterator use_iterator;
2247
2248   public:
2249     /// \brief Replace all the use of \p Inst by \p New.
2250     UsesReplacer(Instruction *Inst, Value *New) : TypePromotionAction(Inst) {
2251       DEBUG(dbgs() << "Do: UsersReplacer: " << *Inst << " with " << *New
2252                    << "\n");
2253       // Record the original uses.
2254       for (Use &U : Inst->uses()) {
2255         Instruction *UserI = cast<Instruction>(U.getUser());
2256         OriginalUses.push_back(InstructionAndIdx(UserI, U.getOperandNo()));
2257       }
2258       // Now, we can replace the uses.
2259       Inst->replaceAllUsesWith(New);
2260     }
2261
2262     /// \brief Reassign the original uses of Inst to Inst.
2263     void undo() override {
2264       DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n");
2265       for (use_iterator UseIt = OriginalUses.begin(),
2266                         EndIt = OriginalUses.end();
2267            UseIt != EndIt; ++UseIt) {
2268         UseIt->Inst->setOperand(UseIt->Idx, Inst);
2269       }
2270     }
2271   };
2272
2273   /// \brief Remove an instruction from the IR.
2274   class InstructionRemover : public TypePromotionAction {
2275     /// Original position of the instruction.
2276     InsertionHandler Inserter;
2277     /// Helper structure to hide all the link to the instruction. In other
2278     /// words, this helps to do as if the instruction was removed.
2279     OperandsHider Hider;
2280     /// Keep track of the uses replaced, if any.
2281     UsesReplacer *Replacer;
2282
2283   public:
2284     /// \brief Remove all reference of \p Inst and optinally replace all its
2285     /// uses with New.
2286     /// \pre If !Inst->use_empty(), then New != nullptr
2287     InstructionRemover(Instruction *Inst, Value *New = nullptr)
2288         : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst),
2289           Replacer(nullptr) {
2290       if (New)
2291         Replacer = new UsesReplacer(Inst, New);
2292       DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");
2293       Inst->removeFromParent();
2294     }
2295
2296     ~InstructionRemover() override { delete Replacer; }
2297
2298     /// \brief Really remove the instruction.
2299     void commit() override { delete Inst; }
2300
2301     /// \brief Resurrect the instruction and reassign it to the proper uses if
2302     /// new value was provided when build this action.
2303     void undo() override {
2304       DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n");
2305       Inserter.insert(Inst);
2306       if (Replacer)
2307         Replacer->undo();
2308       Hider.undo();
2309     }
2310   };
2311
2312 public:
2313   /// Restoration point.
2314   /// The restoration point is a pointer to an action instead of an iterator
2315   /// because the iterator may be invalidated but not the pointer.
2316   typedef const TypePromotionAction *ConstRestorationPt;
2317   /// Advocate every changes made in that transaction.
2318   void commit();
2319   /// Undo all the changes made after the given point.
2320   void rollback(ConstRestorationPt Point);
2321   /// Get the current restoration point.
2322   ConstRestorationPt getRestorationPoint() const;
2323
2324   /// \name API for IR modification with state keeping to support rollback.
2325   /// @{
2326   /// Same as Instruction::setOperand.
2327   void setOperand(Instruction *Inst, unsigned Idx, Value *NewVal);
2328   /// Same as Instruction::eraseFromParent.
2329   void eraseInstruction(Instruction *Inst, Value *NewVal = nullptr);
2330   /// Same as Value::replaceAllUsesWith.
2331   void replaceAllUsesWith(Instruction *Inst, Value *New);
2332   /// Same as Value::mutateType.
2333   void mutateType(Instruction *Inst, Type *NewTy);
2334   /// Same as IRBuilder::createTrunc.
2335   Value *createTrunc(Instruction *Opnd, Type *Ty);
2336   /// Same as IRBuilder::createSExt.
2337   Value *createSExt(Instruction *Inst, Value *Opnd, Type *Ty);
2338   /// Same as IRBuilder::createZExt.
2339   Value *createZExt(Instruction *Inst, Value *Opnd, Type *Ty);
2340   /// Same as Instruction::moveBefore.
2341   void moveBefore(Instruction *Inst, Instruction *Before);
2342   /// @}
2343
2344 private:
2345   /// The ordered list of actions made so far.
2346   SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;
2347   typedef SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator CommitPt;
2348 };
2349
2350 void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
2351                                           Value *NewVal) {
2352   Actions.push_back(
2353       make_unique<TypePromotionTransaction::OperandSetter>(Inst, Idx, NewVal));
2354 }
2355
2356 void TypePromotionTransaction::eraseInstruction(Instruction *Inst,
2357                                                 Value *NewVal) {
2358   Actions.push_back(
2359       make_unique<TypePromotionTransaction::InstructionRemover>(Inst, NewVal));
2360 }
2361
2362 void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,
2363                                                   Value *New) {
2364   Actions.push_back(make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New));
2365 }
2366
2367 void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) {
2368   Actions.push_back(make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));
2369 }
2370
2371 Value *TypePromotionTransaction::createTrunc(Instruction *Opnd,
2372                                              Type *Ty) {
2373   std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty));
2374   Value *Val = Ptr->getBuiltValue();
2375   Actions.push_back(std::move(Ptr));
2376   return Val;
2377 }
2378
2379 Value *TypePromotionTransaction::createSExt(Instruction *Inst,
2380                                             Value *Opnd, Type *Ty) {
2381   std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty));
2382   Value *Val = Ptr->getBuiltValue();
2383   Actions.push_back(std::move(Ptr));
2384   return Val;
2385 }
2386
2387 Value *TypePromotionTransaction::createZExt(Instruction *Inst,
2388                                             Value *Opnd, Type *Ty) {
2389   std::unique_ptr<ZExtBuilder> Ptr(new ZExtBuilder(Inst, Opnd, Ty));
2390   Value *Val = Ptr->getBuiltValue();
2391   Actions.push_back(std::move(Ptr));
2392   return Val;
2393 }
2394
2395 void TypePromotionTransaction::moveBefore(Instruction *Inst,
2396                                           Instruction *Before) {
2397   Actions.push_back(
2398       make_unique<TypePromotionTransaction::InstructionMoveBefore>(Inst, Before));
2399 }
2400
2401 TypePromotionTransaction::ConstRestorationPt
2402 TypePromotionTransaction::getRestorationPoint() const {
2403   return !Actions.empty() ? Actions.back().get() : nullptr;
2404 }
2405
2406 void TypePromotionTransaction::commit() {
2407   for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt;
2408        ++It)
2409     (*It)->commit();
2410   Actions.clear();
2411 }
2412
2413 void TypePromotionTransaction::rollback(
2414     TypePromotionTransaction::ConstRestorationPt Point) {
2415   while (!Actions.empty() && Point != Actions.back().get()) {
2416     std::unique_ptr<TypePromotionAction> Curr = Actions.pop_back_val();
2417     Curr->undo();
2418   }
2419 }
2420
2421 /// \brief A helper class for matching addressing modes.
2422 ///
2423 /// This encapsulates the logic for matching the target-legal addressing modes.
2424 class AddressingModeMatcher {
2425   SmallVectorImpl<Instruction*> &AddrModeInsts;
2426   const TargetMachine &TM;
2427   const TargetLowering &TLI;
2428   const DataLayout &DL;
2429
2430   /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
2431   /// the memory instruction that we're computing this address for.
2432   Type *AccessTy;
2433   unsigned AddrSpace;
2434   Instruction *MemoryInst;
2435
2436   /// This is the addressing mode that we're building up. This is
2437   /// part of the return value of this addressing mode matching stuff.
2438   ExtAddrMode &AddrMode;
2439
2440   /// The instructions inserted by other CodeGenPrepare optimizations.
2441   const SetOfInstrs &InsertedInsts;
2442   /// A map from the instructions to their type before promotion.
2443   InstrToOrigTy &PromotedInsts;
2444   /// The ongoing transaction where every action should be registered.
2445   TypePromotionTransaction &TPT;
2446
2447   /// This is set to true when we should not do profitability checks.
2448   /// When true, IsProfitableToFoldIntoAddressingMode always returns true.
2449   bool IgnoreProfitability;
2450
2451   AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI,
2452                         const TargetMachine &TM, Type *AT, unsigned AS,
2453                         Instruction *MI, ExtAddrMode &AM,
2454                         const SetOfInstrs &InsertedInsts,
2455                         InstrToOrigTy &PromotedInsts,
2456                         TypePromotionTransaction &TPT)
2457       : AddrModeInsts(AMI), TM(TM),
2458         TLI(*TM.getSubtargetImpl(*MI->getParent()->getParent())
2459                  ->getTargetLowering()),
2460         DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
2461         MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
2462         PromotedInsts(PromotedInsts), TPT(TPT) {
2463     IgnoreProfitability = false;
2464   }
2465 public:
2466
2467   /// Find the maximal addressing mode that a load/store of V can fold,
2468   /// give an access type of AccessTy.  This returns a list of involved
2469   /// instructions in AddrModeInsts.
2470   /// \p InsertedInsts The instructions inserted by other CodeGenPrepare
2471   /// optimizations.
2472   /// \p PromotedInsts maps the instructions to their type before promotion.
2473   /// \p The ongoing transaction where every action should be registered.
2474   static ExtAddrMode Match(Value *V, Type *AccessTy, unsigned AS,
2475                            Instruction *MemoryInst,
2476                            SmallVectorImpl<Instruction*> &AddrModeInsts,
2477                            const TargetMachine &TM,
2478                            const SetOfInstrs &InsertedInsts,
2479                            InstrToOrigTy &PromotedInsts,
2480                            TypePromotionTransaction &TPT) {
2481     ExtAddrMode Result;
2482
2483     bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy, AS,
2484                                          MemoryInst, Result, InsertedInsts,
2485                                          PromotedInsts, TPT).matchAddr(V, 0);
2486     (void)Success; assert(Success && "Couldn't select *anything*?");
2487     return Result;
2488   }
2489 private:
2490   bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
2491   bool matchAddr(Value *V, unsigned Depth);
2492   bool matchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth,
2493                           bool *MovedAway = nullptr);
2494   bool isProfitableToFoldIntoAddressingMode(Instruction *I,
2495                                             ExtAddrMode &AMBefore,
2496                                             ExtAddrMode &AMAfter);
2497   bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2);
2498   bool isPromotionProfitable(unsigned NewCost, unsigned OldCost,
2499                              Value *PromotedOperand) const;
2500 };
2501
2502 /// Try adding ScaleReg*Scale to the current addressing mode.
2503 /// Return true and update AddrMode if this addr mode is legal for the target,
2504 /// false if not.
2505 bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
2506                                              unsigned Depth) {
2507   // If Scale is 1, then this is the same as adding ScaleReg to the addressing
2508   // mode.  Just process that directly.
2509   if (Scale == 1)
2510     return matchAddr(ScaleReg, Depth);
2511
2512   // If the scale is 0, it takes nothing to add this.
2513   if (Scale == 0)
2514     return true;
2515
2516   // If we already have a scale of this value, we can add to it, otherwise, we
2517   // need an available scale field.
2518   if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
2519     return false;
2520
2521   ExtAddrMode TestAddrMode = AddrMode;
2522
2523   // Add scale to turn X*4+X*3 -> X*7.  This could also do things like
2524   // [A+B + A*7] -> [B+A*8].
2525   TestAddrMode.Scale += Scale;
2526   TestAddrMode.ScaledReg = ScaleReg;
2527
2528   // If the new address isn't legal, bail out.
2529   if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace))
2530     return false;
2531
2532   // It was legal, so commit it.
2533   AddrMode = TestAddrMode;
2534
2535   // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
2536   // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
2537   // X*Scale + C*Scale to addr mode.
2538   ConstantInt *CI = nullptr; Value *AddLHS = nullptr;
2539   if (isa<Instruction>(ScaleReg) &&  // not a constant expr.
2540       match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
2541     TestAddrMode.ScaledReg = AddLHS;
2542     TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;
2543
2544     // If this addressing mode is legal, commit it and remember that we folded
2545     // this instruction.
2546     if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) {
2547       AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
2548       AddrMode = TestAddrMode;
2549       return true;
2550     }
2551   }
2552
2553   // Otherwise, not (x+c)*scale, just return what we have.
2554   return true;
2555 }
2556
2557 /// This is a little filter, which returns true if an addressing computation
2558 /// involving I might be folded into a load/store accessing it.
2559 /// This doesn't need to be perfect, but needs to accept at least
2560 /// the set of instructions that MatchOperationAddr can.
2561 static bool MightBeFoldableInst(Instruction *I) {
2562   switch (I->getOpcode()) {
2563   case Instruction::BitCast:
2564   case Instruction::AddrSpaceCast:
2565     // Don't touch identity bitcasts.
2566     if (I->getType() == I->getOperand(0)->getType())
2567       return false;
2568     return I->getType()->isPointerTy() || I->getType()->isIntegerTy();
2569   case Instruction::PtrToInt:
2570     // PtrToInt is always a noop, as we know that the int type is pointer sized.
2571     return true;
2572   case Instruction::IntToPtr:
2573     // We know the input is intptr_t, so this is foldable.
2574     return true;
2575   case Instruction::Add:
2576     return true;
2577   case Instruction::Mul:
2578   case Instruction::Shl:
2579     // Can only handle X*C and X << C.
2580     return isa<ConstantInt>(I->getOperand(1));
2581   case Instruction::GetElementPtr:
2582     return true;
2583   default:
2584     return false;
2585   }
2586 }
2587
2588 /// \brief Check whether or not \p Val is a legal instruction for \p TLI.
2589 /// \note \p Val is assumed to be the product of some type promotion.
2590 /// Therefore if \p Val has an undefined state in \p TLI, this is assumed
2591 /// to be legal, as the non-promoted value would have had the same state.
2592 static bool isPromotedInstructionLegal(const TargetLowering &TLI,
2593                                        const DataLayout &DL, Value *Val) {
2594   Instruction *PromotedInst = dyn_cast<Instruction>(Val);
2595   if (!PromotedInst)
2596     return false;
2597   int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode());
2598   // If the ISDOpcode is undefined, it was undefined before the promotion.
2599   if (!ISDOpcode)
2600     return true;
2601   // Otherwise, check if the promoted instruction is legal or not.
2602   return TLI.isOperationLegalOrCustom(
2603       ISDOpcode, TLI.getValueType(DL, PromotedInst->getType()));
2604 }
2605
2606 /// \brief Hepler class to perform type promotion.
2607 class TypePromotionHelper {
2608   /// \brief Utility function to check whether or not a sign or zero extension
2609   /// of \p Inst with \p ConsideredExtType can be moved through \p Inst by
2610   /// either using the operands of \p Inst or promoting \p Inst.
2611   /// The type of the extension is defined by \p IsSExt.
2612   /// In other words, check if:
2613   /// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType.
2614   /// #1 Promotion applies:
2615   /// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...).
2616   /// #2 Operand reuses:
2617   /// ext opnd1 to ConsideredExtType.
2618   /// \p PromotedInsts maps the instructions to their type before promotion.
2619   static bool canGetThrough(const Instruction *Inst, Type *ConsideredExtType,
2620                             const InstrToOrigTy &PromotedInsts, bool IsSExt);
2621
2622   /// \brief Utility function to determine if \p OpIdx should be promoted when
2623   /// promoting \p Inst.
2624   static bool shouldExtOperand(const Instruction *Inst, int OpIdx) {
2625     return !(isa<SelectInst>(Inst) && OpIdx == 0);
2626   }
2627
2628   /// \brief Utility function to promote the operand of \p Ext when this
2629   /// operand is a promotable trunc or sext or zext.
2630   /// \p PromotedInsts maps the instructions to their type before promotion.
2631   /// \p CreatedInstsCost[out] contains the cost of all instructions
2632   /// created to promote the operand of Ext.
2633   /// Newly added extensions are inserted in \p Exts.
2634   /// Newly added truncates are inserted in \p Truncs.
2635   /// Should never be called directly.
2636   /// \return The promoted value which is used instead of Ext.
2637   static Value *promoteOperandForTruncAndAnyExt(
2638       Instruction *Ext, TypePromotionTransaction &TPT,
2639       InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
2640       SmallVectorImpl<Instruction *> *Exts,
2641       SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI);
2642
2643   /// \brief Utility function to promote the operand of \p Ext when this
2644   /// operand is promotable and is not a supported trunc or sext.
2645   /// \p PromotedInsts maps the instructions to their type before promotion.
2646   /// \p CreatedInstsCost[out] contains the cost of all the instructions
2647   /// created to promote the operand of Ext.
2648   /// Newly added extensions are inserted in \p Exts.
2649   /// Newly added truncates are inserted in \p Truncs.
2650   /// Should never be called directly.
2651   /// \return The promoted value which is used instead of Ext.
2652   static Value *promoteOperandForOther(Instruction *Ext,
2653                                        TypePromotionTransaction &TPT,
2654                                        InstrToOrigTy &PromotedInsts,
2655                                        unsigned &CreatedInstsCost,
2656                                        SmallVectorImpl<Instruction *> *Exts,
2657                                        SmallVectorImpl<Instruction *> *Truncs,
2658                                        const TargetLowering &TLI, bool IsSExt);
2659
2660   /// \see promoteOperandForOther.
2661   static Value *signExtendOperandForOther(
2662       Instruction *Ext, TypePromotionTransaction &TPT,
2663       InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
2664       SmallVectorImpl<Instruction *> *Exts,
2665       SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
2666     return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
2667                                   Exts, Truncs, TLI, true);
2668   }
2669
2670   /// \see promoteOperandForOther.
2671   static Value *zeroExtendOperandForOther(
2672       Instruction *Ext, TypePromotionTransaction &TPT,
2673       InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
2674       SmallVectorImpl<Instruction *> *Exts,
2675       SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
2676     return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
2677                                   Exts, Truncs, TLI, false);
2678   }
2679
2680 public:
2681   /// Type for the utility function that promotes the operand of Ext.
2682   typedef Value *(*Action)(Instruction *Ext, TypePromotionTransaction &TPT,
2683                            InstrToOrigTy &PromotedInsts,
2684                            unsigned &CreatedInstsCost,
2685                            SmallVectorImpl<Instruction *> *Exts,
2686                            SmallVectorImpl<Instruction *> *Truncs,
2687                            const TargetLowering &TLI);
2688   /// \brief Given a sign/zero extend instruction \p Ext, return the approriate
2689   /// action to promote the operand of \p Ext instead of using Ext.
2690   /// \return NULL if no promotable action is possible with the current
2691   /// sign extension.
2692   /// \p InsertedInsts keeps track of all the instructions inserted by the
2693   /// other CodeGenPrepare optimizations. This information is important
2694   /// because we do not want to promote these instructions as CodeGenPrepare
2695   /// will reinsert them later. Thus creating an infinite loop: create/remove.
2696   /// \p PromotedInsts maps the instructions to their type before promotion.
2697   static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts,
2698                           const TargetLowering &TLI,
2699                           const InstrToOrigTy &PromotedInsts);
2700 };
2701
2702 bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
2703                                         Type *ConsideredExtType,
2704                                         const InstrToOrigTy &PromotedInsts,
2705                                         bool IsSExt) {
2706   // The promotion helper does not know how to deal with vector types yet.
2707   // To be able to fix that, we would need to fix the places where we
2708   // statically extend, e.g., constants and such.
2709   if (Inst->getType()->isVectorTy())
2710     return false;
2711
2712   // We can always get through zext.
2713   if (isa<ZExtInst>(Inst))
2714     return true;
2715
2716   // sext(sext) is ok too.
2717   if (IsSExt && isa<SExtInst>(Inst))
2718     return true;
2719
2720   // We can get through binary operator, if it is legal. In other words, the
2721   // binary operator must have a nuw or nsw flag.
2722   const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
2723   if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
2724       ((!IsSExt && BinOp->hasNoUnsignedWrap()) ||
2725        (IsSExt && BinOp->hasNoSignedWrap())))
2726     return true;
2727
2728   // Check if we can do the following simplification.
2729   // ext(trunc(opnd)) --> ext(opnd)
2730   if (!isa<TruncInst>(Inst))
2731     return false;
2732
2733   Value *OpndVal = Inst->getOperand(0);
2734   // Check if we can use this operand in the extension.
2735   // If the type is larger than the result type of the extension, we cannot.
2736   if (!OpndVal->getType()->isIntegerTy() ||
2737       OpndVal->getType()->getIntegerBitWidth() >
2738           ConsideredExtType->getIntegerBitWidth())
2739     return false;
2740
2741   // If the operand of the truncate is not an instruction, we will not have
2742   // any information on the dropped bits.
2743   // (Actually we could for constant but it is not worth the extra logic).
2744   Instruction *Opnd = dyn_cast<Instruction>(OpndVal);
2745   if (!Opnd)
2746     return false;
2747
2748   // Check if the source of the type is narrow enough.
2749   // I.e., check that trunc just drops extended bits of the same kind of
2750   // the extension.
2751   // #1 get the type of the operand and check the kind of the extended bits.
2752   const Type *OpndType;
2753   InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd);
2754   if (It != PromotedInsts.end() && It->second.getInt() == IsSExt)
2755     OpndType = It->second.getPointer();
2756   else if ((IsSExt && isa<SExtInst>(Opnd)) || (!IsSExt && isa<ZExtInst>(Opnd)))
2757     OpndType = Opnd->getOperand(0)->getType();
2758   else
2759     return false;
2760
2761   // #2 check that the truncate just drops extended bits.
2762   return Inst->getType()->getIntegerBitWidth() >=
2763          OpndType->getIntegerBitWidth();
2764 }
2765
2766 TypePromotionHelper::Action TypePromotionHelper::getAction(
2767     Instruction *Ext, const SetOfInstrs &InsertedInsts,
2768     const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) {
2769   assert((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
2770          "Unexpected instruction type");
2771   Instruction *ExtOpnd = dyn_cast<Instruction>(Ext->getOperand(0));
2772   Type *ExtTy = Ext->getType();
2773   bool IsSExt = isa<SExtInst>(Ext);
2774   // If the operand of the extension is not an instruction, we cannot
2775   // get through.
2776   // If it, check we can get through.
2777   if (!ExtOpnd || !canGetThrough(ExtOpnd, ExtTy, PromotedInsts, IsSExt))
2778     return nullptr;
2779
2780   // Do not promote if the operand has been added by codegenprepare.
2781   // Otherwise, it means we are undoing an optimization that is likely to be
2782   // redone, thus causing potential infinite loop.
2783   if (isa<TruncInst>(ExtOpnd) && InsertedInsts.count(ExtOpnd))
2784     return nullptr;
2785
2786   // SExt or Trunc instructions.
2787   // Return the related handler.
2788   if (isa<SExtInst>(ExtOpnd) || isa<TruncInst>(ExtOpnd) ||
2789       isa<ZExtInst>(ExtOpnd))
2790     return promoteOperandForTruncAndAnyExt;
2791
2792   // Regular instruction.
2793   // Abort early if we will have to insert non-free instructions.
2794   if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(ExtTy, ExtOpnd->getType()))
2795     return nullptr;
2796   return IsSExt ? signExtendOperandForOther : zeroExtendOperandForOther;
2797 }
2798
2799 Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
2800     llvm::Instruction *SExt, TypePromotionTransaction &TPT,
2801     InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
2802     SmallVectorImpl<Instruction *> *Exts,
2803     SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
2804   // By construction, the operand of SExt is an instruction. Otherwise we cannot
2805   // get through it and this method should not be called.
2806   Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));
2807   Value *ExtVal = SExt;
2808   bool HasMergedNonFreeExt = false;
2809   if (isa<ZExtInst>(SExtOpnd)) {
2810     // Replace s|zext(zext(opnd))
2811     // => zext(opnd).
2812     HasMergedNonFreeExt = !TLI.isExtFree(SExtOpnd);
2813     Value *ZExt =
2814         TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType());
2815     TPT.replaceAllUsesWith(SExt, ZExt);
2816     TPT.eraseInstruction(SExt);
2817     ExtVal = ZExt;
2818   } else {
2819     // Replace z|sext(trunc(opnd)) or sext(sext(opnd))
2820     // => z|sext(opnd).
2821     TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0));
2822   }
2823   CreatedInstsCost = 0;
2824
2825   // Remove dead code.
2826   if (SExtOpnd->use_empty())
2827     TPT.eraseInstruction(SExtOpnd);
2828
2829   // Check if the extension is still needed.
2830   Instruction *ExtInst = dyn_cast<Instruction>(ExtVal);
2831   if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) {
2832     if (ExtInst) {
2833       if (Exts)
2834         Exts->push_back(ExtInst);
2835       CreatedInstsCost = !TLI.isExtFree(ExtInst) && !HasMergedNonFreeExt;
2836     }
2837     return ExtVal;
2838   }
2839
2840   // At this point we have: ext ty opnd to ty.
2841   // Reassign the uses of ExtInst to the opnd and remove ExtInst.
2842   Value *NextVal = ExtInst->getOperand(0);
2843   TPT.eraseInstruction(ExtInst, NextVal);
2844   return NextVal;
2845 }
2846
2847 Value *TypePromotionHelper::promoteOperandForOther(
2848     Instruction *Ext, TypePromotionTransaction &TPT,
2849     InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
2850     SmallVectorImpl<Instruction *> *Exts,
2851     SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI,
2852     bool IsSExt) {
2853   // By construction, the operand of Ext is an instruction. Otherwise we cannot
2854   // get through it and this method should not be called.
2855   Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0));
2856   CreatedInstsCost = 0;
2857   if (!ExtOpnd->hasOneUse()) {
2858     // ExtOpnd will be promoted.
2859     // All its uses, but Ext, will need to use a truncated value of the
2860     // promoted version.
2861     // Create the truncate now.
2862     Value *Trunc = TPT.createTrunc(Ext, ExtOpnd->getType());
2863     if (Instruction *ITrunc = dyn_cast<Instruction>(Trunc)) {
2864       ITrunc->removeFromParent();
2865       // Insert it just after the definition.
2866       ITrunc->insertAfter(ExtOpnd);
2867       if (Truncs)
2868         Truncs->push_back(ITrunc);
2869     }
2870
2871     TPT.replaceAllUsesWith(ExtOpnd, Trunc);
2872     // Restore the operand of Ext (which has been replaced by the previous call
2873     // to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext.
2874     TPT.setOperand(Ext, 0, ExtOpnd);
2875   }
2876
2877   // Get through the Instruction:
2878   // 1. Update its type.
2879   // 2. Replace the uses of Ext by Inst.
2880   // 3. Extend each operand that needs to be extended.
2881
2882   // Remember the original type of the instruction before promotion.
2883   // This is useful to know that the high bits are sign extended bits.
2884   PromotedInsts.insert(std::pair<Instruction *, TypeIsSExt>(
2885       ExtOpnd, TypeIsSExt(ExtOpnd->getType(), IsSExt)));
2886   // Step #1.
2887   TPT.mutateType(ExtOpnd, Ext->getType());
2888   // Step #2.
2889   TPT.replaceAllUsesWith(Ext, ExtOpnd);
2890   // Step #3.
2891   Instruction *ExtForOpnd = Ext;
2892
2893   DEBUG(dbgs() << "Propagate Ext to operands\n");
2894   for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx;
2895        ++OpIdx) {
2896     DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n');
2897     if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() ||
2898         !shouldExtOperand(ExtOpnd, OpIdx)) {
2899       DEBUG(dbgs() << "No need to propagate\n");
2900       continue;
2901     }
2902     // Check if we can statically extend the operand.
2903     Value *Opnd = ExtOpnd->getOperand(OpIdx);
2904     if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
2905       DEBUG(dbgs() << "Statically extend\n");
2906       unsigned BitWidth = Ext->getType()->getIntegerBitWidth();
2907       APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth)
2908                             : Cst->getValue().zext(BitWidth);
2909       TPT.setOperand(ExtOpnd, OpIdx, ConstantInt::get(Ext->getType(), CstVal));
2910       continue;
2911     }
2912     // UndefValue are typed, so we have to statically sign extend them.
2913     if (isa<UndefValue>(Opnd)) {
2914       DEBUG(dbgs() << "Statically extend\n");
2915       TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType()));
2916       continue;
2917     }
2918
2919     // Otherwise we have to explicity sign extend the operand.
2920     // Check if Ext was reused to extend an operand.
2921     if (!ExtForOpnd) {
2922       // If yes, create a new one.
2923       DEBUG(dbgs() << "More operands to ext\n");
2924       Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())
2925         : TPT.createZExt(Ext, Opnd, Ext->getType());
2926       if (!isa<Instruction>(ValForExtOpnd)) {
2927         TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd);
2928         continue;
2929       }
2930       ExtForOpnd = cast<Instruction>(ValForExtOpnd);
2931     }
2932     if (Exts)
2933       Exts->push_back(ExtForOpnd);
2934     TPT.setOperand(ExtForOpnd, 0, Opnd);
2935
2936     // Move the sign extension before the insertion point.
2937     TPT.moveBefore(ExtForOpnd, ExtOpnd);
2938     TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd);
2939     CreatedInstsCost += !TLI.isExtFree(ExtForOpnd);
2940     // If more sext are required, new instructions will have to be created.
2941     ExtForOpnd = nullptr;
2942   }
2943   if (ExtForOpnd == Ext) {
2944     DEBUG(dbgs() << "Extension is useless now\n");
2945     TPT.eraseInstruction(Ext);
2946   }
2947   return ExtOpnd;
2948 }
2949
2950 /// Check whether or not promoting an instruction to a wider type is profitable.
2951 /// \p NewCost gives the cost of extension instructions created by the
2952 /// promotion.
2953 /// \p OldCost gives the cost of extension instructions before the promotion
2954 /// plus the number of instructions that have been
2955 /// matched in the addressing mode the promotion.
2956 /// \p PromotedOperand is the value that has been promoted.
2957 /// \return True if the promotion is profitable, false otherwise.
2958 bool AddressingModeMatcher::isPromotionProfitable(
2959     unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const {
2960   DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost << '\n');
2961   // The cost of the new extensions is greater than the cost of the
2962   // old extension plus what we folded.
2963   // This is not profitable.
2964   if (NewCost > OldCost)
2965     return false;
2966   if (NewCost < OldCost)
2967     return true;
2968   // The promotion is neutral but it may help folding the sign extension in
2969   // loads for instance.
2970   // Check that we did not create an illegal instruction.
2971   return isPromotedInstructionLegal(TLI, DL, PromotedOperand);
2972 }
2973
2974 /// Given an instruction or constant expr, see if we can fold the operation
2975 /// into the addressing mode. If so, update the addressing mode and return
2976 /// true, otherwise return false without modifying AddrMode.
2977 /// If \p MovedAway is not NULL, it contains the information of whether or
2978 /// not AddrInst has to be folded into the addressing mode on success.
2979 /// If \p MovedAway == true, \p AddrInst will not be part of the addressing
2980 /// because it has been moved away.
2981 /// Thus AddrInst must not be added in the matched instructions.
2982 /// This state can happen when AddrInst is a sext, since it may be moved away.
2983 /// Therefore, AddrInst may not be valid when MovedAway is true and it must
2984 /// not be referenced anymore.
2985 bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
2986                                                unsigned Depth,
2987                                                bool *MovedAway) {
2988   // Avoid exponential behavior on extremely deep expression trees.
2989   if (Depth >= 5) return false;
2990
2991   // By default, all matched instructions stay in place.
2992   if (MovedAway)
2993     *MovedAway = false;
2994
2995   switch (Opcode) {
2996   case Instruction::PtrToInt:
2997     // PtrToInt is always a noop, as we know that the int type is pointer sized.
2998     return matchAddr(AddrInst->getOperand(0), Depth);
2999   case Instruction::IntToPtr: {
3000     auto AS = AddrInst->getType()->getPointerAddressSpace();
3001     auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
3002     // This inttoptr is a no-op if the integer type is pointer sized.
3003     if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy)
3004       return matchAddr(AddrInst->getOperand(0), Depth);
3005     return false;
3006   }
3007   case Instruction::BitCast:
3008     // BitCast is always a noop, and we can handle it as long as it is
3009     // int->int or pointer->pointer (we don't want int<->fp or something).
3010     if ((AddrInst->getOperand(0)->getType()->isPointerTy() ||
3011          AddrInst->getOperand(0)->getType()->isIntegerTy()) &&
3012         // Don't touch identity bitcasts.  These were probably put here by LSR,
3013         // and we don't want to mess around with them.  Assume it knows what it
3014         // is doing.
3015         AddrInst->getOperand(0)->getType() != AddrInst->getType())
3016       return matchAddr(AddrInst->getOperand(0), Depth);
3017     return false;
3018   case Instruction::AddrSpaceCast: {
3019     unsigned SrcAS
3020       = AddrInst->getOperand(0)->getType()->getPointerAddressSpace();
3021     unsigned DestAS = AddrInst->getType()->getPointerAddressSpace();
3022     if (TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
3023       return matchAddr(AddrInst->getOperand(0), Depth);
3024     return false;
3025   }
3026   case Instruction::Add: {
3027     // Check to see if we can merge in the RHS then the LHS.  If so, we win.
3028     ExtAddrMode BackupAddrMode = AddrMode;
3029     unsigned OldSize = AddrModeInsts.size();
3030     // Start a transaction at this point.
3031     // The LHS may match but not the RHS.
3032     // Therefore, we need a higher level restoration point to undo partially
3033     // matched operation.
3034     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
3035         TPT.getRestorationPoint();
3036
3037     if (matchAddr(AddrInst->getOperand(1), Depth+1) &&
3038         matchAddr(AddrInst->getOperand(0), Depth+1))
3039       return true;
3040
3041     // Restore the old addr mode info.
3042     AddrMode = BackupAddrMode;
3043     AddrModeInsts.resize(OldSize);
3044     TPT.rollback(LastKnownGood);
3045
3046     // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
3047     if (matchAddr(AddrInst->getOperand(0), Depth+1) &&
3048         matchAddr(AddrInst->getOperand(1), Depth+1))
3049       return true;
3050
3051     // Otherwise we definitely can't merge the ADD in.
3052     AddrMode = BackupAddrMode;
3053     AddrModeInsts.resize(OldSize);
3054     TPT.rollback(LastKnownGood);
3055     break;
3056   }
3057   //case Instruction::Or:
3058   // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
3059   //break;
3060   case Instruction::Mul:
3061   case Instruction::Shl: {
3062     // Can only handle X*C and X << C.
3063     ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
3064     if (!RHS)
3065       return false;
3066     int64_t Scale = RHS->getSExtValue();
3067     if (Opcode == Instruction::Shl)
3068       Scale = 1LL << Scale;
3069
3070     return matchScaledValue(AddrInst->getOperand(0), Scale, Depth);
3071   }
3072   case Instruction::GetElementPtr: {
3073     // Scan the GEP.  We check it if it contains constant offsets and at most
3074     // one variable offset.
3075     int VariableOperand = -1;
3076     unsigned VariableScale = 0;
3077
3078     int64_t ConstantOffset = 0;
3079     gep_type_iterator GTI = gep_type_begin(AddrInst);
3080     for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
3081       if (StructType *STy = dyn_cast<StructType>(*GTI)) {
3082         const StructLayout *SL = DL.getStructLayout(STy);
3083         unsigned Idx =
3084           cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
3085         ConstantOffset += SL->getElementOffset(Idx);
3086       } else {
3087         uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType());
3088         if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
3089           ConstantOffset += CI->getSExtValue()*TypeSize;
3090         } else if (TypeSize) {  // Scales of zero don't do anything.
3091           // We only allow one variable index at the moment.
3092           if (VariableOperand != -1)
3093             return false;
3094
3095           // Remember the variable index.
3096           VariableOperand = i;
3097           VariableScale = TypeSize;
3098         }
3099       }
3100     }
3101
3102     // A common case is for the GEP to only do a constant offset.  In this case,
3103     // just add it to the disp field and check validity.
3104     if (VariableOperand == -1) {
3105       AddrMode.BaseOffs += ConstantOffset;
3106       if (ConstantOffset == 0 ||
3107           TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) {
3108         // Check to see if we can fold the base pointer in too.
3109         if (matchAddr(AddrInst->getOperand(0), Depth+1))
3110           return true;
3111       }
3112       AddrMode.BaseOffs -= ConstantOffset;
3113       return false;
3114     }
3115
3116     // Save the valid addressing mode in case we can't match.
3117     ExtAddrMode BackupAddrMode = AddrMode;
3118     unsigned OldSize = AddrModeInsts.size();
3119
3120     // See if the scale and offset amount is valid for this target.
3121     AddrMode.BaseOffs += ConstantOffset;
3122
3123     // Match the base operand of the GEP.
3124     if (!matchAddr(AddrInst->getOperand(0), Depth+1)) {
3125       // If it couldn't be matched, just stuff the value in a register.
3126       if (AddrMode.HasBaseReg) {
3127         AddrMode = BackupAddrMode;
3128         AddrModeInsts.resize(OldSize);
3129         return false;
3130       }
3131       AddrMode.HasBaseReg = true;
3132       AddrMode.BaseReg = AddrInst->getOperand(0);
3133     }
3134
3135     // Match the remaining variable portion of the GEP.
3136     if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
3137                           Depth)) {
3138       // If it couldn't be matched, try stuffing the base into a register
3139       // instead of matching it, and retrying the match of the scale.
3140       AddrMode = BackupAddrMode;
3141       AddrModeInsts.resize(OldSize);
3142       if (AddrMode.HasBaseReg)
3143         return false;
3144       AddrMode.HasBaseReg = true;
3145       AddrMode.BaseReg = AddrInst->getOperand(0);
3146       AddrMode.BaseOffs += ConstantOffset;
3147       if (!matchScaledValue(AddrInst->getOperand(VariableOperand),
3148                             VariableScale, Depth)) {
3149         // If even that didn't work, bail.
3150         AddrMode = BackupAddrMode;
3151         AddrModeInsts.resize(OldSize);
3152         return false;
3153       }
3154     }
3155
3156     return true;
3157   }
3158   case Instruction::SExt:
3159   case Instruction::ZExt: {
3160     Instruction *Ext = dyn_cast<Instruction>(AddrInst);
3161     if (!Ext)
3162       return false;
3163
3164     // Try to move this ext out of the way of the addressing mode.
3165     // Ask for a method for doing so.
3166     TypePromotionHelper::Action TPH =
3167         TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts);
3168     if (!TPH)
3169       return false;
3170
3171     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
3172         TPT.getRestorationPoint();
3173     unsigned CreatedInstsCost = 0;
3174     unsigned ExtCost = !TLI.isExtFree(Ext);
3175     Value *PromotedOperand =
3176         TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI);
3177     // SExt has been moved away.
3178     // Thus either it will be rematched later in the recursive calls or it is
3179     // gone. Anyway, we must not fold it into the addressing mode at this point.
3180     // E.g.,
3181     // op = add opnd, 1
3182     // idx = ext op
3183     // addr = gep base, idx
3184     // is now:
3185     // promotedOpnd = ext opnd            <- no match here
3186     // op = promoted_add promotedOpnd, 1  <- match (later in recursive calls)
3187     // addr = gep base, op                <- match
3188     if (MovedAway)
3189       *MovedAway = true;
3190
3191     assert(PromotedOperand &&
3192            "TypePromotionHelper should have filtered out those cases");
3193
3194     ExtAddrMode BackupAddrMode = AddrMode;
3195     unsigned OldSize = AddrModeInsts.size();
3196
3197     if (!matchAddr(PromotedOperand, Depth) ||
3198         // The total of the new cost is equal to the cost of the created
3199         // instructions.
3200         // The total of the old cost is equal to the cost of the extension plus
3201         // what we have saved in the addressing mode.
3202         !isPromotionProfitable(CreatedInstsCost,
3203                                ExtCost + (AddrModeInsts.size() - OldSize),
3204                                PromotedOperand)) {
3205       AddrMode = BackupAddrMode;
3206       AddrModeInsts.resize(OldSize);
3207       DEBUG(dbgs() << "Sign extension does not pay off: rollback\n");
3208       TPT.rollback(LastKnownGood);
3209       return false;
3210     }
3211     return true;
3212   }
3213   }
3214   return false;
3215 }
3216
3217 /// If we can, try to add the value of 'Addr' into the current addressing mode.
3218 /// If Addr can't be added to AddrMode this returns false and leaves AddrMode
3219 /// unmodified. This assumes that Addr is either a pointer type or intptr_t
3220 /// for the target.
3221 ///
3222 bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
3223   // Start a transaction at this point that we will rollback if the matching
3224   // fails.
3225   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
3226       TPT.getRestorationPoint();
3227   if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
3228     // Fold in immediates if legal for the target.
3229     AddrMode.BaseOffs += CI->getSExtValue();
3230     if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
3231       return true;
3232     AddrMode.BaseOffs -= CI->getSExtValue();
3233   } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
3234     // If this is a global variable, try to fold it into the addressing mode.
3235     if (!AddrMode.BaseGV) {
3236       AddrMode.BaseGV = GV;
3237       if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
3238         return true;
3239       AddrMode.BaseGV = nullptr;
3240     }
3241   } else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
3242     ExtAddrMode BackupAddrMode = AddrMode;
3243     unsigned OldSize = AddrModeInsts.size();
3244
3245     // Check to see if it is possible to fold this operation.
3246     bool MovedAway = false;
3247     if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) {
3248       // This instruction may have been moved away. If so, there is nothing
3249       // to check here.
3250       if (MovedAway)
3251         return true;
3252       // Okay, it's possible to fold this.  Check to see if it is actually
3253       // *profitable* to do so.  We use a simple cost model to avoid increasing
3254       // register pressure too much.
3255       if (I->hasOneUse() ||
3256           isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
3257         AddrModeInsts.push_back(I);
3258         return true;
3259       }
3260
3261       // It isn't profitable to do this, roll back.
3262       //cerr << "NOT FOLDING: " << *I;
3263       AddrMode = BackupAddrMode;
3264       AddrModeInsts.resize(OldSize);
3265       TPT.rollback(LastKnownGood);
3266     }
3267   } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
3268     if (matchOperationAddr(CE, CE->getOpcode(), Depth))
3269       return true;
3270     TPT.rollback(LastKnownGood);
3271   } else if (isa<ConstantPointerNull>(Addr)) {
3272     // Null pointer gets folded without affecting the addressing mode.
3273     return true;
3274   }
3275
3276   // Worse case, the target should support [reg] addressing modes. :)
3277   if (!AddrMode.HasBaseReg) {
3278     AddrMode.HasBaseReg = true;
3279     AddrMode.BaseReg = Addr;
3280     // Still check for legality in case the target supports [imm] but not [i+r].
3281     if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
3282       return true;
3283     AddrMode.HasBaseReg = false;
3284     AddrMode.BaseReg = nullptr;
3285   }
3286
3287   // If the base register is already taken, see if we can do [r+r].
3288   if (AddrMode.Scale == 0) {
3289     AddrMode.Scale = 1;
3290     AddrMode.ScaledReg = Addr;
3291     if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
3292       return true;
3293     AddrMode.Scale = 0;
3294     AddrMode.ScaledReg = nullptr;
3295   }
3296   // Couldn't match.
3297   TPT.rollback(LastKnownGood);
3298   return false;
3299 }
3300
3301 /// Check to see if all uses of OpVal by the specified inline asm call are due
3302 /// to memory operands. If so, return true, otherwise return false.
3303 static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
3304                                     const TargetMachine &TM) {
3305   const Function *F = CI->getParent()->getParent();
3306   const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering();
3307   const TargetRegisterInfo *TRI = TM.getSubtargetImpl(*F)->getRegisterInfo();
3308   TargetLowering::AsmOperandInfoVector TargetConstraints =
3309       TLI->ParseConstraints(F->getParent()->getDataLayout(), TRI,
3310                             ImmutableCallSite(CI));
3311   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
3312     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
3313
3314     // Compute the constraint code and ConstraintType to use.
3315     TLI->ComputeConstraintToUse(OpInfo, SDValue());
3316
3317     // If this asm operand is our Value*, and if it isn't an indirect memory
3318     // operand, we can't fold it!
3319     if (OpInfo.CallOperandVal == OpVal &&
3320         (OpInfo.ConstraintType != TargetLowering::C_Memory ||
3321          !OpInfo.isIndirect))
3322       return false;
3323   }
3324
3325   return true;
3326 }
3327
3328 /// Recursively walk all the uses of I until we find a memory use.
3329 /// If we find an obviously non-foldable instruction, return true.
3330 /// Add the ultimately found memory instructions to MemoryUses.
3331 static bool FindAllMemoryUses(
3332     Instruction *I,
3333     SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
3334     SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetMachine &TM) {
3335   // If we already considered this instruction, we're done.
3336   if (!ConsideredInsts.insert(I).second)
3337     return false;
3338
3339   // If this is an obviously unfoldable instruction, bail out.
3340   if (!MightBeFoldableInst(I))
3341     return true;
3342
3343   // Loop over all the uses, recursively processing them.
3344   for (Use &U : I->uses()) {
3345     Instruction *UserI = cast<Instruction>(U.getUser());
3346
3347     if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) {
3348       MemoryUses.push_back(std::make_pair(LI, U.getOperandNo()));
3349       continue;
3350     }
3351
3352     if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {
3353       unsigned opNo = U.getOperandNo();
3354       if (opNo == 0) return true; // Storing addr, not into addr.
3355       MemoryUses.push_back(std::make_pair(SI, opNo));
3356       continue;
3357     }
3358
3359     if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
3360       InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
3361       if (!IA) return true;
3362
3363       // If this is a memory operand, we're cool, otherwise bail out.
3364       if (!IsOperandAMemoryOperand(CI, IA, I, TM))
3365         return true;
3366       continue;
3367     }
3368
3369     if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TM))
3370       return true;
3371   }
3372
3373   return false;
3374 }
3375
3376 /// Return true if Val is already known to be live at the use site that we're
3377 /// folding it into. If so, there is no cost to include it in the addressing
3378 /// mode. KnownLive1 and KnownLive2 are two values that we know are live at the
3379 /// instruction already.
3380 bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
3381                                                    Value *KnownLive2) {
3382   // If Val is either of the known-live values, we know it is live!
3383   if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2)
3384     return true;
3385
3386   // All values other than instructions and arguments (e.g. constants) are live.
3387   if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;
3388
3389   // If Val is a constant sized alloca in the entry block, it is live, this is
3390   // true because it is just a reference to the stack/frame pointer, which is
3391   // live for the whole function.
3392   if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
3393     if (AI->isStaticAlloca())
3394       return true;
3395
3396   // Check to see if this value is already used in the memory instruction's
3397   // block.  If so, it's already live into the block at the very least, so we
3398   // can reasonably fold it.
3399   return Val->isUsedInBasicBlock(MemoryInst->getParent());
3400 }
3401
3402 /// It is possible for the addressing mode of the machine to fold the specified
3403 /// instruction into a load or store that ultimately uses it.
3404 /// However, the specified instruction has multiple uses.
3405 /// Given this, it may actually increase register pressure to fold it
3406 /// into the load. For example, consider this code:
3407 ///
3408 ///     X = ...
3409 ///     Y = X+1
3410 ///     use(Y)   -> nonload/store
3411 ///     Z = Y+1
3412 ///     load Z
3413 ///
3414 /// In this case, Y has multiple uses, and can be folded into the load of Z
3415 /// (yielding load [X+2]).  However, doing this will cause both "X" and "X+1" to
3416 /// be live at the use(Y) line.  If we don't fold Y into load Z, we use one
3417 /// fewer register.  Since Y can't be folded into "use(Y)" we don't increase the
3418 /// number of computations either.
3419 ///
3420 /// Note that this (like most of CodeGenPrepare) is just a rough heuristic.  If
3421 /// X was live across 'load Z' for other reasons, we actually *would* want to
3422 /// fold the addressing mode in the Z case.  This would make Y die earlier.
3423 bool AddressingModeMatcher::
3424 isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
3425                                      ExtAddrMode &AMAfter) {
3426   if (IgnoreProfitability) return true;
3427
3428   // AMBefore is the addressing mode before this instruction was folded into it,
3429   // and AMAfter is the addressing mode after the instruction was folded.  Get
3430   // the set of registers referenced by AMAfter and subtract out those
3431   // referenced by AMBefore: this is the set of values which folding in this
3432   // address extends the lifetime of.
3433   //
3434   // Note that there are only two potential values being referenced here,
3435   // BaseReg and ScaleReg (global addresses are always available, as are any
3436   // folded immediates).
3437   Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
3438
3439   // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
3440   // lifetime wasn't extended by adding this instruction.
3441   if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
3442     BaseReg = nullptr;
3443   if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
3444     ScaledReg = nullptr;
3445
3446   // If folding this instruction (and it's subexprs) didn't extend any live
3447   // ranges, we're ok with it.
3448   if (!BaseReg && !ScaledReg)
3449     return true;
3450
3451   // If all uses of this instruction are ultimately load/store/inlineasm's,
3452   // check to see if their addressing modes will include this instruction.  If
3453   // so, we can fold it into all uses, so it doesn't matter if it has multiple
3454   // uses.
3455   SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
3456   SmallPtrSet<Instruction*, 16> ConsideredInsts;
3457   if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM))
3458     return false;  // Has a non-memory, non-foldable use!
3459
3460   // Now that we know that all uses of this instruction are part of a chain of
3461   // computation involving only operations that could theoretically be folded
3462   // into a memory use, loop over each of these uses and see if they could
3463   // *actually* fold the instruction.
3464   SmallVector<Instruction*, 32> MatchedAddrModeInsts;
3465   for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
3466     Instruction *User = MemoryUses[i].first;
3467     unsigned OpNo = MemoryUses[i].second;
3468
3469     // Get the access type of this use.  If the use isn't a pointer, we don't
3470     // know what it accesses.
3471     Value *Address = User->getOperand(OpNo);
3472     PointerType *AddrTy = dyn_cast<PointerType>(Address->getType());
3473     if (!AddrTy)
3474       return false;
3475     Type *AddressAccessTy = AddrTy->getElementType();
3476     unsigned AS = AddrTy->getAddressSpace();
3477
3478     // Do a match against the root of this address, ignoring profitability. This
3479     // will tell us if the addressing mode for the memory operation will
3480     // *actually* cover the shared instruction.
3481     ExtAddrMode Result;
3482     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
3483         TPT.getRestorationPoint();
3484     AddressingModeMatcher Matcher(MatchedAddrModeInsts, TM, AddressAccessTy, AS,
3485                                   MemoryInst, Result, InsertedInsts,
3486                                   PromotedInsts, TPT);
3487     Matcher.IgnoreProfitability = true;
3488     bool Success = Matcher.matchAddr(Address, 0);
3489     (void)Success; assert(Success && "Couldn't select *anything*?");
3490
3491     // The match was to check the profitability, the changes made are not
3492     // part of the original matcher. Therefore, they should be dropped
3493     // otherwise the original matcher will not present the right state.
3494     TPT.rollback(LastKnownGood);
3495
3496     // If the match didn't cover I, then it won't be shared by it.
3497     if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(),
3498                   I) == MatchedAddrModeInsts.end())
3499       return false;
3500
3501     MatchedAddrModeInsts.clear();
3502   }
3503
3504   return true;
3505 }
3506
3507 } // end anonymous namespace
3508
3509 /// Return true if the specified values are defined in a
3510 /// different basic block than BB.
3511 static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
3512   if (Instruction *I = dyn_cast<Instruction>(V))
3513     return I->getParent() != BB;
3514   return false;
3515 }
3516
3517 /// Load and Store Instructions often have addressing modes that can do
3518 /// significant amounts of computation. As such, instruction selection will try
3519 /// to get the load or store to do as much computation as possible for the
3520 /// program. The problem is that isel can only see within a single block. As
3521 /// such, we sink as much legal addressing mode work into the block as possible.
3522 ///
3523 /// This method is used to optimize both load/store and inline asms with memory
3524 /// operands.
3525 bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
3526                                         Type *AccessTy, unsigned AddrSpace) {
3527   Value *Repl = Addr;
3528
3529   // Try to collapse single-value PHI nodes.  This is necessary to undo
3530   // unprofitable PRE transformations.
3531   SmallVector<Value*, 8> worklist;
3532   SmallPtrSet<Value*, 16> Visited;
3533   worklist.push_back(Addr);
3534
3535   // Use a worklist to iteratively look through PHI nodes, and ensure that
3536   // the addressing mode obtained from the non-PHI roots of the graph
3537   // are equivalent.
3538   Value *Consensus = nullptr;
3539   unsigned NumUsesConsensus = 0;
3540   bool IsNumUsesConsensusValid = false;
3541   SmallVector<Instruction*, 16> AddrModeInsts;
3542   ExtAddrMode AddrMode;
3543   TypePromotionTransaction TPT;
3544   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
3545       TPT.getRestorationPoint();
3546   while (!worklist.empty()) {
3547     Value *V = worklist.back();
3548     worklist.pop_back();
3549
3550     // Break use-def graph loops.
3551     if (!Visited.insert(V).second) {
3552       Consensus = nullptr;
3553       break;
3554     }
3555
3556     // For a PHI node, push all of its incoming values.
3557     if (PHINode *P = dyn_cast<PHINode>(V)) {
3558       for (Value *IncValue : P->incoming_values())
3559         worklist.push_back(IncValue);
3560       continue;
3561     }
3562
3563     // For non-PHIs, determine the addressing mode being computed.
3564     SmallVector<Instruction*, 16> NewAddrModeInsts;
3565     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
3566       V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM,
3567       InsertedInsts, PromotedInsts, TPT);
3568
3569     // This check is broken into two cases with very similar code to avoid using
3570     // getNumUses() as much as possible. Some values have a lot of uses, so
3571     // calling getNumUses() unconditionally caused a significant compile-time
3572     // regression.
3573     if (!Consensus) {
3574       Consensus = V;
3575       AddrMode = NewAddrMode;
3576       AddrModeInsts = NewAddrModeInsts;
3577       continue;
3578     } else if (NewAddrMode == AddrMode) {
3579       if (!IsNumUsesConsensusValid) {
3580         NumUsesConsensus = Consensus->getNumUses();
3581         IsNumUsesConsensusValid = true;
3582       }
3583
3584       // Ensure that the obtained addressing mode is equivalent to that obtained
3585       // for all other roots of the PHI traversal.  Also, when choosing one
3586       // such root as representative, select the one with the most uses in order
3587       // to keep the cost modeling heuristics in AddressingModeMatcher
3588       // applicable.
3589       unsigned NumUses = V->getNumUses();
3590       if (NumUses > NumUsesConsensus) {
3591         Consensus = V;
3592         NumUsesConsensus = NumUses;
3593         AddrModeInsts = NewAddrModeInsts;
3594       }
3595       continue;
3596     }
3597
3598     Consensus = nullptr;
3599     break;
3600   }
3601
3602   // If the addressing mode couldn't be determined, or if multiple different
3603   // ones were determined, bail out now.
3604   if (!Consensus) {
3605     TPT.rollback(LastKnownGood);
3606     return false;
3607   }
3608   TPT.commit();
3609
3610   // Check to see if any of the instructions supersumed by this addr mode are
3611   // non-local to I's BB.
3612   bool AnyNonLocal = false;
3613   for (unsigned i = 0, e = AddrModeInsts.size(); i != e; ++i) {
3614     if (IsNonLocalValue(AddrModeInsts[i], MemoryInst->getParent())) {
3615       AnyNonLocal = true;
3616       break;
3617     }
3618   }
3619
3620   // If all the instructions matched are already in this BB, don't do anything.
3621   if (!AnyNonLocal) {
3622     DEBUG(dbgs() << "CGP: Found      local addrmode: " << AddrMode << "\n");
3623     return false;
3624   }
3625
3626   // Insert this computation right after this user.  Since our caller is
3627   // scanning from the top of the BB to the bottom, reuse of the expr are
3628   // guaranteed to happen later.
3629   IRBuilder<> Builder(MemoryInst);
3630
3631   // Now that we determined the addressing expression we want to use and know
3632   // that we have to sink it into this block.  Check to see if we have already
3633   // done this for some other load/store instr in this block.  If so, reuse the
3634   // computation.
3635   Value *&SunkAddr = SunkAddrs[Addr];
3636   if (SunkAddr) {
3637     DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "
3638                  << *MemoryInst << "\n");
3639     if (SunkAddr->getType() != Addr->getType())
3640       SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
3641   } else if (AddrSinkUsingGEPs ||
3642              (!AddrSinkUsingGEPs.getNumOccurrences() && TM &&
3643               TM->getSubtargetImpl(*MemoryInst->getParent()->getParent())
3644                   ->useAA())) {
3645     // By default, we use the GEP-based method when AA is used later. This
3646     // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
3647     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
3648                  << *MemoryInst << "\n");
3649     Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
3650     Value *ResultPtr = nullptr, *ResultIndex = nullptr;
3651
3652     // First, find the pointer.
3653     if (AddrMode.BaseReg && AddrMode.BaseReg->getType()->isPointerTy()) {
3654       ResultPtr = AddrMode.BaseReg;
3655       AddrMode.BaseReg = nullptr;
3656     }
3657
3658     if (AddrMode.Scale && AddrMode.ScaledReg->getType()->isPointerTy()) {
3659       // We can't add more than one pointer together, nor can we scale a
3660       // pointer (both of which seem meaningless).
3661       if (ResultPtr || AddrMode.Scale != 1)
3662         return false;
3663
3664       ResultPtr = AddrMode.ScaledReg;
3665       AddrMode.Scale = 0;
3666     }
3667
3668     if (AddrMode.BaseGV) {
3669       if (ResultPtr)
3670         return false;
3671
3672       ResultPtr = AddrMode.BaseGV;
3673     }
3674
3675     // If the real base value actually came from an inttoptr, then the matcher
3676     // will look through it and provide only the integer value. In that case,
3677     // use it here.
3678     if (!ResultPtr && AddrMode.BaseReg) {
3679       ResultPtr =
3680         Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(), "sunkaddr");
3681       AddrMode.BaseReg = nullptr;
3682     } else if (!ResultPtr && AddrMode.Scale == 1) {
3683       ResultPtr =
3684         Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(), "sunkaddr");
3685       AddrMode.Scale = 0;
3686     }
3687
3688     if (!ResultPtr &&
3689         !AddrMode.BaseReg && !AddrMode.Scale && !AddrMode.BaseOffs) {
3690       SunkAddr = Constant::getNullValue(Addr->getType());
3691     } else if (!ResultPtr) {
3692       return false;
3693     } else {
3694       Type *I8PtrTy =
3695           Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace());
3696       Type *I8Ty = Builder.getInt8Ty();
3697
3698       // Start with the base register. Do this first so that subsequent address
3699       // matching finds it last, which will prevent it from trying to match it
3700       // as the scaled value in case it happens to be a mul. That would be
3701       // problematic if we've sunk a different mul for the scale, because then
3702       // we'd end up sinking both muls.
3703       if (AddrMode.BaseReg) {
3704         Value *V = AddrMode.BaseReg;
3705         if (V->getType() != IntPtrTy)
3706           V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
3707
3708         ResultIndex = V;
3709       }
3710
3711       // Add the scale value.
3712       if (AddrMode.Scale) {
3713         Value *V = AddrMode.ScaledReg;
3714         if (V->getType() == IntPtrTy) {
3715           // done.
3716         } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
3717                    cast<IntegerType>(V->getType())->getBitWidth()) {
3718           V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
3719         } else {
3720           // It is only safe to sign extend the BaseReg if we know that the math
3721           // required to create it did not overflow before we extend it. Since
3722           // the original IR value was tossed in favor of a constant back when
3723           // the AddrMode was created we need to bail out gracefully if widths
3724           // do not match instead of extending it.
3725           Instruction *I = dyn_cast_or_null<Instruction>(ResultIndex);
3726           if (I && (ResultIndex != AddrMode.BaseReg))
3727             I->eraseFromParent();
3728           return false;
3729         }
3730
3731         if (AddrMode.Scale != 1)
3732           V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
3733                                 "sunkaddr");
3734         if (ResultIndex)
3735           ResultIndex = Builder.CreateAdd(ResultIndex, V, "sunkaddr");
3736         else
3737           ResultIndex = V;
3738       }
3739
3740       // Add in the Base Offset if present.
3741       if (AddrMode.BaseOffs) {
3742         Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
3743         if (ResultIndex) {
3744           // We need to add this separately from the scale above to help with
3745           // SDAG consecutive load/store merging.
3746           if (ResultPtr->getType() != I8PtrTy)
3747             ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
3748           ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
3749         }
3750
3751         ResultIndex = V;
3752       }
3753
3754       if (!ResultIndex) {
3755         SunkAddr = ResultPtr;
3756       } else {
3757         if (ResultPtr->getType() != I8PtrTy)
3758           ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
3759         SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
3760       }
3761
3762       if (SunkAddr->getType() != Addr->getType())
3763         SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
3764     }
3765   } else {
3766     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
3767                  << *MemoryInst << "\n");
3768     Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
3769     Value *Result = nullptr;
3770
3771     // Start with the base register. Do this first so that subsequent address
3772     // matching finds it last, which will prevent it from trying to match it
3773     // as the scaled value in case it happens to be a mul. That would be
3774     // problematic if we've sunk a different mul for the scale, because then
3775     // we'd end up sinking both muls.
3776     if (AddrMode.BaseReg) {
3777       Value *V = AddrMode.BaseReg;
3778       if (V->getType()->isPointerTy())
3779         V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
3780       if (V->getType() != IntPtrTy)
3781         V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
3782       Result = V;
3783     }
3784
3785     // Add the scale value.
3786     if (AddrMode.Scale) {
3787       Value *V = AddrMode.ScaledReg;
3788       if (V->getType() == IntPtrTy) {
3789         // done.
3790       } else if (V->getType()->isPointerTy()) {
3791         V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
3792       } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
3793                  cast<IntegerType>(V->getType())->getBitWidth()) {
3794         V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
3795       } else {
3796         // It is only safe to sign extend the BaseReg if we know that the math
3797         // required to create it did not overflow before we extend it. Since
3798         // the original IR value was tossed in favor of a constant back when
3799         // the AddrMode was created we need to bail out gracefully if widths
3800         // do not match instead of extending it.
3801         Instruction *I = dyn_cast_or_null<Instruction>(Result);
3802         if (I && (Result != AddrMode.BaseReg))
3803           I->eraseFromParent();
3804         return false;
3805       }
3806       if (AddrMode.Scale != 1)
3807         V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
3808                               "sunkaddr");
3809       if (Result)
3810         Result = Builder.CreateAdd(Result, V, "sunkaddr");
3811       else
3812         Result = V;
3813     }
3814
3815     // Add in the BaseGV if present.
3816     if (AddrMode.BaseGV) {
3817       Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
3818       if (Result)
3819         Result = Builder.CreateAdd(Result, V, "sunkaddr");
3820       else
3821         Result = V;
3822     }
3823
3824     // Add in the Base Offset if present.
3825     if (AddrMode.BaseOffs) {
3826       Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
3827       if (Result)
3828         Result = Builder.CreateAdd(Result, V, "sunkaddr");
3829       else
3830         Result = V;
3831     }
3832
3833     if (!Result)
3834       SunkAddr = Constant::getNullValue(Addr->getType());
3835     else
3836       SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr");
3837   }
3838
3839   MemoryInst->replaceUsesOfWith(Repl, SunkAddr);
3840
3841   // If we have no uses, recursively delete the value and all dead instructions
3842   // using it.
3843   if (Repl->use_empty()) {
3844     // This can cause recursive deletion, which can invalidate our iterator.
3845     // Use a WeakVH to hold onto it in case this happens.
3846     WeakVH IterHandle(&*CurInstIterator);
3847     BasicBlock *BB = CurInstIterator->getParent();
3848
3849     RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo);
3850
3851     if (IterHandle != CurInstIterator.getNodePtrUnchecked()) {
3852       // If the iterator instruction was recursively deleted, start over at the
3853       // start of the block.
3854       CurInstIterator = BB->begin();
3855       SunkAddrs.clear();
3856     }
3857   }
3858   ++NumMemoryInsts;
3859   return true;
3860 }
3861
3862 /// If there are any memory operands, use OptimizeMemoryInst to sink their
3863 /// address computing into the block when possible / profitable.
3864 bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
3865   bool MadeChange = false;
3866
3867   const TargetRegisterInfo *TRI =
3868       TM->getSubtargetImpl(*CS->getParent()->getParent())->getRegisterInfo();
3869   TargetLowering::AsmOperandInfoVector TargetConstraints =
3870       TLI->ParseConstraints(*DL, TRI, CS);
3871   unsigned ArgNo = 0;
3872   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
3873     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
3874
3875     // Compute the constraint code and ConstraintType to use.
3876     TLI->ComputeConstraintToUse(OpInfo, SDValue());
3877
3878     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
3879         OpInfo.isIndirect) {
3880       Value *OpVal = CS->getArgOperand(ArgNo++);
3881       MadeChange |= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u);
3882     } else if (OpInfo.Type == InlineAsm::isInput)
3883       ArgNo++;
3884   }
3885
3886   return MadeChange;
3887 }
3888
3889 /// \brief Check if all the uses of \p Inst are equivalent (or free) zero or
3890 /// sign extensions.
3891 static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {
3892   assert(!Inst->use_empty() && "Input must have at least one use");
3893   const Instruction *FirstUser = cast<Instruction>(*Inst->user_begin());
3894   bool IsSExt = isa<SExtInst>(FirstUser);
3895   Type *ExtTy = FirstUser->getType();
3896   for (const User *U : Inst->users()) {
3897     const Instruction *UI = cast<Instruction>(U);
3898     if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI)))
3899       return false;
3900     Type *CurTy = UI->getType();
3901     // Same input and output types: Same instruction after CSE.
3902     if (CurTy == ExtTy)
3903       continue;
3904
3905     // If IsSExt is true, we are in this situation:
3906     // a = Inst
3907     // b = sext ty1 a to ty2
3908     // c = sext ty1 a to ty3
3909     // Assuming ty2 is shorter than ty3, this could be turned into:
3910     // a = Inst
3911     // b = sext ty1 a to ty2
3912     // c = sext ty2 b to ty3
3913     // However, the last sext is not free.
3914     if (IsSExt)
3915       return false;
3916
3917     // This is a ZExt, maybe this is free to extend from one type to another.
3918     // In that case, we would not account for a different use.
3919     Type *NarrowTy;
3920     Type *LargeTy;
3921     if (ExtTy->getScalarType()->getIntegerBitWidth() >
3922         CurTy->getScalarType()->getIntegerBitWidth()) {
3923       NarrowTy = CurTy;
3924       LargeTy = ExtTy;
3925     } else {
3926       NarrowTy = ExtTy;
3927       LargeTy = CurTy;
3928     }
3929
3930     if (!TLI.isZExtFree(NarrowTy, LargeTy))
3931       return false;
3932   }
3933   // All uses are the same or can be derived from one another for free.
3934   return true;
3935 }
3936
3937 /// \brief Try to form ExtLd by promoting \p Exts until they reach a
3938 /// load instruction.
3939 /// If an ext(load) can be formed, it is returned via \p LI for the load
3940 /// and \p Inst for the extension.
3941 /// Otherwise LI == nullptr and Inst == nullptr.
3942 /// When some promotion happened, \p TPT contains the proper state to
3943 /// revert them.
3944 ///
3945 /// \return true when promoting was necessary to expose the ext(load)
3946 /// opportunity, false otherwise.
3947 ///
3948 /// Example:
3949 /// \code
3950 /// %ld = load i32* %addr
3951 /// %add = add nuw i32 %ld, 4
3952 /// %zext = zext i32 %add to i64
3953 /// \endcode
3954 /// =>
3955 /// \code
3956 /// %ld = load i32* %addr
3957 /// %zext = zext i32 %ld to i64
3958 /// %add = add nuw i64 %zext, 4
3959 /// \encode
3960 /// Thanks to the promotion, we can match zext(load i32*) to i64.
3961 bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT,
3962                                     LoadInst *&LI, Instruction *&Inst,
3963                                     const SmallVectorImpl<Instruction *> &Exts,
3964                                     unsigned CreatedInstsCost = 0) {
3965   // Iterate over all the extensions to see if one form an ext(load).
3966   for (auto I : Exts) {
3967     // Check if we directly have ext(load).
3968     if ((LI = dyn_cast<LoadInst>(I->getOperand(0)))) {
3969       Inst = I;
3970       // No promotion happened here.
3971       return false;
3972     }
3973     // Check whether or not we want to do any promotion.
3974     if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion)
3975       continue;
3976     // Get the action to perform the promotion.
3977     TypePromotionHelper::Action TPH = TypePromotionHelper::getAction(
3978         I, InsertedInsts, *TLI, PromotedInsts);
3979     // Check if we can promote.
3980     if (!TPH)
3981       continue;
3982     // Save the current state.
3983     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
3984         TPT.getRestorationPoint();
3985     SmallVector<Instruction *, 4> NewExts;
3986     unsigned NewCreatedInstsCost = 0;
3987     unsigned ExtCost = !TLI->isExtFree(I);
3988     // Promote.
3989     Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost,
3990                              &NewExts, nullptr, *TLI);
3991     assert(PromotedVal &&
3992            "TypePromotionHelper should have filtered out those cases");
3993
3994     // We would be able to merge only one extension in a load.
3995     // Therefore, if we have more than 1 new extension we heuristically
3996     // cut this search path, because it means we degrade the code quality.
3997     // With exactly 2, the transformation is neutral, because we will merge
3998     // one extension but leave one. However, we optimistically keep going,
3999     // because the new extension may be removed too.
4000     long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost;
4001     TotalCreatedInstsCost -= ExtCost;
4002     if (!StressExtLdPromotion &&
4003         (TotalCreatedInstsCost > 1 ||
4004          !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) {
4005       // The promotion is not profitable, rollback to the previous state.
4006       TPT.rollback(LastKnownGood);
4007       continue;
4008     }
4009     // The promotion is profitable.
4010     // Check if it exposes an ext(load).
4011     (void)extLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInstsCost);
4012     if (LI && (StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||
4013                // If we have created a new extension, i.e., now we have two
4014                // extensions. We must make sure one of them is merged with
4015                // the load, otherwise we may degrade the code quality.
4016                (LI->hasOneUse() || hasSameExtUse(LI, *TLI))))
4017       // Promotion happened.
4018       return true;
4019     // If this does not help to expose an ext(load) then, rollback.
4020     TPT.rollback(LastKnownGood);
4021   }
4022   // None of the extension can form an ext(load).
4023   LI = nullptr;
4024   Inst = nullptr;
4025   return false;
4026 }
4027
4028 /// Move a zext or sext fed by a load into the same basic block as the load,
4029 /// unless conditions are unfavorable. This allows SelectionDAG to fold the
4030 /// extend into the load.
4031 /// \p I[in/out] the extension may be modified during the process if some
4032 /// promotions apply.
4033 ///
4034 bool CodeGenPrepare::moveExtToFormExtLoad(Instruction *&I) {
4035   // Try to promote a chain of computation if it allows to form
4036   // an extended load.
4037   TypePromotionTransaction TPT;
4038   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4039     TPT.getRestorationPoint();
4040   SmallVector<Instruction *, 1> Exts;
4041   Exts.push_back(I);
4042   // Look for a load being extended.
4043   LoadInst *LI = nullptr;
4044   Instruction *OldExt = I;
4045   bool HasPromoted = extLdPromotion(TPT, LI, I, Exts);
4046   if (!LI || !I) {
4047     assert(!HasPromoted && !LI && "If we did not match any load instruction "
4048                                   "the code must remain the same");
4049     I = OldExt;
4050     return false;
4051   }
4052
4053   // If they're already in the same block, there's nothing to do.
4054   // Make the cheap checks first if we did not promote.
4055   // If we promoted, we need to check if it is indeed profitable.
4056   if (!HasPromoted && LI->getParent() == I->getParent())
4057     return false;
4058
4059   EVT VT = TLI->getValueType(*DL, I->getType());
4060   EVT LoadVT = TLI->getValueType(*DL, LI->getType());
4061
4062   // If the load has other users and the truncate is not free, this probably
4063   // isn't worthwhile.
4064   if (!LI->hasOneUse() && TLI &&
4065       (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) &&
4066       !TLI->isTruncateFree(I->getType(), LI->getType())) {
4067     I = OldExt;
4068     TPT.rollback(LastKnownGood);
4069     return false;
4070   }
4071
4072   // Check whether the target supports casts folded into loads.
4073   unsigned LType;
4074   if (isa<ZExtInst>(I))
4075     LType = ISD::ZEXTLOAD;
4076   else {
4077     assert(isa<SExtInst>(I) && "Unexpected ext type!");
4078     LType = ISD::SEXTLOAD;
4079   }
4080   if (TLI && !TLI->isLoadExtLegal(LType, VT, LoadVT)) {
4081     I = OldExt;
4082     TPT.rollback(LastKnownGood);
4083     return false;
4084   }
4085
4086   // Move the extend into the same block as the load, so that SelectionDAG
4087   // can fold it.
4088   TPT.commit();
4089   I->removeFromParent();
4090   I->insertAfter(LI);
4091   ++NumExtsMoved;
4092   return true;
4093 }
4094
4095 bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
4096   BasicBlock *DefBB = I->getParent();
4097
4098   // If the result of a {s|z}ext and its source are both live out, rewrite all
4099   // other uses of the source with result of extension.
4100   Value *Src = I->getOperand(0);
4101   if (Src->hasOneUse())
4102     return false;
4103
4104   // Only do this xform if truncating is free.
4105   if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType()))
4106     return false;
4107
4108   // Only safe to perform the optimization if the source is also defined in
4109   // this block.
4110   if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent())
4111     return false;
4112
4113   bool DefIsLiveOut = false;
4114   for (User *U : I->users()) {
4115     Instruction *UI = cast<Instruction>(U);
4116
4117     // Figure out which BB this ext is used in.
4118     BasicBlock *UserBB = UI->getParent();
4119     if (UserBB == DefBB) continue;
4120     DefIsLiveOut = true;
4121     break;
4122   }
4123   if (!DefIsLiveOut)
4124     return false;
4125
4126   // Make sure none of the uses are PHI nodes.
4127   for (User *U : Src->users()) {
4128     Instruction *UI = cast<Instruction>(U);
4129     BasicBlock *UserBB = UI->getParent();
4130     if (UserBB == DefBB) continue;
4131     // Be conservative. We don't want this xform to end up introducing
4132     // reloads just before load / store instructions.
4133     if (isa<PHINode>(UI) || isa<LoadInst>(UI) || isa<StoreInst>(UI))
4134       return false;
4135   }
4136
4137   // InsertedTruncs - Only insert one trunc in each block once.
4138   DenseMap<BasicBlock*, Instruction*> InsertedTruncs;
4139
4140   bool MadeChange = false;
4141   for (Use &U : Src->uses()) {
4142     Instruction *User = cast<Instruction>(U.getUser());
4143
4144     // Figure out which BB this ext is used in.
4145     BasicBlock *UserBB = User->getParent();
4146     if (UserBB == DefBB) continue;
4147
4148     // Both src and def are live in this block. Rewrite the use.
4149     Instruction *&InsertedTrunc = InsertedTruncs[UserBB];
4150
4151     if (!InsertedTrunc) {
4152       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
4153       assert(InsertPt != UserBB->end());
4154       InsertedTrunc = new TruncInst(I, Src->getType(), "", &*InsertPt);
4155       InsertedInsts.insert(InsertedTrunc);
4156     }
4157
4158     // Replace a use of the {s|z}ext source with a use of the result.
4159     U = InsertedTrunc;
4160     ++NumExtUses;
4161     MadeChange = true;
4162   }
4163
4164   return MadeChange;
4165 }
4166
4167 /// Check if V (an operand of a select instruction) is an expensive instruction
4168 /// that is only used once.
4169 static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
4170   auto *I = dyn_cast<Instruction>(V);
4171   // If it's safe to speculatively execute, then it should not have side
4172   // effects; therefore, it's safe to sink and possibly *not* execute.
4173   return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&
4174          TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive;
4175 }
4176
4177 /// Returns true if a SelectInst should be turned into an explicit branch.
4178 static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI,
4179                                                 SelectInst *SI) {
4180   // FIXME: This should use the same heuristics as IfConversion to determine
4181   // whether a select is better represented as a branch.  This requires that
4182   // branch probability metadata is preserved for the select, which is not the
4183   // case currently.
4184
4185   CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
4186
4187   // If a branch is predictable, an out-of-order CPU can avoid blocking on its
4188   // comparison condition. If the compare has more than one use, there's
4189   // probably another cmov or setcc around, so it's not worth emitting a branch.
4190   if (!Cmp || !Cmp->hasOneUse())
4191     return false;
4192
4193   Value *CmpOp0 = Cmp->getOperand(0);
4194   Value *CmpOp1 = Cmp->getOperand(1);
4195
4196   // Emit "cmov on compare with a memory operand" as a branch to avoid stalls
4197   // on a load from memory. But if the load is used more than once, do not
4198   // change the select to a branch because the load is probably needed
4199   // regardless of whether the branch is taken or not.
4200   if ((isa<LoadInst>(CmpOp0) && CmpOp0->hasOneUse()) ||
4201       (isa<LoadInst>(CmpOp1) && CmpOp1->hasOneUse()))
4202     return true;
4203
4204   // If either operand of the select is expensive and only needed on one side
4205   // of the select, we should form a branch.
4206   if (sinkSelectOperand(TTI, SI->getTrueValue()) ||
4207       sinkSelectOperand(TTI, SI->getFalseValue()))
4208     return true;
4209
4210   return false;
4211 }
4212
4213
4214 /// If we have a SelectInst that will likely profit from branch prediction,
4215 /// turn it into a branch.
4216 bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
4217   bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
4218
4219   // Can we convert the 'select' to CF ?
4220   if (DisableSelectToBranch || OptSize || !TLI || VectorCond)
4221     return false;
4222
4223   TargetLowering::SelectSupportKind SelectKind;
4224   if (VectorCond)
4225     SelectKind = TargetLowering::VectorMaskSelect;
4226   else if (SI->getType()->isVectorTy())
4227     SelectKind = TargetLowering::ScalarCondVectorVal;
4228   else
4229     SelectKind = TargetLowering::ScalarValSelect;
4230
4231   // Do we have efficient codegen support for this kind of 'selects' ?
4232   if (TLI->isSelectSupported(SelectKind)) {
4233     // We have efficient codegen support for the select instruction.
4234     // Check if it is profitable to keep this 'select'.
4235     if (!TLI->isPredictableSelectExpensive() ||
4236         !isFormingBranchFromSelectProfitable(TTI, SI))
4237       return false;
4238   }
4239
4240   ModifiedDT = true;
4241
4242   // Transform a sequence like this:
4243   //    start:
4244   //       %cmp = cmp uge i32 %a, %b
4245   //       %sel = select i1 %cmp, i32 %c, i32 %d
4246   //
4247   // Into:
4248   //    start:
4249   //       %cmp = cmp uge i32 %a, %b
4250   //       br i1 %cmp, label %select.true, label %select.false
4251   //    select.true:
4252   //       br label %select.end
4253   //    select.false:
4254   //       br label %select.end
4255   //    select.end:
4256   //       %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
4257   //
4258   // In addition, we may sink instructions that produce %c or %d from
4259   // the entry block into the destination(s) of the new branch.
4260   // If the true or false blocks do not contain a sunken instruction, that
4261   // block and its branch may be optimized away. In that case, one side of the
4262   // first branch will point directly to select.end, and the corresponding PHI
4263   // predecessor block will be the start block.
4264
4265   // First, we split the block containing the select into 2 blocks.
4266   BasicBlock *StartBlock = SI->getParent();
4267   BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(SI));
4268   BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
4269
4270   // Delete the unconditional branch that was just created by the split.
4271   StartBlock->getTerminator()->eraseFromParent();
4272
4273   // These are the new basic blocks for the conditional branch.
4274   // At least one will become an actual new basic block.
4275   BasicBlock *TrueBlock = nullptr;
4276   BasicBlock *FalseBlock = nullptr;
4277
4278   // Sink expensive instructions into the conditional blocks to avoid executing
4279   // them speculatively.
4280   if (sinkSelectOperand(TTI, SI->getTrueValue())) {
4281     TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink",
4282                                    EndBlock->getParent(), EndBlock);
4283     auto *TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
4284     auto *TrueInst = cast<Instruction>(SI->getTrueValue());
4285     TrueInst->moveBefore(TrueBranch);
4286   }
4287   if (sinkSelectOperand(TTI, SI->getFalseValue())) {
4288     FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink",
4289                                     EndBlock->getParent(), EndBlock);
4290     auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
4291     auto *FalseInst = cast<Instruction>(SI->getFalseValue());
4292     FalseInst->moveBefore(FalseBranch);
4293   }
4294
4295   // If there was nothing to sink, then arbitrarily choose the 'false' side
4296   // for a new input value to the PHI.
4297   if (TrueBlock == FalseBlock) {
4298     assert(TrueBlock == nullptr &&
4299            "Unexpected basic block transform while optimizing select");
4300
4301     FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
4302                                     EndBlock->getParent(), EndBlock);
4303     BranchInst::Create(EndBlock, FalseBlock);
4304   }
4305
4306   // Insert the real conditional branch based on the original condition.
4307   // If we did not create a new block for one of the 'true' or 'false' paths
4308   // of the condition, it means that side of the branch goes to the end block
4309   // directly and the path originates from the start block from the point of
4310   // view of the new PHI.
4311   if (TrueBlock == nullptr) {
4312     BranchInst::Create(EndBlock, FalseBlock, SI->getCondition(), SI);
4313     TrueBlock = StartBlock;
4314   } else if (FalseBlock == nullptr) {
4315     BranchInst::Create(TrueBlock, EndBlock, SI->getCondition(), SI);
4316     FalseBlock = StartBlock;
4317   } else {
4318     BranchInst::Create(TrueBlock, FalseBlock, SI->getCondition(), SI);
4319   }
4320
4321   // The select itself is replaced with a PHI Node.
4322   PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
4323   PN->takeName(SI);
4324   PN->addIncoming(SI->getTrueValue(), TrueBlock);
4325   PN->addIncoming(SI->getFalseValue(), FalseBlock);
4326
4327   SI->replaceAllUsesWith(PN);
4328   SI->eraseFromParent();
4329
4330   // Instruct OptimizeBlock to skip to the next block.
4331   CurInstIterator = StartBlock->end();
4332   ++NumSelectsExpanded;
4333   return true;
4334 }
4335
4336 static bool isBroadcastShuffle(ShuffleVectorInst *SVI) {
4337   SmallVector<int, 16> Mask(SVI->getShuffleMask());
4338   int SplatElem = -1;
4339   for (unsigned i = 0; i < Mask.size(); ++i) {
4340     if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem)
4341       return false;
4342     SplatElem = Mask[i];
4343   }
4344
4345   return true;
4346 }
4347
4348 /// Some targets have expensive vector shifts if the lanes aren't all the same
4349 /// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases
4350 /// it's often worth sinking a shufflevector splat down to its use so that
4351 /// codegen can spot all lanes are identical.
4352 bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
4353   BasicBlock *DefBB = SVI->getParent();
4354
4355   // Only do this xform if variable vector shifts are particularly expensive.
4356   if (!TLI || !TLI->isVectorShiftByScalarCheap(SVI->getType()))
4357     return false;
4358
4359   // We only expect better codegen by sinking a shuffle if we can recognise a
4360   // constant splat.
4361   if (!isBroadcastShuffle(SVI))
4362     return false;
4363
4364   // InsertedShuffles - Only insert a shuffle in each block once.
4365   DenseMap<BasicBlock*, Instruction*> InsertedShuffles;
4366
4367   bool MadeChange = false;
4368   for (User *U : SVI->users()) {
4369     Instruction *UI = cast<Instruction>(U);
4370
4371     // Figure out which BB this ext is used in.
4372     BasicBlock *UserBB = UI->getParent();
4373     if (UserBB == DefBB) continue;
4374
4375     // For now only apply this when the splat is used by a shift instruction.
4376     if (!UI->isShift()) continue;
4377
4378     // Everything checks out, sink the shuffle if the user's block doesn't
4379     // already have a copy.
4380     Instruction *&InsertedShuffle = InsertedShuffles[UserBB];
4381
4382     if (!InsertedShuffle) {
4383       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
4384       assert(InsertPt != UserBB->end());
4385       InsertedShuffle =
4386           new ShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),
4387                                 SVI->getOperand(2), "", &*InsertPt);
4388     }
4389
4390     UI->replaceUsesOfWith(SVI, InsertedShuffle);
4391     MadeChange = true;
4392   }
4393
4394   // If we removed all uses, nuke the shuffle.
4395   if (SVI->use_empty()) {
4396     SVI->eraseFromParent();
4397     MadeChange = true;
4398   }
4399
4400   return MadeChange;
4401 }
4402
4403 bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
4404   if (!TLI || !DL)
4405     return false;
4406
4407   Value *Cond = SI->getCondition();
4408   Type *OldType = Cond->getType();
4409   LLVMContext &Context = Cond->getContext();
4410   MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType));
4411   unsigned RegWidth = RegType.getSizeInBits();
4412
4413   if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth())
4414     return false;
4415
4416   // If the register width is greater than the type width, expand the condition
4417   // of the switch instruction and each case constant to the width of the
4418   // register. By widening the type of the switch condition, subsequent
4419   // comparisons (for case comparisons) will not need to be extended to the
4420   // preferred register width, so we will potentially eliminate N-1 extends,
4421   // where N is the number of cases in the switch.
4422   auto *NewType = Type::getIntNTy(Context, RegWidth);
4423
4424   // Zero-extend the switch condition and case constants unless the switch
4425   // condition is a function argument that is already being sign-extended.
4426   // In that case, we can avoid an unnecessary mask/extension by sign-extending
4427   // everything instead.
4428   Instruction::CastOps ExtType = Instruction::ZExt;
4429   if (auto *Arg = dyn_cast<Argument>(Cond))
4430     if (Arg->hasSExtAttr())
4431       ExtType = Instruction::SExt;
4432
4433   auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);
4434   ExtInst->insertBefore(SI);
4435   SI->setCondition(ExtInst);
4436   for (SwitchInst::CaseIt Case : SI->cases()) {
4437     APInt NarrowConst = Case.getCaseValue()->getValue();
4438     APInt WideConst = (ExtType == Instruction::ZExt) ?
4439                       NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth);
4440     Case.setValue(ConstantInt::get(Context, WideConst));
4441   }
4442
4443   return true;
4444 }
4445
4446 namespace {
4447 /// \brief Helper class to promote a scalar operation to a vector one.
4448 /// This class is used to move downward extractelement transition.
4449 /// E.g.,
4450 /// a = vector_op <2 x i32>
4451 /// b = extractelement <2 x i32> a, i32 0
4452 /// c = scalar_op b
4453 /// store c
4454 ///
4455 /// =>
4456 /// a = vector_op <2 x i32>
4457 /// c = vector_op a (equivalent to scalar_op on the related lane)
4458 /// * d = extractelement <2 x i32> c, i32 0
4459 /// * store d
4460 /// Assuming both extractelement and store can be combine, we get rid of the
4461 /// transition.
4462 class VectorPromoteHelper {
4463   /// DataLayout associated with the current module.
4464   const DataLayout &DL;
4465
4466   /// Used to perform some checks on the legality of vector operations.
4467   const TargetLowering &TLI;
4468
4469   /// Used to estimated the cost of the promoted chain.
4470   const TargetTransformInfo &TTI;
4471
4472   /// The transition being moved downwards.
4473   Instruction *Transition;
4474   /// The sequence of instructions to be promoted.
4475   SmallVector<Instruction *, 4> InstsToBePromoted;
4476   /// Cost of combining a store and an extract.
4477   unsigned StoreExtractCombineCost;
4478   /// Instruction that will be combined with the transition.
4479   Instruction *CombineInst;
4480
4481   /// \brief The instruction that represents the current end of the transition.
4482   /// Since we are faking the promotion until we reach the end of the chain
4483   /// of computation, we need a way to get the current end of the transition.
4484   Instruction *getEndOfTransition() const {
4485     if (InstsToBePromoted.empty())
4486       return Transition;
4487     return InstsToBePromoted.back();
4488   }
4489
4490   /// \brief Return the index of the original value in the transition.
4491   /// E.g., for "extractelement <2 x i32> c, i32 1" the original value,
4492   /// c, is at index 0.
4493   unsigned getTransitionOriginalValueIdx() const {
4494     assert(isa<ExtractElementInst>(Transition) &&
4495            "Other kind of transitions are not supported yet");
4496     return 0;
4497   }
4498
4499   /// \brief Return the index of the index in the transition.
4500   /// E.g., for "extractelement <2 x i32> c, i32 0" the index
4501   /// is at index 1.
4502   unsigned getTransitionIdx() const {
4503     assert(isa<ExtractElementInst>(Transition) &&
4504            "Other kind of transitions are not supported yet");
4505     return 1;
4506   }
4507
4508   /// \brief Get the type of the transition.
4509   /// This is the type of the original value.
4510   /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the
4511   /// transition is <2 x i32>.
4512   Type *getTransitionType() const {
4513     return Transition->getOperand(getTransitionOriginalValueIdx())->getType();
4514   }
4515
4516   /// \brief Promote \p ToBePromoted by moving \p Def downward through.
4517   /// I.e., we have the following sequence:
4518   /// Def = Transition <ty1> a to <ty2>
4519   /// b = ToBePromoted <ty2> Def, ...
4520   /// =>
4521   /// b = ToBePromoted <ty1> a, ...
4522   /// Def = Transition <ty1> ToBePromoted to <ty2>
4523   void promoteImpl(Instruction *ToBePromoted);
4524
4525   /// \brief Check whether or not it is profitable to promote all the
4526   /// instructions enqueued to be promoted.
4527   bool isProfitableToPromote() {
4528     Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx());
4529     unsigned Index = isa<ConstantInt>(ValIdx)
4530                          ? cast<ConstantInt>(ValIdx)->getZExtValue()
4531                          : -1;
4532     Type *PromotedType = getTransitionType();
4533
4534     StoreInst *ST = cast<StoreInst>(CombineInst);
4535     unsigned AS = ST->getPointerAddressSpace();
4536     unsigned Align = ST->getAlignment();
4537     // Check if this store is supported.
4538     if (!TLI.allowsMisalignedMemoryAccesses(
4539             TLI.getValueType(DL, ST->getValueOperand()->getType()), AS,
4540             Align)) {
4541       // If this is not supported, there is no way we can combine
4542       // the extract with the store.
4543       return false;
4544     }
4545
4546     // The scalar chain of computation has to pay for the transition
4547     // scalar to vector.
4548     // The vector chain has to account for the combining cost.
4549     uint64_t ScalarCost =
4550         TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);
4551     uint64_t VectorCost = StoreExtractCombineCost;
4552     for (const auto &Inst : InstsToBePromoted) {
4553       // Compute the cost.
4554       // By construction, all instructions being promoted are arithmetic ones.
4555       // Moreover, one argument is a constant that can be viewed as a splat
4556       // constant.
4557       Value *Arg0 = Inst->getOperand(0);
4558       bool IsArg0Constant = isa<UndefValue>(Arg0) || isa<ConstantInt>(Arg0) ||
4559                             isa<ConstantFP>(Arg0);
4560       TargetTransformInfo::OperandValueKind Arg0OVK =
4561           IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
4562                          : TargetTransformInfo::OK_AnyValue;
4563       TargetTransformInfo::OperandValueKind Arg1OVK =
4564           !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
4565                           : TargetTransformInfo::OK_AnyValue;
4566       ScalarCost += TTI.getArithmeticInstrCost(
4567           Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK);
4568       VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,
4569                                                Arg0OVK, Arg1OVK);
4570     }
4571     DEBUG(dbgs() << "Estimated cost of computation to be promoted:\nScalar: "
4572                  << ScalarCost << "\nVector: " << VectorCost << '\n');
4573     return ScalarCost > VectorCost;
4574   }
4575
4576   /// \brief Generate a constant vector with \p Val with the same
4577   /// number of elements as the transition.
4578   /// \p UseSplat defines whether or not \p Val should be replicated
4579   /// across the whole vector.
4580   /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
4581   /// otherwise we generate a vector with as many undef as possible:
4582   /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only
4583   /// used at the index of the extract.
4584   Value *getConstantVector(Constant *Val, bool UseSplat) const {
4585     unsigned ExtractIdx = UINT_MAX;
4586     if (!UseSplat) {
4587       // If we cannot determine where the constant must be, we have to
4588       // use a splat constant.
4589       Value *ValExtractIdx = Transition->getOperand(getTransitionIdx());
4590       if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx))
4591         ExtractIdx = CstVal->getSExtValue();
4592       else
4593         UseSplat = true;
4594     }
4595
4596     unsigned End = getTransitionType()->getVectorNumElements();
4597     if (UseSplat)
4598       return ConstantVector::getSplat(End, Val);
4599
4600     SmallVector<Constant *, 4> ConstVec;
4601     UndefValue *UndefVal = UndefValue::get(Val->getType());
4602     for (unsigned Idx = 0; Idx != End; ++Idx) {
4603       if (Idx == ExtractIdx)
4604         ConstVec.push_back(Val);
4605       else
4606         ConstVec.push_back(UndefVal);
4607     }
4608     return ConstantVector::get(ConstVec);
4609   }
4610
4611   /// \brief Check if promoting to a vector type an operand at \p OperandIdx
4612   /// in \p Use can trigger undefined behavior.
4613   static bool canCauseUndefinedBehavior(const Instruction *Use,
4614                                         unsigned OperandIdx) {
4615     // This is not safe to introduce undef when the operand is on
4616     // the right hand side of a division-like instruction.
4617     if (OperandIdx != 1)
4618       return false;
4619     switch (Use->getOpcode()) {
4620     default:
4621       return false;
4622     case Instruction::SDiv:
4623     case Instruction::UDiv:
4624     case Instruction::SRem:
4625     case Instruction::URem:
4626       return true;
4627     case Instruction::FDiv:
4628     case Instruction::FRem:
4629       return !Use->hasNoNaNs();
4630     }
4631     llvm_unreachable(nullptr);
4632   }
4633
4634 public:
4635   VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI,
4636                       const TargetTransformInfo &TTI, Instruction *Transition,
4637                       unsigned CombineCost)
4638       : DL(DL), TLI(TLI), TTI(TTI), Transition(Transition),
4639         StoreExtractCombineCost(CombineCost), CombineInst(nullptr) {
4640     assert(Transition && "Do not know how to promote null");
4641   }
4642
4643   /// \brief Check if we can promote \p ToBePromoted to \p Type.
4644   bool canPromote(const Instruction *ToBePromoted) const {
4645     // We could support CastInst too.
4646     return isa<BinaryOperator>(ToBePromoted);
4647   }
4648
4649   /// \brief Check if it is profitable to promote \p ToBePromoted
4650   /// by moving downward the transition through.
4651   bool shouldPromote(const Instruction *ToBePromoted) const {
4652     // Promote only if all the operands can be statically expanded.
4653     // Indeed, we do not want to introduce any new kind of transitions.
4654     for (const Use &U : ToBePromoted->operands()) {
4655       const Value *Val = U.get();
4656       if (Val == getEndOfTransition()) {
4657         // If the use is a division and the transition is on the rhs,
4658         // we cannot promote the operation, otherwise we may create a
4659         // division by zero.
4660         if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()))
4661           return false;
4662         continue;
4663       }
4664       if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) &&
4665           !isa<ConstantFP>(Val))
4666         return false;
4667     }
4668     // Check that the resulting operation is legal.
4669     int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode());
4670     if (!ISDOpcode)
4671       return false;
4672     return StressStoreExtract ||
4673            TLI.isOperationLegalOrCustom(
4674                ISDOpcode, TLI.getValueType(DL, getTransitionType(), true));
4675   }
4676
4677   /// \brief Check whether or not \p Use can be combined
4678   /// with the transition.
4679   /// I.e., is it possible to do Use(Transition) => AnotherUse?
4680   bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); }
4681
4682   /// \brief Record \p ToBePromoted as part of the chain to be promoted.
4683   void enqueueForPromotion(Instruction *ToBePromoted) {
4684     InstsToBePromoted.push_back(ToBePromoted);
4685   }
4686
4687   /// \brief Set the instruction that will be combined with the transition.
4688   void recordCombineInstruction(Instruction *ToBeCombined) {
4689     assert(canCombine(ToBeCombined) && "Unsupported instruction to combine");
4690     CombineInst = ToBeCombined;
4691   }
4692
4693   /// \brief Promote all the instructions enqueued for promotion if it is
4694   /// is profitable.
4695   /// \return True if the promotion happened, false otherwise.
4696   bool promote() {
4697     // Check if there is something to promote.
4698     // Right now, if we do not have anything to combine with,
4699     // we assume the promotion is not profitable.
4700     if (InstsToBePromoted.empty() || !CombineInst)
4701       return false;
4702
4703     // Check cost.
4704     if (!StressStoreExtract && !isProfitableToPromote())
4705       return false;
4706
4707     // Promote.
4708     for (auto &ToBePromoted : InstsToBePromoted)
4709       promoteImpl(ToBePromoted);
4710     InstsToBePromoted.clear();
4711     return true;
4712   }
4713 };
4714 } // End of anonymous namespace.
4715
4716 void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
4717   // At this point, we know that all the operands of ToBePromoted but Def
4718   // can be statically promoted.
4719   // For Def, we need to use its parameter in ToBePromoted:
4720   // b = ToBePromoted ty1 a
4721   // Def = Transition ty1 b to ty2
4722   // Move the transition down.
4723   // 1. Replace all uses of the promoted operation by the transition.
4724   // = ... b => = ... Def.
4725   assert(ToBePromoted->getType() == Transition->getType() &&
4726          "The type of the result of the transition does not match "
4727          "the final type");
4728   ToBePromoted->replaceAllUsesWith(Transition);
4729   // 2. Update the type of the uses.
4730   // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def.
4731   Type *TransitionTy = getTransitionType();
4732   ToBePromoted->mutateType(TransitionTy);
4733   // 3. Update all the operands of the promoted operation with promoted
4734   // operands.
4735   // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a.
4736   for (Use &U : ToBePromoted->operands()) {
4737     Value *Val = U.get();
4738     Value *NewVal = nullptr;
4739     if (Val == Transition)
4740       NewVal = Transition->getOperand(getTransitionOriginalValueIdx());
4741     else if (isa<UndefValue>(Val) || isa<ConstantInt>(Val) ||
4742              isa<ConstantFP>(Val)) {
4743       // Use a splat constant if it is not safe to use undef.
4744       NewVal = getConstantVector(
4745           cast<Constant>(Val),
4746           isa<UndefValue>(Val) ||
4747               canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()));
4748     } else
4749       llvm_unreachable("Did you modified shouldPromote and forgot to update "
4750                        "this?");
4751     ToBePromoted->setOperand(U.getOperandNo(), NewVal);
4752   }
4753   Transition->removeFromParent();
4754   Transition->insertAfter(ToBePromoted);
4755   Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);
4756 }
4757
4758 /// Some targets can do store(extractelement) with one instruction.
4759 /// Try to push the extractelement towards the stores when the target
4760 /// has this feature and this is profitable.
4761 bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {
4762   unsigned CombineCost = UINT_MAX;
4763   if (DisableStoreExtract || !TLI ||
4764       (!StressStoreExtract &&
4765        !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),
4766                                        Inst->getOperand(1), CombineCost)))
4767     return false;
4768
4769   // At this point we know that Inst is a vector to scalar transition.
4770   // Try to move it down the def-use chain, until:
4771   // - We can combine the transition with its single use
4772   //   => we got rid of the transition.
4773   // - We escape the current basic block
4774   //   => we would need to check that we are moving it at a cheaper place and
4775   //      we do not do that for now.
4776   BasicBlock *Parent = Inst->getParent();
4777   DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
4778   VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost);
4779   // If the transition has more than one use, assume this is not going to be
4780   // beneficial.
4781   while (Inst->hasOneUse()) {
4782     Instruction *ToBePromoted = cast<Instruction>(*Inst->user_begin());
4783     DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');
4784
4785     if (ToBePromoted->getParent() != Parent) {
4786       DEBUG(dbgs() << "Instruction to promote is in a different block ("
4787                    << ToBePromoted->getParent()->getName()
4788                    << ") than the transition (" << Parent->getName() << ").\n");
4789       return false;
4790     }
4791
4792     if (VPH.canCombine(ToBePromoted)) {
4793       DEBUG(dbgs() << "Assume " << *Inst << '\n'
4794                    << "will be combined with: " << *ToBePromoted << '\n');
4795       VPH.recordCombineInstruction(ToBePromoted);
4796       bool Changed = VPH.promote();
4797       NumStoreExtractExposed += Changed;
4798       return Changed;
4799     }
4800
4801     DEBUG(dbgs() << "Try promoting.\n");
4802     if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted))
4803       return false;
4804
4805     DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");
4806
4807     VPH.enqueueForPromotion(ToBePromoted);
4808     Inst = ToBePromoted;
4809   }
4810   return false;
4811 }
4812
4813 bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
4814   // Bail out if we inserted the instruction to prevent optimizations from
4815   // stepping on each other's toes.
4816   if (InsertedInsts.count(I))
4817     return false;
4818
4819   if (PHINode *P = dyn_cast<PHINode>(I)) {
4820     // It is possible for very late stage optimizations (such as SimplifyCFG)
4821     // to introduce PHI nodes too late to be cleaned up.  If we detect such a
4822     // trivial PHI, go ahead and zap it here.
4823     if (Value *V = SimplifyInstruction(P, *DL, TLInfo, nullptr)) {
4824       P->replaceAllUsesWith(V);
4825       P->eraseFromParent();
4826       ++NumPHIsElim;
4827       return true;
4828     }
4829     return false;
4830   }
4831
4832   if (CastInst *CI = dyn_cast<CastInst>(I)) {
4833     // If the source of the cast is a constant, then this should have
4834     // already been constant folded.  The only reason NOT to constant fold
4835     // it is if something (e.g. LSR) was careful to place the constant
4836     // evaluation in a block other than then one that uses it (e.g. to hoist
4837     // the address of globals out of a loop).  If this is the case, we don't
4838     // want to forward-subst the cast.
4839     if (isa<Constant>(CI->getOperand(0)))
4840       return false;
4841
4842     if (TLI && OptimizeNoopCopyExpression(CI, *TLI, *DL))
4843       return true;
4844
4845     if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
4846       /// Sink a zext or sext into its user blocks if the target type doesn't
4847       /// fit in one register
4848       if (TLI &&
4849           TLI->getTypeAction(CI->getContext(),
4850                              TLI->getValueType(*DL, CI->getType())) ==
4851               TargetLowering::TypeExpandInteger) {
4852         return SinkCast(CI);
4853       } else {
4854         bool MadeChange = moveExtToFormExtLoad(I);
4855         return MadeChange | optimizeExtUses(I);
4856       }
4857     }
4858     return false;
4859   }
4860
4861   if (CmpInst *CI = dyn_cast<CmpInst>(I))
4862     if (!TLI || !TLI->hasMultipleConditionRegisters())
4863       return OptimizeCmpExpression(CI);
4864
4865   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
4866     stripInvariantGroupMetadata(*LI);
4867     if (TLI) {
4868       unsigned AS = LI->getPointerAddressSpace();
4869       return optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
4870     }
4871     return false;
4872   }
4873
4874   if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
4875     stripInvariantGroupMetadata(*SI);
4876     if (TLI) {
4877       unsigned AS = SI->getPointerAddressSpace();
4878       return optimizeMemoryInst(I, SI->getOperand(1),
4879                                 SI->getOperand(0)->getType(), AS);
4880     }
4881     return false;
4882   }
4883
4884   BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
4885
4886   if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
4887                 BinOp->getOpcode() == Instruction::LShr)) {
4888     ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
4889     if (TLI && CI && TLI->hasExtractBitsInsn())
4890       return OptimizeExtractBits(BinOp, CI, *TLI, *DL);
4891
4892     return false;
4893   }
4894
4895   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
4896     if (GEPI->hasAllZeroIndices()) {
4897       /// The GEP operand must be a pointer, so must its result -> BitCast
4898       Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
4899                                         GEPI->getName(), GEPI);
4900       GEPI->replaceAllUsesWith(NC);
4901       GEPI->eraseFromParent();
4902       ++NumGEPsElim;
4903       optimizeInst(NC, ModifiedDT);
4904       return true;
4905     }
4906     return false;
4907   }
4908
4909   if (CallInst *CI = dyn_cast<CallInst>(I))
4910     return optimizeCallInst(CI, ModifiedDT);
4911
4912   if (SelectInst *SI = dyn_cast<SelectInst>(I))
4913     return optimizeSelectInst(SI);
4914
4915   if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
4916     return optimizeShuffleVectorInst(SVI);
4917
4918   if (auto *Switch = dyn_cast<SwitchInst>(I))
4919     return optimizeSwitchInst(Switch);
4920
4921   if (isa<ExtractElementInst>(I))
4922     return optimizeExtractElementInst(I);
4923
4924   return false;
4925 }
4926
4927 // In this pass we look for GEP and cast instructions that are used
4928 // across basic blocks and rewrite them to improve basic-block-at-a-time
4929 // selection.
4930 bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
4931   SunkAddrs.clear();
4932   bool MadeChange = false;
4933
4934   CurInstIterator = BB.begin();
4935   while (CurInstIterator != BB.end()) {
4936     MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT);
4937     if (ModifiedDT)
4938       return true;
4939   }
4940   MadeChange |= dupRetToEnableTailCallOpts(&BB);
4941
4942   return MadeChange;
4943 }
4944
4945 // llvm.dbg.value is far away from the value then iSel may not be able
4946 // handle it properly. iSel will drop llvm.dbg.value if it can not
4947 // find a node corresponding to the value.
4948 bool CodeGenPrepare::placeDbgValues(Function &F) {
4949   bool MadeChange = false;
4950   for (BasicBlock &BB : F) {
4951     Instruction *PrevNonDbgInst = nullptr;
4952     for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
4953       Instruction *Insn = &*BI++;
4954       DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
4955       // Leave dbg.values that refer to an alloca alone. These
4956       // instrinsics describe the address of a variable (= the alloca)
4957       // being taken.  They should not be moved next to the alloca
4958       // (and to the beginning of the scope), but rather stay close to
4959       // where said address is used.
4960       if (!DVI || (DVI->getValue() && isa<AllocaInst>(DVI->getValue()))) {
4961         PrevNonDbgInst = Insn;
4962         continue;
4963       }
4964
4965       Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
4966       if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) {
4967         DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
4968         DVI->removeFromParent();
4969         if (isa<PHINode>(VI))
4970           DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt());
4971         else
4972           DVI->insertAfter(VI);
4973         MadeChange = true;
4974         ++NumDbgValueMoved;
4975       }
4976     }
4977   }
4978   return MadeChange;
4979 }
4980
4981 // If there is a sequence that branches based on comparing a single bit
4982 // against zero that can be combined into a single instruction, and the
4983 // target supports folding these into a single instruction, sink the
4984 // mask and compare into the branch uses. Do this before OptimizeBlock ->
4985 // OptimizeInst -> OptimizeCmpExpression, which perturbs the pattern being
4986 // searched for.
4987 bool CodeGenPrepare::sinkAndCmp(Function &F) {
4988   if (!EnableAndCmpSinking)
4989     return false;
4990   if (!TLI || !TLI->isMaskAndBranchFoldingLegal())
4991     return false;
4992   bool MadeChange = false;
4993   for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
4994     BasicBlock *BB = &*I++;
4995
4996     // Does this BB end with the following?
4997     //   %andVal = and %val, #single-bit-set
4998     //   %icmpVal = icmp %andResult, 0
4999     //   br i1 %cmpVal label %dest1, label %dest2"
5000     BranchInst *Brcc = dyn_cast<BranchInst>(BB->getTerminator());
5001     if (!Brcc || !Brcc->isConditional())
5002       continue;
5003     ICmpInst *Cmp = dyn_cast<ICmpInst>(Brcc->getOperand(0));
5004     if (!Cmp || Cmp->getParent() != BB)
5005       continue;
5006     ConstantInt *Zero = dyn_cast<ConstantInt>(Cmp->getOperand(1));
5007     if (!Zero || !Zero->isZero())
5008       continue;
5009     Instruction *And = dyn_cast<Instruction>(Cmp->getOperand(0));
5010     if (!And || And->getOpcode() != Instruction::And || And->getParent() != BB)
5011       continue;
5012     ConstantInt* Mask = dyn_cast<ConstantInt>(And->getOperand(1));
5013     if (!Mask || !Mask->getUniqueInteger().isPowerOf2())
5014       continue;
5015     DEBUG(dbgs() << "found and; icmp ?,0; brcc\n"); DEBUG(BB->dump());
5016
5017     // Push the "and; icmp" for any users that are conditional branches.
5018     // Since there can only be one branch use per BB, we don't need to keep
5019     // track of which BBs we insert into.
5020     for (Value::use_iterator UI = Cmp->use_begin(), E = Cmp->use_end();
5021          UI != E; ) {
5022       Use &TheUse = *UI;
5023       // Find brcc use.
5024       BranchInst *BrccUser = dyn_cast<BranchInst>(*UI);
5025       ++UI;
5026       if (!BrccUser || !BrccUser->isConditional())
5027         continue;
5028       BasicBlock *UserBB = BrccUser->getParent();
5029       if (UserBB == BB) continue;
5030       DEBUG(dbgs() << "found Brcc use\n");
5031
5032       // Sink the "and; icmp" to use.
5033       MadeChange = true;
5034       BinaryOperator *NewAnd =
5035         BinaryOperator::CreateAnd(And->getOperand(0), And->getOperand(1), "",
5036                                   BrccUser);
5037       CmpInst *NewCmp =
5038         CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(), NewAnd, Zero,
5039                         "", BrccUser);
5040       TheUse = NewCmp;
5041       ++NumAndCmpsMoved;
5042       DEBUG(BrccUser->getParent()->dump());
5043     }
5044   }
5045   return MadeChange;
5046 }
5047
5048 /// \brief Retrieve the probabilities of a conditional branch. Returns true on
5049 /// success, or returns false if no or invalid metadata was found.
5050 static bool extractBranchMetadata(BranchInst *BI,
5051                                   uint64_t &ProbTrue, uint64_t &ProbFalse) {
5052   assert(BI->isConditional() &&
5053          "Looking for probabilities on unconditional branch?");
5054   auto *ProfileData = BI->getMetadata(LLVMContext::MD_prof);
5055   if (!ProfileData || ProfileData->getNumOperands() != 3)
5056     return false;
5057
5058   const auto *CITrue =
5059       mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(1));
5060   const auto *CIFalse =
5061       mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(2));
5062   if (!CITrue || !CIFalse)
5063     return false;
5064
5065   ProbTrue = CITrue->getValue().getZExtValue();
5066   ProbFalse = CIFalse->getValue().getZExtValue();
5067
5068   return true;
5069 }
5070
5071 /// \brief Scale down both weights to fit into uint32_t.
5072 static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
5073   uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
5074   uint32_t Scale = (NewMax / UINT32_MAX) + 1;
5075   NewTrue = NewTrue / Scale;
5076   NewFalse = NewFalse / Scale;
5077 }
5078
5079 /// \brief Some targets prefer to split a conditional branch like:
5080 /// \code
5081 ///   %0 = icmp ne i32 %a, 0
5082 ///   %1 = icmp ne i32 %b, 0
5083 ///   %or.cond = or i1 %0, %1
5084 ///   br i1 %or.cond, label %TrueBB, label %FalseBB
5085 /// \endcode
5086 /// into multiple branch instructions like:
5087 /// \code
5088 ///   bb1:
5089 ///     %0 = icmp ne i32 %a, 0
5090 ///     br i1 %0, label %TrueBB, label %bb2
5091 ///   bb2:
5092 ///     %1 = icmp ne i32 %b, 0
5093 ///     br i1 %1, label %TrueBB, label %FalseBB
5094 /// \endcode
5095 /// This usually allows instruction selection to do even further optimizations
5096 /// and combine the compare with the branch instruction. Currently this is
5097 /// applied for targets which have "cheap" jump instructions.
5098 ///
5099 /// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
5100 ///
5101 bool CodeGenPrepare::splitBranchCondition(Function &F) {
5102   if (!TM || !TM->Options.EnableFastISel || !TLI || TLI->isJumpExpensive())
5103     return false;
5104
5105   bool MadeChange = false;
5106   for (auto &BB : F) {
5107     // Does this BB end with the following?
5108     //   %cond1 = icmp|fcmp|binary instruction ...
5109     //   %cond2 = icmp|fcmp|binary instruction ...
5110     //   %cond.or = or|and i1 %cond1, cond2
5111     //   br i1 %cond.or label %dest1, label %dest2"
5112     BinaryOperator *LogicOp;
5113     BasicBlock *TBB, *FBB;
5114     if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB)))
5115       continue;
5116
5117     auto *Br1 = cast<BranchInst>(BB.getTerminator());
5118     if (Br1->getMetadata(LLVMContext::MD_unpredictable))
5119       continue;
5120
5121     unsigned Opc;
5122     Value *Cond1, *Cond2;
5123     if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)),
5124                              m_OneUse(m_Value(Cond2)))))
5125       Opc = Instruction::And;
5126     else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)),
5127                                  m_OneUse(m_Value(Cond2)))))
5128       Opc = Instruction::Or;
5129     else
5130       continue;
5131
5132     if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) ||
5133         !match(Cond2, m_CombineOr(m_Cmp(), m_BinOp()))   )
5134       continue;
5135
5136     DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
5137
5138     // Create a new BB.
5139     auto *InsertBefore = std::next(Function::iterator(BB))
5140         .getNodePtrUnchecked();
5141     auto TmpBB = BasicBlock::Create(BB.getContext(),
5142                                     BB.getName() + ".cond.split",
5143                                     BB.getParent(), InsertBefore);
5144
5145     // Update original basic block by using the first condition directly by the
5146     // branch instruction and removing the no longer needed and/or instruction.
5147     Br1->setCondition(Cond1);
5148     LogicOp->eraseFromParent();
5149
5150     // Depending on the conditon we have to either replace the true or the false
5151     // successor of the original branch instruction.
5152     if (Opc == Instruction::And)
5153       Br1->setSuccessor(0, TmpBB);
5154     else
5155       Br1->setSuccessor(1, TmpBB);
5156
5157     // Fill in the new basic block.
5158     auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB);
5159     if (auto *I = dyn_cast<Instruction>(Cond2)) {
5160       I->removeFromParent();
5161       I->insertBefore(Br2);
5162     }
5163
5164     // Update PHI nodes in both successors. The original BB needs to be
5165     // replaced in one succesor's PHI nodes, because the branch comes now from
5166     // the newly generated BB (NewBB). In the other successor we need to add one
5167     // incoming edge to the PHI nodes, because both branch instructions target
5168     // now the same successor. Depending on the original branch condition
5169     // (and/or) we have to swap the successors (TrueDest, FalseDest), so that
5170     // we perfrom the correct update for the PHI nodes.
5171     // This doesn't change the successor order of the just created branch
5172     // instruction (or any other instruction).
5173     if (Opc == Instruction::Or)
5174       std::swap(TBB, FBB);
5175
5176     // Replace the old BB with the new BB.
5177     for (auto &I : *TBB) {
5178       PHINode *PN = dyn_cast<PHINode>(&I);
5179       if (!PN)
5180         break;
5181       int i;
5182       while ((i = PN->getBasicBlockIndex(&BB)) >= 0)
5183         PN->setIncomingBlock(i, TmpBB);
5184     }
5185
5186     // Add another incoming edge form the new BB.
5187     for (auto &I : *FBB) {
5188       PHINode *PN = dyn_cast<PHINode>(&I);
5189       if (!PN)
5190         break;
5191       auto *Val = PN->getIncomingValueForBlock(&BB);
5192       PN->addIncoming(Val, TmpBB);
5193     }
5194
5195     // Update the branch weights (from SelectionDAGBuilder::
5196     // FindMergedConditions).
5197     if (Opc == Instruction::Or) {
5198       // Codegen X | Y as:
5199       // BB1:
5200       //   jmp_if_X TBB
5201       //   jmp TmpBB
5202       // TmpBB:
5203       //   jmp_if_Y TBB
5204       //   jmp FBB
5205       //
5206
5207       // We have flexibility in setting Prob for BB1 and Prob for NewBB.
5208       // The requirement is that
5209       //   TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
5210       //     = TrueProb for orignal BB.
5211       // Assuming the orignal weights are A and B, one choice is to set BB1's
5212       // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice
5213       // assumes that
5214       //   TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
5215       // Another choice is to assume TrueProb for BB1 equals to TrueProb for
5216       // TmpBB, but the math is more complicated.
5217       uint64_t TrueWeight, FalseWeight;
5218       if (extractBranchMetadata(Br1, TrueWeight, FalseWeight)) {
5219         uint64_t NewTrueWeight = TrueWeight;
5220         uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight;
5221         scaleWeights(NewTrueWeight, NewFalseWeight);
5222         Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
5223                          .createBranchWeights(TrueWeight, FalseWeight));
5224
5225         NewTrueWeight = TrueWeight;
5226         NewFalseWeight = 2 * FalseWeight;
5227         scaleWeights(NewTrueWeight, NewFalseWeight);
5228         Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
5229                          .createBranchWeights(TrueWeight, FalseWeight));
5230       }
5231     } else {
5232       // Codegen X & Y as:
5233       // BB1:
5234       //   jmp_if_X TmpBB
5235       //   jmp FBB
5236       // TmpBB:
5237       //   jmp_if_Y TBB
5238       //   jmp FBB
5239       //
5240       //  This requires creation of TmpBB after CurBB.
5241
5242       // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
5243       // The requirement is that
5244       //   FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
5245       //     = FalseProb for orignal BB.
5246       // Assuming the orignal weights are A and B, one choice is to set BB1's
5247       // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice
5248       // assumes that
5249       //   FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.
5250       uint64_t TrueWeight, FalseWeight;
5251       if (extractBranchMetadata(Br1, TrueWeight, FalseWeight)) {
5252         uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight;
5253         uint64_t NewFalseWeight = FalseWeight;
5254         scaleWeights(NewTrueWeight, NewFalseWeight);
5255         Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
5256                          .createBranchWeights(TrueWeight, FalseWeight));
5257
5258         NewTrueWeight = 2 * TrueWeight;
5259         NewFalseWeight = FalseWeight;
5260         scaleWeights(NewTrueWeight, NewFalseWeight);
5261         Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
5262                          .createBranchWeights(TrueWeight, FalseWeight));
5263       }
5264     }
5265
5266     // Note: No point in getting fancy here, since the DT info is never
5267     // available to CodeGenPrepare.
5268     ModifiedDT = true;
5269
5270     MadeChange = true;
5271
5272     DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();
5273           TmpBB->dump());
5274   }
5275   return MadeChange;
5276 }
5277
5278 void CodeGenPrepare::stripInvariantGroupMetadata(Instruction &I) {
5279   if (auto *InvariantMD = I.getMetadata(LLVMContext::MD_invariant_group))
5280     I.dropUnknownNonDebugMetadata(InvariantMD->getMetadataID());
5281 }