[IndVars] Extract out eliminateIdentitySCEV, NFC

[oota-llvm.git] / lib / CodeGen / CodeGenPrepare.cpp
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp

index 82cd3801f5832ee5e4b1babeb2838f9dc0faedae..c4c97ff0a3d0425e36e09b24f86e28691ba1d678 100644 (file)
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -18,6 +18,7 @@
  #include "llvm/ADT/SmallSet.h"
  #include "llvm/ADT/Statistic.h"
  #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
  #include "llvm/Analysis/TargetTransformInfo.h"
  #include "llvm/IR/CallSite.h"
  #include "llvm/IR/Constants.h"
@@ -32,13 +33,13 @@
  #include "llvm/IR/IntrinsicInst.h"
  #include "llvm/IR/MDBuilder.h"
  #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Statepoint.h"
  #include "llvm/IR/ValueHandle.h"
  #include "llvm/IR/ValueMap.h"
  #include "llvm/Pass.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
  #include "llvm/Target/TargetLowering.h"
  #include "llvm/Target/TargetSubtargetInfo.h"
  #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -72,6 +73,10 @@ static cl::opt<bool> DisableBranchOpts(
    "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
    cl::desc("Disable branch optimizations in CodeGenPrepare"));
  
+static cl::opt<bool>
+    DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false),
+                  cl::desc("Disable GC optimizations in CodeGenPrepare"));
+
  static cl::opt<bool> DisableSelectToBranch(
    "disable-cgp-select2branch", cl::Hidden, cl::init(false),
    cl::desc("Disable select to branch conversion."));
@@ -104,26 +109,18 @@ static cl::opt<bool> StressExtLdPromotion(
  
  namespace {
  typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
-struct TypeIsSExt {
-  Type *Ty;
-  bool IsSExt;
-  TypeIsSExt(Type *Ty, bool IsSExt) : Ty(Ty), IsSExt(IsSExt) {}
-};
+typedef PointerIntPair<Type *, 1, bool> TypeIsSExt;
  typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy;
  class TypePromotionTransaction;
  
    class CodeGenPrepare : public FunctionPass {
-    /// TLI - Keep a pointer of a TargetLowering to consult for determining
-    /// transformation profitability.
      const TargetMachine *TM;
      const TargetLowering *TLI;
      const TargetTransformInfo *TTI;
      const TargetLibraryInfo *TLInfo;
-    DominatorTree *DT;
  
-    /// CurInstIterator - As we scan instructions optimizing them, this is the
-    /// next instruction to optimize.  Xforms that can invalidate this should
-    /// update it.
+    /// As we scan instructions optimizing them, this is the next instruction
+    /// to optimize. Transforms that can invalidate this should update it.
      BasicBlock::iterator CurInstIterator;
  
      /// Keeps track of non-local addresses that have been sunk into a block.
@@ -131,23 +128,25 @@ class TypePromotionTransaction;
      /// multiple load/stores of the same address.
      ValueMap<Value*, Value*> SunkAddrs;
  
-    /// Keeps track of all truncates inserted for the current function.
-    SetOfInstrs InsertedTruncsSet;
+    /// Keeps track of all instructions inserted for the current function.
+    SetOfInstrs InsertedInsts;
      /// Keeps track of the type of the related instruction before their
      /// promotion for the current function.
      InstrToOrigTy PromotedInsts;
  
-    /// ModifiedDT - If CFG is modified in anyway, dominator tree may need to
-    /// be updated.
+    /// True if CFG is modified in any way.
      bool ModifiedDT;
  
-    /// OptSize - True if optimizing for size.
+    /// True if optimizing for size.
      bool OptSize;
  
+    /// DataLayout for the Function being processed.
+    const DataLayout *DL;
+
    public:
      static char ID; // Pass identification, replacement for typeid
      explicit CodeGenPrepare(const TargetMachine *TM = nullptr)
-        : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr) {
+        : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr), DL(nullptr) {
          initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
        }
      bool runOnFunction(Function &F) override;
@@ -156,33 +155,36 @@ class TypePromotionTransaction;
  
      void getAnalysisUsage(AnalysisUsage &AU) const override {
        AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addRequired<TargetLibraryInfo>();
-      AU.addRequired<TargetTransformInfo>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
      }
  
    private:
-    bool EliminateFallThrough(Function &F);
-    bool EliminateMostlyEmptyBlocks(Function &F);
-    bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
-    void EliminateMostlyEmptyBlock(BasicBlock *BB);
-    bool OptimizeBlock(BasicBlock &BB, bool& ModifiedDT);
-    bool OptimizeInst(Instruction *I, bool& ModifiedDT);
-    bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy);
-    bool OptimizeInlineAsmInst(CallInst *CS);
-    bool OptimizeCallInst(CallInst *CI, bool& ModifiedDT);
-    bool MoveExtToFormExtLoad(Instruction *&I);
-    bool OptimizeExtUses(Instruction *I);
-    bool OptimizeSelectInst(SelectInst *SI);
-    bool OptimizeShuffleVectorInst(ShuffleVectorInst *SI);
-    bool OptimizeExtractElementInst(Instruction *Inst);
-    bool DupRetToEnableTailCallOpts(BasicBlock *BB);
-    bool PlaceDbgValues(Function &F);
+    bool eliminateFallThrough(Function &F);
+    bool eliminateMostlyEmptyBlocks(Function &F);
+    bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
+    void eliminateMostlyEmptyBlock(BasicBlock *BB);
+    bool optimizeBlock(BasicBlock &BB, bool& ModifiedDT);
+    bool optimizeInst(Instruction *I, bool& ModifiedDT);
+    bool optimizeMemoryInst(Instruction *I, Value *Addr,
+                            Type *AccessTy, unsigned AS);
+    bool optimizeInlineAsmInst(CallInst *CS);
+    bool optimizeCallInst(CallInst *CI, bool& ModifiedDT);
+    bool moveExtToFormExtLoad(Instruction *&I);
+    bool optimizeExtUses(Instruction *I);
+    bool optimizeSelectInst(SelectInst *SI);
+    bool optimizeShuffleVectorInst(ShuffleVectorInst *SI);
+    bool optimizeExtractElementInst(Instruction *Inst);
+    bool dupRetToEnableTailCallOpts(BasicBlock *BB);
+    bool placeDbgValues(Function &F);
      bool sinkAndCmp(Function &F);
-    bool ExtLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI,
+    bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI,
                          Instruction *&Inst,
                          const SmallVectorImpl<Instruction *> &Exts,
-                        unsigned CreatedInst);
+                        unsigned CreatedInstCost);
      bool splitBranchCondition(Function &F);
+    bool simplifyOffsetableRelocate(Instruction &I);
+    void stripInvariantGroupMetadata(Instruction &I);
    };
  }
  
@@ -198,21 +200,19 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
    if (skipOptnoneFunction(F))
      return false;
  
+  DL = &F.getParent()->getDataLayout();
+
    bool EverMadeChange = false;
    // Clear per function information.
-  InsertedTruncsSet.clear();
+  InsertedInsts.clear();
    PromotedInsts.clear();
  
    ModifiedDT = false;
    if (TM)
-    TLI = TM->getSubtargetImpl()->getTargetLowering();
-  TLInfo = &getAnalysis<TargetLibraryInfo>();
-  TTI = &getAnalysis<TargetTransformInfo>();
-  DominatorTreeWrapperPass *DTWP =
-      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::OptimizeForSize);
+    TLI = TM->getSubtargetImpl(F)->getTargetLowering();
+  TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  OptSize = F.optForSize();
  
    /// This optimization identifies DIV instructions that can be
    /// profitably bypassed and carried out with a shorter, faster divide.
@@ -225,12 +225,12 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
  
    // Eliminate blocks that contain only PHI nodes and an
    // unconditional branch.
-  EverMadeChange |= EliminateMostlyEmptyBlocks(F);
+  EverMadeChange |= eliminateMostlyEmptyBlocks(F);
  
    // llvm.dbg.value is far away from the value then iSel may not be able
    // handle it properly. iSel will drop llvm.dbg.value if it can not
    // find a node corresponding to the value.
-  EverMadeChange |= PlaceDbgValues(F);
+  EverMadeChange |= placeDbgValues(F);
  
    // If there is a mask, compare against zero, and branch that can be combined
    // into a single target instruction, push the mask and compare into branch
@@ -247,10 +247,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
      for (Function::iterator I = F.begin(); I != F.end(); ) {
        BasicBlock *BB = I++;
        bool ModifiedDTOnIteration = false;
-      MadeChange |= OptimizeBlock(*BB, ModifiedDTOnIteration);
-      
+      MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
+
        // Restart BB iteration if the dominator tree of the Function was changed
-      ModifiedDT |= ModifiedDTOnIteration;
        if (ModifiedDTOnIteration)
          break;
      }
@@ -291,23 +290,28 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
      // Merge pairs of basic blocks with unconditional branches, connected by
      // a single edge.
      if (EverMadeChange || MadeChange)
-      MadeChange |= EliminateFallThrough(F);
+      MadeChange |= eliminateFallThrough(F);
  
-    if (MadeChange)
-      ModifiedDT = true;
      EverMadeChange |= MadeChange;
    }
  
-  if (ModifiedDT && DT)
-    DT->recalculate(F);
+  if (!DisableGCOpts) {
+    SmallVector<Instruction *, 2> Statepoints;
+    for (BasicBlock &BB : F)
+      for (Instruction &I : BB)
+        if (isStatepoint(I))
+          Statepoints.push_back(&I);
+    for (auto &I : Statepoints)
+      EverMadeChange |= simplifyOffsetableRelocate(*I);
+  }
  
    return EverMadeChange;
  }
  
-/// EliminateFallThrough - Merge basic blocks which are connected
-/// by a single edge, where one of the basic blocks has a single successor
-/// pointing to the other basic block, which has a single predecessor.
-bool CodeGenPrepare::EliminateFallThrough(Function &F) {
+/// Merge basic blocks which are connected by a single edge, where one of the
+/// basic blocks has a single successor pointing to the other basic block,
+/// which has a single predecessor.
+bool CodeGenPrepare::eliminateFallThrough(Function &F) {
    bool Changed = false;
    // Scan all of the blocks in the function, except for the entry block.
    for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) {
@@ -326,7 +330,7 @@ bool CodeGenPrepare::EliminateFallThrough(Function &F) {
        // Remember if SinglePred was the entry block of the function.
        // If so, we will need to move BB back to the entry position.
        bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
-      MergeBasicBlockIntoOnlyPred(BB, this);
+      MergeBasicBlockIntoOnlyPred(BB, nullptr);
  
        if (isEntry && BB != &BB->getParent()->getEntryBlock())
          BB->moveBefore(&BB->getParent()->getEntryBlock());
@@ -338,12 +342,11 @@ bool CodeGenPrepare::EliminateFallThrough(Function &F) {
    return Changed;
  }
  
-/// EliminateMostlyEmptyBlocks - eliminate blocks that contain only PHI nodes,
-/// debug info directives, and an unconditional branch.  Passes before isel
-/// (e.g. LSR/loopsimplify) often split edges in ways that are non-optimal for
-/// isel.  Start by eliminating these blocks so we can split them the way we
-/// want them.
-bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) {
+/// Eliminate blocks that contain only PHI nodes, debug info directives, and an
+/// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
+/// edges in ways that are non-optimal for isel. Start by eliminating these
+/// blocks so we can split them the way we want them.
+bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {
    bool MadeChange = false;
    // Note that this intentionally skips the entry block.
    for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) {
@@ -373,19 +376,19 @@ bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) {
      if (DestBB == BB)
        continue;
  
-    if (!CanMergeBlocks(BB, DestBB))
+    if (!canMergeBlocks(BB, DestBB))
        continue;
  
-    EliminateMostlyEmptyBlock(BB);
+    eliminateMostlyEmptyBlock(BB);
      MadeChange = true;
    }
    return MadeChange;
  }
  
-/// CanMergeBlocks - Return true if we can merge BB into DestBB if there is a
-/// single uncond branch between them, and BB contains no other non-phi
+/// Return true if we can merge BB into DestBB if there is a single
+/// unconditional branch between them, and BB contains no other non-phi
  /// instructions.
-bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB,
+bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,
                                      const BasicBlock *DestBB) const {
    // We only want to eliminate blocks whose phi nodes are used by phi nodes in
    // the successor.  If there are more complex condition (e.g. preheaders),
@@ -451,9 +454,9 @@ bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB,
  }
  
  
-/// EliminateMostlyEmptyBlock - Eliminate a basic block that have only phi's and
-/// an unconditional branch in it.
-void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
+/// Eliminate a basic block that has only phi's and an unconditional branch in
+/// it.
+void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
    BranchInst *BI = cast<BranchInst>(BB->getTerminator());
    BasicBlock *DestBB = BI->getSuccessor(0);
  
@@ -466,7 +469,7 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
        // Remember if SinglePred was the entry block of the function.  If so, we
        // will need to move BB back to the entry position.
        bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
-      MergeBasicBlockIntoOnlyPred(DestBB, this);
+      MergeBasicBlockIntoOnlyPred(DestBB, nullptr);
  
        if (isEntry && BB != &BB->getParent()->getEntryBlock())
          BB->moveBefore(&BB->getParent()->getEntryBlock());
@@ -508,19 +511,188 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
    // The PHIs are now updated, change everything that refers to BB to use
    // DestBB and remove BB.
    BB->replaceAllUsesWith(DestBB);
-  if (DT && !ModifiedDT) {
-    BasicBlock *BBIDom  = DT->getNode(BB)->getIDom()->getBlock();
-    BasicBlock *DestBBIDom = DT->getNode(DestBB)->getIDom()->getBlock();
-    BasicBlock *NewIDom = DT->findNearestCommonDominator(BBIDom, DestBBIDom);
-    DT->changeImmediateDominator(DestBB, NewIDom);
-    DT->eraseNode(BB);
-  }
    BB->eraseFromParent();
    ++NumBlocksElim;
  
    DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
  }
  
+// Computes a map of base pointer relocation instructions to corresponding
+// derived pointer relocation instructions given a vector of all relocate calls
+static void computeBaseDerivedRelocateMap(
+    const SmallVectorImpl<User *> &AllRelocateCalls,
+    DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> &
+        RelocateInstMap) {
+  // Collect information in two maps: one primarily for locating the base object
+  // while filling the second map; the second map is the final structure holding
+  // a mapping between Base and corresponding Derived relocate calls
+  DenseMap<std::pair<unsigned, unsigned>, IntrinsicInst *> RelocateIdxMap;
+  for (auto &U : AllRelocateCalls) {
+    GCRelocateOperands ThisRelocate(U);
+    IntrinsicInst *I = cast<IntrinsicInst>(U);
+    auto K = std::make_pair(ThisRelocate.getBasePtrIndex(),
+                            ThisRelocate.getDerivedPtrIndex());
+    RelocateIdxMap.insert(std::make_pair(K, I));
+  }
+  for (auto &Item : RelocateIdxMap) {
+    std::pair<unsigned, unsigned> Key = Item.first;
+    if (Key.first == Key.second)
+      // Base relocation: nothing to insert
+      continue;
+
+    IntrinsicInst *I = Item.second;
+    auto BaseKey = std::make_pair(Key.first, Key.first);
+
+    // We're iterating over RelocateIdxMap so we cannot modify it.
+    auto MaybeBase = RelocateIdxMap.find(BaseKey);
+    if (MaybeBase == RelocateIdxMap.end())
+      // TODO: We might want to insert a new base object relocate and gep off
+      // that, if there are enough derived object relocates.
+      continue;
+
+    RelocateInstMap[MaybeBase->second].push_back(I);
+  }
+}
+
+// Accepts a GEP and extracts the operands into a vector provided they're all
+// small integer constants
+static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
+                                          SmallVectorImpl<Value *> &OffsetV) {
+  for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
+    // Only accept small constant integer operands
+    auto Op = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (!Op || Op->getZExtValue() > 20)
+      return false;
+  }
+
+  for (unsigned i = 1; i < GEP->getNumOperands(); i++)
+    OffsetV.push_back(GEP->getOperand(i));
+  return true;
+}
+
+// Takes a RelocatedBase (base pointer relocation instruction) and Targets to
+// replace, computes a replacement, and affects it.
+static bool
+simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase,
+                          const SmallVectorImpl<IntrinsicInst *> &Targets) {
+  bool MadeChange = false;
+  for (auto &ToReplace : Targets) {
+    GCRelocateOperands MasterRelocate(RelocatedBase);
+    GCRelocateOperands ThisRelocate(ToReplace);
+
+    assert(ThisRelocate.getBasePtrIndex() == MasterRelocate.getBasePtrIndex() &&
+           "Not relocating a derived object of the original base object");
+    if (ThisRelocate.getBasePtrIndex() == ThisRelocate.getDerivedPtrIndex()) {
+      // A duplicate relocate call. TODO: coalesce duplicates.
+      continue;
+    }
+
+    Value *Base = ThisRelocate.getBasePtr();
+    auto Derived = dyn_cast<GetElementPtrInst>(ThisRelocate.getDerivedPtr());
+    if (!Derived || Derived->getPointerOperand() != Base)
+      continue;
+
+    SmallVector<Value *, 2> OffsetV;
+    if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV))
+      continue;
+
+    // Create a Builder and replace the target callsite with a gep
+    assert(RelocatedBase->getNextNode() && "Should always have one since it's not a terminator");
+
+    // Insert after RelocatedBase
+    IRBuilder<> Builder(RelocatedBase->getNextNode());
+    Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());
+
+    // If gc_relocate does not match the actual type, cast it to the right type.
+    // In theory, there must be a bitcast after gc_relocate if the type does not
+    // match, and we should reuse it to get the derived pointer. But it could be
+    // cases like this:
+    // bb1:
+    //  ...
+    //  %g1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
+    //  br label %merge
+    //
+    // bb2:
+    //  ...
+    //  %g2 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
+    //  br label %merge
+    //
+    // merge:
+    //  %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ]
+    //  %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)*
+    //
+    // In this case, we can not find the bitcast any more. So we insert a new bitcast
+    // no matter there is already one or not. In this way, we can handle all cases, and
+    // the extra bitcast should be optimized away in later passes.
+    Instruction *ActualRelocatedBase = RelocatedBase;
+    if (RelocatedBase->getType() != Base->getType()) {
+      ActualRelocatedBase =
+          cast<Instruction>(Builder.CreateBitCast(RelocatedBase, Base->getType()));
+    }
+    Value *Replacement = Builder.CreateGEP(
+        Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV));
+    Instruction *ReplacementInst = cast<Instruction>(Replacement);
+    Replacement->takeName(ToReplace);
+    // If the newly generated derived pointer's type does not match the original derived
+    // pointer's type, cast the new derived pointer to match it. Same reasoning as above.
+    Instruction *ActualReplacement = ReplacementInst;
+    if (ReplacementInst->getType() != ToReplace->getType()) {
+      ActualReplacement =
+          cast<Instruction>(Builder.CreateBitCast(ReplacementInst, ToReplace->getType()));
+    }
+    ToReplace->replaceAllUsesWith(ActualReplacement);
+    ToReplace->eraseFromParent();
+
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+// Turns this:
+//
+// %base = ...
+// %ptr = gep %base + 15
+// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
+// %base' = relocate(%tok, i32 4, i32 4)
+// %ptr' = relocate(%tok, i32 4, i32 5)
+// %val = load %ptr'
+//
+// into this:
+//
+// %base = ...
+// %ptr = gep %base + 15
+// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
+// %base' = gc.relocate(%tok, i32 4, i32 4)
+// %ptr' = gep %base' + 15
+// %val = load %ptr'
+bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {
+  bool MadeChange = false;
+  SmallVector<User *, 2> AllRelocateCalls;
+
+  for (auto *U : I.users())
+    if (isGCRelocate(dyn_cast<Instruction>(U)))
+      // Collect all the relocate calls associated with a statepoint
+      AllRelocateCalls.push_back(U);
+
+  // We need atleast one base pointer relocation + one derived pointer
+  // relocation to mangle
+  if (AllRelocateCalls.size() < 2)
+    return false;
+
+  // RelocateInstMap is a mapping from the base relocate instruction to the
+  // corresponding derived relocate instructions
+  DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> RelocateInstMap;
+  computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);
+  if (RelocateInstMap.empty())
+    return false;
+
+  for (auto &Item : RelocateInstMap)
+    // Item.first is the RelocatedBase to offset against
+    // Item.second is the vector of Targets to replace
+    MadeChange = simplifyRelocatesOffABase(Item.first, Item.second);
+  return MadeChange;
+}
+
  /// SinkCast - Sink the specified cast instruction into its user blocks
  static bool SinkCast(CastInst *CI) {
    BasicBlock *DefBB = CI->getParent();
@@ -555,11 +727,11 @@ static bool SinkCast(CastInst *CI) {
        InsertedCast =
          CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "",
                           InsertPt);
-      MadeChange = true;
      }
  
      // Replace a use of the cast with a use of the new cast.
      TheUse = InsertedCast;
+    MadeChange = true;
      ++NumCastUses;
    }
  
@@ -572,17 +744,17 @@ static bool SinkCast(CastInst *CI) {
    return MadeChange;
  }
  
-/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop
-/// copy (e.g. it's casting from one pointer type to another, i32->i8 on PPC),
-/// sink it into user blocks to reduce the number of virtual
-/// registers that must be created and coalesced.
+/// If the specified cast instruction is a noop copy (e.g. it's casting from
+/// one pointer type to another, i32->i8 on PPC), sink it into user blocks to
+/// reduce the number of virtual registers that must be created and coalesced.
  ///
  /// Return true if any changes are made.
  ///
-static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
+static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
+                                       const DataLayout &DL) {
    // If this is a noop copy,
-  EVT SrcVT = TLI.getValueType(CI->getOperand(0)->getType());
-  EVT DstVT = TLI.getValueType(CI->getType());
+  EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(DL, CI->getType());
  
    // This is an fp<->int conversion?
    if (SrcVT.isInteger() != DstVT.isInteger())
@@ -609,16 +781,63 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
    return SinkCast(CI);
  }
  
-/// OptimizeCmpExpression - sink the given CmpInst into user blocks to reduce
-/// the number of virtual registers that must be created and coalesced.  This is
-/// a clear win except on targets with multiple condition code registers
-///  (PowerPC), where it might lose; some adjustment may be wanted there.
+/// Try to combine CI into a call to the llvm.uadd.with.overflow intrinsic if
+/// possible.
+///
+/// Return true if any changes were made.
+static bool CombineUAddWithOverflow(CmpInst *CI) {
+  Value *A, *B;
+  Instruction *AddI;
+  if (!match(CI,
+             m_UAddWithOverflow(m_Value(A), m_Value(B), m_Instruction(AddI))))
+    return false;
+
+  Type *Ty = AddI->getType();
+  if (!isa<IntegerType>(Ty))
+    return false;
+
+  // We don't want to move around uses of condition values this late, so we we
+  // check if it is legal to create the call to the intrinsic in the basic
+  // block containing the icmp:
+
+  if (AddI->getParent() != CI->getParent() && !AddI->hasOneUse())
+    return false;
+
+#ifndef NDEBUG
+  // Someday m_UAddWithOverflow may get smarter, but this is a safe assumption
+  // for now:
+  if (AddI->hasOneUse())
+    assert(*AddI->user_begin() == CI && "expected!");
+#endif
+
+  Module *M = CI->getParent()->getParent()->getParent();
+  Value *F = Intrinsic::getDeclaration(M, Intrinsic::uadd_with_overflow, Ty);
+
+  auto *InsertPt = AddI->hasOneUse() ? CI : AddI;
+
+  auto *UAddWithOverflow =
+      CallInst::Create(F, {A, B}, "uadd.overflow", InsertPt);
+  auto *UAdd = ExtractValueInst::Create(UAddWithOverflow, 0, "uadd", InsertPt);
+  auto *Overflow =
+      ExtractValueInst::Create(UAddWithOverflow, 1, "overflow", InsertPt);
+
+  CI->replaceAllUsesWith(Overflow);
+  AddI->replaceAllUsesWith(UAdd);
+  CI->eraseFromParent();
+  AddI->eraseFromParent();
+  return true;
+}
+
+/// Sink the given CmpInst into user blocks to reduce the number of virtual
+/// registers that must be created and coalesced. This is a clear win except on
+/// targets with multiple condition code registers (PowerPC), where it might
+/// lose; some adjustment may be wanted there.
  ///
  /// Return true if any changes are made.
-static bool OptimizeCmpExpression(CmpInst *CI) {
+static bool SinkCmpExpression(CmpInst *CI) {
    BasicBlock *DefBB = CI->getParent();
  
-  /// InsertedCmp - Only insert a cmp in each block once.
+  /// Only insert a cmp in each block once.
    DenseMap<BasicBlock*, CmpInst*> InsertedCmps;
  
    bool MadeChange = false;
@@ -649,23 +868,35 @@ static bool OptimizeCmpExpression(CmpInst *CI) {
          CmpInst::Create(CI->getOpcode(),
                          CI->getPredicate(),  CI->getOperand(0),
                          CI->getOperand(1), "", InsertPt);
-      MadeChange = true;
      }
  
      // Replace a use of the cmp with a use of the new cmp.
      TheUse = InsertedCmp;
+    MadeChange = true;
      ++NumCmpUses;
    }
  
    // If we removed all uses, nuke the cmp.
-  if (CI->use_empty())
+  if (CI->use_empty()) {
      CI->eraseFromParent();
+    MadeChange = true;
+  }
  
    return MadeChange;
  }
  
-/// isExtractBitsCandidateUse - Check if the candidates could
-/// be combined with shift instruction, which includes:
+static bool OptimizeCmpExpression(CmpInst *CI) {
+  if (SinkCmpExpression(CI))
+    return true;
+
+  if (CombineUAddWithOverflow(CI))
+    return true;
+
+  return false;
+}
+
+/// Check if the candidates could be combined with a shift instruction, which
+/// includes:
  /// 1. Truncate instruction
  /// 2. And instruction and the imm is a mask of the low bits:
  /// imm & (imm+1) == 0
@@ -683,12 +914,11 @@ static bool isExtractBitsCandidateUse(Instruction *User) {
    return true;
  }
  
-/// SinkShiftAndTruncate - sink both shift and truncate instruction
-/// to the use of truncate's BB.
+/// Sink both shift and truncate instruction to the use of truncate's BB.
  static bool
  SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
                       DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts,
-                     const TargetLowering &TLI) {
+                     const TargetLowering &TLI, const DataLayout &DL) {
    BasicBlock *UserBB = User->getParent();
    DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
    TruncInst *TruncI = dyn_cast<TruncInst>(User);
@@ -714,7 +944,7 @@ SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
      // approximation; some nodes' legality is determined by the
      // operand or other means. There's no good way to find out though.
      if (TLI.isOperationLegalOrCustom(
-            ISDOpcode, TLI.getValueType(TruncUser->getType(), true)))
+            ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true)))
        continue;
  
      // Don't bother for PHI nodes.
@@ -754,10 +984,10 @@ SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
    return MadeChange;
  }
  
-/// OptimizeExtractBits - sink the shift *right* instruction into user blocks if
-/// the uses could potentially be combined with this shift instruction and
-/// generate BitExtract instruction. It will only be applied if the architecture
-/// supports BitExtract instruction. Here is an example:
+/// Sink the shift *right* instruction into user blocks if the uses could
+/// potentially be combined with this shift instruction and generate BitExtract
+/// instruction. It will only be applied if the architecture supports BitExtract
+/// instruction. Here is an example:
  /// BB1:
  ///   %x.extract.shift = lshr i64 %arg1, 32
  /// BB2:
@@ -772,13 +1002,14 @@ SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
  /// instruction.
  /// Return true if any changes are made.
  static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
-                                const TargetLowering &TLI) {
+                                const TargetLowering &TLI,
+                                const DataLayout &DL) {
    BasicBlock *DefBB = ShiftI->getParent();
  
    /// Only insert instructions in each block once.
    DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;
  
-  bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(ShiftI->getType()));
+  bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType()));
  
    bool MadeChange = false;
    for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();
@@ -815,9 +1046,10 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
        if (isa<TruncInst>(User) && shiftIsLegal
            // If the type of the truncate is legal, no trucate will be
            // introduced in other basic blocks.
-          && (!TLI.isTypeLegal(TLI.getValueType(User->getType()))))
+          &&
+          (!TLI.isTypeLegal(TLI.getValueType(DL, User->getType()))))
          MadeChange =
-            SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI);
+            SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL);
  
        continue;
      }
@@ -848,10 +1080,10 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
    return MadeChange;
  }
  
-//  ScalarizeMaskedLoad() translates masked load intrinsic, like 
+// Translate a masked load intrinsic like
  // <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
  //                               <16 x i1> %mask, <16 x i32> %passthru)
-// to a chain of basic blocks, whith loading element one-by-one if
+// to a chain of basic blocks, with loading element one-by-one if
  // the appropriate mask bit is set
  // 
  //  %1 = bitcast i8* %addr to i32*
@@ -943,8 +1175,9 @@ static void ScalarizeMaskedLoad(CallInst *CI) {
      //
      CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
      Builder.SetInsertPoint(InsertPt);
-    
-    Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx));
+
+    Value *Gep =
+        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
      LoadInst* Load = Builder.CreateLoad(Gep, false);
      VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
  
@@ -966,7 +1199,7 @@ static void ScalarizeMaskedLoad(CallInst *CI) {
    CI->eraseFromParent();
  }
  
-//  ScalarizeMaskedStore() translates masked store intrinsic, like
+// Translate a masked store intrinsic, like
  // void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
  //                               <16 x i1> %mask)
  // to a chain of basic blocks, that stores element one-by-one if
@@ -1038,7 +1271,8 @@ static void ScalarizeMaskedStore(CallInst *CI) {
      Builder.SetInsertPoint(InsertPt);
      
      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
-    Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx));
+    Value *Gep =
+        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
      Builder.CreateStore(OneElt, Gep);
  
      // Create "else" block, fill it in the next iteration
@@ -1052,7 +1286,7 @@ static void ScalarizeMaskedStore(CallInst *CI) {
    CI->eraseFromParent();
  }
  
-bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
+bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
    BasicBlock *BB = CI->getParent();
  
    // Lower inline assembly if we can.
@@ -1068,10 +1302,54 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
        return true;
      }
      // Sink address computing for memory operands into the block.
-    if (OptimizeInlineAsmInst(CI))
+    if (optimizeInlineAsmInst(CI))
        return true;
    }
  
+  // Align the pointer arguments to this call if the target thinks it's a good
+  // idea
+  unsigned MinSize, PrefAlign;
+  if (TLI && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
+    for (auto &Arg : CI->arg_operands()) {
+      // We want to align both objects whose address is used directly and
+      // objects whose address is used in casts and GEPs, though it only makes
+      // sense for GEPs if the offset is a multiple of the desired alignment and
+      // if size - offset meets the size threshold.
+      if (!Arg->getType()->isPointerTy())
+        continue;
+      APInt Offset(DL->getPointerSizeInBits(
+                       cast<PointerType>(Arg->getType())->getAddressSpace()),
+                   0);
+      Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset);
+      uint64_t Offset2 = Offset.getLimitedValue();
+      if ((Offset2 & (PrefAlign-1)) != 0)
+        continue;
+      AllocaInst *AI;
+      if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&
+          DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
+        AI->setAlignment(PrefAlign);
+      // Global variables can only be aligned if they are defined in this
+      // object (i.e. they are uniquely initialized in this object), and
+      // over-aligning global variables that have an explicit section is
+      // forbidden.
+      GlobalVariable *GV;
+      if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->hasUniqueInitializer() &&
+          !GV->hasSection() && GV->getAlignment() < PrefAlign &&
+          DL->getTypeAllocSize(GV->getType()->getElementType()) >=
+              MinSize + Offset2)
+        GV->setAlignment(PrefAlign);
+    }
+    // If this is a memcpy (or similar) then we may be able to improve the
+    // alignment
+    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
+      unsigned Align = getKnownAlignment(MI->getDest(), *DL);
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI))
+        Align = std::min(Align, getKnownAlignment(MTI->getSource(), *DL));
+      if (Align > MI->getAlignment())
+        MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Align));
+    }
+  }
+
    IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
    if (II) {
      switch (II->getIntrinsicID()) {
@@ -1088,8 +1366,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
        WeakVH IterHandle(CurInstIterator);
  
        replaceAndRecursivelySimplify(CI, RetVal,
-                                    TLI ? TLI->getDataLayout() : nullptr,
-                                    TLInfo, ModifiedDT ? nullptr : DT);
+                                    TLInfo, nullptr);
  
        // If the iterator instruction was recursively deleted, start over at the
        // start of the block.
@@ -1116,14 +1393,35 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
        }
        return false;
      }
+    case Intrinsic::aarch64_stlxr:
+    case Intrinsic::aarch64_stxr: {
+      ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
+      if (!ExtVal || !ExtVal->hasOneUse() ||
+          ExtVal->getParent() == CI->getParent())
+        return false;
+      // Sink a zext feeding stlxr/stxr before it, so it can be folded into it.
+      ExtVal->moveBefore(CI);
+      // Mark this instruction as "inserted by CGP", so that other
+      // optimizations don't touch it.
+      InsertedInsts.insert(ExtVal);
+      return true;
+    }
+    case Intrinsic::invariant_group_barrier:
+      II->replaceAllUsesWith(II->getArgOperand(0));
+      II->eraseFromParent();
+      return true;
      }
  
      if (TLI) {
+      // Unknown address space.
+      // TODO: Target hook to pick which address space the intrinsic cares
+      // about?
+      unsigned AddrSpace = ~0u;
        SmallVector<Value*, 2> PtrOps;
        Type *AccessTy;
-      if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy))
+      if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy, AddrSpace))
          while (!PtrOps.empty())
-          if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy))
+          if (optimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy, AddrSpace))
              return true;
      }
    }
@@ -1131,15 +1429,11 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
    // From here on out we're working with named functions.
    if (!CI->getCalledFunction()) return false;
  
-  // We'll need DataLayout from here on out.
-  const DataLayout *TD = TLI ? TLI->getDataLayout() : nullptr;
-  if (!TD) return false;
-
    // Lower all default uses of _chk calls.  This is very similar
    // to what InstCombineCalls does, but here we are only lowering calls
    // to fortified library functions (e.g. __memcpy_chk) that have the default
    // "don't know" as the objectsize.  Anything else should be left alone.
-  FortifiedLibCallSimplifier Simplifier(TD, TLInfo, true);
+  FortifiedLibCallSimplifier Simplifier(TLInfo, true);
    if (Value *V = Simplifier.optimizeCall(CI)) {
      CI->replaceAllUsesWith(V);
      CI->eraseFromParent();
@@ -1148,9 +1442,8 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
    return false;
  }
  
-/// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return
-/// instructions to the predecessor to enable tail call optimizations. The
-/// case it is currently looking for is:
+/// Look for opportunities to duplicate return instructions to the predecessor
+/// to enable tail call optimizations. The case it is currently looking for is:
  /// @code
  /// bb0:
  ///   %tmp0 = tail call i32 @f0()
@@ -1179,7 +1472,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
  ///   %tmp2 = tail call i32 @f2()
  ///   ret i32 %tmp2
  /// @endcode
-bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) {
+bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) {
    if (!TLI)
      return false;
  
@@ -1298,7 +1591,7 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) {
  
  namespace {
  
-/// ExtAddrMode - This is an extended version of TargetLowering::AddrMode
+/// This is an extended version of TargetLowering::AddrMode
  /// which holds actual Value*'s for register values.
  struct ExtAddrMode : public TargetLowering::AddrMode {
    Value *BaseReg;
@@ -1672,7 +1965,7 @@ class TypePromotionTransaction {
        Inst->removeFromParent();
      }
  
-    ~InstructionRemover() { delete Replacer; }
+    ~InstructionRemover() override { delete Replacer; }
  
      /// \brief Really remove the instruction.
      void commit() override { delete Inst; }
@@ -1802,86 +2095,91 @@ void TypePromotionTransaction::rollback(
  /// This encapsulates the logic for matching the target-legal addressing modes.
  class AddressingModeMatcher {
    SmallVectorImpl<Instruction*> &AddrModeInsts;
+  const TargetMachine &TM;
    const TargetLowering &TLI;
+  const DataLayout &DL;
  
    /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
    /// the memory instruction that we're computing this address for.
    Type *AccessTy;
+  unsigned AddrSpace;
    Instruction *MemoryInst;
  
-  /// AddrMode - This is the addressing mode that we're building up.  This is
+  /// This is the addressing mode that we're building up. This is
    /// part of the return value of this addressing mode matching stuff.
    ExtAddrMode &AddrMode;
  
-  /// The truncate instruction inserted by other CodeGenPrepare optimizations.
-  const SetOfInstrs &InsertedTruncs;
+  /// The instructions inserted by other CodeGenPrepare optimizations.
+  const SetOfInstrs &InsertedInsts;
    /// A map from the instructions to their type before promotion.
    InstrToOrigTy &PromotedInsts;
    /// The ongoing transaction where every action should be registered.
    TypePromotionTransaction &TPT;
  
-  /// IgnoreProfitability - This is set to true when we should not do
-  /// profitability checks.  When true, IsProfitableToFoldIntoAddressingMode
-  /// always returns true.
+  /// This is set to true when we should not do profitability checks.
+  /// When true, IsProfitableToFoldIntoAddressingMode always returns true.
    bool IgnoreProfitability;
  
-  AddressingModeMatcher(SmallVectorImpl<Instruction*> &AMI,
-                        const TargetLowering &T, Type *AT,
+  AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI,
+                        const TargetMachine &TM, Type *AT, unsigned AS,
                          Instruction *MI, ExtAddrMode &AM,
-                        const SetOfInstrs &InsertedTruncs,
+                        const SetOfInstrs &InsertedInsts,
                          InstrToOrigTy &PromotedInsts,
                          TypePromotionTransaction &TPT)
-      : AddrModeInsts(AMI), TLI(T), AccessTy(AT), MemoryInst(MI), AddrMode(AM),
-        InsertedTruncs(InsertedTruncs), PromotedInsts(PromotedInsts), TPT(TPT) {
+      : AddrModeInsts(AMI), TM(TM),
+        TLI(*TM.getSubtargetImpl(*MI->getParent()->getParent())
+                 ->getTargetLowering()),
+        DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
+        MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
+        PromotedInsts(PromotedInsts), TPT(TPT) {
      IgnoreProfitability = false;
    }
  public:
  
-  /// Match - Find the maximal addressing mode that a load/store of V can fold,
+  /// Find the maximal addressing mode that a load/store of V can fold,
    /// give an access type of AccessTy.  This returns a list of involved
    /// instructions in AddrModeInsts.
-  /// \p InsertedTruncs The truncate instruction inserted by other
-  /// CodeGenPrepare
+  /// \p InsertedInsts The instructions inserted by other CodeGenPrepare
    /// optimizations.
    /// \p PromotedInsts maps the instructions to their type before promotion.
    /// \p The ongoing transaction where every action should be registered.
-  static ExtAddrMode Match(Value *V, Type *AccessTy,
+  static ExtAddrMode Match(Value *V, Type *AccessTy, unsigned AS,
                             Instruction *MemoryInst,
                             SmallVectorImpl<Instruction*> &AddrModeInsts,
-                           const TargetLowering &TLI,
-                           const SetOfInstrs &InsertedTruncs,
+                           const TargetMachine &TM,
+                           const SetOfInstrs &InsertedInsts,
                             InstrToOrigTy &PromotedInsts,
                             TypePromotionTransaction &TPT) {
      ExtAddrMode Result;
  
-    bool Success = AddressingModeMatcher(AddrModeInsts, TLI, AccessTy,
-                                         MemoryInst, Result, InsertedTruncs,
-                                         PromotedInsts, TPT).MatchAddr(V, 0);
+    bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy, AS,
+                                         MemoryInst, Result, InsertedInsts,
+                                         PromotedInsts, TPT).matchAddr(V, 0);
      (void)Success; assert(Success && "Couldn't select *anything*?");
      return Result;
    }
  private:
-  bool MatchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
-  bool MatchAddr(Value *V, unsigned Depth);
-  bool MatchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth,
+  bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
+  bool matchAddr(Value *V, unsigned Depth);
+  bool matchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth,
                            bool *MovedAway = nullptr);
-  bool IsProfitableToFoldIntoAddressingMode(Instruction *I,
+  bool isProfitableToFoldIntoAddressingMode(Instruction *I,
                                              ExtAddrMode &AMBefore,
                                              ExtAddrMode &AMAfter);
-  bool ValueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2);
-  bool IsPromotionProfitable(unsigned MatchedSize, unsigned SizeWithPromotion,
+  bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2);
+  bool isPromotionProfitable(unsigned NewCost, unsigned OldCost,
                               Value *PromotedOperand) const;
  };
  
-/// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode.
+/// Try adding ScaleReg*Scale to the current addressing mode.
  /// Return true and update AddrMode if this addr mode is legal for the target,
  /// false if not.
-bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
+bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
                                               unsigned Depth) {
    // If Scale is 1, then this is the same as adding ScaleReg to the addressing
    // mode.  Just process that directly.
    if (Scale == 1)
-    return MatchAddr(ScaleReg, Depth);
+    return matchAddr(ScaleReg, Depth);
  
    // If the scale is 0, it takes nothing to add this.
    if (Scale == 0)
@@ -1900,7 +2198,7 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
    TestAddrMode.ScaledReg = ScaleReg;
  
    // If the new address isn't legal, bail out.
-  if (!TLI.isLegalAddressingMode(TestAddrMode, AccessTy))
+  if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace))
      return false;
  
    // It was legal, so commit it.
@@ -1917,7 +2215,7 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
  
      // If this addressing mode is legal, commit it and remember that we folded
      // this instruction.
-    if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) {
+    if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) {
        AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
        AddrMode = TestAddrMode;
        return true;
@@ -1928,9 +2226,9 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
    return true;
  }
  
-/// MightBeFoldableInst - This is a little filter, which returns true if an
-/// addressing computation involving I might be folded into a load/store
-/// accessing it.  This doesn't need to be perfect, but needs to accept at least
+/// This is a little filter, which returns true if an addressing computation
+/// involving I might be folded into a load/store accessing it.
+/// This doesn't need to be perfect, but needs to accept at least
  /// the set of instructions that MatchOperationAddr can.
  static bool MightBeFoldableInst(Instruction *I) {
    switch (I->getOpcode()) {
@@ -1963,7 +2261,8 @@ static bool MightBeFoldableInst(Instruction *I) {
  /// \note \p Val is assumed to be the product of some type promotion.
  /// Therefore if \p Val has an undefined state in \p TLI, this is assumed
  /// to be legal, as the non-promoted value would have had the same state.
-static bool isPromotedInstructionLegal(const TargetLowering &TLI, Value *Val) {
+static bool isPromotedInstructionLegal(const TargetLowering &TLI,
+                                       const DataLayout &DL, Value *Val) {
    Instruction *PromotedInst = dyn_cast<Instruction>(Val);
    if (!PromotedInst)
      return false;
@@ -1973,7 +2272,7 @@ static bool isPromotedInstructionLegal(const TargetLowering &TLI, Value *Val) {
      return true;
    // Otherwise, check if the promoted instruction is legal or not.
    return TLI.isOperationLegalOrCustom(
-      ISDOpcode, TLI.getValueType(PromotedInst->getType()));
+      ISDOpcode, TLI.getValueType(DL, PromotedInst->getType()));
  }
  
  /// \brief Hepler class to perform type promotion.
@@ -2003,7 +2302,7 @@ class TypePromotionHelper {
    /// \brief Utility function to promote the operand of \p Ext when this
    /// operand is a promotable trunc or sext or zext.
    /// \p PromotedInsts maps the instructions to their type before promotion.
-  /// \p CreatedInsts[out] contains how many non-free instructions have been
+  /// \p CreatedInstsCost[out] contains the cost of all instructions
    /// created to promote the operand of Ext.
    /// Newly added extensions are inserted in \p Exts.
    /// Newly added truncates are inserted in \p Truncs.
@@ -2011,63 +2310,65 @@ class TypePromotionHelper {
    /// \return The promoted value which is used instead of Ext.
    static Value *promoteOperandForTruncAndAnyExt(
        Instruction *Ext, TypePromotionTransaction &TPT,
-      InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+      InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
        SmallVectorImpl<Instruction *> *Exts,
-      SmallVectorImpl<Instruction *> *Truncs);
+      SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI);
  
    /// \brief Utility function to promote the operand of \p Ext when this
    /// operand is promotable and is not a supported trunc or sext.
    /// \p PromotedInsts maps the instructions to their type before promotion.
-  /// \p CreatedInsts[out] contains how many non-free instructions have been
+  /// \p CreatedInstsCost[out] contains the cost of all the instructions
    /// created to promote the operand of Ext.
    /// Newly added extensions are inserted in \p Exts.
    /// Newly added truncates are inserted in \p Truncs.
    /// Should never be called directly.
    /// \return The promoted value which is used instead of Ext.
-  static Value *
-  promoteOperandForOther(Instruction *Ext, TypePromotionTransaction &TPT,
-                         InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
-                         SmallVectorImpl<Instruction *> *Exts,
-                         SmallVectorImpl<Instruction *> *Truncs, bool IsSExt);
+  static Value *promoteOperandForOther(Instruction *Ext,
+                                       TypePromotionTransaction &TPT,
+                                       InstrToOrigTy &PromotedInsts,
+                                       unsigned &CreatedInstsCost,
+                                       SmallVectorImpl<Instruction *> *Exts,
+                                       SmallVectorImpl<Instruction *> *Truncs,
+                                       const TargetLowering &TLI, bool IsSExt);
  
    /// \see promoteOperandForOther.
-  static Value *
-  signExtendOperandForOther(Instruction *Ext, TypePromotionTransaction &TPT,
-                            InstrToOrigTy &PromotedInsts,
-                            unsigned &CreatedInsts,
-                            SmallVectorImpl<Instruction *> *Exts,
-                            SmallVectorImpl<Instruction *> *Truncs) {
-    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, Exts,
-                                  Truncs, true);
+  static Value *signExtendOperandForOther(
+      Instruction *Ext, TypePromotionTransaction &TPT,
+      InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
+      SmallVectorImpl<Instruction *> *Exts,
+      SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
+    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
+                                  Exts, Truncs, TLI, true);
    }
  
    /// \see promoteOperandForOther.
-  static Value *
-  zeroExtendOperandForOther(Instruction *Ext, TypePromotionTransaction &TPT,
-                            InstrToOrigTy &PromotedInsts,
-                            unsigned &CreatedInsts,
-                            SmallVectorImpl<Instruction *> *Exts,
-                            SmallVectorImpl<Instruction *> *Truncs) {
-    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, Exts,
-                                  Truncs, false);
+  static Value *zeroExtendOperandForOther(
+      Instruction *Ext, TypePromotionTransaction &TPT,
+      InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
+      SmallVectorImpl<Instruction *> *Exts,
+      SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
+    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
+                                  Exts, Truncs, TLI, false);
    }
  
  public:
    /// Type for the utility function that promotes the operand of Ext.
    typedef Value *(*Action)(Instruction *Ext, TypePromotionTransaction &TPT,
-                           InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+                           InstrToOrigTy &PromotedInsts,
+                           unsigned &CreatedInstsCost,
                             SmallVectorImpl<Instruction *> *Exts,
-                           SmallVectorImpl<Instruction *> *Truncs);
+                           SmallVectorImpl<Instruction *> *Truncs,
+                           const TargetLowering &TLI);
    /// \brief Given a sign/zero extend instruction \p Ext, return the approriate
    /// action to promote the operand of \p Ext instead of using Ext.
    /// \return NULL if no promotable action is possible with the current
    /// sign extension.
-  /// \p InsertedTruncs keeps track of all the truncate instructions inserted by
-  /// the others CodeGenPrepare optimizations. This information is important
+  /// \p InsertedInsts keeps track of all the instructions inserted by the
+  /// other CodeGenPrepare optimizations. This information is important
    /// because we do not want to promote these instructions as CodeGenPrepare
    /// will reinsert them later. Thus creating an infinite loop: create/remove.
    /// \p PromotedInsts maps the instructions to their type before promotion.
-  static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedTruncs,
+  static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts,
                            const TargetLowering &TLI,
                            const InstrToOrigTy &PromotedInsts);
  };
@@ -2125,8 +2426,8 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
    // #1 get the type of the operand and check the kind of the extended bits.
    const Type *OpndType;
    InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd);
-  if (It != PromotedInsts.end() && It->second.IsSExt == IsSExt)
-    OpndType = It->second.Ty;
+  if (It != PromotedInsts.end() && It->second.getInt() == IsSExt)
+    OpndType = It->second.getPointer();
    else if ((IsSExt && isa<SExtInst>(Opnd)) || (!IsSExt && isa<ZExtInst>(Opnd)))
      OpndType = Opnd->getOperand(0)->getType();
    else
@@ -2140,7 +2441,7 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
  }
  
  TypePromotionHelper::Action TypePromotionHelper::getAction(
-    Instruction *Ext, const SetOfInstrs &InsertedTruncs,
+    Instruction *Ext, const SetOfInstrs &InsertedInsts,
      const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) {
    assert((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
           "Unexpected instruction type");
@@ -2156,7 +2457,7 @@ TypePromotionHelper::Action TypePromotionHelper::getAction(
    // Do not promote if the operand has been added by codegenprepare.
    // Otherwise, it means we are undoing an optimization that is likely to be
    // redone, thus causing potential infinite loop.
-  if (isa<TruncInst>(ExtOpnd) && InsertedTruncs.count(ExtOpnd))
+  if (isa<TruncInst>(ExtOpnd) && InsertedInsts.count(ExtOpnd))
      return nullptr;
  
    // SExt or Trunc instructions.
@@ -2174,16 +2475,18 @@ TypePromotionHelper::Action TypePromotionHelper::getAction(
  
  Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
      llvm::Instruction *SExt, TypePromotionTransaction &TPT,
-    InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+    InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
      SmallVectorImpl<Instruction *> *Exts,
-    SmallVectorImpl<Instruction *> *Truncs) {
+    SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
    // By construction, the operand of SExt is an instruction. Otherwise we cannot
    // get through it and this method should not be called.
    Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));
    Value *ExtVal = SExt;
+  bool HasMergedNonFreeExt = false;
    if (isa<ZExtInst>(SExtOpnd)) {
      // Replace s|zext(zext(opnd))
      // => zext(opnd).
+    HasMergedNonFreeExt = !TLI.isExtFree(SExtOpnd);
      Value *ZExt =
          TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType());
      TPT.replaceAllUsesWith(SExt, ZExt);
@@ -2194,7 +2497,7 @@ Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
      // => z|sext(opnd).
      TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0));
    }
-  CreatedInsts = 0;
+  CreatedInstsCost = 0;
  
    // Remove dead code.
    if (SExtOpnd->use_empty())
@@ -2203,8 +2506,11 @@ Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
    // Check if the extension is still needed.
    Instruction *ExtInst = dyn_cast<Instruction>(ExtVal);
    if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) {
-    if (ExtInst && Exts)
-      Exts->push_back(ExtInst);
+    if (ExtInst) {
+      if (Exts)
+        Exts->push_back(ExtInst);
+      CreatedInstsCost = !TLI.isExtFree(ExtInst) && !HasMergedNonFreeExt;
+    }
      return ExtVal;
    }
  
@@ -2217,13 +2523,14 @@ Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
  
  Value *TypePromotionHelper::promoteOperandForOther(
      Instruction *Ext, TypePromotionTransaction &TPT,
-    InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+    InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
      SmallVectorImpl<Instruction *> *Exts,
-    SmallVectorImpl<Instruction *> *Truncs, bool IsSExt) {
+    SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI,
+    bool IsSExt) {
    // By construction, the operand of Ext is an instruction. Otherwise we cannot
    // get through it and this method should not be called.
    Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0));
-  CreatedInsts = 0;
+  CreatedInstsCost = 0;
    if (!ExtOpnd->hasOneUse()) {
      // ExtOpnd will be promoted.
      // All its uses, but Ext, will need to use a truncated value of the
@@ -2298,7 +2605,6 @@ Value *TypePromotionHelper::promoteOperandForOther(
          continue;
        }
        ExtForOpnd = cast<Instruction>(ValForExtOpnd);
-      ++CreatedInsts;
      }
      if (Exts)
        Exts->push_back(ExtForOpnd);
@@ -2307,6 +2613,7 @@ Value *TypePromotionHelper::promoteOperandForOther(
      // Move the sign extension before the insertion point.
      TPT.moveBefore(ExtForOpnd, ExtOpnd);
      TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd);
+    CreatedInstsCost += !TLI.isExtFree(ExtForOpnd);
      // If more sext are required, new instructions will have to be created.
      ExtForOpnd = nullptr;
    }
@@ -2317,34 +2624,33 @@ Value *TypePromotionHelper::promoteOperandForOther(
    return ExtOpnd;
  }
  
-/// IsPromotionProfitable - Check whether or not promoting an instruction
-/// to a wider type was profitable.
-/// \p MatchedSize gives the number of instructions that have been matched
-/// in the addressing mode after the promotion was applied.
-/// \p SizeWithPromotion gives the number of created instructions for
-/// the promotion plus the number of instructions that have been
-/// matched in the addressing mode before the promotion.
+/// Check whether or not promoting an instruction to a wider type is profitable.
+/// \p NewCost gives the cost of extension instructions created by the
+/// promotion.
+/// \p OldCost gives the cost of extension instructions before the promotion
+/// plus the number of instructions that have been
+/// matched in the addressing mode the promotion.
  /// \p PromotedOperand is the value that has been promoted.
  /// \return True if the promotion is profitable, false otherwise.
-bool
-AddressingModeMatcher::IsPromotionProfitable(unsigned MatchedSize,
-                                             unsigned SizeWithPromotion,
-                                             Value *PromotedOperand) const {
-  // We folded less instructions than what we created to promote the operand.
+bool AddressingModeMatcher::isPromotionProfitable(
+    unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const {
+  DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost << '\n');
+  // The cost of the new extensions is greater than the cost of the
+  // old extension plus what we folded.
    // This is not profitable.
-  if (MatchedSize < SizeWithPromotion)
+  if (NewCost > OldCost)
      return false;
-  if (MatchedSize > SizeWithPromotion)
+  if (NewCost < OldCost)
      return true;
    // The promotion is neutral but it may help folding the sign extension in
    // loads for instance.
    // Check that we did not create an illegal instruction.
-  return isPromotedInstructionLegal(TLI, PromotedOperand);
+  return isPromotedInstructionLegal(TLI, DL, PromotedOperand);
  }
  
-/// MatchOperationAddr - Given an instruction or constant expr, see if we can
-/// fold the operation into the addressing mode.  If so, update the addressing
-/// mode and return true, otherwise return false without modifying AddrMode.
+/// Given an instruction or constant expr, see if we can fold the operation
+/// into the addressing mode. If so, update the addressing  mode and return
+/// true, otherwise return false without modifying AddrMode.
  /// If \p MovedAway is not NULL, it contains the information of whether or
  /// not AddrInst has to be folded into the addressing mode on success.
  /// If \p MovedAway == true, \p AddrInst will not be part of the addressing
@@ -2353,7 +2659,7 @@ AddressingModeMatcher::IsPromotionProfitable(unsigned MatchedSize,
  /// This state can happen when AddrInst is a sext, since it may be moved away.
  /// Therefore, AddrInst may not be valid when MovedAway is true and it must
  /// not be referenced anymore.
-bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
+bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
                                                 unsigned Depth,
                                                 bool *MovedAway) {
    // Avoid exponential behavior on extremely deep expression trees.
@@ -2366,15 +2672,16 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
    switch (Opcode) {
    case Instruction::PtrToInt:
      // PtrToInt is always a noop, as we know that the int type is pointer sized.
-    return MatchAddr(AddrInst->getOperand(0), Depth);
-  case Instruction::IntToPtr:
+    return matchAddr(AddrInst->getOperand(0), Depth);
+  case Instruction::IntToPtr: {
+    auto AS = AddrInst->getType()->getPointerAddressSpace();
+    auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
      // This inttoptr is a no-op if the integer type is pointer sized.
-    if (TLI.getValueType(AddrInst->getOperand(0)->getType()) ==
-        TLI.getPointerTy(AddrInst->getType()->getPointerAddressSpace()))
-      return MatchAddr(AddrInst->getOperand(0), Depth);
+    if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy)
+      return matchAddr(AddrInst->getOperand(0), Depth);
      return false;
+  }
    case Instruction::BitCast:
-  case Instruction::AddrSpaceCast:
      // BitCast is always a noop, and we can handle it as long as it is
      // int->int or pointer->pointer (we don't want int<->fp or something).
      if ((AddrInst->getOperand(0)->getType()->isPointerTy() ||
@@ -2383,8 +2690,16 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
          // and we don't want to mess around with them.  Assume it knows what it
          // is doing.
          AddrInst->getOperand(0)->getType() != AddrInst->getType())
-      return MatchAddr(AddrInst->getOperand(0), Depth);
+      return matchAddr(AddrInst->getOperand(0), Depth);
+    return false;
+  case Instruction::AddrSpaceCast: {
+    unsigned SrcAS
+      = AddrInst->getOperand(0)->getType()->getPointerAddressSpace();
+    unsigned DestAS = AddrInst->getType()->getPointerAddressSpace();
+    if (TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
+      return matchAddr(AddrInst->getOperand(0), Depth);
      return false;
+  }
    case Instruction::Add: {
      // Check to see if we can merge in the RHS then the LHS.  If so, we win.
      ExtAddrMode BackupAddrMode = AddrMode;
@@ -2396,8 +2711,8 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
      TypePromotionTransaction::ConstRestorationPt LastKnownGood =
          TPT.getRestorationPoint();
  
-    if (MatchAddr(AddrInst->getOperand(1), Depth+1) &&
-        MatchAddr(AddrInst->getOperand(0), Depth+1))
+    if (matchAddr(AddrInst->getOperand(1), Depth+1) &&
+        matchAddr(AddrInst->getOperand(0), Depth+1))
        return true;
  
      // Restore the old addr mode info.
@@ -2406,8 +2721,8 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
      TPT.rollback(LastKnownGood);
  
      // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
-    if (MatchAddr(AddrInst->getOperand(0), Depth+1) &&
-        MatchAddr(AddrInst->getOperand(1), Depth+1))
+    if (matchAddr(AddrInst->getOperand(0), Depth+1) &&
+        matchAddr(AddrInst->getOperand(1), Depth+1))
        return true;
  
      // Otherwise we definitely can't merge the ADD in.
@@ -2429,7 +2744,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
      if (Opcode == Instruction::Shl)
        Scale = 1LL << Scale;
  
-    return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth);
+    return matchScaledValue(AddrInst->getOperand(0), Scale, Depth);
    }
    case Instruction::GetElementPtr: {
      // Scan the GEP.  We check it if it contains constant offsets and at most
@@ -2438,16 +2753,15 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
      unsigned VariableScale = 0;
  
      int64_t ConstantOffset = 0;
-    const DataLayout *TD = TLI.getDataLayout();
      gep_type_iterator GTI = gep_type_begin(AddrInst);
      for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
        if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-        const StructLayout *SL = TD->getStructLayout(STy);
+        const StructLayout *SL = DL.getStructLayout(STy);
          unsigned Idx =
            cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
          ConstantOffset += SL->getElementOffset(Idx);
        } else {
-        uint64_t TypeSize = TD->getTypeAllocSize(GTI.getIndexedType());
+        uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType());
          if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
            ConstantOffset += CI->getSExtValue()*TypeSize;
          } else if (TypeSize) {  // Scales of zero don't do anything.
@@ -2466,9 +2780,10 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
      // just add it to the disp field and check validity.
      if (VariableOperand == -1) {
        AddrMode.BaseOffs += ConstantOffset;
-      if (ConstantOffset == 0 || TLI.isLegalAddressingMode(AddrMode, AccessTy)){
+      if (ConstantOffset == 0 ||
+          TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) {
          // Check to see if we can fold the base pointer in too.
-        if (MatchAddr(AddrInst->getOperand(0), Depth+1))
+        if (matchAddr(AddrInst->getOperand(0), Depth+1))
            return true;
        }
        AddrMode.BaseOffs -= ConstantOffset;
@@ -2483,7 +2798,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
      AddrMode.BaseOffs += ConstantOffset;
  
      // Match the base operand of the GEP.
-    if (!MatchAddr(AddrInst->getOperand(0), Depth+1)) {
+    if (!matchAddr(AddrInst->getOperand(0), Depth+1)) {
        // If it couldn't be matched, just stuff the value in a register.
        if (AddrMode.HasBaseReg) {
          AddrMode = BackupAddrMode;
@@ -2495,7 +2810,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
      }
  
      // Match the remaining variable portion of the GEP.
-    if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
+    if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
                            Depth)) {
        // If it couldn't be matched, try stuffing the base into a register
        // instead of matching it, and retrying the match of the scale.
@@ -2506,7 +2821,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
        AddrMode.HasBaseReg = true;
        AddrMode.BaseReg = AddrInst->getOperand(0);
        AddrMode.BaseOffs += ConstantOffset;
-      if (!MatchScaledValue(AddrInst->getOperand(VariableOperand),
+      if (!matchScaledValue(AddrInst->getOperand(VariableOperand),
                              VariableScale, Depth)) {
          // If even that didn't work, bail.
          AddrMode = BackupAddrMode;
@@ -2526,15 +2841,16 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
      // Try to move this ext out of the way of the addressing mode.
      // Ask for a method for doing so.
      TypePromotionHelper::Action TPH =
-        TypePromotionHelper::getAction(Ext, InsertedTruncs, TLI, PromotedInsts);
+        TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts);
      if (!TPH)
        return false;
  
      TypePromotionTransaction::ConstRestorationPt LastKnownGood =
          TPT.getRestorationPoint();
-    unsigned CreatedInsts = 0;
+    unsigned CreatedInstsCost = 0;
+    unsigned ExtCost = !TLI.isExtFree(Ext);
      Value *PromotedOperand =
-        TPH(Ext, TPT, PromotedInsts, CreatedInsts, nullptr, nullptr);
+        TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI);
      // SExt has been moved away.
      // Thus either it will be rematched later in the recursive calls or it is
      // gone. Anyway, we must not fold it into the addressing mode at this point.
@@ -2555,8 +2871,13 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
      ExtAddrMode BackupAddrMode = AddrMode;
      unsigned OldSize = AddrModeInsts.size();
  
-    if (!MatchAddr(PromotedOperand, Depth) ||
-        !IsPromotionProfitable(AddrModeInsts.size(), OldSize + CreatedInsts,
+    if (!matchAddr(PromotedOperand, Depth) ||
+        // The total of the new cost is equals to the cost of the created
+        // instructions.
+        // The total of the old cost is equals to the cost of the extension plus
+        // what we have saved in the addressing mode.
+        !isPromotionProfitable(CreatedInstsCost,
+                               ExtCost + (AddrModeInsts.size() - OldSize),
                                 PromotedOperand)) {
        AddrMode = BackupAddrMode;
        AddrModeInsts.resize(OldSize);
@@ -2570,12 +2891,12 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
    return false;
  }
  
-/// MatchAddr - If we can, try to add the value of 'Addr' into the current
-/// addressing mode.  If Addr can't be added to AddrMode this returns false and
-/// leaves AddrMode unmodified.  This assumes that Addr is either a pointer type
-/// or intptr_t for the target.
+/// If we can, try to add the value of 'Addr' into the current addressing mode.
+/// If Addr can't be added to AddrMode this returns false and leaves AddrMode
+/// unmodified. This assumes that Addr is either a pointer type or intptr_t
+/// for the target.
  ///
-bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
+bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
    // Start a transaction at this point that we will rollback if the matching
    // fails.
    TypePromotionTransaction::ConstRestorationPt LastKnownGood =
@@ -2583,14 +2904,14 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
    if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
      // Fold in immediates if legal for the target.
      AddrMode.BaseOffs += CI->getSExtValue();
-    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+    if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
        return true;
      AddrMode.BaseOffs -= CI->getSExtValue();
    } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
      // If this is a global variable, try to fold it into the addressing mode.
      if (!AddrMode.BaseGV) {
        AddrMode.BaseGV = GV;
-      if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+      if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
          return true;
        AddrMode.BaseGV = nullptr;
      }
@@ -2600,7 +2921,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
  
      // Check to see if it is possible to fold this operation.
      bool MovedAway = false;
-    if (MatchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) {
+    if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) {
        // This instruction may have been move away. If so, there is nothing
        // to check here.
        if (MovedAway)
@@ -2609,7 +2930,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
        // *profitable* to do so.  We use a simple cost model to avoid increasing
        // register pressure too much.
        if (I->hasOneUse() ||
-          IsProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
+          isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
          AddrModeInsts.push_back(I);
          return true;
        }
@@ -2621,7 +2942,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
        TPT.rollback(LastKnownGood);
      }
    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
-    if (MatchOperationAddr(CE, CE->getOpcode(), Depth))
+    if (matchOperationAddr(CE, CE->getOpcode(), Depth))
        return true;
      TPT.rollback(LastKnownGood);
    } else if (isa<ConstantPointerNull>(Addr)) {
@@ -2634,7 +2955,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
      AddrMode.HasBaseReg = true;
      AddrMode.BaseReg = Addr;
      // Still check for legality in case the target supports [imm] but not [i+r].
-    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+    if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
        return true;
      AddrMode.HasBaseReg = false;
      AddrMode.BaseReg = nullptr;
@@ -2644,7 +2965,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
    if (AddrMode.Scale == 0) {
      AddrMode.Scale = 1;
      AddrMode.ScaledReg = Addr;
-    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+    if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
        return true;
      AddrMode.Scale = 0;
      AddrMode.ScaledReg = nullptr;
@@ -2654,17 +2975,21 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
    return false;
  }
  
-/// IsOperandAMemoryOperand - Check to see if all uses of OpVal by the specified
-/// inline asm call are due to memory operands.  If so, return true, otherwise
-/// return false.
+/// Check to see if all uses of OpVal by the specified inline asm call are due
+/// to memory operands. If so, return true, otherwise return false.
  static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
-                                    const TargetLowering &TLI) {
-  TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI));
+                                    const TargetMachine &TM) {
+  const Function *F = CI->getParent()->getParent();
+  const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering();
+  const TargetRegisterInfo *TRI = TM.getSubtargetImpl(*F)->getRegisterInfo();
+  TargetLowering::AsmOperandInfoVector TargetConstraints =
+      TLI->ParseConstraints(F->getParent()->getDataLayout(), TRI,
+                            ImmutableCallSite(CI));
    for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
      TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
  
      // Compute the constraint code and ConstraintType to use.
-    TLI.ComputeConstraintToUse(OpInfo, SDValue());
+    TLI->ComputeConstraintToUse(OpInfo, SDValue());
  
      // If this asm operand is our Value*, and if it isn't an indirect memory
      // operand, we can't fold it!
@@ -2677,13 +3002,13 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
    return true;
  }
  
-/// FindAllMemoryUses - Recursively walk all the uses of I until we find a
-/// memory use.  If we find an obviously non-foldable instruction, return true.
+/// Recursively walk all the uses of I until we find a memory use.
+/// If we find an obviously non-foldable instruction, return true.
  /// Add the ultimately found memory instructions to MemoryUses.
-static bool FindAllMemoryUses(Instruction *I,
-                SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses,
-                              SmallPtrSetImpl<Instruction*> &ConsideredInsts,
-                              const TargetLowering &TLI) {
+static bool FindAllMemoryUses(
+    Instruction *I,
+    SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
+    SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetMachine &TM) {
    // If we already considered this instruction, we're done.
    if (!ConsideredInsts.insert(I).second)
      return false;
@@ -2713,12 +3038,12 @@ static bool FindAllMemoryUses(Instruction *I,
        if (!IA) return true;
  
        // If this is a memory operand, we're cool, otherwise bail out.
-      if (!IsOperandAMemoryOperand(CI, IA, I, TLI))
+      if (!IsOperandAMemoryOperand(CI, IA, I, TM))
          return true;
        continue;
      }
  
-    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI))
+    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TM))
        return true;
    }
  
@@ -2729,7 +3054,7 @@ static bool FindAllMemoryUses(Instruction *I,
  /// the use site that we're folding it into.  If so, there is no cost to
  /// include it in the addressing mode.  KnownLive1 and KnownLive2 are two values
  /// that we know are live at the instruction already.
-bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
+bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
                                                     Value *KnownLive2) {
    // If Val is either of the known-live values, we know it is live!
    if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2)
@@ -2751,11 +3076,11 @@ bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
    return Val->isUsedInBasicBlock(MemoryInst->getParent());
  }
  
-/// IsProfitableToFoldIntoAddressingMode - It is possible for the addressing
-/// mode of the machine to fold the specified instruction into a load or store
-/// that ultimately uses it.  However, the specified instruction has multiple
-/// uses.  Given this, it may actually increase register pressure to fold it
-/// into the load.  For example, consider this code:
+/// It is possible for the addressing mode of the machine to fold the specified
+/// instruction into a load or store that ultimately uses it.
+/// However, the specified instruction has multiple uses.
+/// Given this, it may actually increase register pressure to fold it
+/// into the load. For example, consider this code:
  ///
  ///     X = ...
  ///     Y = X+1
@@ -2773,7 +3098,7 @@ bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
  /// X was live across 'load Z' for other reasons, we actually *would* want to
  /// fold the addressing mode in the Z case.  This would make Y die earlier.
  bool AddressingModeMatcher::
-IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
+isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
                                       ExtAddrMode &AMAfter) {
    if (IgnoreProfitability) return true;
  
@@ -2790,9 +3115,9 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
  
    // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
    // lifetime wasn't extended by adding this instruction.
-  if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
+  if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
      BaseReg = nullptr;
-  if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
+  if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
      ScaledReg = nullptr;
  
    // If folding this instruction (and it's subexprs) didn't extend any live
@@ -2806,7 +3131,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
    // uses.
    SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
    SmallPtrSet<Instruction*, 16> ConsideredInsts;
-  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI))
+  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM))
      return false;  // Has a non-memory, non-foldable use!
  
    // Now that we know that all uses of this instruction are part of a chain of
@@ -2821,9 +3146,11 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
      // Get the access type of this use.  If the use isn't a pointer, we don't
      // know what it accesses.
      Value *Address = User->getOperand(OpNo);
-    if (!Address->getType()->isPointerTy())
+    PointerType *AddrTy = dyn_cast<PointerType>(Address->getType());
+    if (!AddrTy)
        return false;
-    Type *AddressAccessTy = Address->getType()->getPointerElementType();
+    Type *AddressAccessTy = AddrTy->getElementType();
+    unsigned AS = AddrTy->getAddressSpace();
  
      // Do a match against the root of this address, ignoring profitability. This
      // will tell us if the addressing mode for the memory operation will
@@ -2831,11 +3158,11 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
      ExtAddrMode Result;
      TypePromotionTransaction::ConstRestorationPt LastKnownGood =
          TPT.getRestorationPoint();
-    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy,
-                                  MemoryInst, Result, InsertedTruncs,
+    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TM, AddressAccessTy, AS,
+                                  MemoryInst, Result, InsertedInsts,
                                    PromotedInsts, TPT);
      Matcher.IgnoreProfitability = true;
-    bool Success = Matcher.MatchAddr(Address, 0);
+    bool Success = Matcher.matchAddr(Address, 0);
      (void)Success; assert(Success && "Couldn't select *anything*?");
  
      // The match was to check the profitability, the changes made are not
@@ -2856,7 +3183,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
  
  } // end anonymous namespace
  
-/// IsNonLocalValue - Return true if the specified values are defined in a
+/// Return true if the specified values are defined in a
  /// different basic block than BB.
  static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
    if (Instruction *I = dyn_cast<Instruction>(V))
@@ -2864,17 +3191,16 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
    return false;
  }
  
-/// OptimizeMemoryInst - Load and Store Instructions often have
-/// addressing modes that can do significant amounts of computation.  As such,
-/// instruction selection will try to get the load or store to do as much
-/// computation as possible for the program.  The problem is that isel can only
-/// see within a single block.  As such, we sink as much legal addressing mode
-/// stuff into the block as possible.
+/// Load and Store Instructions often have addressing modes that can do
+/// significant amounts of computation. As such, instruction selection will try
+/// to get the load or store to do as much computation as possible for the
+/// program. The problem is that isel can only see within a single block. As
+/// such, we sink as much legal addressing mode work into the block as possible.
  ///
  /// This method is used to optimize both load/store and inline asms with memory
  /// operands.
-bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
-                                        Type *AccessTy) {
+bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
+                                        Type *AccessTy, unsigned AddrSpace) {
    Value *Repl = Addr;
  
    // Try to collapse single-value PHI nodes.  This is necessary to undo
@@ -2906,16 +3232,16 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
  
      // For a PHI node, push all of its incoming values.
      if (PHINode *P = dyn_cast<PHINode>(V)) {
-      for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i)
-        worklist.push_back(P->getIncomingValue(i));
+      for (Value *IncValue : P->incoming_values())
+        worklist.push_back(IncValue);
        continue;
      }
  
      // For non-PHIs, determine the addressing mode being computed.
      SmallVector<Instruction*, 16> NewAddrModeInsts;
      ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
-        V, AccessTy, MemoryInst, NewAddrModeInsts, *TLI, InsertedTruncsSet,
-        PromotedInsts, TPT);
+      V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM,
+      InsertedInsts, PromotedInsts, TPT);
  
      // This check is broken into two cases with very similar code to avoid using
      // getNumUses() as much as possible. Some values have a lot of uses, so
@@ -2989,13 +3315,15 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
                   << *MemoryInst << "\n");
      if (SunkAddr->getType() != Addr->getType())
        SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
-  } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() &&
-               TM && TM->getSubtarget<TargetSubtargetInfo>().useAA())) {
+  } else if (AddrSinkUsingGEPs ||
+             (!AddrSinkUsingGEPs.getNumOccurrences() && TM &&
+              TM->getSubtargetImpl(*MemoryInst->getParent()->getParent())
+                  ->useAA())) {
      // By default, we use the GEP-based method when AA is used later. This
      // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
      DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
                   << *MemoryInst << "\n");
-    Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType());
+    Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
      Value *ResultPtr = nullptr, *ResultIndex = nullptr;
  
      // First, find the pointer.
@@ -3041,7 +3369,8 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
        return false;
      } else {
        Type *I8PtrTy =
-        Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace());
+          Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace());
+      Type *I8Ty = Builder.getInt8Ty();
  
        // Start with the base register. Do this first so that subsequent address
        // matching finds it last, which will prevent it from trying to match it
@@ -3093,7 +3422,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
            // SDAG consecutive load/store merging.
            if (ResultPtr->getType() != I8PtrTy)
              ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
-          ResultPtr = Builder.CreateGEP(ResultPtr, ResultIndex, "sunkaddr");
+          ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
          }
  
          ResultIndex = V;
@@ -3104,7 +3433,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
        } else {
          if (ResultPtr->getType() != I8PtrTy)
            ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
-        SunkAddr = Builder.CreateGEP(ResultPtr, ResultIndex, "sunkaddr");
+        SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
        }
  
        if (SunkAddr->getType() != Addr->getType())
@@ -3113,7 +3442,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
    } else {
      DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
                   << *MemoryInst << "\n");
-    Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType());
+    Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
      Value *Result = nullptr;
  
      // Start with the base register. Do this first so that subsequent address
@@ -3207,14 +3536,15 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
    return true;
  }
  
-/// OptimizeInlineAsmInst - If there are any memory operands, use
-/// OptimizeMemoryInst to sink their address computing into the block when
-/// possible / profitable.
-bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) {
+/// If there are any memory operands, use OptimizeMemoryInst to sink their
+/// address computing into the block when possible / profitable.
+bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
    bool MadeChange = false;
  
-  TargetLowering::AsmOperandInfoVector
-    TargetConstraints = TLI->ParseConstraints(CS);
+  const TargetRegisterInfo *TRI =
+      TM->getSubtargetImpl(*CS->getParent()->getParent())->getRegisterInfo();
+  TargetLowering::AsmOperandInfoVector TargetConstraints =
+      TLI->ParseConstraints(*DL, TRI, CS);
    unsigned ArgNo = 0;
    for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
      TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
@@ -3225,7 +3555,7 @@ bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) {
      if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
          OpInfo.isIndirect) {
        Value *OpVal = CS->getArgOperand(ArgNo++);
-      MadeChange |= OptimizeMemoryInst(CS, OpVal, OpVal->getType());
+      MadeChange |= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u);
      } else if (OpInfo.Type == InlineAsm::isInput)
        ArgNo++;
    }
@@ -3305,10 +3635,10 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {
  /// %add = add nuw i64 %zext, 4
  /// \encode
  /// Thanks to the promotion, we can match zext(load i32*) to i64.
-bool CodeGenPrepare::ExtLdPromotion(TypePromotionTransaction &TPT,
+bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT,
                                      LoadInst *&LI, Instruction *&Inst,
                                      const SmallVectorImpl<Instruction *> &Exts,
-                                    unsigned CreatedInsts = 0) {
+                                    unsigned CreatedInstsCost = 0) {
    // Iterate over all the extensions to see if one form an ext(load).
    for (auto I : Exts) {
      // Check if we directly have ext(load).
@@ -3322,7 +3652,7 @@ bool CodeGenPrepare::ExtLdPromotion(TypePromotionTransaction &TPT,
        continue;
      // Get the action to perform the promotion.
      TypePromotionHelper::Action TPH = TypePromotionHelper::getAction(
-        I, InsertedTruncsSet, *TLI, PromotedInsts);
+        I, InsertedInsts, *TLI, PromotedInsts);
      // Check if we can promote.
      if (!TPH)
        continue;
@@ -3330,10 +3660,11 @@ bool CodeGenPrepare::ExtLdPromotion(TypePromotionTransaction &TPT,
      TypePromotionTransaction::ConstRestorationPt LastKnownGood =
          TPT.getRestorationPoint();
      SmallVector<Instruction *, 4> NewExts;
-    unsigned NewCreatedInsts = 0;
+    unsigned NewCreatedInstsCost = 0;
+    unsigned ExtCost = !TLI->isExtFree(I);
      // Promote.
-    Value *PromotedVal =
-        TPH(I, TPT, PromotedInsts, NewCreatedInsts, &NewExts, nullptr);
+    Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost,
+                             &NewExts, nullptr, *TLI);
      assert(PromotedVal &&
             "TypePromotionHelper should have filtered out those cases");
  
@@ -3343,18 +3674,19 @@ bool CodeGenPrepare::ExtLdPromotion(TypePromotionTransaction &TPT,
      // With exactly 2, the transformation is neutral, because we will merge
      // one extension but leave one. However, we optimistically keep going,
      // because the new extension may be removed too.
-    unsigned TotalCreatedInsts = CreatedInsts + NewCreatedInsts;
+    long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost;
+    TotalCreatedInstsCost -= ExtCost;
      if (!StressExtLdPromotion &&
-        (TotalCreatedInsts > 1 ||
-         !isPromotedInstructionLegal(*TLI, PromotedVal))) {
+        (TotalCreatedInstsCost > 1 ||
+         !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) {
        // The promotion is not profitable, rollback to the previous state.
        TPT.rollback(LastKnownGood);
        continue;
      }
      // The promotion is profitable.
      // Check if it exposes an ext(load).
-    (void)ExtLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInsts);
-    if (LI && (StressExtLdPromotion || NewCreatedInsts == 0 ||
+    (void)extLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInstsCost);
+    if (LI && (StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||
                 // If we have created a new extension, i.e., now we have two
                 // extensions. We must make sure one of them is merged with
                 // the load, otherwise we may degrade the code quality.
@@ -3370,13 +3702,13 @@ bool CodeGenPrepare::ExtLdPromotion(TypePromotionTransaction &TPT,
    return false;
  }
  
-/// MoveExtToFormExtLoad - Move a zext or sext fed by a load into the same
-/// basic block as the load, unless conditions are unfavorable. This allows
-/// SelectionDAG to fold the extend into the load.
+/// Move a zext or sext fed by a load into the same basic block as the load,
+/// unless conditions are unfavorable. This allows SelectionDAG to fold the
+/// extend into the load.
  /// \p I[in/out] the extension may be modified during the process if some
  /// promotions apply.
  ///
-bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *&I) {
+bool CodeGenPrepare::moveExtToFormExtLoad(Instruction *&I) {
    // Try to promote a chain of computation if it allows to form
    // an extended load.
    TypePromotionTransaction TPT;
@@ -3387,7 +3719,7 @@ bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *&I) {
    // Look for a load being extended.
    LoadInst *LI = nullptr;
    Instruction *OldExt = I;
-  bool HasPromoted = ExtLdPromotion(TPT, LI, I, Exts);
+  bool HasPromoted = extLdPromotion(TPT, LI, I, Exts);
    if (!LI || !I) {
      assert(!HasPromoted && !LI && "If we did not match any load instruction "
                                    "the code must remain the same");
@@ -3401,8 +3733,8 @@ bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *&I) {
    if (!HasPromoted && LI->getParent() == I->getParent())
      return false;
  
-  EVT VT = TLI->getValueType(I->getType());
-  EVT LoadVT = TLI->getValueType(LI->getType());
+  EVT VT = TLI->getValueType(*DL, I->getType());
+  EVT LoadVT = TLI->getValueType(*DL, LI->getType());
  
    // If the load has other users and the truncate is not free, this probably
    // isn't worthwhile.
@@ -3437,7 +3769,7 @@ bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *&I) {
    return true;
  }
  
-bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
+bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
    BasicBlock *DefBB = I->getParent();
  
    // If the result of a {s|z}ext and its source are both live out, rewrite all
@@ -3496,7 +3828,7 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
      if (!InsertedTrunc) {
        BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
        InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt);
-      InsertedTruncsSet.insert(InsertedTrunc);
+      InsertedInsts.insert(InsertedTrunc);
      }
  
      // Replace a use of the {s|z}ext source with a use of the result.
@@ -3508,8 +3840,7 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
    return MadeChange;
  }
  
-/// isFormingBranchFromSelectProfitable - Returns true if a SelectInst should be
-/// turned into an explicit branch.
+/// Returns true if a SelectInst should be turned into an explicit branch.
  static bool isFormingBranchFromSelectProfitable(SelectInst *SI) {
    // FIXME: This should use the same heuristics as IfConversion to determine
    // whether a select is better represented as a branch.  This requires that
@@ -3518,28 +3849,27 @@ static bool isFormingBranchFromSelectProfitable(SelectInst *SI) {
  
    CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
  
-  // If the branch is predicted right, an out of order CPU can avoid blocking on
-  // the compare.  Emit cmovs on compares with a memory operand as branches to
-  // avoid stalls on the load from memory.  If the compare has more than one use
-  // there's probably another cmov or setcc around so it's not worth emitting a
-  // branch.
-  if (!Cmp)
+  // If a branch is predictable, an out-of-order CPU can avoid blocking on its
+  // comparison condition. If the compare has more than one use, there's
+  // probably another cmov or setcc around, so it's not worth emitting a branch.
+  if (!Cmp || !Cmp->hasOneUse())
      return false;
  
    Value *CmpOp0 = Cmp->getOperand(0);
    Value *CmpOp1 = Cmp->getOperand(1);
  
-  // We check that the memory operand has one use to avoid uses of the loaded
-  // value directly after the compare, making branches unprofitable.
-  return Cmp->hasOneUse() &&
-         ((isa<LoadInst>(CmpOp0) && CmpOp0->hasOneUse()) ||
+  // Emit "cmov on compare with a memory operand" as a branch to avoid stalls
+  // on a load from memory. But if the load is used more than once, do not
+  // change the select to a branch because the load is probably needed
+  // regardless of whether the branch is taken or not.
+  return ((isa<LoadInst>(CmpOp0) && CmpOp0->hasOneUse()) ||
            (isa<LoadInst>(CmpOp1) && CmpOp1->hasOneUse()));
  }
  
  
  /// If we have a SelectInst that will likely profit from branch prediction,
  /// turn it into a branch.
-bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) {
+bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
    bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
  
    // Can we convert the 'select' to CF ?
@@ -3612,7 +3942,7 @@ static bool isBroadcastShuffle(ShuffleVectorInst *SVI) {
  /// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases
  /// it's often worth sinking a shufflevector splat down to its use so that
  /// codegen can spot all lanes are identical.
-bool CodeGenPrepare::OptimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
+bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
    BasicBlock *DefBB = SVI->getParent();
  
    // Only do this xform if variable vector shifts are particularly expensive.
@@ -3679,6 +4009,9 @@ namespace {
  /// Assuming both extractelement and store can be combine, we get rid of the
  /// transition.
  class VectorPromoteHelper {
+  /// DataLayout associated with the current module.
+  const DataLayout &DL;
+
    /// Used to perform some checks on the legality of vector operations.
    const TargetLowering &TLI;
  
@@ -3752,7 +4085,8 @@ class VectorPromoteHelper {
      unsigned Align = ST->getAlignment();
      // Check if this store is supported.
      if (!TLI.allowsMisalignedMemoryAccesses(
-            TLI.getValueType(ST->getValueOperand()->getType()), AS, Align)) {
+            TLI.getValueType(DL, ST->getValueOperand()->getType()), AS,
+            Align)) {
        // If this is not supported, there is no way we can combine
        // the extract with the store.
        return false;
@@ -3791,7 +4125,7 @@ class VectorPromoteHelper {
    /// \brief Generate a constant vector with \p Val with the same
    /// number of elements as the transition.
    /// \p UseSplat defines whether or not \p Val should be replicated
-  /// accross the whole vector.
+  /// across the whole vector.
    /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
    /// otherwise we generate a vector with as many undef as possible:
    /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only
@@ -3847,9 +4181,10 @@ class VectorPromoteHelper {
    }
  
  public:
-  VectorPromoteHelper(const TargetLowering &TLI, const TargetTransformInfo &TTI,
-                      Instruction *Transition, unsigned CombineCost)
-      : TLI(TLI), TTI(TTI), Transition(Transition),
+  VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI,
+                      const TargetTransformInfo &TTI, Instruction *Transition,
+                      unsigned CombineCost)
+      : DL(DL), TLI(TLI), TTI(TTI), Transition(Transition),
          StoreExtractCombineCost(CombineCost), CombineInst(nullptr) {
      assert(Transition && "Do not know how to promote null");
    }
@@ -3885,7 +4220,7 @@ public:
        return false;
      return StressStoreExtract ||
             TLI.isOperationLegalOrCustom(
-               ISDOpcode, TLI.getValueType(getTransitionType(), true));
+               ISDOpcode, TLI.getValueType(DL, getTransitionType(), true));
    }
  
    /// \brief Check whether or not \p Use can be combined
@@ -3969,152 +4304,10 @@ void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
    Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);
  }
  
-// See if we can speculate calls to intrinsic cttz/ctlz.
-//
-// Example:
-// entry:
-//   ...
-//   %cmp = icmp eq i64 %val, 0
-//   br i1 %cmp, label %end.bb, label %then.bb
-//
-// then.bb:
-//   %c = tail call i64 @llvm.cttz.i64(i64 %val, i1 true)
-//   br label %EndBB
-//
-// end.bb:
-//   %cond = phi i64 [ %c, %then.bb ], [ 64, %entry ]
-//
-// ==>
-//
-// entry:
-//   ...
-//   %c = tail call i64 @llvm.cttz.i64(i64 %val, i1 false)
-//
-static bool OptimizeBranchInst(BranchInst *BrInst, const TargetLowering &TLI) {
-  assert(BrInst->isConditional() && "Expected a conditional branch!");
-  BasicBlock *ThenBB = BrInst->getSuccessor(1);
-  BasicBlock *EndBB = BrInst->getSuccessor(0);
-
-  // See if ThenBB contains only one instruction (excluding the
-  // terminator and DbgInfoIntrinsic calls).
-  IntrinsicInst *II = nullptr;
-  CastInst *CI = nullptr;
-  for (BasicBlock::iterator I = ThenBB->begin(),
-                            E = std::prev(ThenBB->end()); I != E; ++I) {
-    // Skip debug info.
-    if (isa<DbgInfoIntrinsic>(I))
-      continue;
-
-    // Check if this is a zero extension or a truncate of a previously
-    // matched call to intrinsic cttz/ctlz.
-    if (II) {
-      // Early exit if we already found a "free" zero extend/truncate.
-      if (CI)
-        return false;
-
-      Type *SrcTy = II->getType();
-      Type *DestTy = I->getType();
-      Value *V;
- 
-      if (match(cast<Instruction>(I), m_ZExt(m_Value(V))) && V == II) {
-        // Speculate this zero extend only if it is "free" for the target.
-        if (TLI.isZExtFree(SrcTy, DestTy)) {
-          CI = cast<CastInst>(I);
-          continue;
-        }
-      } else if (match(cast<Instruction>(I), m_Trunc(m_Value(V))) && V == II) {
-        // Speculate this truncate only if it is "free" for the target.
-        if (TLI.isTruncateFree(SrcTy, DestTy)) {
-          CI = cast<CastInst>(I);
-          continue;
-        }
-      } else {
-        // Avoid speculating more than one instruction.
-        return false;
-      }
-    }
-
-    // See if this is a call to intrinsic cttz/ctlz.
-    if (match(cast<Instruction>(I), m_Intrinsic<Intrinsic::cttz>())) {
-      // Avoid speculating expensive intrinsic calls.
-      if (!TLI.isCheapToSpeculateCttz())
-        return false;
-    }
-    else if (match(cast<Instruction>(I), m_Intrinsic<Intrinsic::ctlz>())) {
-      // Avoid speculating expensive intrinsic calls.
-      if (!TLI.isCheapToSpeculateCtlz())
-        return false;
-    } else
-      return false;
-    
-    II = cast<IntrinsicInst>(I);
-  }
-
-  // Look for PHI nodes with 'II' as the incoming value from 'ThenBB'.
-  BasicBlock *EntryBB = BrInst->getParent();
-  for (BasicBlock::iterator I = EndBB->begin();
-       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
-    Value *ThenV = PN->getIncomingValueForBlock(ThenBB);
-    Value *OrigV = PN->getIncomingValueForBlock(EntryBB);
-
-    if (!OrigV)
-      return false;
-
-    if (ThenV != II && (!CI || ThenV != CI))
-      return false;
-    
-    if (ConstantInt *CInt = dyn_cast<ConstantInt>(OrigV)) {
-      unsigned BitWidth = II->getType()->getIntegerBitWidth();
-
-      // Don't try to simplify this phi node if 'ThenV' is a cttz/ctlz
-      // intrinsic call, but 'OrigV' is not equal to the 'size-of' in bits
-      // of the value in input to the cttz/ctlz.
-      if (CInt->getValue() != BitWidth)
-        return false;
-
-      // Hoist the call to cttz/ctlz from ThenBB into EntryBB.
-      EntryBB->getInstList().splice(BrInst, ThenBB->getInstList(),
-                                    ThenBB->begin(), std::prev(ThenBB->end()));
- 
-      // Update PN setting ThenV as the incoming value from both 'EntryBB'
-      // and 'ThenBB'. Eventually, method 'OptimizeInst' will fold this
-      // phi node if all the incoming values are the same.
-      PN->setIncomingValue(PN->getBasicBlockIndex(EntryBB), ThenV);
-      PN->setIncomingValue(PN->getBasicBlockIndex(ThenBB), ThenV);
-
-      // Clear the 'undef on zero' flag of the cttz/ctlz intrinsic call.
-      if (cast<ConstantInt>(II->getArgOperand(1))->isOne()) {
-        Type *Ty = II->getArgOperand(0)->getType();
-        Value *Args[] = { II->getArgOperand(0),
-                          ConstantInt::getFalse(II->getContext()) };
-        Module *M = EntryBB->getParent()->getParent();
-        Value *IF = Intrinsic::getDeclaration(M, II->getIntrinsicID(), Ty);
-        IRBuilder<> Builder(II);
-        Instruction *NewI = Builder.CreateCall(IF, Args);
-
-        // Replace the old call to cttz/ctlz.
-        II->replaceAllUsesWith(NewI);
-        II->eraseFromParent();
-      }
- 
-      // Update BrInst condition so that the branch to EndBB is always taken.
-      // Later on, method 'ConstantFoldTerminator' will simplify this branch
-      // replacing it with a direct branch to 'EndBB'.
-      // As a side effect, CodeGenPrepare will attempt to simplify the control
-      // flow graph by deleting basic block 'ThenBB' and merging 'EntryBB' into
-      // 'EndBB' (calling method 'EliminateFallThrough').
-      BrInst->setCondition(ConstantInt::getTrue(BrInst->getContext()));
-      return true;
-    }
-  }
-
-  return false;
-}
-
  /// Some targets can do store(extractelement) with one instruction.
  /// Try to push the extractelement towards the stores when the target
  /// has this feature and this is profitable.
-bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) {
+bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {
    unsigned CombineCost = UINT_MAX;
    if (DisableStoreExtract || !TLI ||
        (!StressStoreExtract &&
@@ -4131,7 +4324,7 @@ bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) {
    //      we do not do that for now.
    BasicBlock *Parent = Inst->getParent();
    DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
-  VectorPromoteHelper VPH(*TLI, *TTI, Inst, CombineCost);
+  VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost);
    // If the transition has more than one use, assume this is not going to be
    // beneficial.
    while (Inst->hasOneUse()) {
@@ -4166,13 +4359,17 @@ bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) {
    return false;
  }
  
-bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
+bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
+  // Bail out if we inserted the instruction to prevent optimizations from
+  // stepping on each other's toes.
+  if (InsertedInsts.count(I))
+    return false;
+
    if (PHINode *P = dyn_cast<PHINode>(I)) {
      // It is possible for very late stage optimizations (such as SimplifyCFG)
      // to introduce PHI nodes too late to be cleaned up.  If we detect such a
      // trivial PHI, go ahead and zap it here.
-    if (Value *V = SimplifyInstruction(P, TLI ? TLI->getDataLayout() : nullptr,
-                                       TLInfo, DT)) {
+    if (Value *V = SimplifyInstruction(P, *DL, TLInfo, nullptr)) {
        P->replaceAllUsesWith(V);
        P->eraseFromParent();
        ++NumPHIsElim;
@@ -4191,19 +4388,20 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
      if (isa<Constant>(CI->getOperand(0)))
        return false;
  
-    if (TLI && OptimizeNoopCopyExpression(CI, *TLI))
+    if (TLI && OptimizeNoopCopyExpression(CI, *TLI, *DL))
        return true;
  
      if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
        /// Sink a zext or sext into its user blocks if the target type doesn't
        /// fit in one register
-      if (TLI && TLI->getTypeAction(CI->getContext(),
-                                    TLI->getValueType(CI->getType())) ==
-                     TargetLowering::TypeExpandInteger) {
+      if (TLI &&
+          TLI->getTypeAction(CI->getContext(),
+                             TLI->getValueType(*DL, CI->getType())) ==
+              TargetLowering::TypeExpandInteger) {
          return SinkCast(CI);
        } else {
-        bool MadeChange = MoveExtToFormExtLoad(I);
-        return MadeChange | OptimizeExtUses(I);
+        bool MadeChange = moveExtToFormExtLoad(I);
+        return MadeChange | optimizeExtUses(I);
        }
      }
      return false;
@@ -4214,15 +4412,21 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
        return OptimizeCmpExpression(CI);
  
    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-    if (TLI)
-      return OptimizeMemoryInst(I, I->getOperand(0), LI->getType());
+    stripInvariantGroupMetadata(*LI);
+    if (TLI) {
+      unsigned AS = LI->getPointerAddressSpace();
+      return optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
+    }
      return false;
    }
  
    if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-    if (TLI)
-      return OptimizeMemoryInst(I, SI->getOperand(1),
-                                SI->getOperand(0)->getType());
+    stripInvariantGroupMetadata(*SI);
+    if (TLI) {
+      unsigned AS = SI->getPointerAddressSpace();
+      return optimizeMemoryInst(I, SI->getOperand(1),
+                                SI->getOperand(0)->getType(), AS);
+    }
      return false;
    }
  
@@ -4232,7 +4436,7 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
                  BinOp->getOpcode() == Instruction::LShr)) {
      ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
      if (TLI && CI && TLI->hasExtractBitsInsn())
-      return OptimizeExtractBits(BinOp, CI, *TLI);
+      return OptimizeExtractBits(BinOp, CI, *TLI, *DL);
  
      return false;
    }
@@ -4245,51 +4449,23 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
        GEPI->replaceAllUsesWith(NC);
        GEPI->eraseFromParent();
        ++NumGEPsElim;
-      OptimizeInst(NC, ModifiedDT);
+      optimizeInst(NC, ModifiedDT);
        return true;
      }
      return false;
    }
  
    if (CallInst *CI = dyn_cast<CallInst>(I))
-    return OptimizeCallInst(CI, ModifiedDT);
+    return optimizeCallInst(CI, ModifiedDT);
  
    if (SelectInst *SI = dyn_cast<SelectInst>(I))
-    return OptimizeSelectInst(SI);
+    return optimizeSelectInst(SI);
  
    if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
-    return OptimizeShuffleVectorInst(SVI);
+    return optimizeShuffleVectorInst(SVI);
  
    if (isa<ExtractElementInst>(I))
-    return OptimizeExtractElementInst(I);
-
-  if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
-    if (TLI && BI->isConditional() && BI->getCondition()->hasOneUse()) {
-      // Check if the branch condition compares a value agaist zero.
-      if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) {
-        if (ICI->getPredicate() == ICmpInst::ICMP_EQ &&
-            match(ICI->getOperand(1), m_Zero())) {
-          BasicBlock *ThenBB = BI->getSuccessor(1);
-          BasicBlock *EndBB = BI->getSuccessor(0);
-
-          // Check if ThenBB is only reachable from this basic block; also,
-          // check if EndBB has more than one predecessor.
-          if (ThenBB->getSinglePredecessor() &&
-              !EndBB->getSinglePredecessor()) {
-            TerminatorInst *TI = ThenBB->getTerminator();
-
-            if (TI->getNumSuccessors() == 1 && TI->getSuccessor(0) == EndBB &&
-                // Try to speculate calls to intrinsic cttz/ctlz from 'ThenBB'.
-                OptimizeBranchInst(BI, *TLI)) {
-              ModifiedDT = true;
-              return true;
-            }
-          }
-        }
-      }
-    }
-    return false;
-  }
+    return optimizeExtractElementInst(I);
  
    return false;
  }
@@ -4297,17 +4473,17 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
  // In this pass we look for GEP and cast instructions that are used
  // across basic blocks and rewrite them to improve basic-block-at-a-time
  // selection.
-bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
+bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
    SunkAddrs.clear();
    bool MadeChange = false;
  
    CurInstIterator = BB.begin();
    while (CurInstIterator != BB.end()) {
-    MadeChange |= OptimizeInst(CurInstIterator++, ModifiedDT);
+    MadeChange |= optimizeInst(CurInstIterator++, ModifiedDT);
      if (ModifiedDT)
        return true;
    }
-  MadeChange |= DupRetToEnableTailCallOpts(&BB);
+  MadeChange |= dupRetToEnableTailCallOpts(&BB);
  
    return MadeChange;
  }
@@ -4315,7 +4491,7 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
  // llvm.dbg.value is far away from the value then iSel may not be able
  // handle it properly. iSel will drop llvm.dbg.value if it can not
  // find a node corresponding to the value.
-bool CodeGenPrepare::PlaceDbgValues(Function &F) {
+bool CodeGenPrepare::placeDbgValues(Function &F) {
    bool MadeChange = false;
    for (BasicBlock &BB : F) {
      Instruction *PrevNonDbgInst = nullptr;
@@ -4469,8 +4645,7 @@ static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
  /// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
  ///
  bool CodeGenPrepare::splitBranchCondition(Function &F) {
-  if (!TM || TM->Options.EnableFastISel != true ||
-      !TLI || TLI->isJumpExpensive())
+  if (!TM || !TM->Options.EnableFastISel || !TLI || TLI->isJumpExpensive())
      return false;
  
    bool MadeChange = false;
@@ -4485,6 +4660,10 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) {
      if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB)))
        continue;
  
+    auto *Br1 = cast<BranchInst>(BB.getTerminator());
+    if (Br1->getMetadata(LLVMContext::MD_unpredictable))
+      continue;
+
      unsigned Opc;
      Value *Cond1, *Cond2;
      if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)),
@@ -4511,7 +4690,6 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) {
  
      // Update original basic block by using the first condition directly by the
      // branch instruction and removing the no longer needed and/or instruction.
-    auto *Br1 = cast<BranchInst>(BB.getTerminator());
      Br1->setCondition(Cond1);
      LogicOp->eraseFromParent();
  
@@ -4631,10 +4809,8 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) {
        }
      }
  
-    // Request DOM Tree update.
      // Note: No point in getting fancy here, since the DT info is never
-    // available to CodeGenPrepare and the existing update code is broken
-    // anyways.
+    // available to CodeGenPrepare.
      ModifiedDT = true;
  
      MadeChange = true;
@@ -4644,3 +4820,8 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) {
    }
    return MadeChange;
  }
+
+void CodeGenPrepare::stripInvariantGroupMetadata(Instruction &I) {
+  if (auto *InvariantMD = I.getMetadata(LLVMContext::MD_invariant_group))
+    I.dropUnknownNonDebugMetadata(InvariantMD->getMetadataID());
+}