Taints the non-acquire RMW's store address with the load part
[oota-llvm.git] / lib / CodeGen / CodeGenPrepare.cpp
index 5c9858dcec2f3cc2cb0975322800992b670f96a1..c9d22a298654908fd40fe06d9980fd0ca32b38f6 100644 (file)
@@ -154,10 +154,15 @@ class TypePromotionTransaction;
     /// DataLayout for the Function being processed.
     const DataLayout *DL;
 
+    // XXX-comment:We need DominatorTree to figure out which instruction to
+    // taint.
+    DominatorTree *DT;
+
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit CodeGenPrepare(const TargetMachine *TM = nullptr)
-        : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr), DL(nullptr) {
+        : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr), DL(nullptr),
+        DT(nullptr) {
         initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
       }
     bool runOnFunction(Function &F) override;
@@ -168,6 +173,7 @@ class TypePromotionTransaction;
       AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
     }
 
   private:
@@ -202,7 +208,10 @@ class TypePromotionTransaction;
 }
 
 char CodeGenPrepare::ID = 0;
-INITIALIZE_TM_PASS(CodeGenPrepare, "codegenprepare",
+INITIALIZE_TM_PASS_BEGIN(CodeGenPrepare, "codegenprepare",
+                   "Optimize for code generation", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_TM_PASS_END(CodeGenPrepare, "codegenprepare",
                    "Optimize for code generation", false, false)
 
 FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) {
@@ -249,26 +258,29 @@ void recursivelyFindDependence(SetType* DepSet, Value* Val,
     } else if (I->isBinaryOp()) {
       BinaryOperator* I = dyn_cast<BinaryOperator>(Val);
       Value *Op0 = I->getOperand(0), *Op1 = I->getOperand(1);
-      recursivelyFindDependence(DepSet, Op0, Depth - 1);
-      recursivelyFindDependence(DepSet, Op1, Depth - 1);
+      recursivelyFindDependence(DepSet, Op0, InsertOnlyLeafNodes, Depth - 1);
+      recursivelyFindDependence(DepSet, Op1, InsertOnlyLeafNodes, Depth - 1);
     } else if (I->isCast()) {
       Value* Op0 = I->getOperand(0);
-      recursivelyFindDependence(DepSet, Op0, Depth - 1);
+      recursivelyFindDependence(DepSet, Op0, InsertOnlyLeafNodes, Depth - 1);
     } else if (I->getOpcode() == Instruction::Select) {
       Value* Op0 = I->getOperand(0);
       Value* Op1 = I->getOperand(1);
       Value* Op2 = I->getOperand(2);
-      recursivelyFindDependence(DepSet, Op0, Depth - 1);
-      recursivelyFindDependence(DepSet, Op1, Depth - 1);
-      recursivelyFindDependence(DepSet, Op2, Depth - 1);
+      recursivelyFindDependence(DepSet, Op0, InsertOnlyLeafNodes, Depth - 1);
+      recursivelyFindDependence(DepSet, Op1, InsertOnlyLeafNodes, Depth - 1);
+      recursivelyFindDependence(DepSet, Op2, InsertOnlyLeafNodes, Depth - 1);
     } else if (I->getOpcode() == Instruction::GetElementPtr) {
       for (unsigned i = 0; i < I->getNumOperands(); i++) {
-        recursivelyFindDependence(DepSet, I->getOperand(i), Depth - 1);
+        recursivelyFindDependence(DepSet, I->getOperand(i), InsertOnlyLeafNodes,
+                                  Depth - 1);
       }
     } else if (I->getOpcode() == Instruction::Store) {
       auto* SI = dyn_cast<StoreInst>(Val);
-      recursivelyFindDependence(DepSet, SI->getPointerOperand(), Depth - 1);
-      recursivelyFindDependence(DepSet, SI->getValueOperand(), Depth - 1);
+      recursivelyFindDependence(DepSet, SI->getPointerOperand(),
+                                InsertOnlyLeafNodes, Depth - 1);
+      recursivelyFindDependence(DepSet, SI->getValueOperand(),
+                                InsertOnlyLeafNodes, Depth - 1);
     } else {
       Value* Op0 = nullptr;
       Value* Op1 = nullptr;
@@ -277,8 +289,20 @@ void recursivelyFindDependence(SetType* DepSet, Value* Val,
         case Instruction::FCmp: {
           Op0 = I->getOperand(0);
           Op1 = I->getOperand(1);
-          recursivelyFindDependence(DepSet, Op0, Depth - 1);
-          recursivelyFindDependence(DepSet, Op1, Depth - 1);
+          recursivelyFindDependence(DepSet, Op0, InsertOnlyLeafNodes,
+                                    Depth - 1);
+          recursivelyFindDependence(DepSet, Op1, InsertOnlyLeafNodes,
+                                    Depth - 1);
+          break;
+        }
+        case Instruction::PHI: {
+          for (int i = 0; i < I->getNumOperands(); i++) {
+            auto* op = I->getOperand(i);
+            if (DepSet->count(op) == 0) {
+              recursivelyFindDependence(DepSet, I->getOperand(i),
+                                        InsertOnlyLeafNodes, Depth - 1);
+            }
+          }
           break;
         }
         default: {
@@ -304,7 +328,15 @@ Value* createCast(IRBuilder<true, NoFolder>& Builder, Value* DepVal,
   Instruction::CastOps CastOp = Instruction::BitCast;
   switch (DepVal->getType()->getTypeID()) {
     case Type::IntegerTyID: {
-      CastOp = Instruction::SExt;
+      assert(TargetIntegerType->getTypeID() == Type::IntegerTyID);
+      auto* FromType = dyn_cast<IntegerType>(DepVal->getType());
+      auto* ToType = dyn_cast<IntegerType>(TargetIntegerType);
+      assert(FromType && ToType);
+      if (FromType->getBitWidth() <= ToType->getBitWidth()) {
+        CastOp = Instruction::ZExt;
+      } else {
+        CastOp = Instruction::Trunc;
+      }
       break;
     }
     case Type::FloatTyID:
@@ -515,9 +547,7 @@ Value* getRootDependence(Value* DepVal) {
 // %4 = or i32 %3, %2
 // %5 = inttoptr i32 %4 to i32*
 // store i32 1, i32* %5, align 4
-bool taintStoreAddress(StoreInst* SI, Value* DepVal,
-                       const char* calling_func = __builtin_FUNCTION()) {
-  DEBUG(dbgs() << "Called from " << calling_func << '\n');
+bool taintStoreAddress(StoreInst* SI, Value* DepVal) {
   // Set the insertion point right after the 'DepVal'.
   Instruction* Inst = nullptr;
   IRBuilder<true, NoFolder> Builder(SI);
@@ -535,7 +565,13 @@ bool taintStoreAddress(StoreInst* SI, Value* DepVal,
   // Figure out if there's a root variable 'DepVal' depends on. For example, we
   // can extract "getelementptr inbounds %struct, %struct* %0, i64 0, i32 123"
   // to be "%struct* %0" since all other operands are constant.
-  DepVal = getRootDependence(DepVal);
+  auto* RootVal = getRootDependence(DepVal);
+  auto* RootInst = dyn_cast<Instruction>(RootVal);
+  auto* DepValInst = dyn_cast<Instruction>(DepVal);
+  if (RootInst && DepValInst &&
+      RootInst->getParent() == DepValInst->getParent()) {
+    DepVal = RootVal;
+  }
 
   // Is this already a dependence-tainted store?
   Value* OldDep = getDependence(Address);
@@ -648,8 +684,11 @@ bool ConditionalBranchDependsOnValue(BranchInst* BI, Value* DepVal) {
 // XXX-update: For a relaxed load 'LI', find the first immediate atomic store or
 // the first conditional branch. Returns nullptr if there's no such immediately
 // following store/branch instructions, which we can only enforce the load with
-// 'acquire'.
-Instruction* findFirstStoreCondBranchInst(LoadInst* LI) {
+// 'acquire'. 'ChainedBB' contains all the blocks chained together with
+// unconditional branches from 'BB' to the block with the first store/cond
+// branch.
+template <typename Vector>
+Instruction* findFirstStoreCondBranchInst(LoadInst* LI, Vector* ChainedBB) {
   // In some situations, relaxed loads can be left as is:
   // 1. The relaxed load is used to calculate the address of the immediate
   // following store;
@@ -663,17 +702,19 @@ Instruction* findFirstStoreCondBranchInst(LoadInst* LI) {
   // However, in this function, we don't deal with them directly. Instead, we
   // just find the immediate following store/condition branch and return it.
 
+  assert(ChainedBB != nullptr && "Chained BB should not be nullptr");
   auto* BB = LI->getParent();
+  ChainedBB->push_back(BB);
   auto BE = BB->end();
   auto BBI = BasicBlock::iterator(LI);
   BBI++;
   while (true) {
     for (; BBI != BE; BBI++) {
-      auto* Inst = dyn_cast<Instruction>(&*BBI);
-      if (Inst == nullptr) {
-        continue;
-      }
-      if (Inst->getOpcode() == Instruction::Store) {
+      Instruction* Inst = &*BBI;
+      IntrinsicInst* II = dyn_cast<IntrinsicInst>(&*BBI);
+      if (II && II->getIntrinsicID() == Intrinsic::aarch64_stlxr) {
+        return II;
+      } else if (Inst->getOpcode() == Instruction::Store) {
         return Inst;
       } else if (Inst->getOpcode() == Instruction::Br) {
         auto* BrInst = dyn_cast<BranchInst>(Inst);
@@ -683,6 +724,7 @@ Instruction* findFirstStoreCondBranchInst(LoadInst* LI) {
           // Reinitialize iterators with the destination of the unconditional
           // branch.
           BB = BrInst->getSuccessor(0);
+          ChainedBB->push_back(BB);
           BBI = BB->begin();
           BE = BB->end();
           break;
@@ -695,32 +737,26 @@ Instruction* findFirstStoreCondBranchInst(LoadInst* LI) {
   }
 }
 
-// XXX-comment: Returns whether the code has been changed.
-bool taintMonotonicLoads(const SmallVector<LoadInst*, 1>& MonotonicLoadInsts) {
-  bool Changed = false;
-  for (auto* LI : MonotonicLoadInsts) {
-    auto* FirstInst = findFirstStoreCondBranchInst(LI);
-    if (FirstInst == nullptr) {
-      // We don't seem to be able to taint a following store/conditional branch
-      // instruction. Simply make it acquire.
-      DEBUG(dbgs() << "[RelaxedLoad]: Transformed to acquire load\n"
-                   << *LI << "\n");
-      LI->setOrdering(Acquire);
-      Changed = true;
+// XXX-update: Find the next node of the last relaxed load from 'FromInst' to
+// 'ToInst'. If none, return 'ToInst'.
+Instruction* findLastLoadNext(Instruction* FromInst, Instruction* ToInst) {
+  if (FromInst == ToInst) {
+    return ToInst;
+  }
+  Instruction* LastLoad = ToInst;
+  auto* BB = FromInst->getParent();
+  auto BE = BB->end();
+  auto BBI = BasicBlock::iterator(FromInst);
+  BBI++;
+  for (; BBI != BE && &*BBI != ToInst; BBI++) {
+    auto* LI = dyn_cast<LoadInst>(&*BBI);
+    if (LI == nullptr || !LI->isAtomic() || LI->getOrdering() != Monotonic) {
       continue;
     }
-    // Taint 'FirstInst', which could be a store or a condition branch
-    // instruction.
-    if (FirstInst->getOpcode() == Instruction::Store) {
-      Changed |= taintStoreAddress(dyn_cast<StoreInst>(FirstInst), LI);
-    } else if (FirstInst->getOpcode() == Instruction::Br) {
-      Changed |= taintConditionalBranch(dyn_cast<BranchInst>(FirstInst), LI);
-    } else {
-      assert(false && "findFirstStoreCondBranchInst() should return a "
-                    "store/condition branch instruction");
-    }
+    LastLoad = LI;
+    LastLoad = LastLoad->getNextNode();
   }
-  return Changed;
+  return LastLoad;
 }
 
 // Inserts a fake conditional branch right after the instruction 'SplitInst',
@@ -750,31 +786,164 @@ void AddFakeConditionalBranch(Instruction* SplitInst, Value* Condition) {
 }
 
 // Returns true if the code is changed, and false otherwise.
-void TaintRelaxedLoads(LoadInst* LI) {
+void TaintRelaxedLoads(Instruction* UsageInst, Instruction* InsertPoint) {
   // For better performance, we can add a "AND X 0" instruction before the
   // condition.
-  auto* FirstInst = findFirstStoreCondBranchInst(LI);
-  Instruction* InsertPoint = nullptr;
-  if (FirstInst == nullptr) {
-    InsertPoint = LI->getParent()->getTerminator();
-    InsertPoint = LI->getNextNode();
-  } else {
-    InsertPoint = LI->getNextNode();
+  auto* BB = UsageInst->getParent();
+  if (InsertPoint == nullptr) {
+    InsertPoint = UsageInst->getNextNode();
+  }
+  // Insert instructions after PHI nodes.
+  while (dyn_cast<PHINode>(InsertPoint)) {
+    InsertPoint = InsertPoint->getNextNode();
+  }
+  // First thing is to cast 'UsageInst' to an integer type if necessary.
+  Value* AndTarget = nullptr;
+  Type* TargetIntegerType =
+      IntegerType::get(UsageInst->getContext(),
+                       BB->getModule()->getDataLayout().getPointerSizeInBits());
+
+  // Check whether InsertPoint is a added fake conditional branch.
+  BranchInst* BI = nullptr;
+  if ((BI = dyn_cast<BranchInst>(InsertPoint)) && BI->isConditional()) {
+    auto* Cond = dyn_cast<Instruction>(BI->getOperand(0));
+    if (Cond && Cond->getOpcode() == Instruction::ICmp) {
+      auto* CmpInst = dyn_cast<ICmpInst>(Cond);
+      auto* Op0 = dyn_cast<Instruction>(Cond->getOperand(0));
+      auto* Op1 = dyn_cast<ConstantInt>(Cond->getOperand(1));
+      // %tmp = And X, 0
+      // %cmp = ICMP_NE %tmp, 0
+      // Br %cmp
+      // =>
+      // %tmp1 = And X, NewTaintedVal
+      // %tmp2 = And %tmp1, 0
+      // %cmp = ICMP_NE %tmp2, 0
+      // Br %cmp
+      if (CmpInst && CmpInst->getPredicate() == CmpInst::ICMP_NE && Op0 &&
+          Op0->getOpcode() == Instruction::And && Op1 && Op1->isZero()) {
+        auto* Op01 = dyn_cast<ConstantInt>(Op0->getOperand(1));
+        if (Op01 && Op01->isZero()) {
+          // Now we have a previously added fake cond branch.
+          auto* Op00 = Op0->getOperand(0);
+          IRBuilder<true, NoFolder> Builder(CmpInst);
+          if (Op00->getType() == UsageInst->getType()) {
+            AndTarget = UsageInst;
+          } else {
+            AndTarget = createCast(Builder, UsageInst, Op00->getType());
+          }
+          AndTarget = Builder.CreateAnd(Op00, AndTarget);
+          auto* AndZero = dyn_cast<Instruction>(Builder.CreateAnd(
+              AndTarget, Constant::getNullValue(AndTarget->getType())));
+          CmpInst->setOperand(0, AndZero);
+          return;
+        }
+      }
+    }
   }
+
   IRBuilder<true, NoFolder> Builder(InsertPoint);
+  if (IntegerType::classof(UsageInst->getType())) {
+    AndTarget = UsageInst;
+  } else {
+    AndTarget = createCast(Builder, UsageInst, TargetIntegerType);
+  }
   auto* AndZero = dyn_cast<Instruction>(
-      Builder.CreateAnd(LI, Constant::getNullValue(LI->getType())));
+      Builder.CreateAnd(AndTarget, Constant::getNullValue(AndTarget->getType())));
   auto* FakeCondition = dyn_cast<Instruction>(Builder.CreateICmp(
-      CmpInst::ICMP_NE, AndZero, Constant::getNullValue(LI->getType())));
+      CmpInst::ICMP_NE, AndZero, Constant::getNullValue(AndTarget->getType())));
   AddFakeConditionalBranch(FakeCondition->getNextNode(), FakeCondition);
 }
 
+// XXX-comment: Finds the appropriate Value derived from an atomic load.
+// 'ChainedBB' contains all the blocks chained together with unconditional
+// branches from LI's parent BB to the block with the first store/cond branch.
+// If we don't find any, it means 'LI' is not used at all (which should not
+// happen in practice). We can simply set 'LI' to be acquire just to be safe.
+template <typename Vector>
+Instruction* findMostRecentDependenceUsage(LoadInst* LI, Instruction* LaterInst,
+                                           Vector* ChainedBB,
+                                           DominatorTree* DT) {
+  typedef SmallSet<Instruction*, 8> UsageSet;
+  typedef DenseMap<BasicBlock*, std::unique_ptr<UsageSet>> UsageMap;
+  assert(ChainedBB->size() >= 1 && "ChainedBB must have >=1 size");
+  // Mapping from basic block in 'ChainedBB' to the set of dependence usage of
+  // 'LI' in each block.
+  UsageMap usage_map;
+  auto* LoadBB = LI->getParent();
+  usage_map[LoadBB] = make_unique<UsageSet>();
+  usage_map[LoadBB]->insert(LI);
+
+  for (auto* BB : *ChainedBB) {
+    if (usage_map[BB] == nullptr) {
+      usage_map[BB] = make_unique<UsageSet>();
+    }
+    auto& usage_set = usage_map[BB];
+    if (usage_set->size() == 0) {
+      // The value has not been used.
+      return nullptr;
+    }
+    // Calculate the usage in the current BB first.
+    std::list<Value*> bb_usage_list;
+    std::copy(usage_set->begin(), usage_set->end(),
+              std::back_inserter(bb_usage_list));
+    for (auto list_iter = bb_usage_list.begin();
+         list_iter != bb_usage_list.end(); list_iter++) {
+      auto* val = *list_iter;
+      for (auto* U : val->users()) {
+        Instruction* Inst = nullptr;
+        if (!(Inst = dyn_cast<Instruction>(U))) {
+          continue;
+        }
+        assert(Inst && "Usage value must be an instruction");
+        auto iter =
+            std::find(ChainedBB->begin(), ChainedBB->end(), Inst->getParent());
+        if (iter == ChainedBB->end()) {
+          // Only care about usage within ChainedBB.
+          continue;
+        }
+        auto* UsageBB = *iter;
+        if (UsageBB == BB) {
+          // Current BB.
+          if (!usage_set->count(Inst)) {
+            bb_usage_list.push_back(Inst);
+            usage_set->insert(Inst);
+          }
+        } else {
+          // A following BB.
+          if (usage_map[UsageBB] == nullptr) {
+            usage_map[UsageBB] = make_unique<UsageSet>();
+          }
+          usage_map[UsageBB]->insert(Inst);
+        }
+      }
+    }
+  }
+
+  // Pick one usage that is in LaterInst's block and that dominates 'LaterInst'.
+  auto* LaterBB = LaterInst->getParent();
+  auto& usage_set = usage_map[LaterBB];
+  Instruction* usage_inst = nullptr;
+  for (auto* inst : *usage_set) {
+    if (DT->dominates(inst, LaterInst)) {
+      usage_inst = inst;
+      break;
+    }
+  }
+
+  assert(usage_inst && "The usage instruction in the same block but after the "
+                       "later instruction");
+  return usage_inst;
+}
+
 // XXX-comment: Returns whether the code has been changed.
 bool AddFakeConditionalBranchAfterMonotonicLoads(
-    const SmallVector<LoadInst*, 1>& MonotonicLoadInsts) {
+    SmallSet<LoadInst*, 1>& MonotonicLoadInsts, DominatorTree* DT) {
   bool Changed = false;
-  for (auto* LI : MonotonicLoadInsts) {
-    auto* FirstInst = findFirstStoreCondBranchInst(LI);
+  while (!MonotonicLoadInsts.empty()) {
+    auto* LI = *MonotonicLoadInsts.begin();
+    MonotonicLoadInsts.erase(LI);
+    SmallVector<BasicBlock*, 2> ChainedBB;
+    auto* FirstInst = findFirstStoreCondBranchInst(LI, &ChainedBB);
     if (FirstInst != nullptr) {
       if (FirstInst->getOpcode() == Instruction::Store) {
         if (StoreAddressDependOnValue(dyn_cast<StoreInst>(FirstInst), LI)) {
@@ -786,21 +955,62 @@ bool AddFakeConditionalBranchAfterMonotonicLoads(
           continue;
         }
       } else {
-        dbgs() << "FirstInst=" << *FirstInst << "\n";
-        assert(false && "findFirstStoreCondBranchInst() should return a "
-                        "store/condition branch instruction");
+        IntrinsicInst* II = dyn_cast<IntrinsicInst>(FirstInst);
+        if (!II || II->getIntrinsicID() != Intrinsic::aarch64_stlxr) {
+          dbgs() << "FirstInst=" << *FirstInst << "\n";
+          assert(false && "findFirstStoreCondBranchInst() should return a "
+                          "store/condition branch instruction");
+        }
       }
     }
 
-    // We really need to process the relaxed load now.
-    StoreInst* SI = nullptr;;
-    if (FirstInst && (SI = dyn_cast<StoreInst>(FirstInst))) {
+    // We really need to process the relaxed load now. Note that if the next
+    // instruction is a RMW, it will be transformed into a control block, so we
+    // can safely only taint upcoming store instructions.
+    StoreInst* SI = nullptr;
+    IntrinsicInst* II = nullptr;
+    if (FirstInst) {
+      SI = dyn_cast<StoreInst>(FirstInst);
+      II = dyn_cast<IntrinsicInst>(FirstInst);
+    }
+    if (FirstInst && SI) {
       // For immediately coming stores, taint the address of the store.
-      taintStoreAddress(SI, LI);
+      if (FirstInst->getParent() == LI->getParent() ||
+          DT->dominates(LI, FirstInst)) {
+        Changed != taintStoreAddress(SI, LI);
+        Changed = true;
+      } else {
+        auto* Inst =
+            findMostRecentDependenceUsage(LI, FirstInst, &ChainedBB, DT);
+        if (!Inst) {
+          LI->setOrdering(Acquire);
+          Changed = true;
+        } else {
+          Changed |= taintStoreAddress(SI, Inst);
+        }
+      }
     } else {
-      // For immediately coming branch, directly add a fake branch.
-      TaintRelaxedLoads(LI);
-      Changed = true;
+      // No upcoming branch
+      if (!FirstInst) {
+        TaintRelaxedLoads(LI, nullptr);
+        Changed = true;
+      } else {
+        // For immediately coming branch, directly add a fake branch.
+        if (FirstInst->getParent() == LI->getParent() ||
+            DT->dominates(LI, FirstInst)) {
+          TaintRelaxedLoads(LI, FirstInst);
+          Changed = true;
+        } else {
+          auto* Inst =
+              findMostRecentDependenceUsage(LI, FirstInst, &ChainedBB, DT);
+          if (Inst) {
+            TaintRelaxedLoads(Inst, FirstInst);
+          } else {
+            LI->setOrdering(Acquire);
+          }
+          Changed = true;
+        }
+      }
     }
   }
   return Changed;
@@ -1075,6 +1285,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     TLI = TM->getSubtargetImpl(F)->getTargetLowering();
   TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   OptSize = F.optForSize();
 
   /// This optimization identifies DIV instructions that can be
@@ -1177,14 +1388,15 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   // XXX-comment: Delay dealing with relaxed loads in this function to avoid
   // further changes done by other passes (e.g., SimplifyCFG).
   // Collect all the relaxed loads.
-  SmallVector<LoadInst*, 1> MonotonicLoadInsts;
+  SmallSet<LoadInst*, 1> MonotonicLoadInsts;
   for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
     if (I->isAtomic()) {
       switch (I->getOpcode()) {
         case Instruction::Load: {
           auto* LI = dyn_cast<LoadInst>(&*I);
-          if (LI->getOrdering() == Monotonic) {
-            MonotonicLoadInsts.push_back(LI);
+          if (LI->getOrdering() == Monotonic &&
+              !LI->getHasSubsequentAcqlRMW()) {
+            MonotonicLoadInsts.insert(LI);
           }
           break;
         }
@@ -1195,7 +1407,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     }
   }
   EverMadeChange |=
-      AddFakeConditionalBranchAfterMonotonicLoads(MonotonicLoadInsts);
+      AddFakeConditionalBranchAfterMonotonicLoads(MonotonicLoadInsts, DT);
 
   return EverMadeChange;
 }
@@ -1290,7 +1502,7 @@ bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,
       const Instruction *UI = cast<Instruction>(U);
       if (UI->getParent() != DestBB || !isa<PHINode>(UI))
         return false;
-      // If User is inside DestBB block and it is a PHINode then check
+      // IfUser is inside DestBB block and it is a PHINode then check
       // incoming value. If incoming value is not from BB then this is
       // a complex condition (e.g. preheaders) we want to avoid here.
       if (UI->getParent() == DestBB) {