Taints the non-acquire RMW's store address with the load part

[oota-llvm.git] / lib / Target / AMDGPU / SIFoldOperands.cpp
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp

index ef377116939ee35abb06b206b548584295ce2711..6230d1e28b74f66a5c1fca18439b6d4047565d57 100644 (file)
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -45,6 +45,7 @@ public:
  
    void getAnalysisUsage(AnalysisUsage &AU) const override {
      AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
      AU.setPreservesCFG();
      MachineFunctionPass::getAnalysisUsage(AU);
    }
@@ -164,8 +165,8 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
  
      // Operand is not legal, so try to commute the instruction to
      // see if this makes it possible to fold.
-    unsigned CommuteIdx0;
-    unsigned CommuteIdx1;
+    unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
+    unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
      bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1);
  
      if (CanCommute) {
@@ -175,7 +176,16 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
          OpNo = CommuteIdx0;
      }
  
-    if (!CanCommute || !TII->commuteInstruction(MI))
+    // One of operands might be an Imm operand, and OpNo may refer to it after
+    // the call of commuteInstruction() below. Such situations are avoided
+    // here explicitly as OpNo must be a register operand to be a candidate
+    // for memory folding.
+    if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
+                       !MI->getOperand(CommuteIdx1).isReg()))
+      return false;
+
+    if (!CanCommute ||
+        !TII->commuteInstruction(MI, false, CommuteIdx0, CommuteIdx1))
        return false;
  
      if (!TII->isOperandLegal(MI, OpNo, OpToFold))
@@ -324,12 +334,20 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
            !MRI.hasOneUse(MI.getOperand(0).getReg()))
          continue;
  
-      // FIXME: Fold operands with subregs.
        if (OpToFold.isReg() &&
-          (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) ||
-           OpToFold.getSubReg()))
+          !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
          continue;
  
+      // Prevent folding operands backwards in the function. For example,
+      // the COPY opcode must not be replaced by 1 in this example:
+      //
+      //    %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3
+      //    ...
+      //    %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use>
+      MachineOperand &Dst = MI.getOperand(0);
+      if (Dst.isReg() &&
+          !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+        continue;
  
        // We need mutate the operands of new mov instructions to add implicit
        // uses of EXEC, but adding them invalidates the use_iterator, so defer
@@ -356,7 +374,10 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
            // Clear kill flags.
            if (!Fold.isImm()) {
              assert(Fold.OpToFold && Fold.OpToFold->isReg());
-            Fold.OpToFold->setIsKill(false);
+            // FIXME: Probably shouldn't bother trying to fold if not an
+            // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
+            // copies.
+            MRI.clearKillFlags(Fold.OpToFold->getReg());
            }
            DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
                  Fold.UseOpNo << " of " << *Fold.UseMI << '\n');