R600/SI: Fix mad*k definitions

[oota-llvm.git] / lib / Target / R600 / SIInsertWaits.cpp
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp

index 67fbdf7be1646769e2cb5d1ff7f3e0425c4f2b68..50f20ac3619e5e6cdd34b2c27ce89e8372ce7e74 100644 (file)
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -17,6 +17,8 @@
  //===----------------------------------------------------------------------===//
  
  #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
  #include "SIInstrInfo.h"
  #include "SIMachineFunctionInfo.h"
  #include "llvm/CodeGen/MachineFunction.h"
@@ -39,6 +41,12 @@ typedef union {
  
  } Counters;
  
+typedef enum {
+  OTHER,
+  SMEM,
+  VMEM
+} InstType;
+
  typedef Counters RegCounters[512];
  typedef std::pair<unsigned, unsigned> RegInterval;
  
@@ -47,7 +55,7 @@ class SIInsertWaits : public MachineFunctionPass {
  private:
    static char ID;
    const SIInstrInfo *TII;
-  const SIRegisterInfo &TRI;
+  const SIRegisterInfo *TRI;
    const MachineRegisterInfo *MRI;
  
    /// \brief Constant hardware limits
@@ -71,6 +79,11 @@ private:
    /// \brief Different export instruction types seen since last wait.
    unsigned ExpInstrTypesSeen;
  
+  /// \brief Type of the last opcode.
+  InstType LastOpcodeType;
+
+  bool LastInstWritesM0;
+
    /// \brief Get increment/decrement amount for this instruction.
    Counters getHwCounts(MachineInstr &MI);
  
@@ -81,7 +94,8 @@ private:
    RegInterval getRegInterval(MachineOperand &Op);
  
    /// \brief Handle instructions async components
-  void pushInstruction(MachineInstr &MI);
+  void pushInstruction(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator I);
  
    /// \brief Insert the actual wait instruction
    bool insertWait(MachineBasicBlock &MBB,
@@ -94,15 +108,19 @@ private:
    /// \brief Resolve all operand dependencies to counter requirements
    Counters handleOperands(MachineInstr &MI);
  
+  /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
+  void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
+
  public:
    SIInsertWaits(TargetMachine &tm) :
      MachineFunctionPass(ID),
-    TII(static_cast<const SIInstrInfo*>(tm.getInstrInfo())),
-    TRI(TII->getRegisterInfo()) { }
+    TII(nullptr),
+    TRI(nullptr),
+    ExpInstrTypesSeen(0) { }
  
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
  
-  const char *getPassName() const {
+  const char *getPassName() const override {
      return "SI insert wait  instructions";
    }
  
@@ -133,12 +151,19 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
    // LGKM may uses larger values
    if (TSFlags & SIInstrFlags::LGKM_CNT) {
  
-    MachineOperand &Op = MI.getOperand(0);
-    assert(Op.isReg() && "First LGKM operand must be a register!");
+    if (TII->isSMRD(MI.getOpcode())) {
+
+      MachineOperand &Op = MI.getOperand(0);
+      assert(Op.isReg() && "First LGKM operand must be a register!");
+
+      unsigned Reg = Op.getReg();
+      unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
+      Result.Named.LGKM = Size > 4 ? 2 : 1;
  
-    unsigned Reg = Op.getReg();
-    unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
-    Result.Named.LGKM = Size > 4 ? 2 : 1;
+    } else {
+      // DS
+      Result.Named.LGKM = 1;
+    }
  
    } else {
      Result.Named.LGKM = 0;
@@ -166,6 +191,29 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
    if (!MI.getDesc().mayStore())
      return false;
  
+  // Check if this operand is the value being stored.
+  // Special case for DS instructions, since the address
+  // operand comes before the value operand and it may have
+  // multiple data operands.
+
+  if (TII->isDS(MI.getOpcode())) {
+    MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
+    if (Data && Op.isIdenticalTo(*Data))
+      return true;
+
+    MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
+    if (Data0 && Op.isIdenticalTo(*Data0))
+      return true;
+
+    MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
+    if (Data1 && Op.isIdenticalTo(*Data1))
+      return true;
+
+    return false;
+  }
+
+  // NOTE: This assumes that the value operand is before the
+  // address operand, and that there is only one value operand.
    for (MachineInstr::mop_iterator I = MI.operands_begin(),
         E = MI.operands_end(); I != E; ++I) {
  
@@ -178,25 +226,26 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
  
  RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
  
-  if (!Op.isReg())
+  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
      return std::make_pair(0, 0);
  
    unsigned Reg = Op.getReg();
-  unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
+  unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
  
    assert(Size >= 4);
  
    RegInterval Result;
-  Result.first = TRI.getEncodingValue(Reg);
+  Result.first = TRI->getEncodingValue(Reg);
    Result.second = Result.first + Size / 4;
  
    return Result;
  }
  
-void SIInsertWaits::pushInstruction(MachineInstr &MI) {
+void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I) {
  
    // Get the hardware counter increments and sum them up
-  Counters Increment = getHwCounts(MI);
+  Counters Increment = getHwCounts(*I);
    unsigned Sum = 0;
  
    for (unsigned i = 0; i < 3; ++i) {
@@ -205,17 +254,43 @@ void SIInsertWaits::pushInstruction(MachineInstr &MI) {
    }
  
    // If we don't increase anything then that's it
-  if (Sum == 0)
+  if (Sum == 0) {
+    LastOpcodeType = OTHER;
      return;
+  }
+
+  if (TRI->ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+    // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM
+    // or SMEM clause, respectively.
+    //
+    // The temporary workaround is to break the clauses with S_NOP.
+    //
+    // The proper solution would be to allocate registers such that all source
+    // and destination registers don't overlap, e.g. this is illegal:
+    //   r0 = load r2
+    //   r2 = load r0
+    if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) ||
+        (LastOpcodeType == VMEM && Increment.Named.VM)) {
+      // Insert a NOP to break the clause.
+      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
+          .addImm(0);
+      LastInstWritesM0 = false;
+    }
+
+    if (TII->isSMRD(I->getOpcode()))
+      LastOpcodeType = SMEM;
+    else if (Increment.Named.VM)
+      LastOpcodeType = VMEM;
+  }
  
    // Remember which export instructions we have seen
    if (Increment.Named.EXP) {
-    ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2;
+    ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2;
    }
  
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
  
-    MachineOperand &Op = MI.getOperand(i);
+    MachineOperand &Op = I->getOperand(i);
      if (!isOpRelevant(Op))
        continue;
  
@@ -265,17 +340,17 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
        continue;
  
      NeedWait = true;
-    
+
      if (Ordered[i]) {
        unsigned Value = LastIssued.Array[i] - Required.Array[i];
  
-      // adjust the value to the real hardware posibilities
+      // Adjust the value to the real hardware possibilities.
        Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
  
      } else
        Counts.Array[i] = 0;
  
-    // Remember on what we have waited on
+    // Remember on what we have waited on.
      WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
    }
  
@@ -292,6 +367,8 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
                    ((Counts.Named.EXP & 0x7) << 4) |
                    ((Counts.Named.LGKM & 0x7) << 8));
  
+  LastOpcodeType = OTHER;
+  LastInstWritesM0 = false;
    return true;
  }
  
@@ -302,23 +379,16 @@ static void increaseCounters(Counters &Dst, const Counters &Src) {
      Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
  }
  
-bool SIInsertWaits::unorderedDefines(MachineInstr &MI) {
-
-  uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
-  if (TSFlags & SIInstrFlags::LGKM_CNT)
-    return true;
-
-  if (TSFlags & SIInstrFlags::EXP_CNT)
-    return ExpInstrTypesSeen == 3;
-
-  return false;
-}
-
  Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
  
-  bool UnorderedDefines = unorderedDefines(MI);
    Counters Result = ZeroCounts;
  
+  // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
+  // but we also want to wait for any other outstanding transfers before
+  // signalling other hardware blocks
+  if (MI.getOpcode() == AMDGPU::S_SENDMSG)
+    return LastIssued;
+
    // For each register affected by this
    // instruction increase the result sequence
    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
@@ -329,8 +399,7 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
  
        if (Op.isDef()) {
          increaseCounters(Result, UsedRegs[j]);
-        if (UnorderedDefines)
-          increaseCounters(Result, DefinedRegs[j]);
+        increaseCounters(Result, DefinedRegs[j]);
        }
  
        if (Op.isUse())
@@ -341,14 +410,45 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
    return Result;
  }
  
-bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
+void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) {
+  if (TRI->ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return;
  
+  // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
+  if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) {
+    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
+    LastInstWritesM0 = false;
+    return;
+  }
+
+  // Set whether this instruction sets M0
+  LastInstWritesM0 = false;
+
+  unsigned NumOperands = I->getNumOperands();
+  for (unsigned i = 0; i < NumOperands; i++) {
+    const MachineOperand &Op = I->getOperand(i);
+
+    if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
+      LastInstWritesM0 = true;
+  }
+}
+
+// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
+// around other non-memory instructions.
+bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
    bool Changes = false;
  
+  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TRI =
+      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+
    MRI = &MF.getRegInfo();
  
    WaitedOn = ZeroCounts;
    LastIssued = ZeroCounts;
+  LastOpcodeType = OTHER;
+  LastInstWritesM0 = false;
  
    memset(&UsedRegs, 0, sizeof(UsedRegs));
    memset(&DefinedRegs, 0, sizeof(DefinedRegs));
@@ -360,8 +460,14 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
           I != E; ++I) {
  
-      Changes |= insertWait(MBB, I, handleOperands(*I));
-      pushInstruction(*I);
+      // Wait for everything before a barrier.
+      if (I->getOpcode() == AMDGPU::S_BARRIER)
+        Changes |= insertWait(MBB, I, LastIssued);
+      else
+        Changes |= insertWait(MBB, I, handleOperands(*I));
+
+      pushInstruction(MBB, I);
+      handleSendMsg(MBB, I);
      }
  
      // Wait for everything at the end of the MBB