lib/Target/AMDGPU/SIInsertWaits.cpp

   1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Insert wait instructions for memory reads and writes.
  12 ///
  13 /// Memory reads and writes are issued asynchronously, so we need to insert
  14 /// S_WAITCNT instructions when we want to access any of their results or
  15 /// overwrite any register that's used asynchronously.
  16 //
  17 //===----------------------------------------------------------------------===//
  18
  19 #include "AMDGPU.h"
  20 #include "AMDGPUSubtarget.h"
  21 #include "SIDefines.h"
  22 #include "SIInstrInfo.h"
  23 #include "SIMachineFunctionInfo.h"
  24 #include "llvm/CodeGen/MachineFunction.h"
  25 #include "llvm/CodeGen/MachineFunctionPass.h"
  26 #include "llvm/CodeGen/MachineInstrBuilder.h"
  27 #include "llvm/CodeGen/MachineRegisterInfo.h"
  28
  29 using namespace llvm;
  30
  31 namespace {
  32
  33 /// \brief One variable for each of the hardware counters
  34 typedef union {
  35   struct {
  36     unsigned VM;
  37     unsigned EXP;
  38     unsigned LGKM;
  39   } Named;
  40   unsigned Array[3];
  41
  42 } Counters;
  43
  44 typedef enum {
  45   OTHER,
  46   SMEM,
  47   VMEM
  48 } InstType;
  49
  50 typedef Counters RegCounters[512];
  51 typedef std::pair<unsigned, unsigned> RegInterval;
  52
  53 class SIInsertWaits : public MachineFunctionPass {
  54
  55 private:
  56   static char ID;
  57   const SIInstrInfo *TII;
  58   const SIRegisterInfo *TRI;
  59   const MachineRegisterInfo *MRI;
  60
  61   /// \brief Constant hardware limits
  62   static const Counters WaitCounts;
  63
  64   /// \brief Constant zero value
  65   static const Counters ZeroCounts;
  66
  67   /// \brief Counter values we have already waited on.
  68   Counters WaitedOn;
  69
  70   /// \brief Counter values for last instruction issued.
  71   Counters LastIssued;
  72
  73   /// \brief Registers used by async instructions.
  74   RegCounters UsedRegs;
  75
  76   /// \brief Registers defined by async instructions.
  77   RegCounters DefinedRegs;
  78
  79   /// \brief Different export instruction types seen since last wait.
  80   unsigned ExpInstrTypesSeen;
  81
  82   /// \brief Type of the last opcode.
  83   InstType LastOpcodeType;
  84
  85   bool LastInstWritesM0;
  86
  87   /// \brief Whether the machine function returns void
  88   bool ReturnsVoid;
  89
  90   /// \brief Get increment/decrement amount for this instruction.
  91   Counters getHwCounts(MachineInstr &MI);
  92
  93   /// \brief Is operand relevant for async execution?
  94   bool isOpRelevant(MachineOperand &Op);
  95
  96   /// \brief Get register interval an operand affects.
  97   RegInterval getRegInterval(const TargetRegisterClass *RC,
  98                              const MachineOperand &Reg) const;
  99
 100   /// \brief Handle instructions async components
 101   void pushInstruction(MachineBasicBlock &MBB,
 102                        MachineBasicBlock::iterator I);
 103
 104   /// \brief Insert the actual wait instruction
 105   bool insertWait(MachineBasicBlock &MBB,
 106                   MachineBasicBlock::iterator I,
 107                   const Counters &Counts);
 108
 109   /// \brief Do we need def2def checks?
 110   bool unorderedDefines(MachineInstr &MI);
 111
 112   /// \brief Resolve all operand dependencies to counter requirements
 113   Counters handleOperands(MachineInstr &MI);
 114
 115   /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
 116   void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
 117
 118 public:
 119   SIInsertWaits(TargetMachine &tm) :
 120     MachineFunctionPass(ID),
 121     TII(nullptr),
 122     TRI(nullptr),
 123     ExpInstrTypesSeen(0) { }
 124
 125   bool runOnMachineFunction(MachineFunction &MF) override;
 126
 127   const char *getPassName() const override {
 128     return "SI insert wait instructions";
 129   }
 130
 131   void getAnalysisUsage(AnalysisUsage &AU) const override {
 132     AU.setPreservesCFG();
 133     MachineFunctionPass::getAnalysisUsage(AU);
 134   }
 135 };
 136
 137 } // End anonymous namespace
 138
 139 char SIInsertWaits::ID = 0;
 140
 141 const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
 142 const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
 143
 144 FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
 145   return new SIInsertWaits(tm);
 146 }
 147
 148 Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
 149   uint64_t TSFlags = MI.getDesc().TSFlags;
 150   Counters Result = { { 0, 0, 0 } };
 151
 152   Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
 153
 154   // Only consider stores or EXP for EXP_CNT
 155   Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
 156       (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore()));
 157
 158   // LGKM may uses larger values
 159   if (TSFlags & SIInstrFlags::LGKM_CNT) {
 160
 161     if (TII->isSMRD(MI)) {
 162
 163       if (MI.getNumOperands() != 0) {
 164         assert(MI.getOperand(0).isReg() &&
 165                "First LGKM operand must be a register!");
 166
 167         // XXX - What if this is a write into a super register?
 168         const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
 169         unsigned Size = RC->getSize();
 170         Result.Named.LGKM = Size > 4 ? 2 : 1;
 171       } else {
 172         // s_dcache_inv etc. do not have a a destination register. Assume we
 173         // want a wait on these.
 174         // XXX - What is the right value?
 175         Result.Named.LGKM = 1;
 176       }
 177     } else {
 178       // DS
 179       Result.Named.LGKM = 1;
 180     }
 181
 182   } else {
 183     Result.Named.LGKM = 0;
 184   }
 185
 186   return Result;
 187 }
 188
 189 bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
 190   // Constants are always irrelevant
 191   if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
 192     return false;
 193
 194   // Defines are always relevant
 195   if (Op.isDef())
 196     return true;
 197
 198   // For exports all registers are relevant
 199   MachineInstr &MI = *Op.getParent();
 200   if (MI.getOpcode() == AMDGPU::EXP)
 201     return true;
 202
 203   // For stores the stored value is also relevant
 204   if (!MI.getDesc().mayStore())
 205     return false;
 206
 207   // Check if this operand is the value being stored.
 208   // Special case for DS instructions, since the address
 209   // operand comes before the value operand and it may have
 210   // multiple data operands.
 211
 212   if (TII->isDS(MI)) {
 213     MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data);
 214     if (Data && Op.isIdenticalTo(*Data))
 215       return true;
 216
 217     MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
 218     if (Data0 && Op.isIdenticalTo(*Data0))
 219       return true;
 220
 221     MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
 222     if (Data1 && Op.isIdenticalTo(*Data1))
 223       return true;
 224
 225     return false;
 226   }
 227
 228   // NOTE: This assumes that the value operand is before the
 229   // address operand, and that there is only one value operand.
 230   for (MachineInstr::mop_iterator I = MI.operands_begin(),
 231        E = MI.operands_end(); I != E; ++I) {
 232
 233     if (I->isReg() && I->isUse())
 234       return Op.isIdenticalTo(*I);
 235   }
 236
 237   return false;
 238 }
 239
 240 RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
 241                                           const MachineOperand &Reg) const {
 242   unsigned Size = RC->getSize();
 243   assert(Size >= 4);
 244
 245   RegInterval Result;
 246   Result.first = TRI->getEncodingValue(Reg.getReg());
 247   Result.second = Result.first + Size / 4;
 248
 249   return Result;
 250 }
 251
 252 void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
 253                                     MachineBasicBlock::iterator I) {
 254
 255   // Get the hardware counter increments and sum them up
 256   Counters Increment = getHwCounts(*I);
 257   Counters Limit = ZeroCounts;
 258   unsigned Sum = 0;
 259
 260   for (unsigned i = 0; i < 3; ++i) {
 261     LastIssued.Array[i] += Increment.Array[i];
 262     if (Increment.Array[i])
 263       Limit.Array[i] = LastIssued.Array[i];
 264     Sum += Increment.Array[i];
 265   }
 266
 267   // If we don't increase anything then that's it
 268   if (Sum == 0) {
 269     LastOpcodeType = OTHER;
 270     return;
 271   }
 272
 273   if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
 274       AMDGPUSubtarget::VOLCANIC_ISLANDS) {
 275     // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
 276     // or SMEM clause, respectively.
 277     //
 278     // The temporary workaround is to break the clauses with S_NOP.
 279     //
 280     // The proper solution would be to allocate registers such that all source
 281     // and destination registers don't overlap, e.g. this is illegal:
 282     //   r0 = load r2
 283     //   r2 = load r0
 284     if ((LastOpcodeType == SMEM && TII->isSMRD(*I)) ||
 285         (LastOpcodeType == VMEM && Increment.Named.VM)) {
 286       // Insert a NOP to break the clause.
 287       BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
 288           .addImm(0);
 289       LastInstWritesM0 = false;
 290     }
 291
 292     if (TII->isSMRD(*I))
 293       LastOpcodeType = SMEM;
 294     else if (Increment.Named.VM)
 295       LastOpcodeType = VMEM;
 296   }
 297
 298   // Remember which export instructions we have seen
 299   if (Increment.Named.EXP) {
 300     ExpInstrTypesSeen |= I->getOpcode() == AMDGPU::EXP ? 1 : 2;
 301   }
 302
 303   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
 304     MachineOperand &Op = I->getOperand(i);
 305     if (!isOpRelevant(Op))
 306       continue;
 307
 308     const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
 309     RegInterval Interval = getRegInterval(RC, Op);
 310     for (unsigned j = Interval.first; j < Interval.second; ++j) {
 311
 312       // Remember which registers we define
 313       if (Op.isDef())
 314         DefinedRegs[j] = Limit;
 315
 316       // and which one we are using
 317       if (Op.isUse())
 318         UsedRegs[j] = Limit;
 319     }
 320   }
 321 }
 322
 323 bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
 324                                MachineBasicBlock::iterator I,
 325                                const Counters &Required) {
 326
 327   // End of program? No need to wait on anything
 328   // A function not returning void needs to wait, because other bytecode will
 329   // be appended after it and we don't know what it will be.
 330   if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
 331     return false;
 332
 333   // Figure out if the async instructions execute in order
 334   bool Ordered[3];
 335
 336   // VM_CNT is always ordered
 337   Ordered[0] = true;
 338
 339   // EXP_CNT is unordered if we have both EXP & VM-writes
 340   Ordered[1] = ExpInstrTypesSeen == 3;
 341
 342   // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
 343   Ordered[2] = false;
 344
 345   // The values we are going to put into the S_WAITCNT instruction
 346   Counters Counts = WaitCounts;
 347
 348   // Do we really need to wait?
 349   bool NeedWait = false;
 350
 351   for (unsigned i = 0; i < 3; ++i) {
 352
 353     if (Required.Array[i] <= WaitedOn.Array[i])
 354       continue;
 355
 356     NeedWait = true;
 357
 358     if (Ordered[i]) {
 359       unsigned Value = LastIssued.Array[i] - Required.Array[i];
 360
 361       // Adjust the value to the real hardware possibilities.
 362       Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
 363
 364     } else
 365       Counts.Array[i] = 0;
 366
 367     // Remember on what we have waited on.
 368     WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
 369   }
 370
 371   if (!NeedWait)
 372     return false;
 373
 374   // Reset EXP_CNT instruction types
 375   if (Counts.Named.EXP == 0)
 376     ExpInstrTypesSeen = 0;
 377
 378   // Build the wait instruction
 379   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
 380           .addImm((Counts.Named.VM & 0xF) |
 381                   ((Counts.Named.EXP & 0x7) << 4) |
 382                   ((Counts.Named.LGKM & 0x7) << 8));
 383
 384   LastOpcodeType = OTHER;
 385   LastInstWritesM0 = false;
 386   return true;
 387 }
 388
 389 /// \brief helper function for handleOperands
 390 static void increaseCounters(Counters &Dst, const Counters &Src) {
 391
 392   for (unsigned i = 0; i < 3; ++i)
 393     Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
 394 }
 395
 396 Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
 397
 398   Counters Result = ZeroCounts;
 399
 400   // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
 401   // but we also want to wait for any other outstanding transfers before
 402   // signalling other hardware blocks
 403   if (MI.getOpcode() == AMDGPU::S_SENDMSG)
 404     return LastIssued;
 405
 406   // For each register affected by this instruction increase the result
 407   // sequence.
 408   //
 409   // TODO: We could probably just look at explicit operands if we removed VCC /
 410   // EXEC from SMRD dest reg classes.
 411   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
 412     MachineOperand &Op = MI.getOperand(i);
 413     if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
 414       continue;
 415
 416     const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
 417     RegInterval Interval = getRegInterval(RC, Op);
 418     for (unsigned j = Interval.first; j < Interval.second; ++j) {
 419
 420       if (Op.isDef()) {
 421         increaseCounters(Result, UsedRegs[j]);
 422         increaseCounters(Result, DefinedRegs[j]);
 423       }
 424
 425       if (Op.isUse())
 426         increaseCounters(Result, DefinedRegs[j]);
 427     }
 428   }
 429
 430   return Result;
 431 }
 432
 433 void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
 434                                   MachineBasicBlock::iterator I) {
 435   if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <
 436       AMDGPUSubtarget::VOLCANIC_ISLANDS)
 437     return;
 438
 439   // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
 440   if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) {
 441     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
 442     LastInstWritesM0 = false;
 443     return;
 444   }
 445
 446   // Set whether this instruction sets M0
 447   LastInstWritesM0 = false;
 448
 449   unsigned NumOperands = I->getNumOperands();
 450   for (unsigned i = 0; i < NumOperands; i++) {
 451     const MachineOperand &Op = I->getOperand(i);
 452
 453     if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
 454       LastInstWritesM0 = true;
 455   }
 456 }
 457
 458 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
 459 // around other non-memory instructions.
 460 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
 461   bool Changes = false;
 462
 463   TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
 464   TRI =
 465       static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
 466
 467   MRI = &MF.getRegInfo();
 468
 469   WaitedOn = ZeroCounts;
 470   LastIssued = ZeroCounts;
 471   LastOpcodeType = OTHER;
 472   LastInstWritesM0 = false;
 473   ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->returnsVoid();
 474
 475   memset(&UsedRegs, 0, sizeof(UsedRegs));
 476   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
 477
 478   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
 479        BI != BE; ++BI) {
 480
 481     MachineBasicBlock &MBB = *BI;
 482     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
 483          I != E; ++I) {
 484
 485       // Wait for everything before a barrier.
 486       if (I->getOpcode() == AMDGPU::S_BARRIER)
 487         Changes |= insertWait(MBB, I, LastIssued);
 488       else
 489         Changes |= insertWait(MBB, I, handleOperands(*I));
 490
 491       pushInstruction(MBB, I);
 492       handleSendMsg(MBB, I);
 493     }
 494
 495     // Wait for everything at the end of the MBB
 496     Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
 497   }
 498
 499   return Changes;
 500 }