lib/Target/R600/AMDGPUAsmPrinter.cpp

   1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 ///
  12 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
  13 /// code.  When passed an MCAsmStreamer it prints assembly and when passed
  14 /// an MCObjectStreamer it outputs binary code.
  15 //
  16 //===----------------------------------------------------------------------===//
  17 //
  18
  19 #include "AMDGPUAsmPrinter.h"
  20 #include "AMDGPU.h"
  21 #include "AMDGPUSubtarget.h"
  22 #include "R600Defines.h"
  23 #include "R600MachineFunctionInfo.h"
  24 #include "R600RegisterInfo.h"
  25 #include "SIDefines.h"
  26 #include "SIMachineFunctionInfo.h"
  27 #include "SIRegisterInfo.h"
  28 #include "llvm/CodeGen/MachineFrameInfo.h"
  29 #include "llvm/MC/MCContext.h"
  30 #include "llvm/MC/MCSectionELF.h"
  31 #include "llvm/MC/MCStreamer.h"
  32 #include "llvm/Support/ELF.h"
  33 #include "llvm/Support/MathExtras.h"
  34 #include "llvm/Support/TargetRegistry.h"
  35 #include "llvm/Target/TargetLoweringObjectFile.h"
  36
  37 using namespace llvm;
  38
  39 // TODO: This should get the default rounding mode from the kernel. We just set
  40 // the default here, but this could change if the OpenCL rounding mode pragmas
  41 // are used.
  42 //
  43 // The denormal mode here should match what is reported by the OpenCL runtime
  44 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
  45 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
  46 //
  47 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
  48 // precision, and leaves single precision to flush all and does not report
  49 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
  50 // CL_FP_DENORM for both.
  51 //
  52 // FIXME: It seems some instructions do not support single precision denormals
  53 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
  54 // and sin_f32, cos_f32 on most parts).
  55
  56 // We want to use these instructions, and using fp32 denormals also causes
  57 // instructions to run at the double precision rate for the device so it's
  58 // probably best to just report no single precision denormals.
  59 static uint32_t getFPMode(const MachineFunction &F) {
  60   const AMDGPUSubtarget& ST = F.getTarget().getSubtarget<AMDGPUSubtarget>();
  61   // TODO: Is there any real use for the flush in only / flush out only modes?
  62
  63   uint32_t FP32Denormals =
  64     ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
  65
  66   uint32_t FP64Denormals =
  67     ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
  68
  69   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
  70          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
  71          FP_DENORM_MODE_SP(FP32Denormals) |
  72          FP_DENORM_MODE_DP(FP64Denormals);
  73 }
  74
  75 static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm,
  76                                               MCStreamer &Streamer) {
  77   return new AMDGPUAsmPrinter(tm, Streamer);
  78 }
  79
  80 extern "C" void LLVMInitializeR600AsmPrinter() {
  81   TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
  82 }
  83
  84 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
  85     : AsmPrinter(TM, Streamer) {
  86   DisasmEnabled = TM.getSubtarget<AMDGPUSubtarget>().dumpCode();
  87 }
  88
  89 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
  90
  91   // This label is used to mark the end of the .text section.
  92   const TargetLoweringObjectFile &TLOF = getObjFileLowering();
  93   OutStreamer.SwitchSection(TLOF.getTextSection());
  94   MCSymbol *EndOfTextLabel =
  95       OutContext.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
  96   OutStreamer.EmitLabel(EndOfTextLabel);
  97 }
  98
  99 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 100
 101   // The starting address of all shader programs must be 256 bytes aligned.
 102   MF.setAlignment(8);
 103
 104   SetupMachineFunction(MF);
 105
 106   EmitFunctionHeader();
 107
 108   MCContext &Context = getObjFileLowering().getContext();
 109   const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config",
 110                                               ELF::SHT_PROGBITS, 0,
 111                                               SectionKind::getReadOnly());
 112   OutStreamer.SwitchSection(ConfigSection);
 113
 114   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
 115   SIProgramInfo KernelInfo;
 116   if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
 117     getSIProgramInfo(KernelInfo, MF);
 118     EmitProgramInfoSI(MF, KernelInfo);
 119   } else {
 120     EmitProgramInfoR600(MF);
 121   }
 122
 123   DisasmLines.clear();
 124   HexLines.clear();
 125   DisasmLineMaxLen = 0;
 126
 127   OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
 128   EmitFunctionBody();
 129
 130   if (isVerbose()) {
 131     const MCSectionELF *CommentSection
 132       = Context.getELFSection(".AMDGPU.csdata",
 133                               ELF::SHT_PROGBITS, 0,
 134                               SectionKind::getReadOnly());
 135     OutStreamer.SwitchSection(CommentSection);
 136
 137     if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
 138       OutStreamer.emitRawComment(" Kernel info:", false);
 139       OutStreamer.emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
 140                                  false);
 141       OutStreamer.emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
 142                                  false);
 143       OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
 144                                  false);
 145       OutStreamer.emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
 146                                  false);
 147       OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
 148                                  false);
 149       OutStreamer.emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
 150                                  false);
 151     } else {
 152       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 153       OutStreamer.emitRawComment(
 154         Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
 155     }
 156   }
 157
 158   if (STM.dumpCode()) {
 159 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 160     MF.dump();
 161 #endif
 162
 163     if (DisasmEnabled) {
 164       OutStreamer.SwitchSection(Context.getELFSection(".AMDGPU.disasm",
 165                                                   ELF::SHT_NOTE, 0,
 166                                                   SectionKind::getReadOnly()));
 167
 168       for (size_t i = 0; i < DisasmLines.size(); ++i) {
 169         std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
 170         Comment += " ; " + HexLines[i] + "\n";
 171
 172         OutStreamer.EmitBytes(StringRef(DisasmLines[i]));
 173         OutStreamer.EmitBytes(StringRef(Comment));
 174       }
 175     }
 176   }
 177
 178   return false;
 179 }
 180
 181 void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
 182   unsigned MaxGPR = 0;
 183   bool killPixel = false;
 184   const R600RegisterInfo *RI = static_cast<const R600RegisterInfo *>(
 185       TM.getSubtargetImpl()->getRegisterInfo());
 186   const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 187   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
 188
 189   for (const MachineBasicBlock &MBB : MF) {
 190     for (const MachineInstr &MI : MBB) {
 191       if (MI.getOpcode() == AMDGPU::KILLGT)
 192         killPixel = true;
 193       unsigned numOperands = MI.getNumOperands();
 194       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
 195         const MachineOperand &MO = MI.getOperand(op_idx);
 196         if (!MO.isReg())
 197           continue;
 198         unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
 199
 200         // Register with value > 127 aren't GPR
 201         if (HWReg > 127)
 202           continue;
 203         MaxGPR = std::max(MaxGPR, HWReg);
 204       }
 205     }
 206   }
 207
 208   unsigned RsrcReg;
 209   if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
 210     // Evergreen / Northern Islands
 211     switch (MFI->getShaderType()) {
 212     default: // Fall through
 213     case ShaderType::COMPUTE:  RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
 214     case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
 215     case ShaderType::PIXEL:    RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
 216     case ShaderType::VERTEX:   RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
 217     }
 218   } else {
 219     // R600 / R700
 220     switch (MFI->getShaderType()) {
 221     default: // Fall through
 222     case ShaderType::GEOMETRY: // Fall through
 223     case ShaderType::COMPUTE:  // Fall through
 224     case ShaderType::VERTEX:   RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
 225     case ShaderType::PIXEL:    RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
 226     }
 227   }
 228
 229   OutStreamer.EmitIntValue(RsrcReg, 4);
 230   OutStreamer.EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
 231                            S_STACK_SIZE(MFI->StackSize), 4);
 232   OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
 233   OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
 234
 235   if (MFI->getShaderType() == ShaderType::COMPUTE) {
 236     OutStreamer.EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
 237     OutStreamer.EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
 238   }
 239 }
 240
 241 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 242                                         const MachineFunction &MF) const {
 243   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
 244   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 245   uint64_t CodeSize = 0;
 246   unsigned MaxSGPR = 0;
 247   unsigned MaxVGPR = 0;
 248   bool VCCUsed = false;
 249   bool FlatUsed = false;
 250   const SIRegisterInfo *RI = static_cast<const SIRegisterInfo *>(
 251       TM.getSubtargetImpl()->getRegisterInfo());
 252
 253   for (const MachineBasicBlock &MBB : MF) {
 254     for (const MachineInstr &MI : MBB) {
 255       // TODO: CodeSize should account for multiple functions.
 256       CodeSize += MI.getDesc().Size;
 257
 258       unsigned numOperands = MI.getNumOperands();
 259       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
 260         const MachineOperand &MO = MI.getOperand(op_idx);
 261         unsigned width = 0;
 262         bool isSGPR = false;
 263
 264         if (!MO.isReg()) {
 265           continue;
 266         }
 267         unsigned reg = MO.getReg();
 268         if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO ||
 269             reg == AMDGPU::VCC_HI) {
 270           VCCUsed = true;
 271           continue;
 272         } else if (reg == AMDGPU::FLAT_SCR ||
 273                    reg == AMDGPU::FLAT_SCR_LO ||
 274                    reg == AMDGPU::FLAT_SCR_HI) {
 275           FlatUsed = true;
 276           continue;
 277         }
 278
 279         switch (reg) {
 280         default: break;
 281         case AMDGPU::SCC:
 282         case AMDGPU::EXEC:
 283         case AMDGPU::M0:
 284           continue;
 285         }
 286
 287         if (AMDGPU::SReg_32RegClass.contains(reg)) {
 288           isSGPR = true;
 289           width = 1;
 290         } else if (AMDGPU::VReg_32RegClass.contains(reg)) {
 291           isSGPR = false;
 292           width = 1;
 293         } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
 294           isSGPR = true;
 295           width = 2;
 296         } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
 297           isSGPR = false;
 298           width = 2;
 299         } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
 300           isSGPR = false;
 301           width = 3;
 302         } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
 303           isSGPR = true;
 304           width = 4;
 305         } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
 306           isSGPR = false;
 307           width = 4;
 308         } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
 309           isSGPR = true;
 310           width = 8;
 311         } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
 312           isSGPR = false;
 313           width = 8;
 314         } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
 315           isSGPR = true;
 316           width = 16;
 317         } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
 318           isSGPR = false;
 319           width = 16;
 320         } else {
 321           llvm_unreachable("Unknown register class");
 322         }
 323         unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
 324         unsigned maxUsed = hwReg + width - 1;
 325         if (isSGPR) {
 326           MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
 327         } else {
 328           MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
 329         }
 330       }
 331     }
 332   }
 333
 334   if (VCCUsed)
 335     MaxSGPR += 2;
 336
 337   if (FlatUsed)
 338     MaxSGPR += 2;
 339
 340   // We found the maximum register index. They start at 0, so add one to get the
 341   // number of registers.
 342   ProgInfo.NumVGPR = MaxVGPR + 1;
 343   ProgInfo.NumSGPR = MaxSGPR + 1;
 344
 345   ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
 346   ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
 347   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
 348   // register.
 349   ProgInfo.FloatMode = getFPMode(MF);
 350
 351   // XXX: Not quite sure what this does, but sc seems to unset this.
 352   ProgInfo.IEEEMode = 0;
 353
 354   // Do not clamp NAN to 0.
 355   ProgInfo.DX10Clamp = 0;
 356
 357   const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
 358   ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
 359
 360   ProgInfo.FlatUsed = FlatUsed;
 361   ProgInfo.VCCUsed = VCCUsed;
 362   ProgInfo.CodeLen = CodeSize;
 363
 364   unsigned LDSAlignShift;
 365   if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
 366     // LDS is allocated in 64 dword blocks.
 367     LDSAlignShift = 8;
 368   } else {
 369     // LDS is allocated in 128 dword blocks.
 370     LDSAlignShift = 9;
 371   }
 372
 373   unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
 374                           MFI->getMaximumWorkGroupSize(MF);
 375
 376   ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize;
 377   ProgInfo.LDSBlocks =
 378      RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
 379
 380   // Scratch is allocated in 256 dword blocks.
 381   unsigned ScratchAlignShift = 10;
 382   // We need to program the hardware with the amount of scratch memory that
 383   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
 384   // scratch memory used per thread.
 385   ProgInfo.ScratchBlocks =
 386     RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(),
 387                        1 << ScratchAlignShift) >> ScratchAlignShift;
 388
 389   ProgInfo.ComputePGMRSrc1 =
 390       S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
 391       S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
 392       S_00B848_PRIORITY(ProgInfo.Priority) |
 393       S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
 394       S_00B848_PRIV(ProgInfo.Priv) |
 395       S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
 396       S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
 397       S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
 398
 399   ProgInfo.ComputePGMRSrc2 =
 400       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
 401       S_00B84C_USER_SGPR(MFI->NumUserSGPRs) |
 402       S_00B84C_TGID_X_EN(1) |
 403       S_00B84C_TGID_Y_EN(1) |
 404       S_00B84C_TGID_Z_EN(1) |
 405       S_00B84C_TG_SIZE_EN(1) |
 406       S_00B84C_TIDIG_COMP_CNT(2) |
 407       S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks);
 408 }
 409
 410 static unsigned getRsrcReg(unsigned ShaderType) {
 411   switch (ShaderType) {
 412   default: // Fall through
 413   case ShaderType::COMPUTE:  return R_00B848_COMPUTE_PGM_RSRC1;
 414   case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
 415   case ShaderType::PIXEL:    return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
 416   case ShaderType::VERTEX:   return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
 417   }
 418 }
 419
 420 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
 421                                          const SIProgramInfo &KernelInfo) {
 422   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 423   unsigned RsrcReg = getRsrcReg(MFI->getShaderType());
 424
 425   if (MFI->getShaderType() == ShaderType::COMPUTE) {
 426     OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
 427
 428     OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
 429
 430     OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
 431     OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
 432
 433     OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
 434     OutStreamer.EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
 435
 436     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
 437     // 0" comment but I don't see a corresponding field in the register spec.
 438   } else {
 439     OutStreamer.EmitIntValue(RsrcReg, 4);
 440     OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
 441                              S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
 442   }
 443
 444   if (MFI->getShaderType() == ShaderType::PIXEL) {
 445     OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
 446     OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
 447     OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
 448     OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
 449   }
 450 }