lib/Target/R600/AMDGPUAsmPrinter.cpp

   1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 ///
  12 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
  13 /// code.  When passed an MCAsmStreamer it prints assembly and when passed
  14 /// an MCObjectStreamer it outputs binary code.
  15 //
  16 //===----------------------------------------------------------------------===//
  17 //
  18
  19 #include "AMDGPUAsmPrinter.h"
  20 #include "AMDGPU.h"
  21 #include "AMDGPUSubtarget.h"
  22 #include "R600Defines.h"
  23 #include "R600MachineFunctionInfo.h"
  24 #include "R600RegisterInfo.h"
  25 #include "SIDefines.h"
  26 #include "SIMachineFunctionInfo.h"
  27 #include "SIRegisterInfo.h"
  28 #include "llvm/CodeGen/MachineFrameInfo.h"
  29 #include "llvm/MC/MCContext.h"
  30 #include "llvm/MC/MCSectionELF.h"
  31 #include "llvm/MC/MCStreamer.h"
  32 #include "llvm/Support/ELF.h"
  33 #include "llvm/Support/MathExtras.h"
  34 #include "llvm/Support/TargetRegistry.h"
  35 #include "llvm/Target/TargetLoweringObjectFile.h"
  36
  37 using namespace llvm;
  38
  39 // TODO: This should get the default rounding mode from the kernel. We just set
  40 // the default here, but this could change if the OpenCL rounding mode pragmas
  41 // are used.
  42 //
  43 // The denormal mode here should match what is reported by the OpenCL runtime
  44 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
  45 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
  46 //
  47 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
  48 // precision, and leaves single precision to flush all and does not report
  49 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
  50 // CL_FP_DENORM for both.
  51 //
  52 // FIXME: It seems some instructions do not support single precision denormals
  53 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
  54 // and sin_f32, cos_f32 on most parts).
  55
  56 // We want to use these instructions, and using fp32 denormals also causes
  57 // instructions to run at the double precision rate for the device so it's
  58 // probably best to just report no single precision denormals.
  59 static uint32_t getFPMode(const MachineFunction &F) {
  60   const AMDGPUSubtarget& ST = F.getTarget().getSubtarget<AMDGPUSubtarget>();
  61   // TODO: Is there any real use for the flush in only / flush out only modes?
  62
  63   uint32_t FP32Denormals =
  64     ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
  65
  66   uint32_t FP64Denormals =
  67     ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
  68
  69   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
  70          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
  71          FP_DENORM_MODE_SP(FP32Denormals) |
  72          FP_DENORM_MODE_DP(FP64Denormals);
  73 }
  74
  75 static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm,
  76                                               MCStreamer &Streamer) {
  77   return new AMDGPUAsmPrinter(tm, Streamer);
  78 }
  79
  80 extern "C" void LLVMInitializeR600AsmPrinter() {
  81   TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
  82 }
  83
  84 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
  85     : AsmPrinter(TM, Streamer) {
  86   DisasmEnabled = TM.getSubtarget<AMDGPUSubtarget>().dumpCode();
  87 }
  88
  89 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
  90
  91   // This label is used to mark the end of the .text section.
  92   const TargetLoweringObjectFile &TLOF = getObjFileLowering();
  93   OutStreamer.SwitchSection(TLOF.getTextSection());
  94   MCSymbol *EndOfTextLabel =
  95       OutContext.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
  96   OutStreamer.EmitLabel(EndOfTextLabel);
  97 }
  98
  99 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 100
 101   // The starting address of all shader programs must be 256 bytes aligned.
 102   MF.setAlignment(8);
 103
 104   SetupMachineFunction(MF);
 105
 106   EmitFunctionHeader();
 107
 108   MCContext &Context = getObjFileLowering().getContext();
 109   const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config",
 110                                               ELF::SHT_PROGBITS, 0,
 111                                               SectionKind::getReadOnly());
 112   OutStreamer.SwitchSection(ConfigSection);
 113
 114   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
 115   SIProgramInfo KernelInfo;
 116   if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
 117     getSIProgramInfo(KernelInfo, MF);
 118     EmitProgramInfoSI(MF, KernelInfo);
 119   } else {
 120     EmitProgramInfoR600(MF);
 121   }
 122
 123   DisasmLines.clear();
 124   HexLines.clear();
 125   DisasmLineMaxLen = 0;
 126
 127   OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
 128   EmitFunctionBody();
 129
 130   if (isVerbose()) {
 131     const MCSectionELF *CommentSection
 132       = Context.getELFSection(".AMDGPU.csdata",
 133                               ELF::SHT_PROGBITS, 0,
 134                               SectionKind::getReadOnly());
 135     OutStreamer.SwitchSection(CommentSection);
 136
 137     if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
 138       OutStreamer.emitRawComment(" Kernel info:", false);
 139       OutStreamer.emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
 140                                  false);
 141       OutStreamer.emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
 142                                  false);
 143       OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
 144                                  false);
 145       OutStreamer.emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
 146                                  false);
 147       OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
 148                                  false);
 149       OutStreamer.emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
 150                                  false);
 151     } else {
 152       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 153       OutStreamer.emitRawComment(
 154         Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
 155     }
 156   }
 157
 158   if (STM.dumpCode()) {
 159 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 160     MF.dump();
 161 #endif
 162
 163     if (DisasmEnabled) {
 164       OutStreamer.SwitchSection(Context.getELFSection(".AMDGPU.disasm",
 165                                                   ELF::SHT_NOTE, 0,
 166                                                   SectionKind::getReadOnly()));
 167
 168       for (size_t i = 0; i < DisasmLines.size(); ++i) {
 169         std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
 170         Comment += " ; " + HexLines[i] + "\n";
 171
 172         OutStreamer.EmitBytes(StringRef(DisasmLines[i]));
 173         OutStreamer.EmitBytes(StringRef(Comment));
 174       }
 175     }
 176   }
 177
 178   return false;
 179 }
 180
 181 void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
 182   unsigned MaxGPR = 0;
 183   bool killPixel = false;
 184   const R600RegisterInfo *RI = static_cast<const R600RegisterInfo *>(
 185       TM.getSubtargetImpl()->getRegisterInfo());
 186   const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 187   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
 188
 189   for (const MachineBasicBlock &MBB : MF) {
 190     for (const MachineInstr &MI : MBB) {
 191       if (MI.getOpcode() == AMDGPU::KILLGT)
 192         killPixel = true;
 193       unsigned numOperands = MI.getNumOperands();
 194       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
 195         const MachineOperand &MO = MI.getOperand(op_idx);
 196         if (!MO.isReg())
 197           continue;
 198         unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
 199
 200         // Register with value > 127 aren't GPR
 201         if (HWReg > 127)
 202           continue;
 203         MaxGPR = std::max(MaxGPR, HWReg);
 204       }
 205     }
 206   }
 207
 208   unsigned RsrcReg;
 209   if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
 210     // Evergreen / Northern Islands
 211     switch (MFI->getShaderType()) {
 212     default: // Fall through
 213     case ShaderType::COMPUTE:  RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
 214     case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
 215     case ShaderType::PIXEL:    RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
 216     case ShaderType::VERTEX:   RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
 217     }
 218   } else {
 219     // R600 / R700
 220     switch (MFI->getShaderType()) {
 221     default: // Fall through
 222     case ShaderType::GEOMETRY: // Fall through
 223     case ShaderType::COMPUTE:  // Fall through
 224     case ShaderType::VERTEX:   RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
 225     case ShaderType::PIXEL:    RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
 226     }
 227   }
 228
 229   OutStreamer.EmitIntValue(RsrcReg, 4);
 230   OutStreamer.EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
 231                            S_STACK_SIZE(MFI->StackSize), 4);
 232   OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
 233   OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
 234
 235   if (MFI->getShaderType() == ShaderType::COMPUTE) {
 236     OutStreamer.EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
 237     OutStreamer.EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
 238   }
 239 }
 240
 241 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 242                                         const MachineFunction &MF) const {
 243   uint64_t CodeSize = 0;
 244   unsigned MaxSGPR = 0;
 245   unsigned MaxVGPR = 0;
 246   bool VCCUsed = false;
 247   bool FlatUsed = false;
 248   const SIRegisterInfo *RI = static_cast<const SIRegisterInfo *>(
 249       TM.getSubtargetImpl()->getRegisterInfo());
 250
 251   for (const MachineBasicBlock &MBB : MF) {
 252     for (const MachineInstr &MI : MBB) {
 253       // TODO: CodeSize should account for multiple functions.
 254       CodeSize += MI.getDesc().Size;
 255
 256       unsigned numOperands = MI.getNumOperands();
 257       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
 258         const MachineOperand &MO = MI.getOperand(op_idx);
 259         unsigned width = 0;
 260         bool isSGPR = false;
 261
 262         if (!MO.isReg()) {
 263           continue;
 264         }
 265         unsigned reg = MO.getReg();
 266         if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO ||
 267             reg == AMDGPU::VCC_HI) {
 268           VCCUsed = true;
 269           continue;
 270         } else if (reg == AMDGPU::FLAT_SCR ||
 271                    reg == AMDGPU::FLAT_SCR_LO ||
 272                    reg == AMDGPU::FLAT_SCR_HI) {
 273           FlatUsed = true;
 274           continue;
 275         }
 276
 277         switch (reg) {
 278         default: break;
 279         case AMDGPU::SCC:
 280         case AMDGPU::EXEC:
 281         case AMDGPU::M0:
 282           continue;
 283         }
 284
 285         if (AMDGPU::SReg_32RegClass.contains(reg)) {
 286           isSGPR = true;
 287           width = 1;
 288         } else if (AMDGPU::VReg_32RegClass.contains(reg)) {
 289           isSGPR = false;
 290           width = 1;
 291         } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
 292           isSGPR = true;
 293           width = 2;
 294         } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
 295           isSGPR = false;
 296           width = 2;
 297         } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
 298           isSGPR = false;
 299           width = 3;
 300         } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
 301           isSGPR = true;
 302           width = 4;
 303         } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
 304           isSGPR = false;
 305           width = 4;
 306         } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
 307           isSGPR = true;
 308           width = 8;
 309         } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
 310           isSGPR = false;
 311           width = 8;
 312         } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
 313           isSGPR = true;
 314           width = 16;
 315         } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
 316           isSGPR = false;
 317           width = 16;
 318         } else {
 319           llvm_unreachable("Unknown register class");
 320         }
 321         unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
 322         unsigned maxUsed = hwReg + width - 1;
 323         if (isSGPR) {
 324           MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
 325         } else {
 326           MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
 327         }
 328       }
 329     }
 330   }
 331
 332   if (VCCUsed)
 333     MaxSGPR += 2;
 334
 335   if (FlatUsed)
 336     MaxSGPR += 2;
 337
 338   // We found the maximum register index. They start at 0, so add one to get the
 339   // number of registers.
 340   ProgInfo.NumVGPR = MaxVGPR + 1;
 341   ProgInfo.NumSGPR = MaxSGPR + 1;
 342
 343   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
 344   // register.
 345   ProgInfo.FloatMode = getFPMode(MF);
 346
 347   // XXX: Not quite sure what this does, but sc seems to unset this.
 348   ProgInfo.IEEEMode = 0;
 349
 350   // Do not clamp NAN to 0.
 351   ProgInfo.DX10Clamp = 0;
 352
 353   const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
 354   ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
 355
 356   ProgInfo.FlatUsed = FlatUsed;
 357   ProgInfo.VCCUsed = VCCUsed;
 358   ProgInfo.CodeLen = CodeSize;
 359 }
 360
 361 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
 362                                          const SIProgramInfo &KernelInfo) {
 363   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
 364   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 365
 366   unsigned RsrcReg;
 367   switch (MFI->getShaderType()) {
 368   default: // Fall through
 369   case ShaderType::COMPUTE:  RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break;
 370   case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break;
 371   case ShaderType::PIXEL:    RsrcReg = R_00B028_SPI_SHADER_PGM_RSRC1_PS; break;
 372   case ShaderType::VERTEX:   RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break;
 373   }
 374
 375   unsigned LDSAlignShift;
 376   if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
 377     // LDS is allocated in 64 dword blocks.
 378     LDSAlignShift = 8;
 379   } else {
 380     // LDS is allocated in 128 dword blocks.
 381     LDSAlignShift = 9;
 382   }
 383
 384   unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
 385                           MFI->getMaximumWorkGroupSize(MF);
 386
 387   unsigned LDSBlocks =
 388      RoundUpToAlignment(MFI->LDSSize + LDSSpillSize,
 389                               1 << LDSAlignShift) >> LDSAlignShift;
 390
 391   // Scratch is allocated in 256 dword blocks.
 392   unsigned ScratchAlignShift = 10;
 393   // We need to program the hardware with the amount of scratch memory that
 394   // is used by the entire wave.  KernelInfo.ScratchSize is the amount of
 395   // scratch memory used per thread.
 396   unsigned ScratchBlocks =
 397     RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(),
 398                        1 << ScratchAlignShift) >> ScratchAlignShift;
 399
 400   unsigned VGPRBlocks = (KernelInfo.NumVGPR - 1) / 4;
 401   unsigned SGPRBlocks = (KernelInfo.NumSGPR - 1) / 8;
 402
 403   if (MFI->getShaderType() == ShaderType::COMPUTE) {
 404     OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
 405
 406     const uint32_t ComputePGMRSrc1 =
 407       S_00B848_VGPRS(VGPRBlocks) |
 408       S_00B848_SGPRS(SGPRBlocks) |
 409       S_00B848_PRIORITY(KernelInfo.Priority) |
 410       S_00B848_FLOAT_MODE(KernelInfo.FloatMode) |
 411       S_00B848_PRIV(KernelInfo.Priv) |
 412       S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) |
 413       S_00B848_IEEE_MODE(KernelInfo.DebugMode) |
 414       S_00B848_IEEE_MODE(KernelInfo.IEEEMode);
 415
 416     OutStreamer.EmitIntValue(ComputePGMRSrc1, 4);
 417
 418     OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
 419     const uint32_t ComputePGMRSrc2 =
 420       S_00B84C_LDS_SIZE(LDSBlocks) |
 421       S_00B02C_SCRATCH_EN(ScratchBlocks > 0);
 422
 423     OutStreamer.EmitIntValue(ComputePGMRSrc2, 4);
 424
 425     OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
 426     OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4);
 427
 428     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
 429     // 0" comment but I don't see a corresponding field in the register spec.
 430   } else {
 431     OutStreamer.EmitIntValue(RsrcReg, 4);
 432     OutStreamer.EmitIntValue(S_00B028_VGPRS(VGPRBlocks) |
 433                              S_00B028_SGPRS(SGPRBlocks), 4);
 434   }
 435
 436   if (MFI->getShaderType() == ShaderType::PIXEL) {
 437     OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
 438     OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4);
 439     OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
 440     OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
 441   }
 442 }