using namespace llvm;
+// TODO: This should get the default rounding mode from the kernel. We just set
+// the default here, but this could change if the OpenCL rounding mode pragmas
+// are used.
+//
+// The denormal mode here should match what is reported by the OpenCL runtime
+// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
+// can also be override to flush with the -cl-denorms-are-zero compiler flag.
+//
+// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
+// precision, and leaves single precision to flush all and does not report
+// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
+// CL_FP_DENORM for both.
+static uint32_t getFPMode(MachineFunction &) {
+ return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
+ FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
+ FP_DENORM_MODE_SP(FP_DENORM_FLUSH_NONE) |
+ FP_DENORM_MODE_DP(FP_DENORM_FLUSH_NONE);
+}
static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm,
MCStreamer &Streamer) {
false);
OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
false);
+ OutStreamer.emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
+ false);
+ OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
+ false);
} else {
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
OutStreamer.emitRawComment(
if (VCCUsed)
MaxSGPR += 2;
- ProgInfo.CodeLen = CodeSize;
- ProgInfo.NumSGPR = MaxSGPR;
ProgInfo.NumVGPR = MaxVGPR;
+ ProgInfo.NumSGPR = MaxSGPR;
+
+ // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
+ // register.
+ ProgInfo.FloatMode = getFPMode(MF);
+
+ // XXX: Not quite sure what this does, but sc seems to unset this.
+ ProgInfo.IEEEMode = 0;
+
+ // Do not clamp NAN to 0.
+ ProgInfo.DX10Clamp = 0;
+
+ ProgInfo.CodeLen = CodeSize;
}
void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF,
const SIProgramInfo &KernelInfo) {
const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
-
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
unsigned RsrcReg;
switch (MFI->ShaderType) {
default: // Fall through
case ShaderType::VERTEX: RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break;
}
- OutStreamer.EmitIntValue(RsrcReg, 4);
- OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |
- S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4);
-
unsigned LDSAlignShift;
if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
- // LDS is allocated in 64 dword blocks
+ // LDS is allocated in 64 dword blocks.
LDSAlignShift = 8;
} else {
- // LDS is allocated in 128 dword blocks
+ // LDS is allocated in 128 dword blocks.
LDSAlignShift = 9;
}
+
unsigned LDSBlocks =
- RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
+ RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
if (MFI->ShaderType == ShaderType::COMPUTE) {
+ OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
+
+ const uint32_t ComputePGMRSrc1 =
+ S_00B848_VGPRS(KernelInfo.NumVGPR / 4) |
+ S_00B848_SGPRS(KernelInfo.NumSGPR / 8) |
+ S_00B848_PRIORITY(KernelInfo.Priority) |
+ S_00B848_FLOAT_MODE(KernelInfo.FloatMode) |
+ S_00B848_PRIV(KernelInfo.Priv) |
+ S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) |
+ S_00B848_IEEE_MODE(KernelInfo.DebugMode) |
+ S_00B848_IEEE_MODE(KernelInfo.IEEEMode);
+
+ OutStreamer.EmitIntValue(ComputePGMRSrc1, 4);
+
OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4);
+ } else {
+ OutStreamer.EmitIntValue(RsrcReg, 4);
+ OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) |
+ S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4);
}
+
if (MFI->ShaderType == ShaderType::PIXEL) {
OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4);
#define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15)
#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC
+
+#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848
+#define S_00B848_VGPRS(x) (((x) & 0x3F) << 0)
+#define G_00B848_VGPRS(x) (((x) >> 0) & 0x3F)
+#define C_00B848_VGPRS 0xFFFFFFC0
+#define S_00B848_SGPRS(x) (((x) & 0x0F) << 6)
+#define G_00B848_SGPRS(x) (((x) >> 6) & 0x0F)
+#define C_00B848_SGPRS 0xFFFFFC3F
+#define S_00B848_PRIORITY(x) (((x) & 0x03) << 10)
+#define G_00B848_PRIORITY(x) (((x) >> 10) & 0x03)
+#define C_00B848_PRIORITY 0xFFFFF3FF
+#define S_00B848_FLOAT_MODE(x) (((x) & 0xFF) << 12)
+#define G_00B848_FLOAT_MODE(x) (((x) >> 12) & 0xFF)
+#define C_00B848_FLOAT_MODE 0xFFF00FFF
+#define S_00B848_PRIV(x) (((x) & 0x1) << 20)
+#define G_00B848_PRIV(x) (((x) >> 20) & 0x1)
+#define C_00B848_PRIV 0xFFEFFFFF
+#define S_00B848_DX10_CLAMP(x) (((x) & 0x1) << 21)
+#define G_00B848_DX10_CLAMP(x) (((x) >> 21) & 0x1)
+#define C_00B848_DX10_CLAMP 0xFFDFFFFF
+#define S_00B848_DEBUG_MODE(x) (((x) & 0x1) << 22)
+#define G_00B848_DEBUG_MODE(x) (((x) >> 22) & 0x1)
+#define C_00B848_DEBUG_MODE 0xFFBFFFFF
+#define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23)
+#define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1)
+#define C_00B848_IEEE_MODE 0xFF7FFFFF
+
+
+// Helpers for setting FLOAT_MODE
+#define FP_ROUND_ROUND_TO_NEAREST 0
+#define FP_ROUND_ROUND_TO_INF 1
+#define FP_ROUND_ROUND_TO_NEGINF 2
+#define FP_ROUND_ROUND_TO_ZERO 3
+
+// Bits 3:0 control rounding mode. 1:0 control single precision, 3:2 double
+// precision.
+#define FP_ROUND_MODE_SP(x) ((x) & 0x3)
+#define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2)
+
+#define FP_DENORM_FLUSH_IN_FLUSH_OUT 0
+#define FP_DENORM_FLUSH_OUT 1
+#define FP_DENORM_FLUSH_IN 2
+#define FP_DENORM_FLUSH_NONE 3
+
+
+// Bits 7:4 control denormal handling. 5:4 control single precision, 6:7 double
+// precision.
+#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4)
+#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6)
+
#endif // SIDEFINES_H_