#define DEBUG_TYPE "nvptx-isel"
-unsigned FMAContractLevel = 0;
-
-static cl::opt<unsigned, true>
-FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
- cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
- " 1: do it 2: do it aggressively"),
- cl::location(FMAContractLevel),
- cl::init(2));
-
static cl::opt<int> UsePrecDivF32(
"nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
CodeGenOpt::Level OptLevel)
: SelectionDAGISel(tm, OptLevel),
Subtarget(tm.getSubtarget<NVPTXSubtarget>()) {
-
- doFMAF32 = (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel >= 1);
- doFMAF64 = (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel >= 1);
- doFMAF32AGG =
- (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel == 2);
- doFMAF64AGG =
- (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel == 2);
-
- allowFMA = (FMAContractLevel >= 1);
-
doMulWide = (OptLevel > 0);
}
}
}
+bool NVPTXDAGToDAGISel::allowFMA() const {
+ const NVPTXTargetLowering *TL = (NVPTXTargetLowering *)getTargetLowering();
+ return TL->allowFMA(*MF, OptLevel);
+}
+
/// Select - Select instructions not customized! Used for
/// expanded, promoted and normal instructions.
SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
- // If true, generate corresponding FPCONTRACT. This is
- // language dependent (i.e. CUDA and OpenCL works differently).
- bool doFMAF64;
- bool doFMAF32;
- bool doFMAF64AGG;
- bool doFMAF32AGG;
- bool allowFMA;
-
// If true, generate mul.wide from sext and mul
bool doMulWide;
int getDivF32Level() const;
bool usePrecSqrtF32() const;
bool useF32FTZ() const;
+ bool allowFMA() const;
public:
explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
"nvptx-sched4reg",
cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
+static cl::opt<unsigned>
+FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
+ cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
+ " 1: do it 2: do it aggressively"),
+ cl::init(2));
+
static bool IsPTXVectorType(MVT VT) {
switch (VT.SimpleTy) {
default:
// NVPTX DAG Combining
//===----------------------------------------------------------------------===//
-extern unsigned FMAContractLevel;
+bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
+ CodeGenOpt::Level OptLevel) const {
+ const Function *F = MF.getFunction();
+ const TargetOptions &TO = MF.getTarget().Options;
+
+ // Always honor command-line argument
+ if (FMAContractLevelOpt.getNumOccurrences() > 0) {
+ return FMAContractLevelOpt > 0;
+ } else if (OptLevel == 0) {
+ // Do not contract if we're not optimizing the code
+ return false;
+ } else if (TO.AllowFPOpFusion == FPOpFusion::Fast || TO.UnsafeFPMath) {
+ // Honor TargetOptions flags that explicitly say fusion is okay
+ return true;
+ } else if (F->hasFnAttribute("unsafe-fp-math")) {
+ // Check for unsafe-fp-math=true coming from Clang
+ Attribute Attr = F->getFnAttribute("unsafe-fp-math");
+ StringRef Val = Attr.getValueAsString();
+ if (Val == "true")
+ return true;
+ }
+
+ // We did not have a clear indication that fusion is allowed, so assume not
+ return false;
+}
/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
/// operands N0 and N1. This is a helper for PerformADDCombine that is
}
else if (N0.getOpcode() == ISD::FMUL) {
if (VT == MVT::f32 || VT == MVT::f64) {
- if (FMAContractLevel == 0)
+ NVPTXTargetLowering *TLI =
+ (NVPTXTargetLowering *)&DAG.getTargetLoweringInfo();
+ if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
return SDValue();
// For floating point:
TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(EVT VT) const override;
+ bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const;
+
+ virtual bool isFMAFasterThanFMulAndFAdd(EVT) const {
+ return true;
+ }
+
private:
const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here
def doF32FTZ : Predicate<"useF32FTZ()">;
def doNoF32FTZ : Predicate<"!useF32FTZ()">;
-def doFMAF32 : Predicate<"doFMAF32">;
-def doFMAF32_ftz : Predicate<"(doFMAF32 && useF32FTZ())">;
-def doFMAF32AGG : Predicate<"doFMAF32AGG">;
-def doFMAF32AGG_ftz : Predicate<"(doFMAF32AGG && useF32FTZ())">;
-def doFMAF64 : Predicate<"doFMAF64">;
-def doFMAF64AGG : Predicate<"doFMAF64AGG">;
-
def doMulWide : Predicate<"doMulWide">;
-def allowFMA : Predicate<"allowFMA">;
-def allowFMA_ftz : Predicate<"(allowFMA && useF32FTZ())">;
+def allowFMA : Predicate<"allowFMA()">;
+def noFMA : Predicate<"!allowFMA()">;
def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
!strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
[(set Float32Regs:$dst,
(OpNode Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[allowFMA_ftz]>;
+ Requires<[allowFMA, doF32FTZ]>;
def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
!strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
[(set Float32Regs:$dst,
(OpNode Float32Regs:$a, fpimm:$b))]>,
- Requires<[allowFMA_ftz]>;
+ Requires<[allowFMA, doF32FTZ]>;
def f32rr : NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
!strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
(ins Float64Regs:$a, Float64Regs:$b),
!strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
[(set Float64Regs:$dst,
- (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+ (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+ Requires<[noFMA]>;
def f64ri : NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, f64imm:$b),
!strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
[(set Float64Regs:$dst,
- (OpNode Float64Regs:$a, fpimm:$b))]>;
+ (OpNode Float64Regs:$a, fpimm:$b))]>,
+ Requires<[noFMA]>;
def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
!strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
[(set Float32Regs:$dst,
(OpNode Float32Regs:$a, Float32Regs:$b))]>,
- Requires<[doF32FTZ]>;
+ Requires<[noFMA, doF32FTZ]>;
def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
!strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
[(set Float32Regs:$dst,
(OpNode Float32Regs:$a, fpimm:$b))]>,
- Requires<[doF32FTZ]>;
+ Requires<[noFMA, doF32FTZ]>;
def f32rr : NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, Float32Regs:$b),
!strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
[(set Float32Regs:$dst,
- (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+ (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+ Requires<[noFMA]>;
def f32ri : NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),
!strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
[(set Float32Regs:$dst,
- (OpNode Float32Regs:$a, fpimm:$b))]>;
+ (OpNode Float32Regs:$a, fpimm:$b))]>,
+ Requires<[noFMA]>;
}
multiclass F2<string OpcStr, SDNode OpNode> {
}
defm FMA32_ftz : FPCONTRACT32<"fma.rn.ftz.f32", doF32FTZ>;
-defm FMA32 : FPCONTRACT32<"fma.rn.f32", doNoF32FTZ>;
-defm FMA64 : FPCONTRACT64<"fma.rn.f64", doNoF32FTZ>;
+defm FMA32 : FPCONTRACT32<"fma.rn.f32", true>;
+defm FMA64 : FPCONTRACT64<"fma.rn.f64", true>;
def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
"sin.approx.f32 \t$dst, $src;",
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s
;; These tests should run for all targets
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
define ptx_device float @t1_f32(float %x, float %y, float %z) {
; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
--- /dev/null
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefix=DEFAULT
+
+target triple = "nvptx64-unknown-cuda"
+
+;; Make sure we are generating proper instruction sequences for fused ops
+;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit
+;; add.f32 otherwise. Without an explicit rounding mode on add.f32, ptxas
+;; is free to fuse with a multiply if it is able. If fusion is not allowed,
+;; we do not form fma.rn at the PTX level and explicitly generate add.rn
+;; for all adds to prevent ptxas from fusion the ops.
+
+;; FAST-LABEL: @t0
+;; DEFAULT-LABEL: @t0
+define float @t0(float %a, float %b, float %c) {
+;; FAST: fma.rn.f32
+;; DEFAULT: mul.rn.f32
+;; DEFAULT: add.rn.f32
+ %v0 = fmul float %a, %b
+ %v1 = fadd float %v0, %c
+ ret float %v1
+}
+
+;; FAST-LABEL: @t1
+;; DEFAULT-LABEL: @t1
+define float @t1(float %a, float %b) {
+;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32
+;; to prevent ptxas from fusing this with anything else.
+;; FAST: add.f32
+;; DEFAULT: add.rn.f32
+ %v1 = fadd float %a, %b
+ ret float %v1
+}
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
+
+target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
; Make sure we can properly differentiate between single-precision and
; double-precision FP literals.
; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 -asm-verbose=1 | FileCheck %s
; CHECK: // implicit-def: %f[[F0:[0-9]+]]
-; CHECK: add.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]];
+; CHECK: add.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]];
define float @foo(float %a) {
%ret = fadd float %a, undef
ret float %ret