[NVPTX] Improve handling of FP fusion

author Justin Holewinski <jholewinski@nvidia.com>

Thu, 17 Jul 2014 18:10:09 +0000 (18:10 +0000)

committer Justin Holewinski <jholewinski@nvidia.com>

Thu, 17 Jul 2014 18:10:09 +0000 (18:10 +0000)
author Justin Holewinski <jholewinski@nvidia.com>
Thu, 17 Jul 2014 18:10:09 +0000 (18:10 +0000)
committer Justin Holewinski <jholewinski@nvidia.com>
Thu, 17 Jul 2014 18:10:09 +0000 (18:10 +0000)
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

index 35d553227e88308b2a66b47047b36b3c78038ac2..aeda71ff0e9034869731ff3315d14ca8e0f17c9e 100644 (file)
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -24,15 +24,6 @@ using namespace llvm;
  
  #define DEBUG_TYPE "nvptx-isel"
  
-unsigned FMAContractLevel = 0;
-
-static cl::opt<unsigned, true>
-FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
-                    cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
-                             " 1: do it  2: do it aggressively"),
-                    cl::location(FMAContractLevel),
-                    cl::init(2));
-
  static cl::opt<int> UsePrecDivF32(
      "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
      cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
@@ -61,16 +52,6 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
                                       CodeGenOpt::Level OptLevel)
      : SelectionDAGISel(tm, OptLevel),
        Subtarget(tm.getSubtarget<NVPTXSubtarget>()) {
-
-  doFMAF32 = (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel >= 1);
-  doFMAF64 = (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel >= 1);
-  doFMAF32AGG =
-      (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel == 2);
-  doFMAF64AGG =
-      (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel == 2);
-
-  allowFMA = (FMAContractLevel >= 1);
-
    doMulWide = (OptLevel > 0);
  }
  
@@ -116,6 +97,11 @@ bool NVPTXDAGToDAGISel::useF32FTZ() const {
    }
  }
  
+bool NVPTXDAGToDAGISel::allowFMA() const {
+  const NVPTXTargetLowering *TL = (NVPTXTargetLowering *)getTargetLowering();
+  return TL->allowFMA(*MF, OptLevel);
+}
+
  /// Select - Select instructions not customized! Used for
  /// expanded, promoted and normal instructions.
  SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h

index c44ccb20ed88fd74aae2b8b5c25cd855d83e1828..c62fc253c33d192eefa49dbb529ef431e726a1e8 100644 (file)
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -24,20 +24,13 @@ namespace {
  
  class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
  
-  // If true, generate corresponding FPCONTRACT. This is
-  // language dependent (i.e. CUDA and OpenCL works differently).
-  bool doFMAF64;
-  bool doFMAF32;
-  bool doFMAF64AGG;
-  bool doFMAF32AGG;
-  bool allowFMA;
-
    // If true, generate mul.wide from sext and mul
    bool doMulWide;
  
    int getDivF32Level() const;
    bool usePrecSqrtF32() const;
    bool useF32FTZ() const;
+  bool allowFMA() const;
  
  public:
    explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp

index 258c57b17c59f7d9553159953e31714e692ebe31..645a9bb5c5e3ea4477d0c027be8fc508ea21185b 100644 (file)
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -48,6 +48,12 @@ static cl::opt<bool> sched4reg(
      "nvptx-sched4reg",
      cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
  
+static cl::opt<unsigned>
+FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
+                    cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
+                             " 1: do it  2: do it aggressively"),
+                    cl::init(2));
+
  static bool IsPTXVectorType(MVT VT) {
    switch (VT.SimpleTy) {
    default:
@@ -3799,7 +3805,31 @@ unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const {
  //                         NVPTX DAG Combining
  //===----------------------------------------------------------------------===//
  
-extern unsigned FMAContractLevel;
+bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
+                                   CodeGenOpt::Level OptLevel) const {
+  const Function *F = MF.getFunction();
+  const TargetOptions &TO = MF.getTarget().Options;
+
+  // Always honor command-line argument
+  if (FMAContractLevelOpt.getNumOccurrences() > 0) {
+    return FMAContractLevelOpt > 0;
+  } else if (OptLevel == 0) {
+    // Do not contract if we're not optimizing the code
+    return false;
+  } else if (TO.AllowFPOpFusion == FPOpFusion::Fast || TO.UnsafeFPMath) {
+    // Honor TargetOptions flags that explicitly say fusion is okay
+    return true;
+  } else if (F->hasFnAttribute("unsafe-fp-math")) {
+    // Check for unsafe-fp-math=true coming from Clang
+    Attribute Attr = F->getFnAttribute("unsafe-fp-math");
+    StringRef Val = Attr.getValueAsString();
+    if (Val == "true")
+      return true;
+  }
+
+  // We did not have a clear indication that fusion is allowed, so assume not
+  return false;
+}
  
  /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
  /// operands N0 and N1.  This is a helper for PerformADDCombine that is
@@ -3833,7 +3863,9 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
    }
    else if (N0.getOpcode() == ISD::FMUL) {
      if (VT == MVT::f32 || VT == MVT::f64) {
-      if (FMAContractLevel == 0)
+      NVPTXTargetLowering *TLI =
+        (NVPTXTargetLowering *)&DAG.getTargetLoweringInfo();
+      if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
          return SDValue();
  
        // For floating point:
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h

index d25fc7844d3781c65dae696efbd2e2fb452230b3..bef6ed9faad6184caa1a4df38693c539712bb6a5 100644 (file)
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -503,6 +503,12 @@ public:
    TargetLoweringBase::LegalizeTypeAction
    getPreferredVectorAction(EVT VT) const override;
  
+  bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const;
+
+  virtual bool isFMAFasterThanFMulAndFAdd(EVT) const {
+    return true;
+  }
+
  private:
    const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here
  
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td

index 5919fe1c01652a78cc519782dd61374290e83216..9900b8c8433fe77b1ca4965955f8ac8c1503a8e8 100644 (file)
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -139,17 +139,10 @@ def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">;
  def doF32FTZ : Predicate<"useF32FTZ()">;
  def doNoF32FTZ : Predicate<"!useF32FTZ()">;
  
-def doFMAF32      : Predicate<"doFMAF32">;
-def doFMAF32_ftz  : Predicate<"(doFMAF32 && useF32FTZ())">;
-def doFMAF32AGG      : Predicate<"doFMAF32AGG">;
-def doFMAF32AGG_ftz  : Predicate<"(doFMAF32AGG && useF32FTZ())">;
-def doFMAF64      : Predicate<"doFMAF64">;
-def doFMAF64AGG      : Predicate<"doFMAF64AGG">;
-
  def doMulWide      : Predicate<"doMulWide">;
  
-def allowFMA : Predicate<"allowFMA">;
-def allowFMA_ftz : Predicate<"(allowFMA && useF32FTZ())">;
+def allowFMA : Predicate<"allowFMA()">;
+def noFMA : Predicate<"!allowFMA()">;
  
  def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
  def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
@@ -222,13 +215,13 @@ multiclass F3<string OpcStr, SDNode OpNode> {
                        !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
                        [(set Float32Regs:$dst,
                          (OpNode Float32Regs:$a, Float32Regs:$b))]>,
-                      Requires<[allowFMA_ftz]>;
+                      Requires<[allowFMA, doF32FTZ]>;
     def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
                        (ins Float32Regs:$a, f32imm:$b),
                        !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
                        [(set Float32Regs:$dst,
                          (OpNode Float32Regs:$a, fpimm:$b))]>,
-                      Requires<[allowFMA_ftz]>;
+                      Requires<[allowFMA, doF32FTZ]>;
     def f32rr : NVPTXInst<(outs Float32Regs:$dst),
                        (ins Float32Regs:$a, Float32Regs:$b),
                        !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
@@ -248,34 +241,38 @@ multiclass F3_rn<string OpcStr, SDNode OpNode> {
                        (ins Float64Regs:$a, Float64Regs:$b),
                        !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
                        [(set Float64Regs:$dst,
-                        (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+                        (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+                      Requires<[noFMA]>;
     def f64ri : NVPTXInst<(outs Float64Regs:$dst),
                        (ins Float64Regs:$a, f64imm:$b),
                        !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
                        [(set Float64Regs:$dst,
-                        (OpNode Float64Regs:$a, fpimm:$b))]>;
+                        (OpNode Float64Regs:$a, fpimm:$b))]>,
+                      Requires<[noFMA]>;
     def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
                        (ins Float32Regs:$a, Float32Regs:$b),
                        !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
                        [(set Float32Regs:$dst,
                          (OpNode Float32Regs:$a, Float32Regs:$b))]>,
-                      Requires<[doF32FTZ]>;
+                      Requires<[noFMA, doF32FTZ]>;
     def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
                        (ins Float32Regs:$a, f32imm:$b),
                        !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
                        [(set Float32Regs:$dst,
                          (OpNode Float32Regs:$a, fpimm:$b))]>,
-                      Requires<[doF32FTZ]>;
+                      Requires<[noFMA, doF32FTZ]>;
     def f32rr : NVPTXInst<(outs Float32Regs:$dst),
                        (ins Float32Regs:$a, Float32Regs:$b),
                        !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
                        [(set Float32Regs:$dst,
-                        (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[noFMA]>;
     def f32ri : NVPTXInst<(outs Float32Regs:$dst),
                        (ins Float32Regs:$a, f32imm:$b),
                        !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
                        [(set Float32Regs:$dst,
-                        (OpNode Float32Regs:$a, fpimm:$b))]>;
+                        (OpNode Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[noFMA]>;
  }
  
  multiclass F2<string OpcStr, SDNode OpNode> {
@@ -919,8 +916,8 @@ multiclass FPCONTRACT64<string OpcStr, Predicate Pred> {
  }
  
  defm FMA32_ftz  : FPCONTRACT32<"fma.rn.ftz.f32", doF32FTZ>;
-defm FMA32  : FPCONTRACT32<"fma.rn.f32", doNoF32FTZ>;
-defm FMA64  : FPCONTRACT64<"fma.rn.f64", doNoF32FTZ>;
+defm FMA32  : FPCONTRACT32<"fma.rn.f32", true>;
+defm FMA64  : FPCONTRACT64<"fma.rn.f64", true>;
  
  def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
                        "sin.approx.f32 \t$dst, $src;",
diff --git a/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll b/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll

index 8cf83d449a3224943b160ae3e719a16b8618959c..c167db4b46dcd9757a1544d6d4e10484aef162d0 100644 (file)
--- a/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
+++ b/test/CodeGen/NVPTX/arithmetic-fp-sm20.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s
  
  ;; These tests should run for all targets
  
diff --git a/test/CodeGen/NVPTX/fma.ll b/test/CodeGen/NVPTX/fma.ll

index 7716f77ef37b22fcb216e493603b2e84ebe4a8f4..14b5c45b87d80bd3d47c67762fedda8cdf3a519e 100644 (file)
--- a/test/CodeGen/NVPTX/fma.ll
+++ b/test/CodeGen/NVPTX/fma.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
  
  define ptx_device float @t1_f32(float %x, float %y, float %z) {
  ; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
diff --git a/test/CodeGen/NVPTX/fp-contract.ll b/test/CodeGen/NVPTX/fp-contract.ll

new file mode 100644 (file)

index 0000000..3f68b18
--- /dev/null
+++ b/test/CodeGen/NVPTX/fp-contract.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefix=DEFAULT
+
+target triple = "nvptx64-unknown-cuda"
+
+;; Make sure we are generating proper instruction sequences for fused ops
+;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit
+;; add.f32 otherwise.  Without an explicit rounding mode on add.f32, ptxas
+;; is free to fuse with a multiply if it is able.  If fusion is not allowed,
+;; we do not form fma.rn at the PTX level and explicitly generate add.rn
+;; for all adds to prevent ptxas from fusion the ops.
+
+;; FAST-LABEL: @t0
+;; DEFAULT-LABEL: @t0
+define float @t0(float %a, float %b, float %c) {
+;; FAST: fma.rn.f32
+;; DEFAULT: mul.rn.f32
+;; DEFAULT: add.rn.f32
+  %v0 = fmul float %a, %b
+  %v1 = fadd float %v0, %c
+  ret float %v1
+}
+
+;; FAST-LABEL: @t1
+;; DEFAULT-LABEL: @t1
+define float @t1(float %a, float %b) {
+;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32
+;; to prevent ptxas from fusing this with anything else.
+;; FAST: add.f32
+;; DEFAULT: add.rn.f32
+  %v1 = fadd float %a, %b
+  ret float %v1
+}
diff --git a/test/CodeGen/NVPTX/fp-literals.ll b/test/CodeGen/NVPTX/fp-literals.ll

index 8a0285b70f80393db35e9070b46bbe0708eae057..755e0f9250a1409c53d975f009b0d316dbb5a66f 100644 (file)
--- a/test/CodeGen/NVPTX/fp-literals.ll
+++ b/test/CodeGen/NVPTX/fp-literals.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
+
+target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
  
  ; Make sure we can properly differentiate between single-precision and
  ; double-precision FP literals.
diff --git a/test/CodeGen/NVPTX/implicit-def.ll b/test/CodeGen/NVPTX/implicit-def.ll

index 06d3d562046ea2783eabc366e7f2d3c3fa8791a2..2d2c6e527f6d8e07dbd7440c725e38f2af6ca46d 100644 (file)
--- a/test/CodeGen/NVPTX/implicit-def.ll
+++ b/test/CodeGen/NVPTX/implicit-def.ll
@@ -1,7 +1,7 @@
  ; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 -asm-verbose=1 | FileCheck %s
  
  ; CHECK: // implicit-def: %f[[F0:[0-9]+]]
-; CHECK: add.f32         %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]];
+; CHECK: add.rn.f32         %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]];
  define float @foo(float %a) {
    %ret = fadd float %a, undef
    ret float %ret
author	Justin Holewinski <jholewinski@nvidia.com>
	Thu, 17 Jul 2014 18:10:09 +0000 (18:10 +0000)
committer	Justin Holewinski <jholewinski@nvidia.com>
	Thu, 17 Jul 2014 18:10:09 +0000 (18:10 +0000)
lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp		patch \| blob \| history
lib/Target/NVPTX/NVPTXISelDAGToDAG.h		patch \| blob \| history
lib/Target/NVPTX/NVPTXISelLowering.cpp		patch \| blob \| history
lib/Target/NVPTX/NVPTXISelLowering.h		patch \| blob \| history
lib/Target/NVPTX/NVPTXInstrInfo.td		patch \| blob \| history
test/CodeGen/NVPTX/arithmetic-fp-sm20.ll		patch \| blob \| history
test/CodeGen/NVPTX/fma.ll		patch \| blob \| history
test/CodeGen/NVPTX/fp-contract.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/NVPTX/fp-literals.ll		patch \| blob \| history
test/CodeGen/NVPTX/implicit-def.ll		patch \| blob \| history