Removing LLVM_DELETED_FUNCTION, as MSVC 2012 was the last reason for requiring the...

[oota-llvm.git] / lib / Target / R600 / AMDGPUISelLowering.cpp
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp

index 2f23de0cb2ec0af5216f6581f905abae12bc12f1..d96f03aaeb0545c2e4018d4780178014e9e27152 100644 (file)
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -102,11 +102,9 @@ EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
    return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
  }
  
-AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
-  TargetLowering(TM) {
-
-  Subtarget = &TM.getSubtarget<AMDGPUSubtarget>();
-
+AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
+                                           const AMDGPUSubtarget &STI)
+    : TargetLowering(TM), Subtarget(&STI) {
    setOperationAction(ISD::Constant, MVT::i32, Legal);
    setOperationAction(ISD::Constant, MVT::i64, Legal);
    setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
@@ -127,9 +125,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
    setOperationAction(ISD::FABS,   MVT::f32, Legal);
    setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
    setOperationAction(ISD::FRINT,  MVT::f32, Legal);
-  setOperationAction(ISD::FROUND, MVT::f32, Legal);
    setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
  
+  setOperationAction(ISD::FROUND, MVT::f32, Custom);
+  setOperationAction(ISD::FROUND, MVT::f64, Custom);
+
    setOperationAction(ISD::FREM, MVT::f32, Custom);
    setOperationAction(ISD::FREM, MVT::f64, Custom);
  
@@ -141,9 +141,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
    setOperationAction(ISD::STORE, MVT::v2f32, Promote);
    AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
  
-  setOperationAction(ISD::STORE, MVT::i64, Promote);
-  AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
-
    setOperationAction(ISD::STORE, MVT::v4f32, Promote);
    AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
  
@@ -162,9 +159,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
    // Custom lowering of vector stores is required for local address space
    // stores.
    setOperationAction(ISD::STORE, MVT::v4i32, Custom);
-  // XXX: Native v2i32 local address space stores are possible, but not
-  // currently implemented.
-  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
  
    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
    setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
@@ -187,9 +181,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
    setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
    AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
  
-  setOperationAction(ISD::LOAD, MVT::i64, Promote);
-  AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
-
    setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
    AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
  
@@ -216,18 +207,28 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
  
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand);
+  // There are no 64-bit extloads. These should be done as a 32-bit extload and
+  // an extension to 64-bit.
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
+  }
+
+  for (MVT VT : MVT::integer_vector_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
+  }
  
    setOperationAction(ISD::BR_CC, MVT::i1, Expand);
  
@@ -246,7 +247,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
  
    setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
  
-  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
    setTruncStoreAction(MVT::f32, MVT::f16, Expand);
    setTruncStoreAction(MVT::f64, MVT::f16, Expand);
  
@@ -400,6 +402,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
    // large sequence of instructions.
    setIntDivIsCheap(false);
    setPow2SDivIsCheap(false);
+  setFsqrtIsCheap(true);
  
    // FIXME: Need to really handle these.
    MaxStoresPerMemcpy  = 4096;
@@ -468,6 +471,18 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
            (LScalarSize < 32));
  }
  
+// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
+// profitable with the expansion for 64-bit since it's generally good to
+// speculate things.
+// FIXME: These should really have the size as a parameter.
+bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
+  return true;
+}
+
+bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
+  return true;
+}
+
  //===---------------------------------------------------------------------===//
  // Target Properties
  //===---------------------------------------------------------------------===//
@@ -586,6 +601,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
    case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
    case ISD::FRINT: return LowerFRINT(Op, DAG);
    case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
+  case ISD::FROUND: return LowerFROUND(Op, DAG);
    case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
    case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
    case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
@@ -645,7 +661,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
                                                         const SDValue &InitPtr,
                                                         SDValue Chain,
                                                         SelectionDAG &DAG) const {
-  const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout();
+  const DataLayout *TD = getDataLayout();
    SDLoc DL(InitPtr);
    Type *InitTy = Init->getType();
  
@@ -733,7 +749,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
                                                   SDValue Op,
                                                   SelectionDAG &DAG) const {
  
-  const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout();
+  const DataLayout *TD = getDataLayout();
    GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
    const GlobalValue *GV = G->getGlobal();
  
@@ -836,8 +852,7 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
                                                SelectionDAG &DAG) const {
  
    MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
-      getTargetMachine().getSubtargetImpl()->getFrameLowering());
+  const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering();
  
    FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
  
@@ -892,10 +907,9 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
      }
  
      case Intrinsic::AMDGPU_div_fmas:
-      // FIXME: Dropping bool parameter. Work is needed to support the implicit
-      // read from VCC.
        return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+                         Op.getOperand(4));
  
      case Intrinsic::AMDGPU_div_fixup:
        return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
@@ -1396,24 +1410,6 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
    EVT VT = Op.getValueType();
    EVT MemVT = Load->getMemoryVT();
  
-  if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) {
-    // We can do the extload to 32-bits, and then need to separately extend to
-    // 64-bits.
-
-    SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32,
-                                       Load->getChain(),
-                                       Load->getBasePtr(),
-                                       MemVT,
-                                       Load->getMemOperand());
-
-    SDValue Ops[] = {
-      DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32),
-      ExtLoad32.getValue(1)
-    };
-
-    return DAG.getMergeValues(Ops, DL);
-  }
-
    if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
      assert(VT == MVT::i1 && "Only i1 non-extloads expected");
      // FIXME: Copied from PPC
@@ -1652,12 +1648,26 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
    SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
  
+  if (VT == MVT::i64 &&
+    DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
+    DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
+
+    SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
+                              LHS_Lo, RHS_Lo);
+
+    SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero);
+    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero);
+    Results.push_back(DIV);
+    Results.push_back(REM);
+    return;
+  }
+
    // Get Speculative values
    SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
    SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
  
-  SDValue REM_Hi = zero;
    SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
+  SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero);
  
    SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
    SDValue DIV_Lo = zero;
@@ -1665,8 +1675,10 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
    const unsigned halfBitWidth = HalfVT.getSizeInBits();
  
    for (unsigned i = 0; i < halfBitWidth; ++i) {
-    SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
-    // Get Value of high bit
+    const unsigned bitPos = halfBitWidth - i - 1;
+    SDValue POS = DAG.getConstant(bitPos, HalfVT);
+    // Get value of high bit
+    // TODO: Remove the BFE part when the optimization is fixed
      SDValue HBit;
      if (halfBitWidth == 32 && Subtarget->hasBFE()) {
        HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
@@ -1674,33 +1686,23 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
        HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
        HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
      }
+    HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
  
-    SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
-      DAG.getConstant(halfBitWidth - 1, HalfVT));
-    REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
-    REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
-
-    REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
-    REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
-
+    // Shift
+    REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, VT));
+    // Add LHS high bit
+    REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
  
-    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
-
-    SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
+    SDValue BIT = DAG.getConstant(1 << bitPos, HalfVT);
      SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
  
      DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
  
      // Update REM
-
      SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
-
      REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
-    REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
-    REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
    }
  
-  SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
    SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
    Results.push_back(DIV);
    Results.push_back(REM);
@@ -1721,8 +1723,8 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
    SDValue Den = Op.getOperand(1);
  
    if (VT == MVT::i32) {
-    if (DAG.MaskedValueIsZero(Op.getOperand(0), APInt(32, 0xff << 24)) &&
-        DAG.MaskedValueIsZero(Op.getOperand(1), APInt(32, 0xff << 24))) {
+    if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) &&
+        DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) {
        // TODO: We technically could do this for i64, but shouldn't that just be
        // handled by something generally reducing 64-bit division on 32-bit
        // values to 32-bit?
@@ -1834,19 +1836,31 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
    SDValue LHS = Op.getOperand(0);
    SDValue RHS = Op.getOperand(1);
  
-  if (VT == MVT::i32) {
-    if (DAG.ComputeNumSignBits(Op.getOperand(0)) > 8 &&
-        DAG.ComputeNumSignBits(Op.getOperand(1)) > 8) {
-      // TODO: We technically could do this for i64, but shouldn't that just be
-      // handled by something generally reducing 64-bit division on 32-bit
-      // values to 32-bit?
-      return LowerDIVREM24(Op, DAG, true);
-    }
-  }
-
    SDValue Zero = DAG.getConstant(0, VT);
    SDValue NegOne = DAG.getConstant(-1, VT);
  
+  if (VT == MVT::i32 &&
+      DAG.ComputeNumSignBits(LHS) > 8 &&
+      DAG.ComputeNumSignBits(RHS) > 8) {
+    return LowerDIVREM24(Op, DAG, true);
+  }
+  if (VT == MVT::i64 &&
+      DAG.ComputeNumSignBits(LHS) > 32 &&
+      DAG.ComputeNumSignBits(RHS) > 32) {
+    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+    //HiLo split
+    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
+    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
+    SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
+                                 LHS_Lo, RHS_Lo);
+    SDValue Res[2] = {
+      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
+      DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
+    };
+    return DAG.getMergeValues(Res, DL);
+  }
+
    SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
    SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
    SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
@@ -1911,6 +1925,20 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
    return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
  }
  
+static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) {
+  const unsigned FractBits = 52;
+  const unsigned ExpBits = 11;
+
+  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
+                                Hi,
+                                DAG.getConstant(FractBits - 32, MVT::i32),
+                                DAG.getConstant(ExpBits, MVT::i32));
+  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
+                            DAG.getConstant(1023, MVT::i32));
+
+  return Exp;
+}
+
  SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
    SDLoc SL(Op);
    SDValue Src = Op.getOperand(0);
@@ -1926,16 +1954,9 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
    // exponent.
    SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
  
-  const unsigned FractBits = 52;
-  const unsigned ExpBits = 11;
+  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
  
-  // Extract the exponent.
-  SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
-                                Hi,
-                                DAG.getConstant(FractBits - 32, MVT::i32),
-                                DAG.getConstant(ExpBits, MVT::i32));
-  SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
-                            DAG.getConstant(1023, MVT::i32));
+  const unsigned FractBits = 52;
  
    // Extract the sign bit.
    const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32);
@@ -1998,6 +2019,99 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con
    return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
  }
  
+// XXX - May require not supporting f32 denormals?
+SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue X = Op.getOperand(0);
+
+  SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
+
+  SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
+
+  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
+
+  const SDValue Zero = DAG.getConstantFP(0.0, MVT::f32);
+  const SDValue One = DAG.getConstantFP(1.0, MVT::f32);
+  const SDValue Half = DAG.getConstantFP(0.5, MVT::f32);
+
+  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
+
+  SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
+
+  SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
+
+  return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
+}
+
+SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  SDValue X = Op.getOperand(0);
+
+  SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
+
+  const SDValue Zero = DAG.getConstant(0, MVT::i32);
+  const SDValue One = DAG.getConstant(1, MVT::i32);
+  const SDValue NegOne = DAG.getConstant(-1, MVT::i32);
+  const SDValue FiftyOne = DAG.getConstant(51, MVT::i32);
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
+
+
+  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
+
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
+
+  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
+
+  const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), MVT::i64);
+
+  SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
+  SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
+                          DAG.getConstant(INT64_C(0x0008000000000000), MVT::i64),
+                          Exp);
+
+  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
+  SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
+                              DAG.getConstant(0, MVT::i64), Tmp0,
+                              ISD::SETNE);
+
+  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
+                             D, DAG.getConstant(0, MVT::i64));
+  SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
+
+  K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
+  K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
+
+  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
+  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
+  SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
+
+  SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
+                            ExpEqNegOne,
+                            DAG.getConstantFP(1.0, MVT::f64),
+                            DAG.getConstantFP(0.0, MVT::f64));
+
+  SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
+
+  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
+  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
+
+  return K;
+}
+
+SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  if (VT == MVT::f32)
+    return LowerFROUND32(Op, DAG);
+
+  if (VT == MVT::f64)
+    return LowerFROUND64(Op, DAG);
+
+  llvm_unreachable("unhandled type");
+}
+
  SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
    SDLoc SL(Op);
    SDValue Src = Op.getOperand(0);
@@ -2552,6 +2666,46 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
    }
  }
  
+SDValue AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand,
+                                               DAGCombinerInfo &DCI,
+                                               unsigned &RefinementSteps,
+                                               bool &UseOneConstNR) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = Operand.getValueType();
+
+  if (VT == MVT::f32) {
+    RefinementSteps = 0;
+    return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
+  }
+
+  // TODO: There is also f64 rsq instruction, but the documentation is less
+  // clear on its precision.
+
+  return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
+                                               DAGCombinerInfo &DCI,
+                                               unsigned &RefinementSteps) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = Operand.getValueType();
+
+  if (VT == MVT::f32) {
+    // Reciprocal, < 1 ulp error.
+    //
+    // This reciprocal approximation converges to < 0.5 ulp error with one
+    // newton rhapson performed with two fused multiple adds (FMAs).
+
+    RefinementSteps = 0;
+    return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
+  }
+
+  // TODO: There is also f64 rcp instruction, but the documentation is less
+  // clear on its precision.
+
+  return SDValue();
+}
+
  static void computeKnownBitsForMinMax(const SDValue Op0,
                                        const SDValue Op1,
                                        APInt &KnownZero,