R600: Promote i64 loads to v2i32

[oota-llvm.git] / lib / Target / R600 / SIISelLowering.cpp
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp

index e68804850785a5f046056b294ea8646380ea7d90..f4911169d2a9a0d46c8305dcfdb6d58b52610132 100644 (file)
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -14,8 +14,8 @@
  
  #include "SIISelLowering.h"
  #include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
  #include "AMDGPUSubtarget.h"
-#include "AMDILIntrinsicInfo.h"
  #include "SIInstrInfo.h"
  #include "SIMachineFunctionInfo.h"
  #include "SIRegisterInfo.h"
@@ -24,26 +24,27 @@
  #include "llvm/CodeGen/MachineRegisterInfo.h"
  #include "llvm/CodeGen/SelectionDAG.h"
  #include "llvm/IR/Function.h"
+#include "llvm/ADT/SmallString.h"
  
  using namespace llvm;
  
  SITargetLowering::SITargetLowering(TargetMachine &TM) :
      AMDGPUTargetLowering(TM) {
    addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
-  addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass);
+  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
  
    addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
    addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
  
    addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
-  addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass);
+  addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
  
-  addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass);
-  addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass);
-  addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass);
+  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
+  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
+  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
  
-  addRegisterClass(MVT::v4i32, &AMDGPU::VSrc_128RegClass);
-  addRegisterClass(MVT::v4f32, &AMDGPU::VSrc_128RegClass);
+  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
+  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
  
    addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
    addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
@@ -76,6 +77,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
    setOperationAction(ISD::ADD, MVT::i32, Legal);
    setOperationAction(ISD::ADDC, MVT::i32, Legal);
    setOperationAction(ISD::ADDE, MVT::i32, Legal);
+  setOperationAction(ISD::SUBC, MVT::i32, Legal);
+  setOperationAction(ISD::SUBE, MVT::i32, Legal);
  
    // We need to custom lower vector stores from local memory
    setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
@@ -88,33 +91,29 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
  
    // We need to custom lower loads/stores from private memory
    setOperationAction(ISD::LOAD, MVT::i32, Custom);
-  setOperationAction(ISD::LOAD, MVT::i64, Custom);
    setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
    setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
    setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
  
    setOperationAction(ISD::STORE, MVT::i1, Custom);
    setOperationAction(ISD::STORE, MVT::i32, Custom);
-  setOperationAction(ISD::STORE, MVT::i64, Custom);
    setOperationAction(ISD::STORE, MVT::v2i32, Custom);
    setOperationAction(ISD::STORE, MVT::v4i32, Custom);
  
+  setOperationAction(ISD::SELECT, MVT::f32, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32);
    setOperationAction(ISD::SELECT, MVT::i64, Custom);
    setOperationAction(ISD::SELECT, MVT::f64, Promote);
    AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
  
-  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
-
-  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
  
    setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
    setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
  
-  setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom);
-  setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
-  setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
-
    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
@@ -137,6 +136,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
  
    setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  
    setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
    setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
@@ -179,8 +179,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
      MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32
    };
  
-  const size_t NumVecTypes = array_lengthof(VecTypes);
-  for (unsigned Type = 0; Type < NumVecTypes; ++Type) {
+  for (MVT VT : VecTypes) {
      for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
        switch(Op) {
        case ISD::LOAD:
@@ -194,7 +193,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
        case ISD::EXTRACT_SUBVECTOR:
          break;
        default:
-        setOperationAction(Op, VecTypes[Type], Expand);
+        setOperationAction(Op, VT, Expand);
          break;
        }
      }
@@ -214,9 +213,16 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
      setOperationAction(ISD::FRINT, MVT::f64, Legal);
    }
  
+  // FIXME: These should be removed and handled the same was as f32 fneg. Source
+  // modifiers also work for the double instructions.
+  setOperationAction(ISD::FNEG, MVT::f64, Expand);
+  setOperationAction(ISD::FABS, MVT::f64, Expand);
+
    setTargetDAGCombine(ISD::SELECT_CC);
    setTargetDAGCombine(ISD::SETCC);
  
+  setTargetDAGCombine(ISD::UINT_TO_FP);
+
    setSchedulingPreference(Sched::RegPressure);
  }
  
@@ -444,35 +450,6 @@ SDValue SITargetLowering::LowerFormalArguments(
    return Chain;
  }
  
-/// Usually ISel will insert a copy between terminator insturction that output
-/// a value and the S_BRANCH* at the end of the block.  This causes
-/// MachineBasicBlock::getFirstTerminator() to return the incorrect value,
-/// so we want to make sure there are no copies between terminators at the
-/// end of blocks.
-static void LowerTerminatorWithOutput(unsigned Opcode, MachineBasicBlock *BB,
-                                      MachineInstr *MI,
-                                      const TargetInstrInfo *TII,
-                                      MachineRegisterInfo &MRI) {
-  unsigned DstReg = MI->getOperand(0).getReg();
-  // Usually ISel will insert a copy between the SI_IF_NON_TERM instruction
-  // and the S_BRANCH* terminator.  We want to replace SI_IF_NO_TERM with
-  // SI_IF and we can't have any instructions between S_BRANCH* and SI_IF,
-  // since they are both terminators
-  assert(MRI.hasOneUse(DstReg));
-  MachineOperand &Use = *MRI.use_begin(DstReg);
-  MachineInstr *UseMI = Use.getParent();
-  assert(UseMI->getOpcode() == AMDGPU::COPY);
-
-  MRI.replaceRegWith(UseMI->getOperand(0).getReg(), DstReg);
-  UseMI->eraseFromParent();
-  BuildMI(*BB, BB->getFirstTerminator(), MI->getDebugLoc(),
-          TII->get(Opcode))
-          .addOperand(MI->getOperand(0))
-          .addOperand(MI->getOperand(1))
-          .addOperand(MI->getOperand(2));
-  MI->eraseFromParent();
-}
-
  MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
      MachineInstr * MI, MachineBasicBlock * BB) const {
  
@@ -510,25 +487,20 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
      MI->eraseFromParent();
      break;
    }
-  case AMDGPU::SI_IF_NON_TERM:
-    LowerTerminatorWithOutput(AMDGPU::SI_IF, BB, MI, TII, MRI);
-    break;
-  case AMDGPU::SI_ELSE_NON_TERM:
-    LowerTerminatorWithOutput(AMDGPU::SI_ELSE, BB, MI, TII, MRI);
-    break;
-  case AMDGPU::V_SUB_F64:
-    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64),
-            MI->getOperand(0).getReg())
-            .addReg(MI->getOperand(1).getReg())
-            .addReg(MI->getOperand(2).getReg())
-            .addImm(0)  /* src2 */
-            .addImm(0)  /* ABS */
-            .addImm(0)  /* CLAMP */
-            .addImm(0)  /* OMOD */
-            .addImm(2); /* NEG */
+  case AMDGPU::V_SUB_F64: {
+    unsigned DestReg = MI->getOperand(0).getReg();
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg)
+      .addImm(0)  // SRC0 modifiers
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(1)  // SRC1 modifiers
+      .addReg(MI->getOperand(2).getReg())
+      .addImm(0)  // SRC2 modifiers
+      .addImm(0)  // src2
+      .addImm(0)  // CLAMP
+      .addImm(0); // OMOD
      MI->eraseFromParent();
      break;
-
+  }
    case AMDGPU::SI_RegisterStorePseudo: {
      MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
      unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -539,6 +511,50 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
        MIB.addOperand(MI->getOperand(i));
  
      MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::FABS_SI: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32),
+            Reg)
+            .addImm(0x7fffffff);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_AND_B32_e32),
+            MI->getOperand(0).getReg())
+            .addReg(MI->getOperand(1).getReg())
+            .addReg(Reg);
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::FNEG_SI: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32),
+            Reg)
+            .addImm(0x80000000);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_XOR_B32_e32),
+            MI->getOperand(0).getReg())
+            .addReg(MI->getOperand(1).getReg())
+            .addReg(Reg);
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::FCLAMP_SI: {
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F32_e64),
+            MI->getOperand(0).getReg())
+            .addImm(0) // SRC0 modifiers
+            .addOperand(MI->getOperand(1))
+            .addImm(0) // SRC1 modifiers
+            .addImm(0) // SRC1
+            .addImm(1) // CLAMP
+            .addImm(0); // OMOD
+    MI->eraseFromParent();
    }
    }
    return BB;
@@ -585,6 +601,14 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::BRCOND: return LowerBRCOND(Op, DAG);
    case ISD::LOAD: {
      LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
+    EVT VT = Op.getValueType();
+
+    // These loads are legal.
+    if (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+        VT.isVector() && VT.getVectorNumElements() == 2 &&
+        VT.getVectorElementType() == MVT::i32)
+      return SDValue();
+
      if (Op.getValueType().isVector() &&
          (Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
           Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
@@ -601,11 +625,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    }
  
    case ISD::SELECT: return LowerSELECT(Op, DAG);
-  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
-  case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
    case ISD::STORE: return LowerSTORE(Op, DAG);
-  case ISD::ANY_EXTEND: // Fall-through
-  case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG);
    case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
    case ISD::INTRINSIC_WO_CHAIN: {
      unsigned IntrinsicID =
@@ -893,45 +913,17 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
  }
  
-SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue True = Op.getOperand(2);
-  SDValue False = Op.getOperand(3);
-  SDValue CC = Op.getOperand(4);
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-
-  // Possible Min/Max pattern
-  SDValue MinMax = LowerMinMax(Op, DAG);
-  if (MinMax.getNode()) {
-    return MinMax;
-  }
-
-  SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC);
-  return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
-}
-
-SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-
-  if (VT != MVT::i64) {
-    return SDValue();
-  }
-
-  SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0),
-                                                 DAG.getConstant(31, MVT::i32));
-
-  return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi);
-}
-
  SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
    SDLoc DL(Op);
    StoreSDNode *Store = cast<StoreSDNode>(Op);
    EVT VT = Store->getMemoryVT();
  
+  // These stores are legal.
+  if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+      VT.isVector() && VT.getVectorNumElements() == 2 &&
+      VT.getVectorElementType() == MVT::i32)
+    return SDValue();
+
    SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
    if (Ret.getNode())
      return Ret;
@@ -1007,27 +999,99 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
    return Chain;
  }
  
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
  
-SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
+SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
+                                                     DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
+  EVT ScalarVT = VT.getScalarType();
+  if (ScalarVT != MVT::f32)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+
+  // TODO: We could try to match extracting the higher bytes, which would be
+  // easier if i8 vectors weren't promoted to i32 vectors, particularly after
+  // types are legalized. v4i8 -> v4f32 is probably the only case to worry
+  // about in practice.
+  if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
+    if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
+      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
+      DCI.AddToWorklist(Cvt.getNode());
+      return Cvt;
+    }
+  }
  
-  if (VT != MVT::i64) {
+  // We are primarily trying to catch operations on illegal vector types
+  // before they are expanded.
+  // For scalars, we can use the more flexible method of checking masked bits
+  // after legalization.
+  if (!DCI.isBeforeLegalize() ||
+      !SrcVT.isVector() ||
+      SrcVT.getVectorElementType() != MVT::i8) {
      return SDValue();
    }
  
-  SDValue Src = Op.getOperand(0);
-  if (Src.getValueType() != MVT::i32)
-    Src = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
+  assert(DCI.isBeforeLegalize() && "Unexpected legal type");
  
-  SDValue Zero = DAG.getConstant(0, MVT::i32);
-  return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Src, Zero);
-}
+  // Weird sized vectors are a pain to handle, but we know 3 is really the same
+  // size as 4.
+  unsigned NElts = SrcVT.getVectorNumElements();
+  if (!SrcVT.isSimple() && NElts != 3)
+    return SDValue();
  
-//===----------------------------------------------------------------------===//
-// Custom DAG optimizations
-//===----------------------------------------------------------------------===//
+  // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to
+  // prevent a mess from expanding to v4i32 and repacking.
+  if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+    EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
+    EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
+    EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
+
+    LoadSDNode *Load = cast<LoadSDNode>(Src);
+    SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
+                                     Load->getChain(),
+                                     Load->getBasePtr(),
+                                     LoadVT,
+                                     Load->getMemOperand());
+
+    // Make sure successors of the original load stay after it by updating
+    // them to use the new Chain.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1));
+
+    SmallVector<SDValue, 4> Elts;
+    if (RegVT.isVector())
+      DAG.ExtractVectorElements(NewLoad, Elts);
+    else
+      Elts.push_back(NewLoad);
+
+    SmallVector<SDValue, 4> Ops;
+
+    unsigned EltIdx = 0;
+    for (SDValue Elt : Elts) {
+      unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx);
+      for (unsigned I = 0; I < ComponentsInElt; ++I) {
+        unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I;
+        SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt);
+        DCI.AddToWorklist(Cvt.getNode());
+        Ops.push_back(Cvt);
+      }
+
+      ++EltIdx;
+    }
+
+    assert(Ops.size() == NElts);
+
+    return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops);
+  }
+
+  return SDValue();
+}
  
  SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
@@ -1070,8 +1134,34 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
        }
        break;
      }
+
+  case AMDGPUISD::CVT_F32_UBYTE0:
+  case AMDGPUISD::CVT_F32_UBYTE1:
+  case AMDGPUISD::CVT_F32_UBYTE2:
+  case AMDGPUISD::CVT_F32_UBYTE3: {
+    unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
+
+    SDValue Src = N->getOperand(0);
+    APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
+
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                          !DCI.isBeforeLegalizeOps());
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
+        TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
+      DCI.CommitTargetLoweringOpt(TLO);
+    }
+
+    break;
    }
-  return SDValue();
+
+  case ISD::UINT_TO_FP: {
+    return performUCharToFloatCombine(N, DCI);
+  }
+  }
+
+  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
  }
  
  /// \brief Test if RegClass is one of the VSrc classes
@@ -1285,14 +1375,14 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
    // e64 version if available, -1 otherwise
    int OpcodeE64 = AMDGPU::getVOPe64(Opcode);
    const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? nullptr : &TII->get(OpcodeE64);
+  int InputModifiers[3] = {0};
  
    assert(!DescE64 || DescE64->getNumDefs() == NumDefs);
-  assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4));
  
    int32_t Immediate = Desc->getSize() == 4 ? 0 : -1;
    bool HaveVSrc = false, HaveSSrc = false;
  
-  // First figure out what we alread have in this instruction
+  // First figure out what we already have in this instruction.
    for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
         i != e && Op < NumOps; ++i, ++Op) {
  
@@ -1311,7 +1401,7 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
      }
    }
  
-  // If we neither have VSrc nor SSrc it makes no sense to continue
+  // If we neither have VSrc nor SSrc, it makes no sense to continue.
    if (!HaveVSrc && !HaveSSrc)
      return Node;
  
@@ -1327,17 +1417,17 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
      const SDValue &Operand = Node->getOperand(i);
      Ops.push_back(Operand);
  
-    // Already folded immediate ?
+    // Already folded immediate?
      if (isa<ConstantSDNode>(Operand.getNode()) ||
          isa<ConstantFPSDNode>(Operand.getNode()))
        continue;
  
-    // Is this a VSrc or SSrc operand ?
+    // Is this a VSrc or SSrc operand?
      unsigned RegClass = Desc->OpInfo[Op].RegClass;
      if (isVSrc(RegClass) || isSSrc(RegClass)) {
        // Try to fold the immediates
        if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) {
-        // Folding didn't worked, make sure we don't hit the SReg limit
+        // Folding didn't work, make sure we don't hit the SReg limit.
          ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed);
        }
        continue;
@@ -1362,8 +1452,10 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
        }
      }
  
-    if (DescE64 && !Immediate) {
+    if (Immediate)
+      continue;
  
+    if (DescE64) {
        // Test if it makes sense to switch to e64 encoding
        unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass;
        if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass))
@@ -1381,11 +1473,43 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
          DescE64 = nullptr;
        }
      }
+
+    if (!DescE64 && !Promote2e64)
+      continue;
+    if (!Operand.isMachineOpcode())
+      continue;
+    if (Operand.getMachineOpcode() == AMDGPU::FNEG_SI) {
+      Ops.pop_back();
+      Ops.push_back(Operand.getOperand(0));
+      InputModifiers[i] = 1;
+      Promote2e64 = true;
+      if (!DescE64)
+        continue;
+      Desc = DescE64;
+      DescE64 = nullptr;
+    }
+    else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) {
+      Ops.pop_back();
+      Ops.push_back(Operand.getOperand(0));
+      InputModifiers[i] = 2;
+      Promote2e64 = true;
+      if (!DescE64)
+        continue;
+      Desc = DescE64;
+      DescE64 = nullptr;
+    }
    }
  
    if (Promote2e64) {
+    std::vector<SDValue> OldOps(Ops);
+    Ops.clear();
+    for (unsigned i = 0; i < OldOps.size(); ++i) {
+      // src_modifier
+      Ops.push_back(DAG.getTargetConstant(InputModifiers[i], MVT::i32));
+      Ops.push_back(OldOps[i]);
+    }
      // Add the modifier flags while promoting
-    for (unsigned i = 0; i < 4; ++i)
+    for (unsigned i = 0; i < 2; ++i)
        Ops.push_back(DAG.getTargetConstant(0, MVT::i32));
    }
  
@@ -1495,7 +1619,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
    }
  }
  
-/// \brief Fold the instructions after slecting them
+/// \brief Fold the instructions after selecting them.
  SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                            SelectionDAG &DAG) const {
    const SIInstrInfo *TII =