Remove the TargetMachine forwards for TargetSubtargetInfo based

[oota-llvm.git] / lib / Target / R600 / AMDGPUISelLowering.cpp
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp

index 3bde6e1c8f26a4424384d74b287f20219eb4a695..3cc135c98d7afc30581f386ccdf164f2db7102aa 100644 (file)
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -21,7 +21,6 @@
  #include "AMDGPUSubtarget.h"
  #include "R600MachineFunctionInfo.h"
  #include "SIMachineFunctionInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
  #include "llvm/CodeGen/CallingConvLower.h"
  #include "llvm/CodeGen/MachineFunction.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -108,9 +107,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
  
    Subtarget = &TM.getSubtarget<AMDGPUSubtarget>();
  
-  // Initialize target lowering borrowed from AMDIL
-  InitAMDILLowering();
-
    setOperationAction(ISD::Constant, MVT::i32, Legal);
    setOperationAction(ISD::Constant, MVT::i64, Legal);
    setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
@@ -142,6 +138,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
    setOperationAction(ISD::STORE, MVT::v2f32, Promote);
    AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
  
+  setOperationAction(ISD::STORE, MVT::i64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
+
    setOperationAction(ISD::STORE, MVT::v4f32, Promote);
    AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
  
@@ -185,6 +184,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
    setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
    AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
  
+  setOperationAction(ISD::LOAD, MVT::i64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
+
    setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
    AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
  
@@ -239,10 +241,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
      setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    }
  
+  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+
    const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
    for (MVT VT : ScalarIntVTs) {
      setOperationAction(ISD::SREM, VT, Expand);
-    setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::SDIV, VT, Custom);
  
      // GPU does not have divrem function for signed or unsigned.
      setOperationAction(ISD::SDIVREM, VT, Custom);
@@ -268,7 +276,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
    setOperationAction(ISD::ROTL, MVT::i64, Expand);
    setOperationAction(ISD::ROTR, MVT::i64, Expand);
  
-  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
    setOperationAction(ISD::MUL, MVT::i64, Expand);
    setOperationAction(ISD::MULHU, MVT::i64, Expand);
    setOperationAction(ISD::MULHS, MVT::i64, Expand);
@@ -277,6 +284,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
    setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
  
+  if (!Subtarget->hasFFBH())
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+
+  if (!Subtarget->hasFFBL())
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+
    static const MVT::SimpleValueType VectorIntTypes[] = {
      MVT::v2i32, MVT::v4i32
    };
@@ -339,6 +352,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
      setOperationAction(ISD::FFLOOR, VT, Expand);
      setOperationAction(ISD::FTRUNC, VT, Expand);
      setOperationAction(ISD::FMUL, VT, Expand);
+    setOperationAction(ISD::FMA, VT, Expand);
      setOperationAction(ISD::FRINT, VT, Expand);
      setOperationAction(ISD::FNEARBYINT, VT, Expand);
      setOperationAction(ISD::FSQRT, VT, Expand);
@@ -357,10 +371,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
  
    setTargetDAGCombine(ISD::MUL);
    setTargetDAGCombine(ISD::SELECT_CC);
+  setTargetDAGCombine(ISD::STORE);
  
    setSchedulingPreference(Sched::RegPressure);
    setJumpIsExpensive(true);
  
+  setSelectIsExpensive(false);
+  PredictableSelectIsExpensive = false;
+
    // There are no integer divide instructions, and these expand to a pretty
    // large sequence of instructions.
    setIntDivIsCheap(false);
@@ -383,6 +401,10 @@ MVT AMDGPUTargetLowering::getVectorIdxTy() const {
    return MVT::i32;
  }
  
+bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
+  return true;
+}
+
  // The backend supports 32 and 64 bit floating point immediates.
  // FIXME: Why are we reporting vectors of FP immediates as legal?
  bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
@@ -450,6 +472,10 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
    return Src == MVT::i32 && Dest == MVT::i64;
  }
  
+bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  return isZExtFree(Val.getValueType(), VT2);
+}
+
  bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
    // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
    // limited number of native 64-bit operations. Shrinking an operation to fit
@@ -511,7 +537,6 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
      llvm_unreachable("Custom lowering code for this"
                       "instruction is not implemented yet!");
      break;
-  // AMDGPU DAG lowering.
    case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
    case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
    case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
@@ -527,9 +552,6 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
    case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
    case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
    case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
-
-  // AMDIL DAG lowering.
-  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
    }
    return Op;
  }
@@ -546,6 +568,24 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
      // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
      // nothing here and let the illegal result integer be handled normally.
      return;
+  case ISD::LOAD: {
+    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
+    if (!Node)
+      return;
+
+    Results.push_back(SDValue(Node, 0));
+    Results.push_back(SDValue(Node, 1));
+    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
+    // function
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
+    return;
+  }
+  case ISD::STORE: {
+    SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG);
+    if (Lowered.getNode())
+      Results.push_back(Lowered);
+    return;
+  }
    default:
      return;
    }
@@ -566,7 +606,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
                                                         const SDValue &InitPtr,
                                                         SDValue Chain,
                                                         SelectionDAG &DAG) const {
-  const DataLayout *TD = getTargetMachine().getDataLayout();
+  const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout();
    SDLoc DL(InitPtr);
    Type *InitTy = Init->getType();
  
@@ -643,7 +683,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
                                                   SDValue Op,
                                                   SelectionDAG &DAG) const {
  
-  const DataLayout *TD = getTargetMachine().getDataLayout();
+  const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout();
    GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
    const GlobalValue *GV = G->getGlobal();
  
@@ -666,7 +706,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
        Offset = MFI->LocalMemoryObjects[GV];
      }
  
-    return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace()));
+    return DAG.getConstant(Offset, getPointerTy(AMDGPUAS::LOCAL_ADDRESS));
    }
    case AMDGPUAS::CONSTANT_ADDRESS: {
      MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
@@ -738,8 +778,8 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
                                                SelectionDAG &DAG) const {
  
    MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL =
-   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
+  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
+      getTargetMachine().getSubtargetImpl()->getFrameLowering());
  
    FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
  
@@ -771,9 +811,21 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
        return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
                           Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
  
-    case Intrinsic::AMDGPU_div_scale:
-      return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2));
+    case Intrinsic::AMDGPU_div_scale: {
+      // 3rd parameter required to be a constant.
+      const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+      if (!Param)
+        return DAG.getUNDEF(VT);
+
+      // Translate to the operands expected by the machine instruction. The
+      // first parameter must be the same as the first instruction.
+      SDValue Numerator = Op.getOperand(1);
+      SDValue Denominator = Op.getOperand(2);
+      SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
+
+      return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
+                         Denominator, Numerator);
+    }
  
      case Intrinsic::AMDGPU_div_fmas:
        return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
@@ -793,6 +845,12 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
      case Intrinsic::AMDGPU_rsq:
        return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
  
+    case AMDGPUIntrinsic::AMDGPU_legacy_rsq:
+      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
+
+    case Intrinsic::AMDGPU_rsq_clamped:
+      return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
+
      case AMDGPUIntrinsic::AMDGPU_imax:
        return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
                                                    Op.getOperand(2));
@@ -865,7 +923,7 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
  
      case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
        return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
-    case AMDGPUIntrinsic::AMDGPU_trunc:
+    case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
        return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
    }
  }
@@ -953,26 +1011,93 @@ SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N,
    return SDValue();
  }
  
-SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
-                                              SelectionDAG &DAG) const {
-  LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
-  EVT MemEltVT = Load->getMemoryVT().getVectorElementType();
-  EVT EltVT = Op.getValueType().getVectorElementType();
+SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+  EVT MemVT = Load->getMemoryVT();
+  EVT MemEltVT = MemVT.getVectorElementType();
+
+  EVT LoadVT = Op.getValueType();
+  EVT EltVT = LoadVT.getVectorElementType();
    EVT PtrVT = Load->getBasePtr().getValueType();
+
    unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
    SmallVector<SDValue, 8> Loads;
+  SmallVector<SDValue, 8> Chains;
+
    SDLoc SL(Op);
+  unsigned MemEltSize = MemEltVT.getStoreSize();
+  MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
  
-  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+  for (unsigned i = 0; i < NumElts; ++i) {
      SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
-                    DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT));
-    Loads.push_back(DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
-                        Load->getChain(), Ptr,
-                        MachinePointerInfo(Load->getMemOperand()->getValue()),
-                        MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
-                        Load->getAlignment()));
+                              DAG.getConstant(i * MemEltSize, PtrVT));
+
+    SDValue NewLoad
+      = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
+                       Load->getChain(), Ptr,
+                       SrcValue.getWithOffset(i * MemEltSize),
+                       MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
+                       Load->isInvariant(), Load->getAlignment());
+    Loads.push_back(NewLoad.getValue(0));
+    Chains.push_back(NewLoad.getValue(1));
    }
-  return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), Loads);
+
+  SDValue Ops[] = {
+    DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads),
+    DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains)
+  };
+
+  return DAG.getMergeValues(Ops, SL);
+}
+
+SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  // If this is a 2 element vector, we really want to scalarize and not create
+  // weird 1 element vectors.
+  if (VT.getVectorNumElements() == 2)
+    return ScalarizeVectorLoad(Op, DAG);
+
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+  SDValue BasePtr = Load->getBasePtr();
+  EVT PtrVT = BasePtr.getValueType();
+  EVT MemVT = Load->getMemoryVT();
+  SDLoc SL(Op);
+  MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
+
+  EVT LoVT, HiVT;
+  EVT LoMemVT, HiMemVT;
+  SDValue Lo, Hi;
+
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
+  std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
+  SDValue LoLoad
+    = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
+                     Load->getChain(), BasePtr,
+                     SrcValue,
+                     LoMemVT, Load->isVolatile(), Load->isNonTemporal(),
+                     Load->isInvariant(), Load->getAlignment());
+
+  SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+                              DAG.getConstant(LoMemVT.getStoreSize(), PtrVT));
+
+  SDValue HiLoad
+    = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT,
+                     Load->getChain(), HiPtr,
+                     SrcValue.getWithOffset(LoMemVT.getStoreSize()),
+                     HiMemVT, Load->isVolatile(), Load->isNonTemporal(),
+                     Load->isInvariant(), Load->getAlignment());
+
+  SDValue Ops[] = {
+    DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
+    DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
+                LoLoad.getValue(1), HiLoad.getValue(1))
+  };
+
+  return DAG.getMergeValues(Ops, SL);
  }
  
  SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
@@ -1033,8 +1158,8 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
                        Store->getAlignment());
  }
  
-SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
-                                            SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op,
+                                                   SelectionDAG &DAG) const {
    StoreSDNode *Store = cast<StoreSDNode>(Op);
    EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
    EVT EltVT = Store->getValue().getValueType().getVectorElementType();
@@ -1044,21 +1169,77 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
  
    SmallVector<SDValue, 8> Chains;
  
+  unsigned EltSize = MemEltVT.getStoreSize();
+  MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
+
    for (unsigned i = 0, e = NumElts; i != e; ++i) {
      SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
-                              Store->getValue(), DAG.getConstant(i, MVT::i32));
-    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT,
-                              Store->getBasePtr(),
-                            DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8),
-                                            PtrVT));
-    Chains.push_back(DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
-                         MachinePointerInfo(Store->getMemOperand()->getValue()),
-                         MemEltVT, Store->isVolatile(), Store->isNonTemporal(),
-                         Store->getAlignment()));
+                              Store->getValue(),
+                              DAG.getConstant(i, MVT::i32));
+
+    SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), PtrVT);
+    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset);
+    SDValue NewStore =
+      DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
+                        SrcValue.getWithOffset(i * EltSize),
+                        MemEltVT, Store->isNonTemporal(), Store->isVolatile(),
+                        Store->getAlignment());
+    Chains.push_back(NewStore);
    }
+
    return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
  }
  
+SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  SDValue Val = Store->getValue();
+  EVT VT = Val.getValueType();
+
+  // If this is a 2 element vector, we really want to scalarize and not create
+  // weird 1 element vectors.
+  if (VT.getVectorNumElements() == 2)
+    return ScalarizeVectorStore(Op, DAG);
+
+  EVT MemVT = Store->getMemoryVT();
+  SDValue Chain = Store->getChain();
+  SDValue BasePtr = Store->getBasePtr();
+  SDLoc SL(Op);
+
+  EVT LoVT, HiVT;
+  EVT LoMemVT, HiMemVT;
+  SDValue Lo, Hi;
+
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
+  std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
+
+  EVT PtrVT = BasePtr.getValueType();
+  SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+                              DAG.getConstant(LoMemVT.getStoreSize(), PtrVT));
+
+  MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
+  SDValue LoStore
+    = DAG.getTruncStore(Chain, SL, Lo,
+                        BasePtr,
+                        SrcValue,
+                        LoMemVT,
+                        Store->isNonTemporal(),
+                        Store->isVolatile(),
+                        Store->getAlignment());
+  SDValue HiStore
+    = DAG.getTruncStore(Chain, SL, Hi,
+                        HiPtr,
+                        SrcValue.getWithOffset(LoMemVT.getStoreSize()),
+                        HiMemVT,
+                        Store->isNonTemporal(),
+                        Store->isVolatile(),
+                        Store->getAlignment());
+
+  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
+}
+
+
  SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
    SDLoc DL(Op);
    LoadSDNode *Load = cast<LoadSDNode>(Op);
@@ -1075,7 +1256,13 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
                                         Load->getBasePtr(),
                                         MemVT,
                                         Load->getMemOperand());
-    return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32);
+
+    SDValue Ops[] = {
+      DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32),
+      ExtLoad32.getValue(1)
+    };
+
+    return DAG.getMergeValues(Ops, DL);
    }
  
    if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
@@ -1089,21 +1276,13 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
  
      SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
                                     BasePtr, MVT::i8, MMO);
-    return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
-  }
  
-  // Lower loads constant address space global variable loads
-  if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
-      isa<GlobalVariable>(
-          GetUnderlyingObject(Load->getMemOperand()->getValue()))) {
+    SDValue Ops[] = {
+      DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD),
+      NewLD.getValue(1)
+    };
  
-    SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
-        getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
-    Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
-        DAG.getConstant(2, MVT::i32));
-    return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
-                       Load->getChain(), Ptr,
-                       DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
+    return DAG.getMergeValues(Ops, DL);
    }
  
    if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
@@ -1128,10 +1307,21 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
    EVT MemEltVT = MemVT.getScalarType();
    if (ExtType == ISD::SEXTLOAD) {
      SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
-    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
+
+    SDValue Ops[] = {
+      DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
+      Load->getChain()
+    };
+
+    return DAG.getMergeValues(Ops, DL);
    }
  
-  return DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
+  SDValue Ops[] = {
+    DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
+    Load->getChain()
+  };
+
+  return DAG.getMergeValues(Ops, DL);
  }
  
  SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
@@ -1146,7 +1336,7 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
    if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
         Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
        Store->getValue().getValueType().isVector()) {
-    return SplitVectorStore(Op, DAG);
+    return ScalarizeVectorStore(Op, DAG);
    }
  
    EVT MemVT = Store->getMemoryVT();
@@ -1191,84 +1381,83 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
    return SDValue();
  }
  
+// This is a shortcut for integer division because we have fast i32<->f32
+// conversions, and fast f32 reciprocal instructions. The fractional part of a
+// float is enough to accurately represent up to a 24-bit integer.
  SDValue AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
    SDLoc DL(Op);
-  EVT OVT = Op.getValueType();
+  EVT VT = Op.getValueType();
    SDValue LHS = Op.getOperand(0);
    SDValue RHS = Op.getOperand(1);
-  MVT INTTY;
-  MVT FLTTY;
-  if (!OVT.isVector()) {
-    INTTY = MVT::i32;
-    FLTTY = MVT::f32;
-  } else if (OVT.getVectorNumElements() == 2) {
-    INTTY = MVT::v2i32;
-    FLTTY = MVT::v2f32;
-  } else if (OVT.getVectorNumElements() == 4) {
-    INTTY = MVT::v4i32;
-    FLTTY = MVT::v4f32;
+  MVT IntVT = MVT::i32;
+  MVT FltVT = MVT::f32;
+
+  if (VT.isVector()) {
+    unsigned NElts = VT.getVectorNumElements();
+    IntVT = MVT::getVectorVT(MVT::i32, NElts);
+    FltVT = MVT::getVectorVT(MVT::f32, NElts);
    }
-  unsigned bitsize = OVT.getScalarType().getSizeInBits();
+
+  unsigned BitSize = VT.getScalarType().getSizeInBits();
+
    // char|short jq = ia ^ ib;
-  SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
+  SDValue jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
  
    // jq = jq >> (bitsize - 2)
-  jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT));
+  jq = DAG.getNode(ISD::SRA, DL, VT, jq, DAG.getConstant(BitSize - 2, VT));
  
    // jq = jq | 0x1
-  jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
+  jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, VT));
  
    // jq = (int)jq
-  jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
+  jq = DAG.getSExtOrTrunc(jq, DL, IntVT);
  
    // int ia = (int)LHS;
-  SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
+  SDValue ia = DAG.getSExtOrTrunc(LHS, DL, IntVT);
  
    // int ib, (int)RHS;
-  SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
+  SDValue ib = DAG.getSExtOrTrunc(RHS, DL, IntVT);
  
    // float fa = (float)ia;
-  SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
+  SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FltVT, ia);
  
    // float fb = (float)ib;
-  SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
+  SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FltVT, ib);
  
    // float fq = native_divide(fa, fb);
-  SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb);
+  SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
+                           fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
  
    // fq = trunc(fq);
-  fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
+  fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
  
    // float fqneg = -fq;
-  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
+  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
  
    // float fr = mad(fqneg, fb, fa);
-  SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY,
-      DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa);
+  SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT,
+                           DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa);
  
    // int iq = (int)fq;
-  SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
+  SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, IntVT, fq);
  
    // fr = fabs(fr);
-  fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
+  fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
  
    // fb = fabs(fb);
-  fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
+  fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
+
+  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT);
  
    // int cv = fr >= fb;
-  SDValue cv;
-  if (INTTY == MVT::i32) {
-    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
-  } else {
-    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
-  }
+  SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
+
    // jq = (cv ? jq : 0);
-  jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq,
-      DAG.getConstant(0, OVT));
+  jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, VT));
+
    // dst = iq + jq;
-  iq = DAG.getSExtOrTrunc(iq, DL, OVT);
-  iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
-  return iq;
+  iq = DAG.getSExtOrTrunc(iq, DL, VT);
+  return DAG.getNode(ISD::ADD, DL, VT, iq, jq);
  }
  
  SDValue AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
@@ -1343,19 +1532,20 @@ SDValue AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
  SDValue AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
    EVT OVT = Op.getValueType().getScalarType();
  
-  if (OVT == MVT::i64)
-    return LowerSDIV64(Op, DAG);
+  if (OVT == MVT::i32) {
+    if (DAG.ComputeNumSignBits(Op.getOperand(0)) > 8 &&
+        DAG.ComputeNumSignBits(Op.getOperand(1)) > 8) {
+      // TODO: We technically could do this for i64, but shouldn't that just be
+      // handled by something generally reducing 64-bit division on 32-bit
+      // values to 32-bit?
+      return LowerSDIV24(Op, DAG);
+    }
  
-  if (OVT.getScalarType() == MVT::i32)
      return LowerSDIV32(Op, DAG);
-
-  if (OVT == MVT::i16 || OVT == MVT::i8) {
-    // FIXME: We should be checking for the masked bits. This isn't reached
-    // because i8 and i16 are not legal types.
-    return LowerSDIV24(Op, DAG);
    }
  
-  return SDValue(Op.getNode(), 0);
+  assert(OVT == MVT::i64);
+  return LowerSDIV64(Op, DAG);
  }
  
  SDValue AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
@@ -1811,41 +2001,96 @@ static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
    return DAG.getConstant(Src0 >> Offset, MVT::i32);
  }
  
+static bool usesAllNormalStores(SDNode *LoadVal) {
+  for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) {
+    if (!ISD::isNormalStore(*I))
+      return false;
+  }
+
+  return true;
+}
+
+// If we have a copy of an illegal type, replace it with a load / store of an
+// equivalently sized legal type. This avoids intermediate bit pack / unpack
+// instructions emitted when handling extloads and truncstores. Ideally we could
+// recognize the pack / unpack pattern to eliminate it.
+SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  StoreSDNode *SN = cast<StoreSDNode>(N);
+  SDValue Value = SN->getValue();
+  EVT VT = Value.getValueType();
+
+  if (isTypeLegal(VT) || SN->isVolatile() || !ISD::isNormalLoad(Value.getNode()))
+    return SDValue();
+
+  LoadSDNode *LoadVal = cast<LoadSDNode>(Value);
+  if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal))
+    return SDValue();
+
+  EVT MemVT = LoadVal->getMemoryVT();
+
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+  EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT);
+
+  SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
+                                LoadVT, SL,
+                                LoadVal->getChain(),
+                                LoadVal->getBasePtr(),
+                                LoadVal->getOffset(),
+                                LoadVT,
+                                LoadVal->getMemOperand());
+
+  SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0));
+  DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false);
+
+  return DAG.getStore(SN->getChain(), SL, NewLoad,
+                      SN->getBasePtr(), SN->getMemOperand());
+}
+
+SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  EVT VT = N->getValueType(0);
+
+  if (VT.isVector() || VT.getSizeInBits() > 32)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue Mul;
+
+  if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
+    N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
+    N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
+    Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
+  } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
+    N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
+    N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
+    Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
+  } else {
+    return SDValue();
+  }
+
+  // We need to use sext even for MUL_U24, because MUL_U24 is used
+  // for signed multiply of 8 and 16-bit types.
+  return DAG.getSExtOrTrunc(Mul, DL, VT);
+}
+
  SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
-                                            DAGCombinerInfo &DCI) const {
+                                                DAGCombinerInfo &DCI) const {
    SelectionDAG &DAG = DCI.DAG;
    SDLoc DL(N);
  
    switch(N->getOpcode()) {
      default: break;
-    case ISD::MUL: {
-      EVT VT = N->getValueType(0);
-      SDValue N0 = N->getOperand(0);
-      SDValue N1 = N->getOperand(1);
-      SDValue Mul;
-
-      // FIXME: Add support for 24-bit multiply with 64-bit output on SI.
-      if (VT.isVector() || VT.getSizeInBits() > 32)
-        break;
-
-      if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
-        N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
-        N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
-        Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
-      } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
-        N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
-        N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
-        Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
-      } else {
-        break;
-      }
-
-      // We need to use sext even for MUL_U24, because MUL_U24 is used
-      // for signed multiply of 8 and 16-bit types.
-      SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT);
-
-      return Reg;
-    }
+    case ISD::MUL:
+      return performMulCombine(N, DCI);
      case AMDGPUISD::MUL_I24:
      case AMDGPUISD::MUL_U24: {
        SDValue N0 = N->getOperand(0);
@@ -1936,6 +2181,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
  
      break;
    }
+
+  case ISD::STORE:
+    return performStoreCombine(N, DCI);
    }
    return SDValue();
  }
@@ -2019,7 +2267,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
    // AMDIL DAG nodes
    NODE_NAME_CASE(CALL);
    NODE_NAME_CASE(UMUL);
-  NODE_NAME_CASE(DIV_INF);
    NODE_NAME_CASE(RET_FLAG);
    NODE_NAME_CASE(BRANCH_COND);
  
@@ -2040,6 +2287,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
    NODE_NAME_CASE(TRIG_PREOP)
    NODE_NAME_CASE(RCP)
    NODE_NAME_CASE(RSQ)
+  NODE_NAME_CASE(RSQ_LEGACY)
+  NODE_NAME_CASE(RSQ_CLAMPED)
    NODE_NAME_CASE(DOT4)
    NODE_NAME_CASE(BFE_U32)
    NODE_NAME_CASE(BFE_I32)
@@ -2065,6 +2314,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
    NODE_NAME_CASE(CVT_F32_UBYTE2)
    NODE_NAME_CASE(CVT_F32_UBYTE3)
    NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
+  NODE_NAME_CASE(CONST_DATA_PTR)
    NODE_NAME_CASE(STORE_MSKOR)
    NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
    }