[X86][SSE] Update the cost table for integer-integer conversions on SSE2/SSE4.1.

[oota-llvm.git] / lib / Target / AMDGPU / AMDGPUISelLowering.cpp
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

index 9479d233cf9f58b5f9412038c50fa2a61526e897..6f673f4f395fcac6dd848beca93a5d494ea825c3 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -15,6 +15,7 @@
  
  #include "AMDGPUISelLowering.h"
  #include "AMDGPU.h"
+#include "AMDGPUDiagnosticInfoUnsupported.h"
  #include "AMDGPUFrameLowering.h"
  #include "AMDGPUIntrinsicInfo.h"
  #include "AMDGPURegisterInfo.h"
@@ -27,50 +28,9 @@
  #include "llvm/CodeGen/SelectionDAG.h"
  #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
  #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/DiagnosticPrinter.h"
  
  using namespace llvm;
  
-namespace {
-
-/// Diagnostic information for unimplemented or unsupported feature reporting.
-class DiagnosticInfoUnsupported : public DiagnosticInfo {
-private:
-  const Twine &Description;
-  const Function &Fn;
-
-  static int KindID;
-
-  static int getKindID() {
-    if (KindID == 0)
-      KindID = llvm::getNextAvailablePluginDiagnosticKind();
-    return KindID;
-  }
-
-public:
-  DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
-                          DiagnosticSeverity Severity = DS_Error)
-    : DiagnosticInfo(getKindID(), Severity),
-      Description(Desc),
-      Fn(Fn) { }
-
-  const Function &getFunction() const { return Fn; }
-  const Twine &getDescription() const { return Description; }
-
-  void print(DiagnosticPrinter &DP) const override {
-    DP << "unsupported " << getDescription() << " in " << Fn.getName();
-  }
-
-  static bool classof(const DiagnosticInfo *DI) {
-    return DI->getKind() == getKindID();
-  }
-};
-
-int DiagnosticInfoUnsupported::KindID = 0;
-}
-
-
  static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
                        CCValAssign::LocInfo LocInfo,
                        ISD::ArgFlagsTy ArgFlags, CCState &State) {
@@ -113,6 +73,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
    setOperationAction(ISD::BR_JT, MVT::Other, Expand);
    setOperationAction(ISD::BRIND, MVT::Other, Expand);
  
+  // This is totally unsupported, just custom lower to produce an error.
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+
    // We need to custom lower some of the intrinsics
    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  
@@ -352,7 +315,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
      setOperationAction(ISD::SDIVREM, VT, Custom);
-    setOperationAction(ISD::UDIVREM, VT, Custom);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
      setOperationAction(ISD::ADDC, VT, Expand);
      setOperationAction(ISD::SUBC, VT, Expand);
      setOperationAction(ISD::ADDE, VT, Expand);
@@ -406,6 +369,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
    setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
    setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
  
+  setTargetDAGCombine(ISD::SHL);
    setTargetDAGCombine(ISD::MUL);
    setTargetDAGCombine(ISD::SELECT);
    setTargetDAGCombine(ISD::SELECT_CC);
@@ -428,12 +392,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
    setSelectIsExpensive(false);
    PredictableSelectIsExpensive = false;
  
-  // There are no integer divide instructions, and these expand to a pretty
-  // large sequence of instructions.
-  setIntDivIsCheap(false);
-  setPow2SDivIsCheap(false);
    setFsqrtIsCheap(true);
  
+  // We want to find all load dependencies for long chains of stores to enable
+  // merging into very wide vectors. The problem is with vectors with > 4
+  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
+  // vectors are a legal type, even though we have to split the loads
+  // usually. When we can more precisely specify load legality per address
+  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
+  // smarter so that they can figure out what to do in 2 iterations without all
+  // N > 4 stores on the same chain.
+  GatherAllAliasesMaxDepth = 16;
+
    // FIXME: Need to really handle these.
    MaxStoresPerMemcpy  = 4096;
    MaxStoresPerMemmove = 4096;
@@ -444,7 +414,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
  // Target Information
  //===----------------------------------------------------------------------===//
  
-MVT AMDGPUTargetLowering::getVectorIdxTy() const {
+MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
    return MVT::i32;
  }
  
@@ -533,6 +503,18 @@ bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
    return true;
  }
  
+bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
+  // There are few operations which truly have vector input operands. Any vector
+  // operation is going to involve operations on each component, and a
+  // build_vector will be a copy per element, so it always makes sense to use a
+  // build_vector input in place of the extracted element to avoid a copy into a
+  // super register.
+  //
+  // We should probably only do this if all users are extracts only, but this
+  // should be the common case.
+  return true;
+}
+
  bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
    // Truncate is just accessing a subregister.
    return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
@@ -545,9 +527,8 @@ bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
  }
  
  bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
-  const DataLayout *DL = getDataLayout();
-  unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType());
-  unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType());
+  unsigned SrcSize = Src->getScalarSizeInBits();
+  unsigned DestSize = Dest->getScalarSizeInBits();
  
    return SrcSize == 32 && DestSize == 64;
  }
@@ -617,6 +598,15 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
    return SDValue();
  }
  
+SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  const Function &Fn = *DAG.getMachineFunction().getFunction();
+
+  DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "dynamic alloca");
+  DAG.getContext()->diagnose(NoDynamicAlloca);
+  return SDValue();
+}
+
  SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
    switch (Op.getOpcode()) {
@@ -643,6 +633,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
    case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
    case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
    case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
    }
    return Op;
  }
@@ -697,7 +688,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
                                                         const SDValue &InitPtr,
                                                         SDValue Chain,
                                                         SelectionDAG &DAG) const {
-  const DataLayout *TD = getDataLayout();
+  const DataLayout &TD = DAG.getDataLayout();
    SDLoc DL(InitPtr);
    Type *InitTy = Init->getType();
  
@@ -705,20 +696,20 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
      EVT VT = EVT::getEVT(InitTy);
      PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
      return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr,
-                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
-                        TD->getPrefTypeAlignment(InitTy));
+                        MachinePointerInfo(UndefValue::get(PtrTy)), false,
+                        false, TD.getPrefTypeAlignment(InitTy));
    }
  
    if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
      EVT VT = EVT::getEVT(CFP->getType());
      PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
      return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr,
-                 MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
-                 TD->getPrefTypeAlignment(CFP->getType()));
+                        MachinePointerInfo(UndefValue::get(PtrTy)), false,
+                        false, TD.getPrefTypeAlignment(CFP->getType()));
    }
  
    if (StructType *ST = dyn_cast<StructType>(InitTy)) {
-    const StructLayout *SL = TD->getStructLayout(ST);
+    const StructLayout *SL = TD.getStructLayout(ST);
  
      EVT PtrVT = InitPtr.getValueType();
      SmallVector<SDValue, 8> Chains;
@@ -745,7 +736,7 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
      else
        llvm_unreachable("Unexpected type");
  
-    unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType());
+    unsigned EltSize = TD.getTypeAllocSize(SeqTy->getElementType());
      SmallVector<SDValue, 8> Chains;
      for (unsigned i = 0; i < NumElements; ++i) {
        SDValue Offset = DAG.getConstant(i * EltSize, DL, PtrVT);
@@ -762,8 +753,8 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
      EVT VT = EVT::getEVT(InitTy);
      PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
      return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
-                        MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
-                        TD->getPrefTypeAlignment(InitTy));
+                        MachinePointerInfo(UndefValue::get(PtrTy)), false,
+                        false, TD.getPrefTypeAlignment(InitTy));
    }
  
    Init->dump();
@@ -785,7 +776,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
                                                   SDValue Op,
                                                   SelectionDAG &DAG) const {
  
-  const DataLayout *TD = getDataLayout();
+  const DataLayout &DL = DAG.getDataLayout();
    GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
    const GlobalValue *GV = G->getGlobal();
  
@@ -801,7 +792,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
  
      unsigned Offset;
      if (MFI->LocalMemoryObjects.count(GV) == 0) {
-      uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
+      uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
        Offset = MFI->LDSSize;
        MFI->LocalMemoryObjects[GV] = Offset;
        // XXX: Account for alignment?
@@ -811,16 +802,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
      }
  
      return DAG.getConstant(Offset, SDLoc(Op),
-                           getPointerTy(AMDGPUAS::LOCAL_ADDRESS));
+                           getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS));
    }
    case AMDGPUAS::CONSTANT_ADDRESS: {
      MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
      Type *EltType = GV->getType()->getElementType();
-    unsigned Size = TD->getTypeAllocSize(EltType);
-    unsigned Alignment = TD->getPrefTypeAlignment(EltType);
+    unsigned Size = DL.getTypeAllocSize(EltType);
+    unsigned Alignment = DL.getPrefTypeAlignment(EltType);
  
-    MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS);
-    MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
+    MVT PrivPtrVT = getPointerTy(DL, AMDGPUAS::PRIVATE_ADDRESS);
+    MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
  
      int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
      SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT);
@@ -892,7 +883,9 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
    FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
  
    unsigned FrameIndex = FIN->getIndex();
-  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
+  unsigned IgnoredFrameReg;
+  unsigned Offset =
+      TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
    return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
                           Op.getValueType());
  }
@@ -1077,6 +1070,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
                                                  SelectionDAG &DAG) const {
    SDLoc DL(Op);
    EVT VT = Op.getValueType();
+  // TODO: Should this propagate fast-math-flags?
    SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
                                  DAG.getConstantFP(1.0f, DL, MVT::f32),
                                  Op.getOperand(1));
@@ -1167,45 +1161,6 @@ SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
    return SDValue();
  }
  
-// FIXME: Remove this when combines added to DAGCombiner.
-SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL,
-                                             EVT VT,
-                                             SDValue LHS,
-                                             SDValue RHS,
-                                             SDValue True,
-                                             SDValue False,
-                                             SDValue CC,
-                                             SelectionDAG &DAG) const {
-  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
-    return SDValue();
-
-  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
-  switch (CCOpcode) {
-  case ISD::SETULE:
-  case ISD::SETULT: {
-    unsigned Opc = (LHS == True) ? ISD::UMIN : ISD::UMAX;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
-  }
-  case ISD::SETLE:
-  case ISD::SETLT: {
-    unsigned Opc = (LHS == True) ? ISD::SMIN : ISD::SMAX;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
-  }
-  case ISD::SETGT:
-  case ISD::SETGE: {
-    unsigned Opc = (LHS == True) ? ISD::SMAX : ISD::SMIN;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
-  }
-  case ISD::SETUGE:
-  case ISD::SETUGT: {
-    unsigned Opc = (LHS == True) ? ISD::UMAX : ISD::UMIN;
-    return DAG.getNode(Opc, DL, VT, LHS, RHS);
-  }
-  default:
-    return SDValue();
-  }
-}
-
  SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op,
                                                    SelectionDAG &DAG) const {
    LoadSDNode *Load = cast<LoadSDNode>(Op);
@@ -1529,7 +1484,7 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
    if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
         Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
        Store->getValue().getValueType().isVector()) {
-    return ScalarizeVectorStore(Op, DAG);
+    return SplitVectorStore(Op, DAG);
    }
  
    EVT MemVT = Store->getMemoryVT();
@@ -1630,6 +1585,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
    // float fb = (float)ib;
    SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
  
+  // TODO: Should this propagate fast-math-flags?
    // float fq = native_divide(fa, fb);
    SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
                             fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
@@ -1653,7 +1609,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
    // fb = fabs(fb);
    fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
  
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT);
+  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
  
    // int cv = fr >= fb;
    SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
@@ -1940,6 +1896,8 @@ SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
    SDValue X = Op.getOperand(0);
    SDValue Y = Op.getOperand(1);
  
+  // TODO: Should this propagate fast-math-flags?
+
    SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
    SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
    SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
@@ -1960,13 +1918,15 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
    const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
    const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
  
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
  
    SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
    SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
    SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
  
    SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
+  // TODO: Should this propagate fast-math-flags?
    return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
  }
  
@@ -2020,7 +1980,8 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
    SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
    SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
  
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
  
    const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
  
@@ -2043,6 +2004,8 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
    SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
    SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
  
+  // TODO: Should this propagate fast-math-flags?
+
    SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
    SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
  
@@ -2051,7 +2014,8 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
    APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");
    SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
  
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
    SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
  
    return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
@@ -2071,6 +2035,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const
  
    SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
  
+  // TODO: Should this propagate fast-math-flags?
+
    SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
  
    SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
@@ -2081,7 +2047,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const
  
    SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
  
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
  
    SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
  
@@ -2100,8 +2067,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const
    const SDValue One = DAG.getConstant(1, SL, MVT::i32);
    const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
    const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
-
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
  
    SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
  
@@ -2172,13 +2139,15 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
    const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
    const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
  
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
  
    SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
    SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
    SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
  
    SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
+  // TODO: Should this propagate fast-math-flags?
    return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
  }
  
@@ -2201,7 +2170,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
  
    SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
                                DAG.getConstant(32, SL, MVT::i32));
-
+  // TODO: Should this propagate fast-math-flags?
    return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
  }
  
@@ -2226,6 +2195,7 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
                             DAG.getConstant(1, DL, MVT::i32));
    SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
+  // TODO: Should this propagate fast-math-flags?
    FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
                          DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32
    return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
@@ -2252,7 +2222,7 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
                                   MVT::f64);
    SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
                                   MVT::f64);
-
+  // TODO: Should this propagate fast-math-flags?
    SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
  
    SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
@@ -2411,6 +2381,33 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
                        SN->getBasePtr(), SN->getMemOperand());
  }
  
+SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
+                                                DAGCombinerInfo &DCI) const {
+  if (N->getValueType(0) != MVT::i64)
+    return SDValue();
+
+  // i64 (shl x, 32) -> (build_pair 0, x)
+
+  // Doing this with moves theoretically helps MI optimizations that understand
+  // copies. 2 v_mov_b32_e32 will have the same code size / cycle count as
+  // v_lshl_b64. In the SALU case, I think this is slightly worse since it
+  // doubles the code size and I'm unsure about cycle count.
+  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!RHS || RHS->getZExtValue() != 32)
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+
+  // Extract low 32-bits.
+  SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+
+  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Zero, Lo);
+}
+
  SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
    EVT VT = N->getValueType(0);
@@ -2450,6 +2447,12 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
    switch(N->getOpcode()) {
    default:
      break;
+  case ISD::SHL: {
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+      break;
+
+    return performShlCombine(N, DCI);
+  }
    case ISD::MUL:
      return performMulCombine(N, DCI);
    case AMDGPUISD::MUL_I24:
@@ -2473,12 +2476,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
  
        if (VT == MVT::f32)
          return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
-
-      // TODO: Implement min / max Evergreen instructions.
-      if (VT == MVT::i32 &&
-          Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-        return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
-      }
      }
  
      break;
@@ -2614,20 +2611,14 @@ bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
    if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
      return CFP->isExactlyValue(1.0);
    }
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-    return C->isAllOnesValue();
-  }
-  return false;
+  return isAllOnesConstant(Op);
  }
  
  bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
    if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
      return CFP->getValueAPF().isZero();
    }
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-    return C->isNullValue();
-  }
-  return false;
+  return isNullConstant(Op);
  }
  
  SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
@@ -2645,6 +2636,18 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
    return DAG.getRegister(VirtualRegister, VT);
  }
  
+uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
+    const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
+  uint64_t ArgOffset = MFI->ABIArgOffset;
+  switch (Param) {
+  case GRID_DIM:
+    return ArgOffset;
+  case GRID_OFFSET:
+    return ArgOffset + 4;
+  }
+  llvm_unreachable("unexpected implicit parameter type");
+}
+
  #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
  
  const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -2843,8 +2846,7 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
        return 1;
  
      unsigned SignBits = 32 - Width->getZExtValue() + 1;
-    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-    if (!Offset || !Offset->isNullValue())
+    if (!isNullConstant(Op.getOperand(1)))
        return SignBits;
  
      // TODO: Could probably figure something out with non-0 offsets.