AMDGPU: Use the default strings for data emission directives

[oota-llvm.git] / lib / Target / AMDGPU / SIISelLowering.cpp
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index dd818a9ba746687b7c9c0c8c0be65d795f045cce..1b0cc87206f47ea1e3b111e4a0c60fd4faf87d63 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -20,6 +20,7 @@
  
  #include "SIISelLowering.h"
  #include "AMDGPU.h"
+#include "AMDGPUDiagnosticInfoUnsupported.h"
  #include "AMDGPUIntrinsicInfo.h"
  #include "AMDGPUSubtarget.h"
  #include "SIInstrInfo.h"
@@ -51,6 +52,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
    addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
    addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
  
+  addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
+  addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
+
    addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
    addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
  
@@ -155,12 +159,30 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
    for (MVT VT : MVT::fp_valuetypes())
      setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
  
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+
    setTruncStoreAction(MVT::i64, MVT::i32, Expand);
    setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
    setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
  
+
+  setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
+
+  setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
+  setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
+
    setOperationAction(ISD::LOAD, MVT::i1, Custom);
  
+  setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
+
+  setOperationAction(ISD::STORE, MVT::v2i64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
+
+  setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
+
    setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
    setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
    setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
@@ -172,9 +194,14 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
    setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
    setOperationAction(ISD::SELECT, MVT::i1, Promote);
  
+  setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
+
+
+  setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+
    // We only support LOAD/STORE and vector manipulation ops for vectors
    // with > 4 elements.
-  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) {
+  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
      for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
        switch(Op) {
        case ISD::LOAD:
@@ -185,6 +212,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
        case ISD::INSERT_VECTOR_ELT:
        case ISD::INSERT_SUBVECTOR:
        case ISD::EXTRACT_SUBVECTOR:
+      case ISD::SCALAR_TO_VECTOR:
          break;
        case ISD::CONCAT_VECTORS:
          setOperationAction(Op, VT, Custom);
@@ -196,6 +224,22 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
      }
    }
  
+  // Most operations are naturally 32-bit vector operations. We only support
+  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
+  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
+    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
+
+    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
+  }
+
    if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
      setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
      setOperationAction(ISD::FCEIL, MVT::f64, Legal);
@@ -254,6 +298,47 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
    return false;
  }
  
+bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
+  // Flat instructions do not have offsets, and only have the register
+  // address.
+  return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
+}
+
+bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
+  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
+  // additionally can do r + r + i with addr64. 32-bit has more addressing
+  // mode options. Depending on the resource constant, it can also do
+  // (i64 r0) + (i32 r1) * (i14 i).
+  //
+  // Private arrays end up using a scratch buffer most of the time, so also
+  // assume those use MUBUF instructions. Scratch loads / stores are currently
+  // implemented as mubuf instructions with offen bit set, so slightly
+  // different than the normal addr64.
+  if (!isUInt<12>(AM.BaseOffs))
+    return false;
+
+  // FIXME: Since we can split immediate into soffset and immediate offset,
+  // would it make sense to allow any immediate?
+
+  switch (AM.Scale) {
+  case 0: // r + i or just i, depending on HasBaseReg.
+    return true;
+  case 1:
+    return true; // We have r + r or r + i.
+  case 2:
+    if (AM.HasBaseReg) {
+      // Reject 2 * r + r.
+      return false;
+    }
+
+    // Allow 2 * r as r + r
+    // Or  2 * r + i is allowed as r + r + i.
+    return true;
+  default: // Don't allow n * r
+    return false;
+  }
+}
+
  bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                               const AddrMode &AM, Type *Ty,
                                               unsigned AS) const {
@@ -262,51 +347,64 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
      return false;
  
    switch (AS) {
-  case AMDGPUAS::GLOBAL_ADDRESS:
-  case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
-  case AMDGPUAS::PRIVATE_ADDRESS:
-  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
-    // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
-    // additionally can do r + r + i with addr64. 32-bit has more addressing
-    // mode options. Depending on the resource constant, it can also do
-    // (i64 r0) + (i32 r1) * (i14 i).
-    //
-    // SMRD instructions have an 8-bit, dword offset.
-    //
-    // Assume nonunifom access, since the address space isn't enough to know
-    // what instruction we will use, and since we don't know if this is a load
-    // or store and scalar stores are only available on VI.
-    //
-    // We also know if we are doing an extload, we can't do a scalar load.
-    //
-    // Private arrays end up using a scratch buffer most of the time, so also
-    // assume those use MUBUF instructions. Scratch loads / stores are currently
-    // implemented as mubuf instructions with offen bit set, so slightly
-    // different than the normal addr64.
-    if (!isUInt<12>(AM.BaseOffs))
-      return false;
+  case AMDGPUAS::GLOBAL_ADDRESS: {
+    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      // Assume the we will use FLAT for all global memory accesses
+      // on VI.
+      // FIXME: This assumption is currently wrong.  On VI we still use
+      // MUBUF instructions for the r + i addressing mode.  As currently
+      // implemented, the MUBUF instructions only work on buffer < 4GB.
+      // It may be possible to support > 4GB buffers with MUBUF instructions,
+      // by setting the stride value in the resource descriptor which would
+      // increase the size limit to (stride * 4GB).  However, this is risky,
+      // because it has never been validated.
+      return isLegalFlatAddressingMode(AM);
+    }
  
-    // FIXME: Since we can split immediate into soffset and immediate offset,
-    // would it make sense to allow any immediate?
+    return isLegalMUBUFAddressingMode(AM);
+  }
+  case AMDGPUAS::CONSTANT_ADDRESS: {
+    // If the offset isn't a multiple of 4, it probably isn't going to be
+    // correctly aligned.
+    if (AM.BaseOffs % 4 != 0)
+      return isLegalMUBUFAddressingMode(AM);
+
+    // There are no SMRD extloads, so if we have to do a small type access we
+    // will use a MUBUF load.
+    // FIXME?: We also need to do this if unaligned, but we don't know the
+    // alignment here.
+    if (DL.getTypeStoreSize(Ty) < 4)
+      return isLegalMUBUFAddressingMode(AM);
+
+    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+      // SMRD instructions have an 8-bit, dword offset on SI.
+      if (!isUInt<8>(AM.BaseOffs / 4))
+        return false;
+    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
+      // On CI+, this can also be a 32-bit literal constant offset. If it fits
+      // in 8-bits, it can use a smaller encoding.
+      if (!isUInt<32>(AM.BaseOffs / 4))
+        return false;
+    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      // On VI, these use the SMEM format and the offset is 20-bit in bytes.
+      if (!isUInt<20>(AM.BaseOffs))
+        return false;
+    } else
+      llvm_unreachable("unhandled generation");
  
-    switch (AM.Scale) {
-    case 0: // r + i or just i, depending on HasBaseReg.
+    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
        return true;
-    case 1:
-      return true; // We have r + r or r + i.
-    case 2:
-      if (AM.HasBaseReg) {
-        // Reject 2 * r + r.
-        return false;
-      }
  
-      // Allow 2 * r as r + r
-      // Or  2 * r + i is allowed as r + r + i.
+    if (AM.Scale == 1 && AM.HasBaseReg)
        return true;
-    default: // Don't allow n * r
-      return false;
-    }
+
+    return false;
    }
+
+  case AMDGPUAS::PRIVATE_ADDRESS:
+  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+    return isLegalMUBUFAddressingMode(AM);
+
    case AMDGPUAS::LOCAL_ADDRESS:
    case AMDGPUAS::REGION_ADDRESS: {
      // Basic, single offset DS instructions allow a 16-bit unsigned immediate
@@ -324,11 +422,9 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
  
      return false;
    }
-  case AMDGPUAS::FLAT_ADDRESS: {
-    // Flat instructions do not have offsets, and only have the register
-    // address.
-    return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
-  }
+  case AMDGPUAS::FLAT_ADDRESS:
+    return isLegalFlatAddressingMode(AM);
+
    default:
      llvm_unreachable("unhandled address space");
    }
@@ -356,7 +452,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
      // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
      // aligned, 8 byte access in a single operation using ds_read2/write2_b32
      // with adjacent offsets.
-    return Align % 4 == 0;
+    bool AlignedBy4 = (Align % 4 == 0);
+    if (IsFast)
+      *IsFast = AlignedBy4;
+    return AlignedBy4;
    }
  
    // Smaller than dword value must be aligned.
@@ -408,12 +507,6 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
    return TII->isInlineConstant(Imm);
  }
  
-static EVT toIntegerVT(EVT VT) {
-  if (VT.isVector())
-    return VT.changeVectorElementTypeToInteger();
-  return MVT::getIntegerVT(VT.getSizeInBits());
-}
-
  SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                           SDLoc SL, SDValue Chain,
                                           unsigned Offset, bool Signed) const {
@@ -421,7 +514,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
    MachineFunction &MF = DAG.getMachineFunction();
    const SIRegisterInfo *TRI =
        static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
-  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
+  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
  
    Type *Ty = VT.getTypeForEVT(*DAG.getContext());
  
@@ -437,30 +530,10 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
  
    unsigned Align = DL.getABITypeAlignment(Ty);
  
-  if (VT != MemVT && VT.isFloatingPoint()) {
-    // Do an integer load and convert.
-    // FIXME: This is mostly because load legalization after type legalization
-    // doesn't handle FP extloads.
-    assert(VT.getScalarType() == MVT::f32 &&
-           MemVT.getScalarType() == MVT::f16);
-
-    EVT IVT = toIntegerVT(VT);
-    EVT MemIVT = toIntegerVT(MemVT);
-    SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD,
-                               IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT,
-                               false, // isVolatile
-                               true, // isNonTemporal
-                               true, // isInvariant
-                               Align); // Alignment
-    SDValue Ops[] = {
-      DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load),
-      Load.getValue(1)
-    };
-
-    return DAG.getMergeValues(Ops, SL);
-  }
-
    ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+  if (MemVT.isFloatingPoint())
+    ExtTy = ISD::EXTLOAD;
+
    return DAG.getLoad(ISD::UNINDEXED, ExtTy,
                       VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
                       false, // isVolatile
@@ -479,8 +552,16 @@ SDValue SITargetLowering::LowerFormalArguments(
    MachineFunction &MF = DAG.getMachineFunction();
    FunctionType *FType = MF.getFunction()->getFunctionType();
    SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+
+  if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) {
+    const Function *Fn = MF.getFunction();
+    DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA");
+    DAG.getContext()->diagnose(NoGraphicsHSA);
+    return SDValue();
+  }
  
-  assert(CallConv == CallingConv::C);
+  // FIXME: We currently assume all calling conventions are kernels.
  
    SmallVector<ISD::InputArg, 16> Splits;
    BitVector Skipped(Ins.size());
@@ -495,7 +576,7 @@ SDValue SITargetLowering::LowerFormalArguments(
        assert((PSInputNum <= 15) && "Too many PS inputs!");
  
        if (!Arg.Used) {
-        // We can savely skip PS inputs
+        // We can safely skip PS inputs
          Skipped.set(i);
          ++PSInputNum;
          continue;
@@ -512,7 +593,7 @@ SDValue SITargetLowering::LowerFormalArguments(
  
        // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
        // three or five element vertex only needs three or five registers,
-      // NOT four or eigth.
+      // NOT four or eight.
        Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
        unsigned NumElements = ParamType->getVectorNumElements();
  
@@ -538,41 +619,30 @@ SDValue SITargetLowering::LowerFormalArguments(
      CCInfo.AllocateReg(AMDGPU::VGPR1);
    }
  
-  // The pointer to the list of arguments is stored in SGPR0, SGPR1
-       // The pointer to the scratch buffer is stored in SGPR2, SGPR3
-  if (Info->getShaderType() == ShaderType::COMPUTE) {
-    if (Subtarget->isAmdHsaOS())
-      Info->NumUserSGPRs = 2;  // FIXME: Need to support scratch buffers.
-    else
-      Info->NumUserSGPRs = 4;
-
-    unsigned InputPtrReg =
-        TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
-    unsigned InputPtrRegLo =
-        TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0);
-    unsigned InputPtrRegHi =
-        TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
-    unsigned ScratchPtrReg =
-        TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
-    unsigned ScratchPtrRegLo =
-        TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0);
-    unsigned ScratchPtrRegHi =
-        TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1);
-
-    CCInfo.AllocateReg(InputPtrRegLo);
-    CCInfo.AllocateReg(InputPtrRegHi);
-    CCInfo.AllocateReg(ScratchPtrRegLo);
-    CCInfo.AllocateReg(ScratchPtrRegHi);
-    MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
-    MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass);
-  }
-
    if (Info->getShaderType() == ShaderType::COMPUTE) {
      getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
                              Splits);
    }
  
+  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
+  if (Info->hasPrivateSegmentBuffer()) {
+    unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
+    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+    CCInfo.AllocateReg(PrivateSegmentBufferReg);
+  }
+
+  if (Info->hasDispatchPtr()) {
+    unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
+    MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(DispatchPtrReg);
+  }
+
+  if (Info->hasKernargSegmentPtr()) {
+    unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
+    MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(InputPtrReg);
+  }
+
    AnalyzeFormalArguments(CCInfo, Splits);
  
    SmallVector<SDValue, 16> Chains;
@@ -599,7 +669,7 @@ SDValue SITargetLowering::LowerFormalArguments(
                                     Offset, Ins[i].Flags.isSExt());
        Chains.push_back(Arg.getValue(1));
  
-      const PointerType *ParamTy =
+      auto *ParamTy =
          dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
        if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
            ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
@@ -660,10 +730,113 @@ SDValue SITargetLowering::LowerFormalArguments(
      InVals.push_back(Val);
    }
  
-  if (Info->getShaderType() != ShaderType::COMPUTE) {
-    unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>(
-        AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
-    Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
+  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+  // these from the dispatch pointer.
+
+  // Start adding system SGPRs.
+  if (Info->hasWorkGroupIDX()) {
+    unsigned Reg = Info->addWorkGroupIDX();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  } else
+    llvm_unreachable("work group id x is always enabled");
+
+  if (Info->hasWorkGroupIDY()) {
+    unsigned Reg = Info->addWorkGroupIDY();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkGroupIDZ()) {
+    unsigned Reg = Info->addWorkGroupIDZ();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkGroupInfo()) {
+    unsigned Reg = Info->addWorkGroupInfo();
+    MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasPrivateSegmentWaveByteOffset()) {
+    // Scratch wave offset passed in system SGPR.
+    unsigned PrivateSegmentWaveByteOffsetReg
+      = Info->addPrivateSegmentWaveByteOffset();
+
+    MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
+    CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
+  }
+
+  // Now that we've figured out where the scratch register inputs are, see if
+  // should reserve the arguments and use them directly.
+
+  bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
+
+  if (ST.isAmdHsaOS()) {
+    // TODO: Assume we will spill without optimizations.
+    if (HasStackObjects) {
+      // If we have stack objects, we unquestionably need the private buffer
+      // resource. For the HSA ABI, this will be the first 4 user SGPR
+      // inputs. We can reserve those and use them directly.
+
+      unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+      Info->setScratchRSrcReg(PrivateSegmentBufferReg);
+
+      unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+      Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+    } else {
+      unsigned ReservedBufferReg
+        = TRI->reservedPrivateSegmentBufferReg(MF);
+      unsigned ReservedOffsetReg
+        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+
+      // We tentatively reserve the last registers (skipping the last two
+      // which may contain VCC). After register allocation, we'll replace
+      // these with the ones immediately after those which were really
+      // allocated. In the prologue copies will be inserted from the argument
+      // to these reserved registers.
+      Info->setScratchRSrcReg(ReservedBufferReg);
+      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+    }
+  } else {
+    unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
+
+    // Without HSA, relocations are used for the scratch pointer and the
+    // buffer resource setup is always inserted in the prologue. Scratch wave
+    // offset is still in an input SGPR.
+    Info->setScratchRSrcReg(ReservedBufferReg);
+
+    if (HasStackObjects) {
+      unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
+        MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+      Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+    } else {
+      unsigned ReservedOffsetReg
+        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+    }
+  }
+
+  if (Info->hasWorkItemIDX()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  } else
+    llvm_unreachable("workitem id x should always be enabled");
+
+  if (Info->hasWorkItemIDY()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
+  if (Info->hasWorkItemIDZ()) {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
    }
  
    if (Chains.empty())
@@ -812,10 +985,29 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
  
  SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
  
+  SDLoc SL(Op);
    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
    unsigned FrameIndex = FINode->getIndex();
  
-  return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+  // A FrameIndex node represents a 32-bit offset into scratch memory.  If
+  // the high bit of a frame index offset were to be set, this would mean
+  // that it represented an offset of ~2GB * 64 = ~128GB from the start of the
+  // scratch buffer, with 64 being the number of threads per wave.
+  //
+  // If we know the machine uses less than 128GB of scratch, then we can
+  // amrk the high bit of the FrameIndex node as known zero,
+  // which is important, because it means in most situations we can
+  // prove that values derived from FrameIndex nodes are non-negative.
+  // This enables us to take advantage of more addressing modes when
+  // accessing scratch buffers, since for scratch reads/writes, the register
+  // offset must always be positive.
+
+  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+  if (Subtarget->enableHugeScratchBuffer())
+    return TFI;
+
+  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
+                    DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));
  }
  
  /// This transforms the control flow intrinsics to get the branch destination as
@@ -940,6 +1132,18 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
                                                        // a glue result.
  }
  
+SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
+                                                 SDValue Op,
+                                                 MVT VT,
+                                                 unsigned Offset) const {
+  SDLoc SL(Op);
+  SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
+                                 DAG.getEntryNode(), Offset, false);
+  // The local size values will have the hi 16-bits as zero.
+  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
+                     DAG.getValueType(VT));
+}
+
  SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                    SelectionDAG &DAG) const {
    MachineFunction &MF = DAG.getMachineFunction();
@@ -951,7 +1155,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
    SDLoc DL(Op);
    unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
  
+  // TODO: Should this propagate fast-math-flags?
+
    switch (IntrinsicID) {
+  case Intrinsic::amdgcn_dispatch_ptr:
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
+      TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT);
+
    case Intrinsic::r600_read_ngroups_x:
      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                            SI::KernelInputOffsets::NGROUPS_X, false);
@@ -971,37 +1181,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
                            SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
    case Intrinsic::r600_read_local_size_x:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_X, false);
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_X);
    case Intrinsic::r600_read_local_size_y:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_Y, false);
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_Y);
    case Intrinsic::r600_read_local_size_z:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::LOCAL_SIZE_Z, false);
-
+    return lowerImplicitZextParam(DAG, Op, MVT::i16,
+                                  SI::KernelInputOffsets::LOCAL_SIZE_Z);
    case Intrinsic::AMDGPU_read_workdim:
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          getImplicitParameterOffset(MFI, GRID_DIM), false);
-
+    // Really only 2 bits.
+    return lowerImplicitZextParam(DAG, Op, MVT::i8,
+                                  getImplicitParameterOffset(MFI, GRID_DIM));
    case Intrinsic::r600_read_tgid_x:
      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
    case Intrinsic::r600_read_tgid_y:
      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
    case Intrinsic::r600_read_tgid_z:
      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
    case Intrinsic::r600_read_tidig_x:
      return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
    case Intrinsic::r600_read_tidig_y:
      return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
    case Intrinsic::r600_read_tidig_z:
      return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
-      TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT);
+      TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
    case AMDGPUIntrinsic::SI_load_const: {
      SDValue Ops[] = {
        Op.getOperand(1),
@@ -1040,6 +1249,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                         DAG.getConstant(2, DL, MVT::i32), // P0
                         Op.getOperand(1), Op.getOperand(2), Glue);
    }
+  case AMDGPUIntrinsic::SI_packf16:
+    if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
+      return DAG.getUNDEF(MVT::i32);
+    return Op;
    case AMDGPUIntrinsic::SI_fs_interp: {
      SDValue IJ = Op.getOperand(4);
      SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
@@ -1115,16 +1328,21 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
             "Custom lowering for non-i32 vectors hasn't been implemented.");
      unsigned NumElements = Op.getValueType().getVectorNumElements();
      assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
+
      switch (Load->getAddressSpace()) {
        default: break;
        case AMDGPUAS::GLOBAL_ADDRESS:
        case AMDGPUAS::PRIVATE_ADDRESS:
+        if (NumElements >= 8)
+          return SplitVectorLoad(Op, DAG);
+
          // v4 loads are supported for private and global memory.
          if (NumElements <= 4)
            break;
          // fall-through
        case AMDGPUAS::LOCAL_ADDRESS:
-        return ScalarizeVectorLoad(Op, DAG);
+        // If properly aligned, if we split we might be able to use ds_read_b64.
+        return SplitVectorLoad(Op, DAG);
      }
    }
  
@@ -1199,8 +1417,10 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
    if (Unsafe) {
      // Turn into multiply by the reciprocal.
      // x / y -> x * (1.0 / y)
+    SDNodeFlags Flags;
+    Flags.setUnsafeAlgebra(true);
      SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
-    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip);
+    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
    }
  
    return SDValue();
@@ -1237,6 +1457,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
  
    SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
  
+  // TODO: Should this propagate fast-math-flags?
+
    r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
  
    SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
@@ -1342,7 +1564,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
      return Ret;
  
    if (VT.isVector() && VT.getVectorNumElements() >= 8)
-      return ScalarizeVectorStore(Op, DAG);
+      return SplitVectorStore(Op, DAG);
  
    if (VT == MVT::i1)
      return DAG.getTruncStore(Store->getChain(), DL,
@@ -1356,6 +1578,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
    SDLoc DL(Op);
    EVT VT = Op.getValueType();
    SDValue Arg = Op.getOperand(0);
+  // TODO: Should this propagate fast-math-flags?
    SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
                                    DAG.getNode(ISD::FMUL, DL, VT, Arg,
                                                DAG.getConstantFP(0.5/M_PI, DL,
@@ -2034,6 +2257,13 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
    }
  }
  
+static bool isFrameIndexOp(SDValue Op) {
+  if (Op.getOpcode() == ISD::AssertZext)
+    Op = Op.getOperand(0);
+
+  return isa<FrameIndexSDNode>(Op);
+}
+
  /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
  /// with frame index operands.
  /// LLVM assumes that inputs are to these instructions are registers.
@@ -2042,7 +2272,7 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
  
    SmallVector<SDValue, 8> Ops;
    for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
-    if (!isa<FrameIndexSDNode>(Node->getOperand(i))) {
+    if (!isFrameIndexOp(Node->getOperand(i))) {
        Ops.push_back(Node->getOperand(i));
        continue;
      }
@@ -2081,9 +2311,14 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
        static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
  
    MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  TII->legalizeOperands(MI);
  
-  if (TII->isMIMG(MI->getOpcode())) {
+  if (TII->isVOP3(MI->getOpcode())) {
+    // Make sure constant bus requirements are respected.
+    TII->legalizeOperandsVOP3(MRI, MI);
+    return;
+  }
+
+  if (TII->isMIMG(*MI)) {
      unsigned VReg = MI->getOperand(0).getReg();
      unsigned Writemask = MI->getOperand(1).getImm();
      unsigned BitsSet = 0;
@@ -2125,53 +2360,38 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
                                                  SDLoc DL,
                                                  SDValue Ptr) const {
    const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-#if 1
-    // XXX - Workaround for moveToVALU not handling different register class
-    // inserts for REG_SEQUENCE.
-
-    // Build the half of the subregister with the constants.
-    const SDValue Ops0[] = {
-      DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
-      buildSMovImm32(DAG, DL, 0),
-      DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
-      buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
-      DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
-    };
-
-    SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
-                                                  MVT::v2i32, Ops0), 0);
-
-    // Combine the constants and the pointer.
-    const SDValue Ops1[] = {
-      DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
-      Ptr,
-      DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
-      SubRegHi,
-      DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
-    };
+    static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+
+  // Build the half of the subregister with the constants before building the
+  // full 128-bit register. If we are building multiple resource descriptors,
+  // this will allow CSEing of the 2-component register.
+  const SDValue Ops0[] = {
+    DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
+    buildSMovImm32(DAG, DL, 0),
+    DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+    buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
+    DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
+  };
  
-    return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
-#else
-    const SDValue Ops[] = {
-      DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
-      Ptr,
-      DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
-      buildSMovImm32(DAG, DL, 0),
-      DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
-      buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32),
-      DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
-    };
+  SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                                MVT::v2i32, Ops0), 0);
  
-    return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
+  // Combine the constants and the pointer.
+  const SDValue Ops1[] = {
+    DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+    Ptr,
+    DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
+    SubRegHi,
+    DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
+  };
  
-#endif
+  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
  }
  
  /// \brief Return a resource descriptor with the 'Add TID' bit enabled
-///        The TID (Thread ID) is multipled by the stride value (bits [61:48]
-///        of the resource descriptor) to create an offset, which is added to the
-///        resource ponter.
+///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
+///        of the resource descriptor) to create an offset, which is added to
+///        the resource pointer.
  MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
                                             SDLoc DL,
                                             SDValue Ptr,
@@ -2204,17 +2424,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
    return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
  }
  
-MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
-                                                  SDLoc DL,
-                                                  SDValue Ptr) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-  uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE |
-                  0xffffffff; // Size
-
-  return buildRSRC(DAG, DL, Ptr, 0, Rsrc);
-}
-
  SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
                                                 const TargetRegisterClass *RC,
                                                 unsigned Reg, EVT VT) const {