AMDGPU: Add MachineInstr overloads for instruction format tests

[oota-llvm.git] / lib / Target / AMDGPU / SIISelLowering.cpp
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index 32f452eeee4d15b3437d954fbd09cd80d826fdc8..0d11f3b6847f01b53b468ea14751728cd439de55 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -157,6 +157,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
  
    setTruncStoreAction(MVT::i64, MVT::i32, Expand);
    setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
    setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
  
    setOperationAction(ISD::LOAD, MVT::i1, Custom);
@@ -254,58 +255,113 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
    return false;
  }
  
-bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                             Type *Ty, unsigned AS) const {
+bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
+  // Flat instructions do not have offsets, and only have the register
+  // address.
+  return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
+}
+
+bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
+  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
+  // additionally can do r + r + i with addr64. 32-bit has more addressing
+  // mode options. Depending on the resource constant, it can also do
+  // (i64 r0) + (i32 r1) * (i14 i).
+  //
+  // Private arrays end up using a scratch buffer most of the time, so also
+  // assume those use MUBUF instructions. Scratch loads / stores are currently
+  // implemented as mubuf instructions with offen bit set, so slightly
+  // different than the normal addr64.
+  if (!isUInt<12>(AM.BaseOffs))
+    return false;
+
+  // FIXME: Since we can split immediate into soffset and immediate offset,
+  // would it make sense to allow any immediate?
+
+  switch (AM.Scale) {
+  case 0: // r + i or just i, depending on HasBaseReg.
+    return true;
+  case 1:
+    return true; // We have r + r or r + i.
+  case 2:
+    if (AM.HasBaseReg) {
+      // Reject 2 * r + r.
+      return false;
+    }
+
+    // Allow 2 * r as r + r
+    // Or  2 * r + i is allowed as r + r + i.
+    return true;
+  default: // Don't allow n * r
+    return false;
+  }
+}
+
+bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                             const AddrMode &AM, Type *Ty,
+                                             unsigned AS) const {
    // No global is ever allowed as a base.
    if (AM.BaseGV)
      return false;
  
    switch (AS) {
-  case AMDGPUAS::GLOBAL_ADDRESS:
-  case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions?
-  case AMDGPUAS::PRIVATE_ADDRESS:
-  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: {
-    // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
-    // additionally can do r + r + i with addr64. 32-bit has more addressing
-    // mode options. Depending on the resource constant, it can also do
-    // (i64 r0) + (i32 r1) * (i14 i).
-    //
-    // SMRD instructions have an 8-bit, dword offset.
-    //
-    // Assume nonunifom access, since the address space isn't enough to know
-    // what instruction we will use, and since we don't know if this is a load
-    // or store and scalar stores are only available on VI.
-    //
-    // We also know if we are doing an extload, we can't do a scalar load.
-    //
-    // Private arrays end up using a scratch buffer most of the time, so also
-    // assume those use MUBUF instructions. Scratch loads / stores are currently
-    // implemented as mubuf instructions with offen bit set, so slightly
-    // different than the normal addr64.
-    if (!isUInt<12>(AM.BaseOffs))
-      return false;
+  case AMDGPUAS::GLOBAL_ADDRESS: {
+    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      // Assume the we will use FLAT for all global memory accesses
+      // on VI.
+      // FIXME: This assumption is currently wrong.  On VI we still use
+      // MUBUF instructions for the r + i addressing mode.  As currently
+      // implemented, the MUBUF instructions only work on buffer < 4GB.
+      // It may be possible to support > 4GB buffers with MUBUF instructions,
+      // by setting the stride value in the resource descriptor which would
+      // increase the size limit to (stride * 4GB).  However, this is risky,
+      // because it has never been validated.
+      return isLegalFlatAddressingMode(AM);
+    }
  
-    // FIXME: Since we can split immediate into soffset and immediate offset,
-    // would it make sense to allow any immediate?
+    return isLegalMUBUFAddressingMode(AM);
+  }
+  case AMDGPUAS::CONSTANT_ADDRESS: {
+    // If the offset isn't a multiple of 4, it probably isn't going to be
+    // correctly aligned.
+    if (AM.BaseOffs % 4 != 0)
+      return isLegalMUBUFAddressingMode(AM);
+
+    // There are no SMRD extloads, so if we have to do a small type access we
+    // will use a MUBUF load.
+    // FIXME?: We also need to do this if unaligned, but we don't know the
+    // alignment here.
+    if (DL.getTypeStoreSize(Ty) < 4)
+      return isLegalMUBUFAddressingMode(AM);
+
+    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+      // SMRD instructions have an 8-bit, dword offset on SI.
+      if (!isUInt<8>(AM.BaseOffs / 4))
+        return false;
+    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
+      // On CI+, this can also be a 32-bit literal constant offset. If it fits
+      // in 8-bits, it can use a smaller encoding.
+      if (!isUInt<32>(AM.BaseOffs / 4))
+        return false;
+    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      // On VI, these use the SMEM format and the offset is 20-bit in bytes.
+      if (!isUInt<20>(AM.BaseOffs))
+        return false;
+    } else
+      llvm_unreachable("unhandled generation");
  
-    switch (AM.Scale) {
-    case 0: // r + i or just i, depending on HasBaseReg.
+    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
        return true;
-    case 1:
-      return true; // We have r + r or r + i.
-    case 2:
-      if (AM.HasBaseReg) {
-        // Reject 2 * r + r.
-        return false;
-      }
  
-      // Allow 2 * r as r + r
-      // Or  2 * r + i is allowed as r + r + i.
+    if (AM.Scale == 1 && AM.HasBaseReg)
        return true;
-    default: // Don't allow n * r
-      return false;
-    }
+
+    return false;
    }
+
+  case AMDGPUAS::PRIVATE_ADDRESS:
+  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+    return isLegalMUBUFAddressingMode(AM);
+
    case AMDGPUAS::LOCAL_ADDRESS:
    case AMDGPUAS::REGION_ADDRESS: {
      // Basic, single offset DS instructions allow a 16-bit unsigned immediate
@@ -323,11 +379,9 @@ bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
  
      return false;
    }
-  case AMDGPUAS::FLAT_ADDRESS: {
-    // Flat instructions do not have offsets, and only have the register
-    // address.
-    return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
-  }
+  case AMDGPUAS::FLAT_ADDRESS:
+    return isLegalFlatAddressingMode(AM);
+
    default:
      llvm_unreachable("unhandled address space");
    }
@@ -355,7 +409,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
      // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
      // aligned, 8 byte access in a single operation using ds_read2/write2_b32
      // with adjacent offsets.
-    return Align % 4 == 0;
+    bool AlignedBy4 = (Align % 4 == 0);
+    if (IsFast)
+      *IsFast = AlignedBy4;
+    return AlignedBy4;
    }
  
    // Smaller than dword value must be aligned.
@@ -407,16 +464,10 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
    return TII->isInlineConstant(Imm);
  }
  
-static EVT toIntegerVT(EVT VT) {
-  if (VT.isVector())
-    return VT.changeVectorElementTypeToInteger();
-  return MVT::getIntegerVT(VT.getSizeInBits());
-}
-
  SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                           SDLoc SL, SDValue Chain,
                                           unsigned Offset, bool Signed) const {
-  const DataLayout *DL = getDataLayout();
+  const DataLayout &DL = DAG.getDataLayout();
    MachineFunction &MF = DAG.getMachineFunction();
    const SIRegisterInfo *TRI =
        static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
@@ -425,36 +476,21 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
    Type *Ty = VT.getTypeForEVT(*DAG.getContext());
  
    MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
-  MVT PtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
+  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
    PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
    SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
                                         MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
                              DAG.getConstant(Offset, SL, PtrVT));
-  SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
+  SDValue PtrOffset = DAG.getUNDEF(PtrVT);
    MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
  
-  unsigned Align = DL->getABITypeAlignment(Ty);
-
-  if (VT != MemVT && VT.isFloatingPoint()) {
-    // Do an integer load and convert.
-    // FIXME: This is mostly because load legalization after type legalization
-    // doesn't handle FP extloads.
-    assert(VT.getScalarType() == MVT::f32 &&
-           MemVT.getScalarType() == MVT::f16);
-
-    EVT IVT = toIntegerVT(VT);
-    EVT MemIVT = toIntegerVT(MemVT);
-    SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD,
-                               IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT,
-                               false, // isVolatile
-                               true, // isNonTemporal
-                               true, // isInvariant
-                               Align); // Alignment
-    return DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load);
-  }
+  unsigned Align = DL.getABITypeAlignment(Ty);
  
    ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+  if (MemVT.isFloatingPoint())
+    ExtTy = ISD::EXTLOAD;
+
    return DAG.getLoad(ISD::UNINDEXED, ExtTy,
                       VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
                       false, // isVolatile
@@ -474,7 +510,7 @@ SDValue SITargetLowering::LowerFormalArguments(
    FunctionType *FType = MF.getFunction()->getFunctionType();
    SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
  
-  assert(CallConv == CallingConv::C);
+  // FIXME: We currently assume all calling conventions are kernels.
  
    SmallVector<ISD::InputArg, 16> Splits;
    BitVector Skipped(Ins.size());
@@ -489,7 +525,7 @@ SDValue SITargetLowering::LowerFormalArguments(
        assert((PSInputNum <= 15) && "Too many PS inputs!");
  
        if (!Arg.Used) {
-        // We can savely skip PS inputs
+        // We can safely skip PS inputs
          Skipped.set(i);
          ++PSInputNum;
          continue;
@@ -506,7 +542,7 @@ SDValue SITargetLowering::LowerFormalArguments(
  
        // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
        // three or five element vertex only needs three or five registers,
-      // NOT four or eigth.
+      // NOT four or eight.
        Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
        unsigned NumElements = ParamType->getVectorNumElements();
  
@@ -533,7 +569,7 @@ SDValue SITargetLowering::LowerFormalArguments(
    }
  
    // The pointer to the list of arguments is stored in SGPR0, SGPR1
-       // The pointer to the scratch buffer is stored in SGPR2, SGPR3
+  // The pointer to the scratch buffer is stored in SGPR2, SGPR3
    if (Info->getShaderType() == ShaderType::COMPUTE) {
      if (Subtarget->isAmdHsaOS())
        Info->NumUserSGPRs = 2;  // FIXME: Need to support scratch buffers.
@@ -569,6 +605,8 @@ SDValue SITargetLowering::LowerFormalArguments(
  
    AnalyzeFormalArguments(CCInfo, Splits);
  
+  SmallVector<SDValue, 16> Chains;
+
    for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
  
      const ISD::InputArg &Arg = Ins[i];
@@ -587,10 +625,11 @@ SDValue SITargetLowering::LowerFormalArguments(
                                VA.getLocMemOffset();
        // The first 36 bytes of the input buffer contains information about
        // thread group and global sizes.
-      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, DAG.getRoot(),
+      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, Chain,
                                     Offset, Ins[i].Flags.isSExt());
+      Chains.push_back(Arg.getValue(1));
  
-      const PointerType *ParamTy =
+      auto *ParamTy =
          dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
        if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
            ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
@@ -614,7 +653,8 @@ SDValue SITargetLowering::LowerFormalArguments(
        Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
                                       &AMDGPU::SReg_64RegClass);
        Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
-      InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+      SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+      InVals.push_back(Copy);
        continue;
      }
  
@@ -634,7 +674,9 @@ SDValue SITargetLowering::LowerFormalArguments(
        for (unsigned j = 1; j != NumElements; ++j) {
          Reg = ArgLocs[ArgIdx++].getLocReg();
          Reg = MF.addLiveIn(Reg, RC);
-        Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+
+        SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+        Regs.push_back(Copy);
        }
  
        // Fill up the missing vector elements
@@ -649,11 +691,15 @@ SDValue SITargetLowering::LowerFormalArguments(
    }
  
    if (Info->getShaderType() != ShaderType::COMPUTE) {
-    unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>(
+    unsigned ScratchIdx = CCInfo.getFirstUnallocated(makeArrayRef(
          AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
      Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
    }
-  return Chain;
+
+  if (Chains.empty())
+    return Chain;
+
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
  }
  
  MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
@@ -695,14 +741,15 @@ bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
    return true;
  }
  
-EVT SITargetLowering::getSetCCResultType(LLVMContext &Ctx, EVT VT) const {
+EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
+                                         EVT VT) const {
    if (!VT.isVector()) {
      return MVT::i1;
    }
    return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
  }
  
-MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
+MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const {
    return MVT::i32;
  }
  
@@ -795,10 +842,29 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
  
  SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
  
+  SDLoc SL(Op);
    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
    unsigned FrameIndex = FINode->getIndex();
  
-  return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+  // A FrameIndex node represents a 32-bit offset into scratch memory.  If
+  // the high bit of a frame index offset were to be set, this would mean
+  // that it represented an offset of ~2GB * 64 = ~128GB from the start of the
+  // scratch buffer, with 64 being the number of threads per wave.
+  //
+  // If we know the machine uses less than 128GB of scratch, then we can
+  // amrk the high bit of the FrameIndex node as known zero,
+  // which is important, because it means in most situations we can
+  // prove that values derived from FrameIndex nodes are non-negative.
+  // This enables us to take advantage of more addressing modes when
+  // accessing scratch buffers, since for scratch reads/writes, the register
+  // offset must always be positive.
+
+  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
+  if (Subtarget->enableHugeScratchBuffer())
+    return TFI;
+
+  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
+                    DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));
  }
  
  /// This transforms the control flow intrinsics to get the branch destination as
@@ -888,7 +954,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
  
    SDLoc DL(GSD);
    const GlobalValue *GV = GSD->getGlobal();
-  MVT PtrVT = getPointerTy(GSD->getAddressSpace());
+  MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());
  
    SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
@@ -926,6 +992,7 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
  SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                    SelectionDAG &DAG) const {
    MachineFunction &MF = DAG.getMachineFunction();
+  auto MFI = MF.getInfo<SIMachineFunctionInfo>();
    const SIRegisterInfo *TRI =
        static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
  
@@ -933,6 +1000,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
    SDLoc DL(Op);
    unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
  
+  // TODO: Should this propagate fast-math-flags?
+
    switch (IntrinsicID) {
    case Intrinsic::r600_read_ngroups_x:
      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
@@ -964,8 +1033,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
  
    case Intrinsic::AMDGPU_read_workdim:
      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          MF.getInfo<SIMachineFunctionInfo>()->ABIArgOffset,
-                          false);
+                          getImplicitParameterOffset(MFI, GRID_DIM), false);
  
    case Intrinsic::r600_read_tgid_x:
      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
@@ -1182,8 +1250,10 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
    if (Unsafe) {
      // Turn into multiply by the reciprocal.
      // x / y -> x * (1.0 / y)
+    SDNodeFlags Flags;
+    Flags.setUnsafeAlgebra(true);
      SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
-    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip);
+    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
    }
  
    return SDValue();
@@ -1213,12 +1283,15 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
  
    const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
  
-  EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
  
    SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
  
    SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
  
+  // TODO: Should this propagate fast-math-flags?
+
    r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
  
    SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
@@ -1338,6 +1411,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
    SDLoc DL(Op);
    EVT VT = Op.getValueType();
    SDValue Arg = Op.getOperand(0);
+  // TODO: Should this propagate fast-math-flags?
    SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
                                    DAG.getNode(ISD::FMUL, DL, VT, Arg,
                                                DAG.getConstantFP(0.5/M_PI, DL,
@@ -1411,7 +1485,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
      unsigned AS = Load->getAddressSpace();
      unsigned Align = Load->getAlignment();
      Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
-    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
+    unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
  
      // Don't try to replace the load if we have to expand it due to alignment
      // problems. Otherwise we will end up scalarizing the load, and trying to
@@ -2016,6 +2090,13 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
    }
  }
  
+static bool isFrameIndexOp(SDValue Op) {
+  if (Op.getOpcode() == ISD::AssertZext)
+    Op = Op.getOperand(0);
+
+  return isa<FrameIndexSDNode>(Op);
+}
+
  /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
  /// with frame index operands.
  /// LLVM assumes that inputs are to these instructions are registers.
@@ -2024,7 +2105,7 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
  
    SmallVector<SDValue, 8> Ops;
    for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
-    if (!isa<FrameIndexSDNode>(Node->getOperand(i))) {
+    if (!isFrameIndexOp(Node->getOperand(i))) {
        Ops.push_back(Node->getOperand(i));
        continue;
      }
@@ -2065,7 +2146,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
    MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
    TII->legalizeOperands(MI);
  
-  if (TII->isMIMG(MI->getOpcode())) {
+  if (TII->isMIMG(*MI)) {
      unsigned VReg = MI->getOperand(0).getReg();
      unsigned Writemask = MI->getOperand(1).getImm();
      unsigned BitsSet = 0;
@@ -2107,53 +2188,38 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
                                                  SDLoc DL,
                                                  SDValue Ptr) const {
    const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-#if 1
-    // XXX - Workaround for moveToVALU not handling different register class
-    // inserts for REG_SEQUENCE.
-
-    // Build the half of the subregister with the constants.
-    const SDValue Ops0[] = {
-      DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
-      buildSMovImm32(DAG, DL, 0),
-      DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
-      buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
-      DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
-    };
-
-    SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
-                                                  MVT::v2i32, Ops0), 0);
-
-    // Combine the constants and the pointer.
-    const SDValue Ops1[] = {
-      DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
-      Ptr,
-      DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
-      SubRegHi,
-      DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
-    };
+    static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+
+  // Build the half of the subregister with the constants before building the
+  // full 128-bit register. If we are building multiple resource descriptors,
+  // this will allow CSEing of the 2-component register.
+  const SDValue Ops0[] = {
+    DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
+    buildSMovImm32(DAG, DL, 0),
+    DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
+    buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
+    DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
+  };
  
-    return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
-#else
-    const SDValue Ops[] = {
-      DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32),
-      Ptr,
-      DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32),
-      buildSMovImm32(DAG, DL, 0),
-      DAG.getTargetConstant(AMDGPU::sub2, MVT::i32),
-      buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32),
-      DAG.getTargetConstant(AMDGPU::sub3, MVT::i32)
-    };
+  SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                                MVT::v2i32, Ops0), 0);
  
-    return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
+  // Combine the constants and the pointer.
+  const SDValue Ops1[] = {
+    DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+    Ptr,
+    DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
+    SubRegHi,
+    DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
+  };
  
-#endif
+  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
  }
  
  /// \brief Return a resource descriptor with the 'Add TID' bit enabled
-///        The TID (Thread ID) is multipled by the stride value (bits [61:48]
-///        of the resource descriptor) to create an offset, which is added to the
-///        resource ponter.
+///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
+///        of the resource descriptor) to create an offset, which is added to
+///        the resource pointer.
  MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
                                             SDLoc DL,
                                             SDValue Ptr,
@@ -2191,10 +2257,8 @@ MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
                                                    SDValue Ptr) const {
    const SIInstrInfo *TII =
        static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-  uint64_t Rsrc = TII->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE |
-                  0xffffffff; // Size
  
-  return buildRSRC(DAG, DL, Ptr, 0, Rsrc);
+  return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23());
  }
  
  SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,