From e3d4cbc7d25061441adafa47450a31571c87bf85 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Fri, 28 Jun 2013 15:47:08 +0000
Subject: [PATCH] R600: Add local memory support via LDS

Reviewed-by: Vincent Lejeune<vljn at ovi.com>

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185162 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/R600/AMDGPUAsmPrinter.cpp      |  6 ++
 lib/Target/R600/AMDGPUISelLowering.cpp    | 23 +++++++
 lib/Target/R600/AMDGPUISelLowering.h      |  3 +
 lib/Target/R600/AMDGPUInstructions.td     |  9 +++
 lib/Target/R600/AMDGPUMachineFunction.cpp |  1 +
 lib/Target/R600/AMDGPUMachineFunction.h   |  2 +
 lib/Target/R600/AMDILISelDAGToDAG.cpp     | 15 +++--
 lib/Target/R600/R600Defines.h             |  6 +-
 lib/Target/R600/R600ISelLowering.cpp      | 18 ++++-
 lib/Target/R600/R600InstrFormats.td       | 43 ++++++++++--
 lib/Target/R600/R600InstrInfo.cpp         | 46 ++++++++++---
 lib/Target/R600/R600InstrInfo.h           | 11 +++
 lib/Target/R600/R600Instructions.td       | 75 +++++++++++++++++++++
 lib/Target/R600/R600MachineScheduler.cpp  | 12 +++-
 lib/Target/R600/R600Packetizer.cpp        |  3 +
 lib/Target/R600/R600RegisterInfo.td       |  3 +-
 lib/Target/R600/R600Schedule.td           |  2 +
 test/CodeGen/R600/local-memory.ll         | 82 +++++++++++++++++++++++
 18 files changed, 336 insertions(+), 24 deletions(-)
 create mode 100644 test/CodeGen/R600/local-memory.ll
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index f720c7ecb63..996d2a61221 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -29,6 +29,7 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 
@@ -130,6 +131,11 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) {
                            S_STACK_SIZE(MFI->StackSize), 4);
   OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
   OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
+
+  if (MFI->ShaderType == ShaderType::COMPUTE) {
+    OutStreamer.EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
+    OutStreamer.EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
+  }
 }
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 6d7359053dc..4019a1f02ef 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DataLayout.h"
 
 using namespace llvm;
 
@@ -71,6 +72,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
   setOperationAction(ISD::UREM, MVT::i32, Expand);
 
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+
   int types[] = {
     (int)MVT::v2i32,
     (int)MVT::v4i32
@@ -138,6 +141,26 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
   return Op;
 }
 
+SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
+                                                 SDValue Op,
+                                                 SelectionDAG &DAG) const {
+
+  const DataLayout *TD = getTargetMachine().getDataLayout();
+  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
+  // XXX: What does the value of G->getOffset() mean?
+  assert(G->getOffset() == 0 &&
+         "Do not know what to do with an non-zero offset");
+
+  unsigned Offset = MFI->LDSSize;
+  const GlobalValue *GV = G->getGlobal();
+  uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
+
+  // XXX: Account for alignment?
+  MFI->LDSSize += Size;
+
+  return DAG.getConstant(Offset, MVT::i32);
+}
+
 SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     SelectionDAG &DAG) const {
   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index 69a0ac95b29..d739a013242 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -20,6 +20,7 @@
 
 namespace llvm {
 
+class AMDGPUMachineFunction;
 class MachineRegisterInfo;
 
 class AMDGPUTargetLowering : public TargetLowering {
@@ -36,6 +37,8 @@ protected:
   virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
                                        const TargetRegisterClass *RC,
                                        unsigned Reg, EVT VT) const;
+  SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
+                             SelectionDAG &DAG) const;
 
   bool isHWTrueValue(SDValue Op) const;
   bool isHWFalseValue(SDValue Op) const;
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index 29df37499a2..234bb994f37 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -94,6 +94,15 @@ def zextloadi8_constant : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{
     return isGlobalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
+def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def local_store : PatFrag<(ops node:$val, node:$ptr),
+                             (store node:$val, node:$ptr), [{
+    return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
 class Constants {
 int TWO_PI = 0x40c90fdb;
 int PI = 0x40490fdb;
diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp
index 04610254029..9a1e3447e7e 100644
--- a/lib/Target/R600/AMDGPUMachineFunction.cpp
+++ b/lib/Target/R600/AMDGPUMachineFunction.cpp
@@ -10,6 +10,7 @@ const char *AMDGPUMachineFunction::ShaderTypeAttribute = "ShaderType";
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
     MachineFunctionInfo() {
   ShaderType = ShaderType::COMPUTE;
+  LDSSize = 0;
   AttributeSet Set = MF.getFunction()->getAttributes();
   Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
                                  ShaderTypeAttribute);
diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h
index 21c8c51dae4..5d5df1249b4 100644
--- a/lib/Target/R600/AMDGPUMachineFunction.h
+++ b/lib/Target/R600/AMDGPUMachineFunction.h
@@ -23,6 +23,8 @@ private:
 public:
   AMDGPUMachineFunction(const MachineFunction &MF);
   unsigned ShaderType;
+  /// Number of bytes in the LDS that are being used.
+  unsigned LDSSize;
 };
 
 }
diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp
index 9f077b9db58..e79ab3c0db9 100644
--- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
@@ -282,11 +282,16 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
 
         int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(),
                                         AMDGPU::OpName::literal);
-        assert(ImmIdx != -1);
+        if (ImmIdx == -1) {
+          continue;
+        }
 
-        // subtract one from ImmIdx, because the DST operand is usually index
-        // 0 for MachineInstrs, but we have no DST in the Ops vector.
-        ImmIdx--;
+        if (TII->getOperandIdx(Use->getMachineOpcode(),
+                               AMDGPU::OpName::dst) != -1) {
+          // subtract one from ImmIdx, because the DST operand is usually index
+          // 0 for MachineInstrs, but we have no DST in the Ops vector.
+          ImmIdx--;
+        }
 
         // Check that we aren't already using an immediate.
         // XXX: It's possible for an instruction to have more than one
@@ -336,7 +341,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     }
     if (Result && Result->isMachineOpcode() &&
         !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)
-        && TII->isALUInstr(Result->getMachineOpcode())) {
+        && TII->hasInstrModifiers(Result->getMachineOpcode())) {
       // Fold FNEG/FABS/CONST_ADDRESS
       // TODO: Isel can generate multiple MachineInst, we need to recursively
       // parse Result
diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
index 6bcf8aefa74..90fc29ce14d 100644
--- a/lib/Target/R600/R600Defines.h
+++ b/lib/Target/R600/R600Defines.h
@@ -42,7 +42,9 @@ namespace R600_InstFlag {
     OP2 = (1 << 11),
     VTX_INST  = (1 << 12),
     TEX_INST = (1 << 13),
-    ALU_INST = (1 << 14)
+    ALU_INST = (1 << 14),
+    LDS_1A = (1 << 15),
+    LDS_1A1D = (1 << 16)
   };
 }
 
@@ -162,4 +164,6 @@ namespace OpName {
 #define R_028878_SQ_PGM_RESOURCES_GS                 0x028878
 #define R_0288D4_SQ_PGM_RESOURCES_LS                 0x0288d4
 
+#define R_0288E8_SQ_LDS_ALLOC                        0x0288E8
+
 #endif // R600DEFINES_H_
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index b898af13b7a..ce2aa9208f8 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -138,6 +138,19 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
     break;
   }
 
+  case AMDGPU::LDS_READ_RET: {
+    MachineInstrBuilder NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
+                                        TII->get(MI->getOpcode()),
+                                        AMDGPU::OQAP);
+    for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
+      NewMI.addOperand(MI->getOperand(i));
+    }
+    TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
+                                 MI->getOperand(0).getReg(),
+                                 AMDGPU::OQAP);
+    break;
+  }
+
   case AMDGPU::MOV_IMM_F32:
     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
                      MI->getOperand(1).getFPImm()->getValueAPF()
@@ -456,6 +469,8 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 //===----------------------------------------------------------------------===//
 
 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
@@ -463,14 +478,13 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
   case ISD::STORE: return LowerSTORE(Op, DAG);
   case ISD::LOAD: return LowerLOAD(Op, DAG);
   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
+  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
   case ISD::INTRINSIC_VOID: {
     SDValue Chain = Op.getOperand(0);
     unsigned IntrinsicID =
                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     switch (IntrinsicID) {
     case AMDGPUIntrinsic::AMDGPU_store_output: {
-      MachineFunction &MF = DAG.getMachineFunction();
-      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
       MFI->LiveOuts.push_back(Reg);
diff --git a/lib/Target/R600/R600InstrFormats.td b/lib/Target/R600/R600InstrFormats.td
index 2c98fb90db5..2d72404702f 100644
--- a/lib/Target/R600/R600InstrFormats.td
+++ b/lib/Target/R600/R600InstrFormats.td
@@ -23,6 +23,8 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
   bits<2> FlagOperandIdx = 0;
   bit Op1 = 0;
   bit Op2 = 0;
+  bit LDS_1A = 0;
+  bit LDS_1A1D = 0;
   bit HasNativeOperands = 0;
   bit VTXInst = 0;
   bit TEXInst = 0;
@@ -49,21 +51,21 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
   let TSFlags{12} = VTXInst;
   let TSFlags{13} = TEXInst;
   let TSFlags{14} = ALUInst;
+  let TSFlags{15} = LDS_1A;
+  let TSFlags{16} = LDS_1A1D;
 }
 
 //===----------------------------------------------------------------------===//
 // ALU instructions
 //===----------------------------------------------------------------------===//
 
-class R600ALU_Word0 {
+class R600_ALU_LDS_Word0 {
   field bits<32> Word0;
 
   bits<11> src0;
-  bits<1>  src0_neg;
   bits<1>  src0_rel;
   bits<11> src1;
   bits<1>  src1_rel;
-  bits<1>  src1_neg;
   bits<3>  index_mode = 0;
   bits<2>  pred_sel;
   bits<1>  last;
@@ -76,16 +78,23 @@ class R600ALU_Word0 {
   let Word0{8-0}   = src0_sel;
   let Word0{9}     = src0_rel;
   let Word0{11-10} = src0_chan;
-  let Word0{12}    = src0_neg;
   let Word0{21-13} = src1_sel;
   let Word0{22}    = src1_rel;
   let Word0{24-23} = src1_chan;
-  let Word0{25}    = src1_neg;
   let Word0{28-26} = index_mode;
   let Word0{30-29} = pred_sel;
   let Word0{31}    = last;
 }
 
+class R600ALU_Word0 : R600_ALU_LDS_Word0 {
+
+  bits<1>  src0_neg;
+  bits<1>  src1_neg;
+
+  let Word0{12}    = src0_neg;
+  let Word0{25}    = src1_neg;
+}
+
 class R600ALU_Word1 {
   field bits<32> Word1;
 
@@ -138,6 +147,30 @@ class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{
   let Word1{17-13} = alu_inst;
 }
 
+class R600LDS_Word1 {
+  field bits<32> Word1;
+
+  bits<11> src2;
+  bits<9>  src2_sel  = src2{8-0};
+  bits<2>  src2_chan = src2{10-9};
+  bits<1>  src2_rel;
+  // offset specifies the stride offset to the second set of data to be read
+  // from.  This is a dword offset.
+  bits<5>  alu_inst = 17; // OP3_INST_LDS_IDX_OP
+  bits<3>  bank_swizzle;
+  bits<6>  lds_op;
+  bits<2>  dst_chan = 0;
+
+  let Word1{8-0}   = src2_sel;
+  let Word1{9}     = src2_rel;
+  let Word1{11-10} = src2_chan;
+  let Word1{17-13} = alu_inst;
+  let Word1{20-18} = bank_swizzle;
+  let Word1{26-21} = lds_op;
+  let Word1{30-29} = dst_chan;
+}
+
+
 /*
 XXX: R600 subtarget uses a slightly different encoding than the other
 subtargets.  We currently handle this in R600MCCodeEmitter, but we may
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 3b1a24015a8..f05390ec48c 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -136,6 +136,21 @@ bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
   return (TargetFlags & R600_InstFlag::ALU_INST);
 }
 
+bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const {
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
+  return ((TargetFlags & R600_InstFlag::OP1) |
+          (TargetFlags & R600_InstFlag::OP2) |
+          (TargetFlags & R600_InstFlag::OP3));
+}
+
+bool R600InstrInfo::isLDSInstr(unsigned Opcode) const {
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
+  return ((TargetFlags & R600_InstFlag::LDS_1A) |
+          (TargetFlags & R600_InstFlag::LDS_1A1D));
+}
+
 bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
   return (get(Opcode).TSFlags & R600_InstFlag::TRANS_ONLY);
 }
@@ -245,6 +260,9 @@ R600InstrInfo::ExtractSrcs(MachineInstr *MI,
     unsigned Reg = Srcs[i].first->getReg();
     unsigned Index = RI.getEncodingValue(Reg) & 0xff;
     unsigned Chan = RI.getHWRegChan(Reg);
+    if (Reg == AMDGPU::OQAP) {
+      Result.push_back(std::pair<int, unsigned>(Index, 0));
+    }
     if (Index > 127) {
       Result.push_back(DummyPair);
       continue;
@@ -287,10 +305,11 @@ Swizzle(std::vector<std::pair<int, unsigned> > Src,
   return Src;
 }
 
-static bool
-isLegal(const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
-    const std::vector<R600InstrInfo::BankSwizzle> &Swz,
-    unsigned CheckedSize) {
+bool
+R600InstrInfo::isLegal(
+             const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+             const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+             unsigned CheckedSize) const {
   int Vector[4][3];
   memset(Vector, -1, sizeof(Vector));
   for (unsigned i = 0; i < CheckedSize; i++) {
@@ -300,6 +319,16 @@ isLegal(const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
       const std::pair<int, unsigned> &Src = Srcs[j];
       if (Src.first < 0)
         continue;
+      if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) {
+        if (Swz[i] != R600InstrInfo::ALU_VEC_012 &&
+            Swz[i] != R600InstrInfo::ALU_VEC_021) {
+            // The value from output queue A (denoted by register OQAP) can
+            // only be fetched during the first cycle.
+            return false;
+        }
+        // OQAP does not count towards the normal read port restrictions
+        continue;
+      }
       if (Vector[Src.second][j] < 0)
         Vector[Src.second][j] = Src.first;
       if (Vector[Src.second][j] != Src.first)
@@ -309,10 +338,11 @@ isLegal(const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
   return true;
 }
 
-static bool recursiveFitsFPLimitation(
-const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
-std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
-unsigned Depth = 0) {
+bool
+R600InstrInfo::recursiveFitsFPLimitation(
+             const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+             std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+             unsigned Depth) const {
   if (!isLegal(IGSrcs, SwzCandidate, Depth))
     return false;
   if (IGSrcs.size() == Depth)
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index 3c2e50be662..a3752889369 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -63,6 +63,8 @@ namespace llvm {
 
   /// \returns true if this \p Opcode represents an ALU instruction.
   bool isALUInstr(unsigned Opcode) const;
+  bool hasInstrModifiers(unsigned Opcode) const;
+  bool isLDSInstr(unsigned Opcode) const;
 
   bool isTransOnly(unsigned Opcode) const;
   bool isTransOnly(const MachineInstr *MI) const;
@@ -82,6 +84,15 @@ namespace llvm {
   SmallVector<std::pair<MachineOperand *, int64_t>, 3>
       getSrcs(MachineInstr *MI) const;
 
+  bool isLegal(
+             const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+             const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+             unsigned CheckedSize) const;
+  bool recursiveFitsFPLimitation(
+             const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+             std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+             unsigned Depth = 0) const;
+
   /// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210
   /// returns true and the first (in lexical order) BankSwizzle affectation
   /// starting from the one already provided in the Instruction Group MIs that
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index f42501a2fd0..fd585f8b7e9 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1529,6 +1529,81 @@ def GROUP_BARRIER : InstR600 <
   let ALUInst = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// LDS Instructions
+//===----------------------------------------------------------------------===//
+class R600_LDS  <bits<6> op, dag outs, dag ins, string asm,
+                 list<dag> pattern = []> :
+
+    InstR600 <outs, ins, asm, pattern, XALU>,
+    R600_ALU_LDS_Word0,
+    R600LDS_Word1 {
+
+  bits<6>  offset = 0;
+  let lds_op = op;
+
+  let Word1{27} = offset{0};
+  let Word1{12} = offset{1};
+  let Word1{28} = offset{2};
+  let Word1{31} = offset{3};
+  let Word0{12} = offset{4};
+  let Word0{25} = offset{5};
+
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+
+  let ALUInst = 1;
+  let HasNativeOperands = 1;
+  let UseNamedOperandTable = 1;
+}
+
+class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS <
+  lds_op,
+  (outs R600_Reg32:$dst),
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       LAST:$last, R600_Pred:$pred_sel,
+       BANK_SWIZZLE:$bank_swizzle),
+  "  "#name#" $last OQAP, $src0$src0_rel $pred_sel",
+  pattern
+  > {
+
+  let src1 = 0;
+  let src1_rel = 0;
+  let src2 = 0;
+  let src2_rel = 0;
+
+  let Defs = [OQAP];
+  let usesCustomInserter = 1;
+  let LDS_1A = 1;
+  let DisableEncoding = "$dst";
+}
+
+class R600_LDS_1A1D <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS <
+  lds_op,
+  (outs),
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
+       LAST:$last, R600_Pred:$pred_sel,
+       BANK_SWIZZLE:$bank_swizzle),
+  "  "#name#" $last $src0$src0_rel, $src1$src1_rel, $pred_sel",
+  pattern
+  > {
+
+  let src2 = 0;
+  let src2_rel = 0;
+  let LDS_1A1D = 1;
+}
+
+def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
+  [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))]
+>;
+
+def LDS_WRITE : R600_LDS_1A1D <0xD, "LDS_WRITE",
+  [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
+>;
+
   // TRUNC is used for the FLT_TO_INT instructions to work around a
   // perceived problem where the rounding modes are applied differently
   // depending on the instruction and the slot they are in.
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index acc1b4d6ee3..7e28f9dde47 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -278,6 +278,10 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
       return AluT_XYZW;
     }
 
+    if (TII->isLDSInstr(MI->getOpcode())) {
+      return AluT_X;
+    }
+
     // Is the result already assigned to a channel ?
     unsigned DestSubReg = MI->getOperand(0).getSubReg();
     switch (DestSubReg) {
@@ -371,14 +375,18 @@ void R600SchedStrategy::PrepareNextSlot() {
 }
 
 void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
-  unsigned DestReg = MI->getOperand(0).getReg();
+  int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+  if (DstIndex == -1) {
+    return;
+  }
+  unsigned DestReg = MI->getOperand(DstIndex).getReg();
   // PressureRegister crashes if an operand is def and used in the same inst
   // and we try to constraint its regclass
   for (MachineInstr::mop_iterator It = MI->operands_begin(),
       E = MI->operands_end(); It != E; ++It) {
     MachineOperand &MO = *It;
     if (MO.isReg() && !MO.isDef() &&
-        MO.getReg() == MI->getOperand(0).getReg())
+        MO.getReg() == DestReg)
       return;
   }
   // Constrains the regclass of DestReg to assign it to Slot
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index 4c72d229675..6fc15deb5c9 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -92,6 +92,9 @@ private:
         Result[Dst] = AMDGPU::PV_X;
         continue;
       }
+      if (Dst == AMDGPU::OQAP) {
+        continue;
+      }
       unsigned PVReg = 0;
       switch (TRI.getHWRegChan(Dst)) {
       case 0:
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
index a8b9b70322c..60a93e3f6c2 100644
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -101,6 +101,7 @@ def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
 def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
 def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
 def AR_X : R600Reg<"AR.x", 0>;
+def OQAP : R600Reg<"OQAP", 221>;
 
 def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
                           (add (sequence "ArrayBase%u", 448, 480))>;
@@ -170,7 +171,7 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
     R600_ArrayBase,
     R600_Addr,
     ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
-    ALU_CONST, ALU_PARAM
+    ALU_CONST, ALU_PARAM, OQAP
     )>;
 
 def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
diff --git a/lib/Target/R600/R600Schedule.td b/lib/Target/R600/R600Schedule.td
index 207233d7e76..df62bf85c0a 100644
--- a/lib/Target/R600/R600Schedule.td
+++ b/lib/Target/R600/R600Schedule.td
@@ -23,6 +23,7 @@ def TRANS : FuncUnit;
 def AnyALU : InstrItinClass;
 def VecALU : InstrItinClass;
 def TransALU : InstrItinClass;
+def XALU : InstrItinClass;
 
 def R600_VLIW5_Itin : ProcessorItineraries <
   [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL],
@@ -31,6 +32,7 @@ def R600_VLIW5_Itin : ProcessorItineraries <
     InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>,
     InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
     InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>,
+    InstrItinData<XALU, [InstrStage<1, [ALU_X]>]>,
     InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
   ]
 >;
diff --git a/test/CodeGen/R600/local-memory.ll b/test/CodeGen/R600/local-memory.ll
new file mode 100644
index 00000000000..0ff38483087
--- /dev/null
+++ b/test/CodeGen/R600/local-memory.ll
@@ -0,0 +1,82 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+
+@local_memory.local_mem = internal addrspace(3) unnamed_addr global [16 x i32] zeroinitializer, align 4
+
+; CHECK: @local_memory
+
+; Check that the LDS size emitted correctly
+; CHECK: .long 166120
+; CHECK-NEXT: .long 16
+
+; CHECK: LDS_WRITE
+
+; GROUP_BARRIER must be the last instruction in a clause
+; CHECK: GROUP_BARRIER
+; CHECK-NEXT: ALU clause
+
+; CHECK: LDS_READ_RET
+
+define void @local_memory(i32 addrspace(1)* %out) {
+entry:
+  %y.i = call i32 @llvm.r600.read.tidig.x() #0
+  %arrayidx = getelementptr inbounds [16 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
+  store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
+  %add = add nsw i32 %y.i, 1
+  %cmp = icmp eq i32 %add, 16
+  %.add = select i1 %cmp, i32 0, i32 %add
+  call void @llvm.AMDGPU.barrier.local()
+  %arrayidx1 = getelementptr inbounds [16 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
+  %0 = load i32 addrspace(3)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %out, i32 %y.i
+  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+@local_memory_two_objects.local_mem0 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4
+@local_memory_two_objects.local_mem1 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4
+
+; CHECK: @local_memory_two_objects
+
+; Check that the LDS size emitted correctly
+; CHECK: .long 166120
+; CHECK-NEXT: .long 8
+
+; Make sure the lds writes are using different addresses.
+; CHECK: LDS_WRITE {{[*]*}} {{PV|T}}[[ADDRW:[0-9]*\.[XYZW]]]
+; CHECK-NOT: LDS_WRITE {{[*]*}} T[[ADDRW]]
+
+; GROUP_BARRIER must be the last instruction in a clause
+; CHECK: GROUP_BARRIER
+; CHECK-NEXT: ALU clause
+
+; Make sure the lds reads are using different addresses.
+; CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
+; CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
+
+define void @local_memory_two_objects(i32 addrspace(1)* %out) {
+entry:
+  %x.i = call i32 @llvm.r600.read.tidig.x() #0
+  %arrayidx = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
+  store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
+  %mul = shl nsw i32 %x.i, 1
+  %arrayidx1 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
+  store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
+  %sub = sub nsw i32 3, %x.i
+  call void @llvm.AMDGPU.barrier.local()
+  %arrayidx2 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
+  %0 = load i32 addrspace(3)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32 addrspace(1)* %out, i32 %x.i
+  store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
+  %1 = load i32 addrspace(3)* %arrayidx4, align 4
+  %add = add nsw i32 %x.i, 4
+  %arrayidx5 = getelementptr inbounds i32 addrspace(1)* %out, i32 %add
+  store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare void @llvm.AMDGPU.barrier.local()
+
+attributes #0 = { readnone }
-- 
2.34.1