From a1d28f6dd713d31269b612ac4aba0d408a7e9990 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Fri, 21 Mar 2014 15:51:57 +0000
Subject: [PATCH] R600/SI: Handle MUBUF instructions in
 SIInstrInfo::moveToVALU()

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@204476 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/R600/AMDGPUTargetMachine.cpp |   3 +
 lib/Target/R600/SIISelLowering.cpp      |   4 +-
 lib/Target/R600/SIInstrFormats.td       |   1 +
 lib/Target/R600/SIInstrInfo.cpp         | 136 +++++++++++++++++++++++-
 lib/Target/R600/SIInstrInfo.h           |  10 ++
 test/CodeGen/R600/salu-to-valu.ll       |   8 +-
 6 files changed, 157 insertions(+), 5 deletions(-)
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index 7f5042876e7..b11fce34025 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -165,6 +165,9 @@ bool AMDGPUPassConfig::addPreRegAlloc() {
     addPass(createR600VectorRegMerger(*TM));
   } else {
     addPass(createSIFixSGPRCopiesPass(*TM));
+    // SIFixSGPRCopies can generate a lot of duplicate instructions,
+    // so we need to run MachineCSE afterwards.
+    addPass(&MachineCSEID);
   }
   return false;
 }
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 52e5a16759d..fd1e3a69bd1 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -25,8 +25,6 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/Function.h"
 
-const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
-
 using namespace llvm;
 
 SITargetLowering::SITargetLowering(TargetMachine &TM) :
@@ -407,7 +405,7 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo)
             .addImm(0);
     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi)
-            .addImm(RSRC_DATA_FORMAT >> 32);
+            .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi)
             .addReg(SubRegHiLo)
             .addImm(AMDGPU::sub0)
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index 53ebaaf15a7..aa2c22c51eb 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -369,6 +369,7 @@ class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let EXP_CNT = 1;
 
   let neverHasSideEffects = 1;
+  let UseNamedOperandTable = 1;
 }
 
 class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index c8a52971efa..6c2185eafe8 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -558,6 +558,32 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
   MO.ChangeToRegister(Reg, false);
 }
 
+unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
+                                         MachineRegisterInfo &MRI,
+                                         MachineOperand &SuperReg,
+                                         const TargetRegisterClass *SuperRC,
+                                         unsigned SubIdx,
+                                         const TargetRegisterClass *SubRC)
+                                         const {
+  assert(SuperReg.isReg());
+
+  unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
+  unsigned SubReg = MRI.createVirtualRegister(SubRC);
+
+  // Just in case the super register is itself a sub-register, copy it to a new
+  // value so we don't need to wory about merging its subreg index with the
+  // SubIdx passed to this function.  The register coalescer should be able to
+  // eliminate this extra copy.
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY),
+          NewSuperReg)
+          .addOperand(SuperReg);
+
+  BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY),
+          SubReg)
+          .addReg(NewSuperReg, 0, SubIdx);
+  return SubReg;
+}
+
 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
@@ -675,6 +701,110 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
       MI->getOperand(i).setReg(DstReg);
     }
   }
+
+  // Legalize MUBUF* instructions
+  // FIXME: If we start using the non-addr64 instructions for compute, we
+  // may need to legalize them here.
+
+  int SRsrcIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                            AMDGPU::OpName::srsrc);
+  int VAddrIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                             AMDGPU::OpName::vaddr);
+  if (SRsrcIdx != -1 && VAddrIdx != -1) {
+    const TargetRegisterClass *VAddrRC =
+        RI.getRegClass(get(MI->getOpcode()).OpInfo[VAddrIdx].RegClass);
+
+    if(VAddrRC->getSize() == 8 &&
+       MRI.getRegClass(MI->getOperand(SRsrcIdx).getReg()) != VAddrRC) {
+      // We have a MUBUF instruction that uses a 64-bit vaddr register and
+      // srsrc has the incorrect register class.  In order to fix this, we
+      // need to extract the pointer from the resource descriptor (srsrc),
+      // add it to the value of vadd,  then store the result in the vaddr
+      // operand.  Then, we need to set the pointer field of the resource
+      // descriptor to zero.
+
+      MachineBasicBlock &MBB = *MI->getParent();
+      MachineOperand &SRsrcOp = MI->getOperand(SRsrcIdx);
+      MachineOperand &VAddrOp = MI->getOperand(VAddrIdx);
+      unsigned SRsrcPtrLo, SRsrcPtrHi, VAddrLo, VAddrHi;
+      unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+      unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+      unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+      unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+
+      // SRsrcPtrLo = srsrc:sub0
+      SRsrcPtrLo = buildExtractSubReg(MI, MRI, SRsrcOp,
+          &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
+
+      // SRsrcPtrHi = srsrc:sub1
+      SRsrcPtrHi = buildExtractSubReg(MI, MRI, SRsrcOp,
+          &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
+
+      // VAddrLo = vaddr:sub0
+      VAddrLo = buildExtractSubReg(MI, MRI, VAddrOp,
+          &AMDGPU::VReg_64RegClass, AMDGPU::sub0, &AMDGPU::VReg_32RegClass);
+
+      // VAddrHi = vaddr:sub1
+      VAddrHi = buildExtractSubReg(MI, MRI, VAddrOp,
+          &AMDGPU::VReg_64RegClass, AMDGPU::sub1, &AMDGPU::VReg_32RegClass);
+
+      // NewVaddrLo = SRsrcPtrLo + VAddrLo
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32),
+              NewVAddrLo)
+              .addReg(SRsrcPtrLo)
+              .addReg(VAddrLo)
+              .addReg(AMDGPU::VCC, RegState::Define | RegState::Implicit);
+
+      // NewVaddrHi = SRsrcPtrHi + VAddrHi
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32),
+              NewVAddrHi)
+              .addReg(SRsrcPtrHi)
+              .addReg(VAddrHi)
+              .addReg(AMDGPU::VCC, RegState::ImplicitDefine)
+              .addReg(AMDGPU::VCC, RegState::Implicit);
+
+      // NewVaddr = {NewVaddrHi, NewVaddrLo}
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
+              NewVAddr)
+              .addReg(NewVAddrLo)
+              .addImm(AMDGPU::sub0)
+              .addReg(NewVAddrHi)
+              .addImm(AMDGPU::sub1);
+
+      // Zero64 = 0
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
+              Zero64)
+              .addImm(0);
+
+      // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+              SRsrcFormatLo)
+              .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
+
+      // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+              SRsrcFormatHi)
+              .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
+
+      // NewSRsrc = {Zero64, SRsrcFormat}
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
+              NewSRsrc)
+              .addReg(Zero64)
+              .addImm(AMDGPU::sub0_sub1)
+              .addReg(SRsrcFormatLo)
+              .addImm(AMDGPU::sub2)
+              .addReg(SRsrcFormatHi)
+              .addImm(AMDGPU::sub3);
+
+      // Update the instruction to use NewVaddr
+      MI->getOperand(VAddrIdx).setReg(NewVAddr);
+      // Update the instruction to use NewSRsrc
+      MI->getOperand(SRsrcIdx).setReg(NewSRsrc);
+    }
+  }
 }
 
 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
@@ -731,8 +861,12 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     }
 
     unsigned NewOpcode = getVALUOp(*Inst);
-    if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
+    if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
+      // We cannot move this instruction to the VALU, so we should try to
+      // legalize its operands instead.
+      legalizeOperands(Inst);
       continue;
+    }
 
     // Use the new VALU Opcode.
     const MCInstrDesc &NewDesc = get(NewOpcode);
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index bb8bc72ff86..11dbfb8e489 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -25,6 +25,13 @@ class SIInstrInfo : public AMDGPUInstrInfo {
 private:
   const SIRegisterInfo RI;
 
+  unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
+                              MachineRegisterInfo &MRI,
+                              MachineOperand &SuperReg,
+                              const TargetRegisterClass *SuperRC,
+                              unsigned SubIdx,
+                              const TargetRegisterClass *SubRC) const;
+
 public:
   explicit SIInstrInfo(AMDGPUTargetMachine &tm);
 
@@ -142,6 +149,9 @@ namespace AMDGPU {
   int getCommuteRev(uint16_t Opcode);
   int getCommuteOrig(uint16_t Opcode);
 
+  const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
+
+
 } // End namespace AMDGPU
 
 } // End namespace llvm
diff --git a/test/CodeGen/R600/salu-to-valu.ll b/test/CodeGen/R600/salu-to-valu.ll
index c989c9d6722..e461bf9acec 100644
--- a/test/CodeGen/R600/salu-to-valu.ll
+++ b/test/CodeGen/R600/salu-to-valu.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=r600 -mcpu=SI  | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
 
 ; In this test both the pointer and the offset operands to the
 ; BUFFER_LOAD instructions end up being stored in vgprs.  This
@@ -8,8 +8,14 @@
 ; (low 64-bits of srsrc).
 
 ; CHECK-LABEL: @mubuf
+
 ; Make sure we aren't using VGPRs for the source operand of S_MOV_B64
 ; CHECK-NOT: S_MOV_B64 s[{{[0-9]+:[0-9]+}}], v
+
+; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_*
+; instructions
+; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]
+; CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]
 define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #1
-- 
2.34.1