R600/SI: Try to use scalar BFE.

author Matt Arsenault <Matthew.Arsenault@amd.com>

Fri, 18 Apr 2014 05:19:26 +0000 (05:19 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Fri, 18 Apr 2014 05:19:26 +0000 (05:19 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Fri, 18 Apr 2014 05:19:26 +0000 (05:19 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Fri, 18 Apr 2014 05:19:26 +0000 (05:19 +0000)
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp

index 3c41638b7c9aa272f1fe45cf9aac3c481c947d13..5132d453781bd49c65068eaaa6966396757b87ac 100644 (file)
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -203,13 +203,14 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
      N->setNodeId(-1);
      return NULL;   // Already selected.
    }
+
+  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
    switch (Opc) {
    default: break;
    // We are selecting i64 ADD here instead of custom lower it during
    // DAG legalization, so we can fold some i64 ADDs used for address
    // calculation into the LOAD and STORE instructions.
    case ISD::ADD: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
      if (N->getValueType(0) != MVT::i64 ||
          ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
        break;
@@ -255,7 +256,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
    }
    case ISD::BUILD_VECTOR: {
      unsigned RegClassID;
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
      const AMDGPURegisterInfo *TRI =
                     static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
      const SIRegisterInfo *SIRI =
@@ -342,7 +342,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
    }
    case ISD::BUILD_PAIR: {
      SDValue RC, SubReg0, SubReg1;
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
      if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
        break;
      }
@@ -393,7 +392,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
    }
  
    case AMDGPUISD::REGISTER_LOAD: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
      if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
        break;
      SDValue Addr, Offset;
@@ -410,7 +408,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
                                    Ops);
    }
    case AMDGPUISD::REGISTER_STORE: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
      if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
        break;
      SDValue Addr, Offset;
@@ -426,6 +423,47 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
                                          CurDAG->getVTList(MVT::Other),
                                          Ops);
    }
+
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      break;
+
+    // There is a scalar version available, but unlike the vector version which
+    // has a separate operand for the offset and width, the scalar version packs
+    // the width and offset into a single operand. Try to move to the scalar
+    // version if the offsets are constant, so that we can try to keep extended
+    // loads of kernel arguments in SGPRs.
+
+    // TODO: Technically we could try to pattern match scalar bitshifts of
+    // dynamic values, but it's probably not useful.
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!Offset)
+      break;
+
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+    if (!Width)
+      break;
+
+    bool Signed = Opc == AMDGPUISD::BFE_I32;
+
+    // Transformation function, pack the offset and width of a BFE into
+    // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+    // source, bits [5:0] contain the offset and bits [22:16] the width.
+
+    uint32_t OffsetVal = Offset->getZExtValue();
+    uint32_t WidthVal = Width->getZExtValue();
+
+    uint32_t PackedVal = OffsetVal | WidthVal << 16;
+
+    SDValue PackedOffsetWidth = CurDAG->getTargetConstant(PackedVal, MVT::i32);
+    return CurDAG->getMachineNode(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
+                                  SDLoc(N),
+                                  MVT::i32,
+                                  N->getOperand(0),
+                                  PackedOffsetWidth);
+
+  }
    }
    return SelectCode(N);
  }
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp

index 21f7a81700a640c60b70b18f49574f36f8a4b59a..96eeea562794de6421b2a2e2e2eb00fd3bb1ca3e 100644 (file)
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -539,6 +539,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
    case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
    case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
    case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
+  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
    case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
    case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
    case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
@@ -1009,6 +1011,27 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
  
      addDescImplicitUseDef(NewDesc, Inst);
  
+    if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
+      const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
+      // If we need to move this to VGPRs, we need to unpack the second operand
+      // back into the 2 separate ones for bit offset and width.
+      assert(OffsetWidthOp.isImm() &&
+             "Scalar BFE is only implemented for constant width and offset");
+      uint32_t Imm = OffsetWidthOp.getImm();
+
+      uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+      uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+
+      Inst->RemoveOperand(2); // Remove old immediate.
+      Inst->addOperand(MachineOperand::CreateImm(Offset));
+      Inst->addOperand(MachineOperand::CreateImm(BitWidth));
+
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+    }
+
      // Update the destination register class.
  
      const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll

index a47da2b25acae2e7d415edf3ebc533cc5bd88f71..b722959aad5711ee1cab0ec162897c2d6540ddcb 100644 (file)
--- a/test/CodeGen/R600/sext-in-reg.ll
+++ b/test/CodeGen/R600/sext-in-reg.ll
@@ -1,12 +1,13 @@
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
  
  declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone
  
  
  ; FUNC-LABEL: @sext_in_reg_i1_i32
  ; SI: S_LOAD_DWORD [[ARG:s[0-9]+]],
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[ARG]], 0, 1
+; SI: S_BFE_I32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
+; SI: V_MOV_B32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]]
  ; SI: BUFFER_STORE_DWORD [[EXTRACT]],
  
  ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
@@ -148,8 +149,8 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun
  
  ; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments.
  ; XFUNC-LABEL: @sext_in_reg_i8_to_v1i64
-; XSI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; XSI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31,
+; XSI: S_BFE_I32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288
+; XSI: S_ASHR_I32 {{v[0-9]+}}, [[EXTRACT]], 31
  ; XSI: BUFFER_STORE_DWORD
  ; XEG: BFE_INT
  ; XEG: ASHR
@@ -204,8 +205,8 @@ define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out
  
  
  ; FUNC-LABEL: @sext_in_reg_v2i1_to_v2i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
  ; SI: BUFFER_STORE_DWORDX2
  
  ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
@@ -221,10 +222,10 @@ define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
  }
  
  ; FUNC-LABEL: @sext_in_reg_v4i1_to_v4i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
  ; SI: BUFFER_STORE_DWORDX4
  
  ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
@@ -320,6 +321,34 @@ define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind {
    ret void
  }
  
+; FUNC-LABEL: @vgpr_sext_in_reg_v4i8_to_v4i32
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
+  %loada = load <4 x i32> addrspace(1)* %a, align 16
+  %loadb = load <4 x i32> addrspace(1)* %b, align 16
+  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
+  %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @vgpr_sext_in_reg_v4i16_to_v4i32
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
+  %loada = load <4 x i32> addrspace(1)* %a, align 16
+  %loadb = load <4 x i32> addrspace(1)* %b, align 16
+  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
+  %ashr = ashr <4 x i32> %shl, <i32 16, i32 16, i32 16, i32 16>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
  ; FIXME: The BFE should really be eliminated. I think it should happen
  ; when computeMaskedBitsForTargetNode is implemented for imax.
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Fri, 18 Apr 2014 05:19:26 +0000 (05:19 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Fri, 18 Apr 2014 05:19:26 +0000 (05:19 +0000)
lib/Target/R600/AMDGPUISelDAGToDAG.cpp		patch \| blob \| history
lib/Target/R600/SIInstrInfo.cpp		patch \| blob \| history
test/CodeGen/R600/sext-in-reg.ll		patch \| blob \| history