R600/SI: Teach SIFoldOperands to split 64-bit constants when folding

author Tom Stellard <thomas.stellard@amd.com>

Wed, 7 Jan 2015 19:56:17 +0000 (19:56 +0000)

committer Tom Stellard <thomas.stellard@amd.com>

Wed, 7 Jan 2015 19:56:17 +0000 (19:56 +0000)
author Tom Stellard <thomas.stellard@amd.com>
Wed, 7 Jan 2015 19:56:17 +0000 (19:56 +0000)
committer Tom Stellard <thomas.stellard@amd.com>
Wed, 7 Jan 2015 19:56:17 +0000 (19:56 +0000)
diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp

index 1079b88f6697b902f81b17189c70044b2765eaaa..23d4a4dbe17e5ff44d01315a1b01c7574c0966f1 100644 (file)
--- a/lib/Target/R600/SIFoldOperands.cpp
+++ b/lib/Target/R600/SIFoldOperands.cpp
@@ -153,27 +153,44 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
          const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo());
  
          // FIXME: Fold operands with subregs.
-        if (UseOp.isReg() && UseOp.getSubReg()) {
+        if (UseOp.isReg() && UseOp.getSubReg() && OpToFold.isReg()) {
            continue;
          }
  
          bool FoldingImm = OpToFold.isImm() || OpToFold.isFPImm();
+        APInt Imm;
  
-        // In order to fold immediates into copies, we need to change the
-        // copy to a MOV.
-        if (FoldingImm && UseMI->getOpcode() == AMDGPU::COPY) {
-          const TargetRegisterClass *TRC =
-              MRI.getRegClass(UseMI->getOperand(0).getReg());
-
-          if (TRC->getSize() == 4) {
-            if (TRI.isSGPRClass(TRC))
-              UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
-            else
-              UseMI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
-          } else if (TRC->getSize() == 8 && TRI.isSGPRClass(TRC)) {
-            UseMI->setDesc(TII->get(AMDGPU::S_MOV_B64));
+        if (FoldingImm) {
+          const TargetRegisterClass *UseRC = MRI.getRegClass(UseOp.getReg());
+
+          if (OpToFold.isFPImm()) {
+            Imm = OpToFold.getFPImm()->getValueAPF().bitcastToAPInt();
            } else {
-            continue;
+            Imm = APInt(64, OpToFold.getImm());
+          }
+
+          // Split 64-bit constants into 32-bits for folding.
+          if (UseOp.getSubReg()) {
+            if (UseRC->getSize() != 8)
+              continue;
+
+            if (UseOp.getSubReg() == AMDGPU::sub0) {
+              Imm = Imm.getLoBits(32);
+            } else {
+              assert(UseOp.getSubReg() == AMDGPU::sub1);
+              Imm = Imm.getHiBits(32);
+            }
+          }
+
+          // In order to fold immediates into copies, we need to change the
+          // copy to a MOV.
+          if (UseMI->getOpcode() == AMDGPU::COPY) {
+            unsigned MovOp = TII->getMovOpcode(
+                MRI.getRegClass(UseMI->getOperand(0).getReg()));
+            if (MovOp == AMDGPU::COPY)
+              continue;
+
+            UseMI->setDesc(TII->get(MovOp));
            }
          }
  
@@ -185,19 +202,14 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
              UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
            continue;
  
-        if (FoldingImm) {
-          uint64_t Imm;
-          if (OpToFold.isFPImm()) {
-            Imm = OpToFold.getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue();
-          } else {
-            Imm = OpToFold.getImm();
-          }
  
-          const MachineOperand ImmOp = MachineOperand::CreateImm(Imm);
+        if (FoldingImm) {
+          const MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
            if (TII->isOperandLegal(UseMI, Use.getOperandNo(), &ImmOp)) {
-            FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(), Imm));
-            continue;
+            FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(),
+                               Imm.getSExtValue()));
            }
+          continue;
          }
  
          // Normal substitution with registers
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp

index a58a46e862ad6f4bb8abd639cd2b3b6620215f93..08dd425ecbeebe9efb83f2c01ec76fe1df9c8200 100644 (file)
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -418,6 +418,16 @@ unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const {
    return Opcode;
  }
  
+unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
+
+  if (DstRC->getSize() == 4) {
+    return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+  } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) {
+    return AMDGPU::S_MOV_B64;
+  }
+  return AMDGPU::COPY;
+}
+
  static bool shouldTryToSpillVGPRs(MachineFunction *MF) {
  
    SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h

index 6d63816bbb4f7899abea44b8fef33be563f53916..f766dc85e86abd535444a0cbf811304d5d5c8b46 100644 (file)
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -110,6 +110,10 @@ public:
  
    bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
  
+  // \brief Returns an opcode that can be used to move a value to a \p DstRC
+  // register.  If there is no hardware instruction that can store to \p
+  // DstRC, then AMDGPU::COPY is returned.
+  unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
    unsigned commuteOpcode(unsigned Opcode) const;
  
    MachineInstr *commuteInstruction(MachineInstr *MI,
diff --git a/test/CodeGen/R600/operand-folding.ll b/test/CodeGen/R600/operand-folding.ll

index 77dea0a61205708be996f00ae2ee65c94965a86a..4693ff990612ab02f672e264226a62eb77c7a7ee 100644 (file)
--- a/test/CodeGen/R600/operand-folding.ll
+++ b/test/CodeGen/R600/operand-folding.ll
@@ -36,5 +36,22 @@ endif:
    ret void
  }
  
+; CHECK-LABEL: {{^}}fold_64bit_constant_add:
+; CHECK-NOT: s_mov_b64
+; FIXME: It would be better if we could use v_add here and drop the extra
+; v_mov_b32 instructions.
+; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1
+; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0
+; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}},
+
+define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
+entry:
+  %tmp0 = add i64 %val, 1
+  store i64 %tmp0, i64 addrspace(1)* %out
+  ret void
+}
+
  declare i32 @llvm.r600.read.tidig.x() #0
  attributes #0 = { readnone }
diff --git a/test/CodeGen/R600/sint_to_fp.f64.ll b/test/CodeGen/R600/sint_to_fp.f64.ll

index 4d866eb86fdaefc1ac1d73a5f645fe5f3f03a4b2..abce1777aad30e2942c332e9f9c696725f76cd86 100644 (file)
--- a/test/CodeGen/R600/sint_to_fp.f64.ll
+++ b/test/CodeGen/R600/sint_to_fp.f64.ll
@@ -12,10 +12,10 @@ define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
  
  ; SI-LABEL: {{^}}sint_to_fp_i1_f64:
  ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
-; we should be able to fold the SGPRs into the V_CNDMASK instructions.
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
+; uses an SGPR for [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
  ; SI: buffer_store_dwordx2
  ; SI: s_endpgm
  define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
diff --git a/test/CodeGen/R600/uint_to_fp.f64.ll b/test/CodeGen/R600/uint_to_fp.f64.ll

index b13e7ada39663f9226e3bbb83387982c93623cc2..f34683395d5c32b2163d25fdcd1d6a84a30c2da1 100644 (file)
--- a/test/CodeGen/R600/uint_to_fp.f64.ll
+++ b/test/CodeGen/R600/uint_to_fp.f64.ll
@@ -72,10 +72,10 @@ define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i
  
  ; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
  ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
-; we should be able to fold the SGPRs into the V_CNDMASK instructions.
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
+; uses an SGPR for [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
  ; SI: buffer_store_dwordx2
  ; SI: s_endpgm
  define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
author	Tom Stellard <thomas.stellard@amd.com>
	Wed, 7 Jan 2015 19:56:17 +0000 (19:56 +0000)
committer	Tom Stellard <thomas.stellard@amd.com>
	Wed, 7 Jan 2015 19:56:17 +0000 (19:56 +0000)
lib/Target/R600/SIFoldOperands.cpp		patch \| blob \| history
lib/Target/R600/SIInstrInfo.cpp		patch \| blob \| history
lib/Target/R600/SIInstrInfo.h		patch \| blob \| history
test/CodeGen/R600/operand-folding.ll		patch \| blob \| history
test/CodeGen/R600/sint_to_fp.f64.ll		patch \| blob \| history
test/CodeGen/R600/uint_to_fp.f64.ll		patch \| blob \| history