From 546520a72757cf8a7f2f0728170fe16c0b1efd5d Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Wed, 7 Jan 2015 19:56:17 +0000 Subject: [PATCH] R600/SI: Teach SIFoldOperands to split 64-bit constants when folding This allows folding of sequences like: s[0:1] = s_mov_b64 4 v_add_i32 v0, s0, v0 v_addc_u32 v1, s1, v1 into v_add_i32 v0, 4, v0 v_add_i32 v1, 0, v1 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225369 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/SIFoldOperands.cpp | 62 +++++++++++++++++----------- lib/Target/R600/SIInstrInfo.cpp | 10 +++++ lib/Target/R600/SIInstrInfo.h | 4 ++ test/CodeGen/R600/operand-folding.ll | 17 ++++++++ test/CodeGen/R600/sint_to_fp.f64.ll | 8 ++-- test/CodeGen/R600/uint_to_fp.f64.ll | 8 ++-- 6 files changed, 76 insertions(+), 33 deletions(-) diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp index 1079b88f669..23d4a4dbe17 100644 --- a/lib/Target/R600/SIFoldOperands.cpp +++ b/lib/Target/R600/SIFoldOperands.cpp @@ -153,27 +153,44 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo()); // FIXME: Fold operands with subregs. - if (UseOp.isReg() && UseOp.getSubReg()) { + if (UseOp.isReg() && UseOp.getSubReg() && OpToFold.isReg()) { continue; } bool FoldingImm = OpToFold.isImm() || OpToFold.isFPImm(); + APInt Imm; - // In order to fold immediates into copies, we need to change the - // copy to a MOV. - if (FoldingImm && UseMI->getOpcode() == AMDGPU::COPY) { - const TargetRegisterClass *TRC = - MRI.getRegClass(UseMI->getOperand(0).getReg()); - - if (TRC->getSize() == 4) { - if (TRI.isSGPRClass(TRC)) - UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32)); - else - UseMI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32)); - } else if (TRC->getSize() == 8 && TRI.isSGPRClass(TRC)) { - UseMI->setDesc(TII->get(AMDGPU::S_MOV_B64)); + if (FoldingImm) { + const TargetRegisterClass *UseRC = MRI.getRegClass(UseOp.getReg()); + + if (OpToFold.isFPImm()) { + Imm = OpToFold.getFPImm()->getValueAPF().bitcastToAPInt(); } else { - continue; + Imm = APInt(64, OpToFold.getImm()); + } + + // Split 64-bit constants into 32-bits for folding. + if (UseOp.getSubReg()) { + if (UseRC->getSize() != 8) + continue; + + if (UseOp.getSubReg() == AMDGPU::sub0) { + Imm = Imm.getLoBits(32); + } else { + assert(UseOp.getSubReg() == AMDGPU::sub1); + Imm = Imm.getHiBits(32); + } + } + + // In order to fold immediates into copies, we need to change the + // copy to a MOV. + if (UseMI->getOpcode() == AMDGPU::COPY) { + unsigned MovOp = TII->getMovOpcode( + MRI.getRegClass(UseMI->getOperand(0).getReg())); + if (MovOp == AMDGPU::COPY) + continue; + + UseMI->setDesc(TII->get(MovOp)); } } @@ -185,19 +202,14 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1) continue; - if (FoldingImm) { - uint64_t Imm; - if (OpToFold.isFPImm()) { - Imm = OpToFold.getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue(); - } else { - Imm = OpToFold.getImm(); - } - const MachineOperand ImmOp = MachineOperand::CreateImm(Imm); + if (FoldingImm) { + const MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); if (TII->isOperandLegal(UseMI, Use.getOperandNo(), &ImmOp)) { - FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(), Imm)); - continue; + FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(), + Imm.getSExtValue())); } + continue; } // Normal substitution with registers diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index a58a46e862a..08dd425ecbe 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -418,6 +418,16 @@ unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const { return Opcode; } +unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { + + if (DstRC->getSize() == 4) { + return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; + } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) { + return AMDGPU::S_MOV_B64; + } + return AMDGPU::COPY; +} + static bool shouldTryToSpillVGPRs(MachineFunction *MF) { SIMachineFunctionInfo *MFI = MF->getInfo(); diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index 6d63816bbb4..f766dc85e86 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -110,6 +110,10 @@ public: bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + // \brief Returns an opcode that can be used to move a value to a \p DstRC + // register. If there is no hardware instruction that can store to \p + // DstRC, then AMDGPU::COPY is returned. + unsigned getMovOpcode(const TargetRegisterClass *DstRC) const; unsigned commuteOpcode(unsigned Opcode) const; MachineInstr *commuteInstruction(MachineInstr *MI, diff --git a/test/CodeGen/R600/operand-folding.ll b/test/CodeGen/R600/operand-folding.ll index 77dea0a6120..4693ff99061 100644 --- a/test/CodeGen/R600/operand-folding.ll +++ b/test/CodeGen/R600/operand-folding.ll @@ -36,5 +36,22 @@ endif: ret void } +; CHECK-LABEL: {{^}}fold_64bit_constant_add: +; CHECK-NOT: s_mov_b64 +; FIXME: It would be better if we could use v_add here and drop the extra +; v_mov_b32 instructions. +; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1 +; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0 +; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]] +; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]] +; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}, + +define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) { +entry: + %tmp0 = add i64 %val, 1 + store i64 %tmp0, i64 addrspace(1)* %out + ret void +} + declare i32 @llvm.r600.read.tidig.x() #0 attributes #0 = { readnone } diff --git a/test/CodeGen/R600/sint_to_fp.f64.ll b/test/CodeGen/R600/sint_to_fp.f64.ll index 4d866eb86fd..abce1777aad 100644 --- a/test/CodeGen/R600/sint_to_fp.f64.ll +++ b/test/CodeGen/R600/sint_to_fp.f64.ll @@ -12,10 +12,10 @@ define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { ; SI-LABEL: {{^}}sint_to_fp_i1_f64: ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], -; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs, -; we should be able to fold the SGPRs into the V_CNDMASK instructions. -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]] +; We can't fold the SGPRs into v_cndmask_b32_e64, because it already +; uses an SGPR for [[CMP]] +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] ; SI: buffer_store_dwordx2 ; SI: s_endpgm define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) { diff --git a/test/CodeGen/R600/uint_to_fp.f64.ll b/test/CodeGen/R600/uint_to_fp.f64.ll index b13e7ada396..f34683395d5 100644 --- a/test/CodeGen/R600/uint_to_fp.f64.ll +++ b/test/CodeGen/R600/uint_to_fp.f64.ll @@ -72,10 +72,10 @@ define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i ; SI-LABEL: {{^}}uint_to_fp_i1_to_f64: ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], -; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs, -; we should be able to fold the SGPRs into the V_CNDMASK instructions. -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]] +; We can't fold the SGPRs into v_cndmask_b32_e64, because it already +; uses an SGPR for [[CMP]] +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] ; SI: buffer_store_dwordx2 ; SI: s_endpgm define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) { -- 2.34.1