const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo());
// FIXME: Fold operands with subregs.
- if (UseOp.isReg() && UseOp.getSubReg()) {
+ if (UseOp.isReg() && UseOp.getSubReg() && OpToFold.isReg()) {
continue;
}
bool FoldingImm = OpToFold.isImm() || OpToFold.isFPImm();
+ APInt Imm;
- // In order to fold immediates into copies, we need to change the
- // copy to a MOV.
- if (FoldingImm && UseMI->getOpcode() == AMDGPU::COPY) {
- const TargetRegisterClass *TRC =
- MRI.getRegClass(UseMI->getOperand(0).getReg());
-
- if (TRC->getSize() == 4) {
- if (TRI.isSGPRClass(TRC))
- UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
- else
- UseMI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
- } else if (TRC->getSize() == 8 && TRI.isSGPRClass(TRC)) {
- UseMI->setDesc(TII->get(AMDGPU::S_MOV_B64));
+ if (FoldingImm) {
+ const TargetRegisterClass *UseRC = MRI.getRegClass(UseOp.getReg());
+
+ if (OpToFold.isFPImm()) {
+ Imm = OpToFold.getFPImm()->getValueAPF().bitcastToAPInt();
} else {
- continue;
+ Imm = APInt(64, OpToFold.getImm());
+ }
+
+ // Split 64-bit constants into 32-bits for folding.
+ if (UseOp.getSubReg()) {
+ if (UseRC->getSize() != 8)
+ continue;
+
+ if (UseOp.getSubReg() == AMDGPU::sub0) {
+ Imm = Imm.getLoBits(32);
+ } else {
+ assert(UseOp.getSubReg() == AMDGPU::sub1);
+ Imm = Imm.getHiBits(32);
+ }
+ }
+
+ // In order to fold immediates into copies, we need to change the
+ // copy to a MOV.
+ if (UseMI->getOpcode() == AMDGPU::COPY) {
+ unsigned MovOp = TII->getMovOpcode(
+ MRI.getRegClass(UseMI->getOperand(0).getReg()));
+ if (MovOp == AMDGPU::COPY)
+ continue;
+
+ UseMI->setDesc(TII->get(MovOp));
}
}
UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
continue;
- if (FoldingImm) {
- uint64_t Imm;
- if (OpToFold.isFPImm()) {
- Imm = OpToFold.getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue();
- } else {
- Imm = OpToFold.getImm();
- }
- const MachineOperand ImmOp = MachineOperand::CreateImm(Imm);
+ if (FoldingImm) {
+ const MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
if (TII->isOperandLegal(UseMI, Use.getOperandNo(), &ImmOp)) {
- FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(), Imm));
- continue;
+ FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(),
+ Imm.getSExtValue()));
}
+ continue;
}
// Normal substitution with registers
return Opcode;
}
+unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
+
+ if (DstRC->getSize() == 4) {
+ return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+ } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) {
+ return AMDGPU::S_MOV_B64;
+ }
+ return AMDGPU::COPY;
+}
+
static bool shouldTryToSpillVGPRs(MachineFunction *MF) {
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+ // \brief Returns an opcode that can be used to move a value to a \p DstRC
+ // register. If there is no hardware instruction that can store to \p
+ // DstRC, then AMDGPU::COPY is returned.
+ unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
unsigned commuteOpcode(unsigned Opcode) const;
MachineInstr *commuteInstruction(MachineInstr *MI,
ret void
}
+; CHECK-LABEL: {{^}}fold_64bit_constant_add:
+; CHECK-NOT: s_mov_b64
+; FIXME: It would be better if we could use v_add here and drop the extra
+; v_mov_b32 instructions.
+; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1
+; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0
+; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]]
+; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
+; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}},
+
+define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
+entry:
+ %tmp0 = add i64 %val, 1
+ store i64 %tmp0, i64 addrspace(1)* %out
+ ret void
+}
+
declare i32 @llvm.r600.read.tidig.x() #0
attributes #0 = { readnone }
; SI-LABEL: {{^}}sint_to_fp_i1_f64:
; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
-; we should be able to fold the SGPRs into the V_CNDMASK instructions.
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
+; uses an SGPR for [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
; SI: buffer_store_dwordx2
; SI: s_endpgm
define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
-; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
-; we should be able to fold the SGPRs into the V_CNDMASK instructions.
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
+; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
+; uses an SGPR for [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
; SI: buffer_store_dwordx2
; SI: s_endpgm
define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {