X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FR600%2FSIInstrInfo.cpp;h=f5b82d53ba7a9d020fa3fdc4abdfdb3bfac7578c;hb=2b6e6fc1a8c188a9ccbe028b42697d32edaa2a1c;hp=d6f5517719823f9066b8719db249cb5d5e203975;hpb=c2bab04829e32a078069a32152aed7885071eb5c;p=oota-llvm.git diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index d6f55177198..f5b82d53ba7 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -19,13 +19,14 @@ #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" #include "llvm/MC/MCInstrDesc.h" using namespace llvm; -SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm) - : AMDGPUInstrInfo(tm), - RI(tm) { } +SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) + : AMDGPUInstrInfo(st), + RI(st) { } //===----------------------------------------------------------------------===// // TargetInstrInfo callbacks @@ -187,27 +188,51 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo(); + MachineFunction *MF = MBB.getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo(); + MachineRegisterInfo &MRI = MF->getRegInfo(); DebugLoc DL = MBB.findDebugLoc(MI); unsigned KillFlag = isKill ? RegState::Kill : 0; - if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) { - unsigned Lane = MFI->SpillTracker.getNextLane(MRI); - BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), - MFI->SpillTracker.LaneVGPR) + if (RI.hasVGPRs(RC)) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Can't spill VGPR!"); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0) + .addReg(SrcReg); + } else if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) { + unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MF); + unsigned TgtReg = MFI->SpillTracker.LaneVGPR; + + BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), TgtReg) .addReg(SrcReg, KillFlag) .addImm(Lane); + MFI->SpillTracker.addSpilledReg(FrameIndex, TgtReg, Lane); + } else if (RI.isSGPRClass(RC)) { + // We are only allowed to create one new instruction when spilling + // registers, so we need to use pseudo instruction for vector + // registers. + // + // Reserve a spot in the spill tracker for each sub-register of + // the vector register. + unsigned NumSubRegs = RC->getSize() / 4; + unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MF, NumSubRegs); MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, - Lane); - } else { - for (unsigned i = 0, e = RC->getSize() / 4; i != e; ++i) { - unsigned SubReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(MBB, MI, MBB.findDebugLoc(MI), get(AMDGPU::COPY), SubReg) - .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); - storeRegToStackSlot(MBB, MI, SubReg, isKill, FrameIndex + i, - &AMDGPU::SReg_32RegClass, TRI); + FirstLane); + + unsigned Opcode; + switch (RC->getSize() * 8) { + case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; + case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; + case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; + case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; + default: llvm_unreachable("Cannot spill register class"); } + + BuildMI(MBB, MI, DL, get(Opcode), MFI->SpillTracker.LaneVGPR) + .addReg(SrcReg) + .addImm(FrameIndex); + } else { + llvm_unreachable("VGPR spilling not supported"); } } @@ -216,30 +241,128 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, unsigned DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo(); + MachineFunction *MF = MBB.getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo(); DebugLoc DL = MBB.findDebugLoc(MI); - if (TRI->getCommonSubClass(RC, &AMDGPU::SReg_32RegClass)) { - SIMachineFunctionInfo::SpilledReg Spill = + + if (RI.hasVGPRs(RC)) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::loadRegToStackSlot - Can't retrieve spilled VGPR!"); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + .addImm(0); + } else if (RI.isSGPRClass(RC)){ + unsigned Opcode; + switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; + case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; + case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; + case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; + case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; + default: llvm_unreachable("Cannot spill register class"); + } + + SIMachineFunctionInfo::SpilledReg Spill = MFI->SpillTracker.getSpilledReg(FrameIndex); - assert(Spill.VGPR); - BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), DestReg) + + BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addReg(Spill.VGPR) - .addImm(Spill.Lane); + .addImm(FrameIndex); } else { - for (unsigned i = 0, e = RC->getSize() / 4; i != e; ++i) { - unsigned Flags = RegState::Define; - if (i == 0) { - Flags |= RegState::Undef; - } - unsigned SubReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - loadRegFromStackSlot(MBB, MI, SubReg, FrameIndex + i, - &AMDGPU::SReg_32RegClass, TRI); - BuildMI(MBB, MI, DL, get(AMDGPU::COPY)) - .addReg(DestReg, Flags, RI.getSubRegFromChannel(i)) - .addReg(SubReg); + llvm_unreachable("VGPR spilling not supported"); + } +} + +static unsigned getNumSubRegsForSpillOp(unsigned Op) { + + switch (Op) { + case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S512_RESTORE: + return 16; + case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S256_RESTORE: + return 8; + case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S128_RESTORE: + return 4; + case AMDGPU::SI_SPILL_S64_SAVE: + case AMDGPU::SI_SPILL_S64_RESTORE: + return 2; + case AMDGPU::SI_SPILL_S32_RESTORE: + return 1; + default: llvm_unreachable("Invalid spill opcode"); + } +} + +void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, + int Count) const { + while (Count > 0) { + int Arg; + if (Count >= 8) + Arg = 7; + else + Arg = Count - 1; + Count -= 8; + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) + .addImm(Arg); + } +} + +bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + SIMachineFunctionInfo *MFI = + MI->getParent()->getParent()->getInfo(); + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MBB.findDebugLoc(MI); + switch (MI->getOpcode()) { + default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); + + // SGPR register spill + case AMDGPU::SI_SPILL_S512_SAVE: + case AMDGPU::SI_SPILL_S256_SAVE: + case AMDGPU::SI_SPILL_S128_SAVE: + case AMDGPU::SI_SPILL_S64_SAVE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned FrameIndex = MI->getOperand(2).getImm(); + + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + SIMachineFunctionInfo::SpilledReg Spill; + unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(1).getReg(), + &AMDGPU::SGPR_32RegClass, i); + Spill = MFI->SpillTracker.getSpilledReg(FrameIndex); + + BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), + MI->getOperand(0).getReg()) + .addReg(SubReg) + .addImm(Spill.Lane + i); + } + MI->eraseFromParent(); + break; + } + + // SGPR register restore + case AMDGPU::SI_SPILL_S512_RESTORE: + case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_S32_RESTORE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + + for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { + SIMachineFunctionInfo::SpilledReg Spill; + unsigned FrameIndex = MI->getOperand(2).getImm(); + unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(0).getReg(), + &AMDGPU::SGPR_32RegClass, i); + Spill = MFI->SpillTracker.getSpilledReg(FrameIndex); + + BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), SubReg) + .addReg(MI->getOperand(1).getReg()) + .addImm(Spill.Lane + i); } + insertNOPs(MI, 3); + MI->eraseFromParent(); + break; + } } + return true; } MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, @@ -247,18 +370,18 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg()) - return 0; + return nullptr; // Cannot commute VOP2 if src0 is SGPR. if (isVOP2(MI->getOpcode()) && MI->getOperand(1).isReg() && RI.isSGPRClass(MRI.getRegClass(MI->getOperand(1).getReg()))) - return 0; + return nullptr; if (!MI->getOperand(2).isReg()) { // XXX: Commute instructions with FPImm operands if (NewMI || MI->getOperand(2).isFPImm() || (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) { - return 0; + return nullptr; } // XXX: Commute VOP3 instructions with abs and neg set. @@ -267,7 +390,7 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, AMDGPU::OpName::abs)).getImm() || MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::neg)).getImm())) - return 0; + return nullptr; unsigned Reg = MI->getOperand(1).getReg(); unsigned SubReg = MI->getOperand(1).getSubReg(); @@ -537,13 +660,28 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64; case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32; case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64; + case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32; + case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32; + case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32; + case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32; + case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32; case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32; + case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32; case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32; case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32; case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32; case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; + case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; + case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; + case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; + case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e32; + case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; + case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; } } @@ -568,6 +706,7 @@ bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { case AMDGPU::COPY: case AMDGPU::REG_SEQUENCE: case AMDGPU::PHI: + case AMDGPU::INSERT_SUBREG: return RI.hasVGPRs(getOpRegClass(MI, 0)); default: return RI.hasVGPRs(getOpRegClass(MI, OpNo)); @@ -607,8 +746,8 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, unsigned SubReg = MRI.createVirtualRegister(SubRC); // Just in case the super register is itself a sub-register, copy it to a new - // value so we don't need to wory about merging its subreg index with the - // SubIdx passed to this function. The register coalescer should be able to + // value so we don't need to worry about merging its subreg index with the + // SubIdx passed to this function. The register coalescer should be able to // eliminate this extra copy. BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY), NewSuperReg) @@ -751,7 +890,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // class of the output. if (MI->getOpcode() == AMDGPU::REG_SEQUENCE || MI->getOpcode() == AMDGPU::PHI) { - const TargetRegisterClass *RC = NULL, *SRC = NULL, *VRC = NULL; + const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { if (!MI->getOperand(i).isReg() || !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) @@ -801,6 +940,23 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { } } + // Legalize INSERT_SUBREG + // src0 must have the same register class as dst + if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { + unsigned Dst = MI->getOperand(0).getReg(); + unsigned Src0 = MI->getOperand(1).getReg(); + const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); + const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); + if (DstRC != Src0RC) { + MachineBasicBlock &MBB = *MI->getParent(); + unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) + .addReg(Src0); + MI->getOperand(1).setReg(NewSrc0); + } + return; + } + // Legalize MUBUF* instructions // FIXME: If we start using the non-addr64 instructions for compute, we // may need to legalize them here. @@ -906,6 +1062,72 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { } } +void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const { + MachineBasicBlock *MBB = MI->getParent(); + switch (MI->getOpcode()) { + case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORD_SGPR: + case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX2_SGPR: + case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX4_SGPR: + unsigned NewOpcode = getVALUOp(*MI); + unsigned RegOffset; + unsigned ImmOffset; + + if (MI->getOperand(2).isReg()) { + RegOffset = MI->getOperand(2).getReg(); + ImmOffset = 0; + } else { + assert(MI->getOperand(2).isImm()); + // SMRD instructions take a dword offsets and MUBUF instructions + // take a byte offset. + ImmOffset = MI->getOperand(2).getImm() << 2; + RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + if (isUInt<12>(ImmOffset)) { + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + RegOffset) + .addImm(0); + } else { + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), + RegOffset) + .addImm(ImmOffset); + ImmOffset = 0; + } + } + + unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + unsigned DWord0 = RegOffset; + unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) + .addImm(0); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) + .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) + .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32); + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) + .addReg(DWord0) + .addImm(AMDGPU::sub0) + .addReg(DWord1) + .addImm(AMDGPU::sub1) + .addReg(DWord2) + .addImm(AMDGPU::sub2) + .addReg(DWord3) + .addImm(AMDGPU::sub3); + MI->setDesc(get(NewOpcode)); + if (MI->getOperand(2).isReg()) { + MI->getOperand(2).setReg(MI->getOperand(1).getReg()); + } else { + MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false); + } + MI->getOperand(1).setReg(SRsrc); + MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); + } +} + void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { SmallVector Worklist; Worklist.push_back(&TopInst); @@ -915,8 +1137,16 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { MachineBasicBlock *MBB = Inst->getParent(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned Opcode = Inst->getOpcode(); + unsigned NewOpcode = getVALUOp(*Inst); + // Handle some special cases - switch(Inst->getOpcode()) { + switch (Opcode) { + default: + if (isSMRD(Inst->getOpcode())) { + moveSMRDToVALU(Inst, MRI); + } + break; case AMDGPU::S_MOV_B64: { DebugLoc DL = Inst->getDebugLoc(); @@ -942,22 +1172,27 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { continue; } case AMDGPU::S_AND_B64: - splitScalar64BitOp(Worklist, Inst, AMDGPU::S_AND_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32); Inst->eraseFromParent(); continue; case AMDGPU::S_OR_B64: - splitScalar64BitOp(Worklist, Inst, AMDGPU::S_OR_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32); Inst->eraseFromParent(); continue; case AMDGPU::S_XOR_B64: - splitScalar64BitOp(Worklist, Inst, AMDGPU::S_XOR_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32); Inst->eraseFromParent(); continue; case AMDGPU::S_NOT_B64: - splitScalar64BitOp(Worklist, Inst, AMDGPU::S_NOT_B32); + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); + Inst->eraseFromParent(); + continue; + + case AMDGPU::S_BCNT1_I32_B64: + splitScalar64BitBCNT(Worklist, Inst); Inst->eraseFromParent(); continue; @@ -967,7 +1202,6 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { llvm_unreachable("Moving this op to VALU not implemented"); } - unsigned NewOpcode = getVALUOp(*Inst); if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { // We cannot move this instruction to the VALU, so we should try to // legalize its operands instead. @@ -988,26 +1222,56 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { Inst->RemoveOperand(i); } - // Add the implict and explicit register definitions. - if (NewDesc.ImplicitUses) { - for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) { - unsigned Reg = NewDesc.ImplicitUses[i]; - Inst->addOperand(MachineOperand::CreateReg(Reg, false, true)); - } + if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { + // We are converting these to a BFE, so we need to add the missing + // operands for the size and offset. + unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; + Inst->addOperand(Inst->getOperand(1)); + Inst->getOperand(1).ChangeToImmediate(0); + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(Size)); + + // XXX - Other pointless operands. There are 4, but it seems you only need + // 3 to not hit an assertion later in MCInstLower. + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(0)); + } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { + // The VALU version adds the second operand to the result, so insert an + // extra 0 operand. + Inst->addOperand(MachineOperand::CreateImm(0)); } - if (NewDesc.ImplicitDefs) { - for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) { - unsigned Reg = NewDesc.ImplicitDefs[i]; - Inst->addOperand(MachineOperand::CreateReg(Reg, true, true)); - } + addDescImplicitUseDef(NewDesc, Inst); + + if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { + const MachineOperand &OffsetWidthOp = Inst->getOperand(2); + // If we need to move this to VGPRs, we need to unpack the second operand + // back into the 2 separate ones for bit offset and width. + assert(OffsetWidthOp.isImm() && + "Scalar BFE is only implemented for constant width and offset"); + uint32_t Imm = OffsetWidthOp.getImm(); + + uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. + uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. + + Inst->RemoveOperand(2); // Remove old immediate. + Inst->addOperand(Inst->getOperand(1)); + Inst->getOperand(1).ChangeToImmediate(0); + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(Offset)); + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(BitWidth)); + Inst->addOperand(MachineOperand::CreateImm(0)); + Inst->addOperand(MachineOperand::CreateImm(0)); } // Update the destination register class. const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0); - switch (Inst->getOpcode()) { + switch (Opcode) { // For target instructions, getOpRegClass just returns the virtual // register class associated with the operand, so we need to find an // equivalent VGPR register class in order to move the instruction to the @@ -1057,9 +1321,62 @@ const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { return &AMDGPU::VReg_32RegClass; } -void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl &Worklist, - MachineInstr *Inst, - unsigned Opcode) const { +void SIInstrInfo::splitScalar64BitUnaryOp( + SmallVectorImpl &Worklist, + MachineInstr *Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src0 = Inst->getOperand(1); + DebugLoc DL = Inst->getDebugLoc(); + + MachineBasicBlock::iterator MII = Inst; + + const MCInstrDesc &InstDesc = get(Opcode); + const TargetRegisterClass *Src0RC = Src0.isReg() ? + MRI.getRegClass(Src0.getReg()) : + &AMDGPU::SGPR_32RegClass; + + const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); + + MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub0, Src0SubRC); + + const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); + const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); + + unsigned DestSub0 = MRI.createVirtualRegister(DestRC); + MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) + .addOperand(SrcReg0Sub0); + + MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub1, Src0SubRC); + + unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); + MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) + .addOperand(SrcReg0Sub1); + + unsigned FullDestReg = MRI.createVirtualRegister(DestRC); + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // Try to legalize the operands in case we need to swap the order to keep it + // valid. + Worklist.push_back(LoHalf); + Worklist.push_back(HiHalf); +} + +void SIInstrInfo::splitScalar64BitBinaryOp( + SmallVectorImpl &Worklist, + MachineInstr *Inst, + unsigned Opcode) const { MachineBasicBlock &MBB = *Inst->getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -1120,6 +1437,64 @@ void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl &Worklist, Worklist.push_back(HiHalf); } +void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl &Worklist, + MachineInstr *Inst) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst->getDebugLoc(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src = Inst->getOperand(1); + + const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e32); + const TargetRegisterClass *SrcRC = Src.isReg() ? + MRI.getRegClass(Src.getReg()) : + &AMDGPU::SGPR_32RegClass; + + unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0); + + MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, + AMDGPU::sub0, SrcSubRC); + MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, + AMDGPU::sub1, SrcSubRC); + + MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg) + .addOperand(SrcRegSub0) + .addImm(0); + + MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg) + .addOperand(SrcRegSub1) + .addReg(MidReg); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + + Worklist.push_back(First); + Worklist.push_back(Second); +} + +void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, + MachineInstr *Inst) const { + // Add the implict and explicit register definitions. + if (NewDesc.ImplicitUses) { + for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) { + unsigned Reg = NewDesc.ImplicitUses[i]; + Inst->addOperand(MachineOperand::CreateReg(Reg, false, true)); + } + } + + if (NewDesc.ImplicitDefs) { + for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) { + unsigned Reg = NewDesc.ImplicitDefs[i]; + Inst->addOperand(MachineOperand::CreateReg(Reg, true, true)); + } + } +} + MachineInstrBuilder SIInstrInfo::buildIndirectWrite( MachineBasicBlock *MBB, MachineBasicBlock::iterator I,