From 92fb79b7a611ab4c1043f04e8acd08f963d073ad Mon Sep 17 00:00:00 2001 From: Jakob Stoklund Olesen Date: Thu, 29 Sep 2011 05:10:54 +0000 Subject: [PATCH] Expand the x86 V_SET0* pseudos right after register allocation. This also makes it possible to reduce the number of pseudo instructions and get rid of the encoding information. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@140776 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Target/TargetInstrInfo.h | 2 +- lib/Target/X86/X86InstrInfo.cpp | 49 +++++++++++++------ lib/Target/X86/X86InstrInfo.h | 3 ++ lib/Target/X86/X86InstrSSE.td | 68 ++++++++++++--------------- lib/Target/X86/X86MCInstLower.cpp | 6 --- 5 files changed, 67 insertions(+), 61 deletions(-) diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h index 616e89ac506..07f614d61d9 100644 --- a/include/llvm/Target/TargetInstrInfo.h +++ b/include/llvm/Target/TargetInstrInfo.h @@ -392,7 +392,7 @@ public: /// into real instructions. The target can edit MI in place, or it can insert /// new instructions and erase MI. The function should return true if /// anything was changed. - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const { return false; } diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 4eb6c3076f2..bad978d8747 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2391,6 +2391,37 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, NewMIs.push_back(MIB); } +/// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr +/// instruction with two undef reads of the register being defined. This is +/// used for mapping: +/// %xmm4 = V_SET0 +/// to: +/// %xmm4 = PXORrr %xmm4, %xmm4 +/// +static bool Expand2AddrUndef(MachineInstr *MI, const MCInstrDesc &Desc) { + assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); + unsigned Reg = MI->getOperand(0).getReg(); + MI->setDesc(Desc); + + // MachineInstr::addOperand() will insert explicit operands before any + // implicit operands. + MachineInstrBuilder(MI).addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + // But we don't trust that. + assert(MI->getOperand(1).getReg() == Reg && + MI->getOperand(2).getReg() == Reg && "Misplaced operand"); + return true; +} + +bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + bool HasAVX = TM.getSubtarget().hasAVX(); + switch (MI->getOpcode()) { + case X86::V_SET0: + return Expand2AddrUndef(MI, get(HasAVX ? X86::VPXORrr : X86::PXORrr)); + } + return false; +} + MachineInstr* X86InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, uint64_t Offset, @@ -2679,13 +2710,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, case X86::AVX_SET0PDY: Alignment = 32; break; - case X86::V_SET0PS: - case X86::V_SET0PD: - case X86::V_SET0PI: + case X86::V_SET0: case X86::V_SETALLONES: - case X86::AVX_SET0PS: - case X86::AVX_SET0PD: - case X86::AVX_SET0PI: case X86::AVX_SETALLONES: Alignment = 16; break; @@ -2722,13 +2748,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, SmallVector MOs; switch (LoadMI->getOpcode()) { - case X86::V_SET0PS: - case X86::V_SET0PD: - case X86::V_SET0PI: + case X86::V_SET0: case X86::V_SETALLONES: - case X86::AVX_SET0PS: - case X86::AVX_SET0PD: - case X86::AVX_SET0PI: case X86::AVX_SET0PSY: case X86::AVX_SET0PDY: case X86::AVX_SETALLONES: @@ -2736,7 +2757,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, case X86::FsFLD0SS: case X86::VFsFLD0SD: case X86::VFsFLD0SS: { - // Folding a V_SET0P? or V_SETALLONES as a load, to ease register pressure. + // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. // Create a constant-pool entry and operands to load from it. // Medium and large mode can't fold loads this way. @@ -3316,7 +3337,6 @@ static const unsigned ReplaceableInstrs[][3] = { { X86::ANDPSrr, X86::ANDPDrr, X86::PANDrr }, { X86::ORPSrm, X86::ORPDrm, X86::PORrm }, { X86::ORPSrr, X86::ORPDrr, X86::PORrr }, - { X86::V_SET0PS, X86::V_SET0PD, X86::V_SET0PI }, { X86::XORPSrm, X86::XORPDrm, X86::PXORrm }, { X86::XORPSrr, X86::XORPDrr, X86::PXORrr }, // AVX 128-bit support @@ -3332,7 +3352,6 @@ static const unsigned ReplaceableInstrs[][3] = { { X86::VANDPSrr, X86::VANDPDrr, X86::VPANDrr }, { X86::VORPSrm, X86::VORPDrm, X86::VPORrm }, { X86::VORPSrr, X86::VORPDrr, X86::VPORrr }, - { X86::AVX_SET0PS, X86::AVX_SET0PD, X86::AVX_SET0PI }, { X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm }, { X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr }, // AVX 256-bit support diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 0f4022207eb..97009dbdbe5 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -247,6 +247,9 @@ public: MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOEnd, SmallVectorImpl &NewMIs) const; + + virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; + virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, uint64_t Offset, diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 075be73f1a9..4a51edfc5ad 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -260,26 +260,26 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1, // AVX & SSE - Zero/One Vectors //===----------------------------------------------------------------------===// -// Alias instructions that map zero vector to pxor / xorp* for sse. +// Alias instruction that maps zero vector to pxor / xorp* for sse. +// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then +// swizzled by ExecutionDepsFix to pxor. // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-zeros value if folding it would be beneficial. -// FIXME: Change encoding to pseudo! This is blocked right now by the x86 -// JIT implementation, it does not expand the instructions below like -// X86MCInstLower does. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isCodeGenOnly = 1 in { -def V_SET0PS : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "", - [(set VR128:$dst, (v4f32 immAllZerosV))]>; -def V_SET0PD : PDI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "", - [(set VR128:$dst, (v2f64 immAllZerosV))]>; -let ExeDomain = SSEPackedInt in -def V_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "", - [(set VR128:$dst, (v4i32 immAllZerosV))]>; + isPseudo = 1 in { +def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", []>; } -// The same as done above but for AVX. The 128-bit versions are the -// same, but re-encoded. The 256-bit does not support PI version, and -// doesn't need it because on sandy bridge the register is set to zero +def : Pat<(v4f32 immAllZerosV), (V_SET0)>; +def : Pat<(v2f64 immAllZerosV), (V_SET0)>; +def : Pat<(v4i32 immAllZerosV), (V_SET0)>; +def : Pat<(v2i64 immAllZerosV), (V_SET0)>; +def : Pat<(v8i16 immAllZerosV), (V_SET0)>; +def : Pat<(v16i8 immAllZerosV), (V_SET0)>; + + +// The same as done above but for AVX. The 256-bit ISA does not support PI, +// and doesn't need it because on sandy bridge the register is set to zero // at the rename stage without using any execution unit, so SET0PSY // and SET0PDY can be used for vector int instructions without penalty // FIXME: Change encoding to pseudo! This is blocked right now by the x86 @@ -287,32 +287,22 @@ def V_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "", // X86MCInstLower does. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isCodeGenOnly = 1, Predicates = [HasAVX] in { -def AVX_SET0PS : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "", - [(set VR128:$dst, (v4f32 immAllZerosV))]>, VEX_4V; -def AVX_SET0PD : PDI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "", - [(set VR128:$dst, (v2f64 immAllZerosV))]>, VEX_4V; def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "", [(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V; def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "", [(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V; -let ExeDomain = SSEPackedInt in -def AVX_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "", - [(set VR128:$dst, (v4i32 immAllZerosV))]>; } -def : Pat<(v2i64 immAllZerosV), (V_SET0PI)>; -def : Pat<(v8i16 immAllZerosV), (V_SET0PI)>; -def : Pat<(v16i8 immAllZerosV), (V_SET0PI)>; // AVX has no support for 256-bit integer instructions, but since the 128-bit // VPXOR instruction writes zero to its upper part, it's safe build zeros. -def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (AVX_SET0PI), sub_xmm)>; +def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>; def : Pat<(bc_v8i32 (v8f32 immAllZerosV)), - (SUBREG_TO_REG (i32 0), (AVX_SET0PI), sub_xmm)>; + (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>; -def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (AVX_SET0PI), sub_xmm)>; +def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>; def : Pat<(bc_v4i64 (v8f32 immAllZerosV)), - (SUBREG_TO_REG (i64 0), (AVX_SET0PI), sub_xmm)>; + (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>; // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-ones value if folding it would be beneficial. @@ -427,12 +417,12 @@ let Predicates = [HasSSE1] in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSS to the lower bits. def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), - (MOVSSrr (v4f32 (V_SET0PS)), FR32:$src)>; + (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>; def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), - (MOVSSrr (v4f32 (V_SET0PS)), + (MOVSSrr (v4f32 (V_SET0)), (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (MOVSSrr (v4i32 (V_SET0PI)), + (MOVSSrr (v4i32 (V_SET0)), (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>; } @@ -483,7 +473,7 @@ let Predicates = [HasSSE2] in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSD to the lower bits. def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), - (MOVSDrr (v2f64 (V_SET0PS)), FR64:$src)>; + (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>; } let AddedComplexity = 20 in { @@ -558,15 +548,15 @@ let Predicates = [HasAVX] in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVS{S,D} to the lower bits. def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), - (VMOVSSrr (v4f32 (AVX_SET0PS)), FR32:$src)>; + (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), - (VMOVSSrr (v4f32 (AVX_SET0PS)), + (VMOVSSrr (v4f32 (V_SET0)), (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (VMOVSSrr (v4i32 (AVX_SET0PI)), + (VMOVSSrr (v4i32 (V_SET0)), (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>; def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), - (VMOVSDrr (v2f64 (AVX_SET0PS)), FR64:$src)>; + (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; } let AddedComplexity = 20 in { @@ -604,12 +594,12 @@ let Predicates = [HasAVX] in { def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector FR32:$src)), (i32 0)))), (SUBREG_TO_REG (i32 0), - (v4f32 (VMOVSSrr (v4f32 (AVX_SET0PS)), FR32:$src)), + (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), sub_xmm)>; def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, (v2f64 (scalar_to_vector FR64:$src)), (i32 0)))), (SUBREG_TO_REG (i64 0), - (v2f64 (VMOVSDrr (v2f64 (AVX_SET0PS)), FR64:$src)), + (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), sub_xmm)>; // Extract and store. diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 6cb5831f551..1d603cf6e1b 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -372,15 +372,9 @@ ReSimplify: case X86::FsFLD0SD: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; case X86::VFsFLD0SS: LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break; case X86::VFsFLD0SD: LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break; - case X86::V_SET0PS: LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break; - case X86::V_SET0PD: LowerUnaryToTwoAddr(OutMI, X86::XORPDrr); break; - case X86::V_SET0PI: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; case X86::V_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break; - case X86::AVX_SET0PS: LowerUnaryToTwoAddr(OutMI, X86::VXORPSrr); break; case X86::AVX_SET0PSY: LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break; - case X86::AVX_SET0PD: LowerUnaryToTwoAddr(OutMI, X86::VXORPDrr); break; case X86::AVX_SET0PDY: LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break; - case X86::AVX_SET0PI: LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break; case X86::AVX_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break; case X86::MOV16r0: -- 2.34.1