From: Bruno Cardoso Lopes Date: Mon, 25 Jul 2011 23:05:32 +0000 (+0000) Subject: Codegen allonesvector better while using AVX: vpcmpeqd + vinsertf128 X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=863bd9d5cf86e57752975d1ab6779f3116a23b90;p=oota-llvm.git Codegen allonesvector better while using AVX: vpcmpeqd + vinsertf128 This also fixes PR10452 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@136004 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 2f74c0fdd46..793770aa50a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3831,21 +3831,25 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, } /// getOnesVector - Returns a vector of specified type with all bits set. -/// Always build ones vectors as <4 x i32> or <8 x i32> bitcasted to -/// their original type, ensuring they get CSE'd. +/// Always build ones vectors as <4 x i32>. For 256-bit types, use two +/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their +/// original type, ensuring they get CSE'd. static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); assert((VT.is128BitVector() || VT.is256BitVector()) && "Expected a 128-bit or 256-bit vector type"); SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); + SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, + Cst, Cst, Cst, Cst); - SDValue Vec; if (VT.is256BitVector()) { - SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); - } else - Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), + Vec, DAG.getConstant(0, MVT::i32), DAG, dl); + Vec = Insert128BitVector(InsV, Vec, + DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); + } + return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } @@ -12023,6 +12027,35 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector +/// so it can be folded inside ANDNP. +static bool CanFoldXORWithAllOnes(const SDNode *N) { + EVT VT = N->getValueType(0); + + // Match direct AllOnes for 128 and 256-bit vectors + if (ISD::isBuildVectorAllOnes(N)) + return true; + + // Look through a bit convert. + if (N->getOpcode() == ISD::BITCAST) + N = N->getOperand(0).getNode(); + + // Sometimes the operand may come from a insert_subvector building a 256-bit + // allones vector + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + + if (VT.getSizeInBits() == 256 && + N->getOpcode() == ISD::INSERT_SUBVECTOR && + V1.getOpcode() == ISD::INSERT_SUBVECTOR && + V1.getOperand(0).getOpcode() == ISD::UNDEF && + ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && + ISD::isBuildVectorAllOnes(V2.getNode())) + return true; + + return false; +} + static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -12047,12 +12080,14 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, // Check LHS for vnot if (N0.getOpcode() == ISD::XOR && - ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) + //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) + CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); // Check RHS for vnot if (N1.getOpcode() == ISD::XOR && - ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) + //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) + CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); return SDValue(); diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 1ab02780cdf..fda04749c01 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2450,6 +2450,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, case X86::AVX_SET0PS: case X86::AVX_SET0PD: case X86::AVX_SET0PI: + case X86::AVX_SETALLONES: Alignment = 16; break; case X86::FsFLD0SD: @@ -2494,6 +2495,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, case X86::AVX_SET0PI: case X86::AVX_SET0PSY: case X86::AVX_SET0PDY: + case X86::AVX_SETALLONES: case X86::FsFLD0SD: case X86::FsFLD0SS: case X86::VFsFLD0SD: @@ -2531,9 +2533,10 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8); else Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); - const Constant *C = LoadMI->getOpcode() == X86::V_SETALLONES ? - Constant::getAllOnesValue(Ty) : - Constant::getNullValue(Ty); + + bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES); + const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) : + Constant::getNullValue(Ty); unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); // Create operands to load from the constant pool entry. diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index d574a7b8db3..b5ac5feb8c7 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3143,11 +3143,17 @@ def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP; // Alias instructions that map zero vector to pxor / xorp* for sse. // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-ones value if folding it would be beneficial. +// FIXME: Change encoding to pseudo! This is blocked right now by the x86 +// JIT implementation, it does not expand the instructions below like +// X86MCInstLower does. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isCodeGenOnly = 1, ExeDomain = SSEPackedInt in - // FIXME: Change encoding to pseudo. def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4i32 immAllOnesV))]>; +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isCodeGenOnly = 1, ExeDomain = SSEPackedInt, Predicates = [HasAVX] in + def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V; //===---------------------------------------------------------------------===// // SSE3 - Conversion Instructions diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index e3853355553..2ed596af15e 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -381,6 +381,7 @@ ReSimplify: case X86::AVX_SET0PD: LowerUnaryToTwoAddr(OutMI, X86::VXORPDrr); break; case X86::AVX_SET0PDY: LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break; case X86::AVX_SET0PI: LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break; + case X86::AVX_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break; case X86::MOV16r0: LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV16r0 -> MOV32r0 diff --git a/test/CodeGen/X86/avx-256.ll b/test/CodeGen/X86/avx-256.ll index a6d1450c9c1..244bf98ce68 100644 --- a/test/CodeGen/X86/avx-256.ll +++ b/test/CodeGen/X86/avx-256.ll @@ -12,3 +12,15 @@ entry: store <4 x double> zeroinitializer, <4 x double>* @y, align 32 ret void } + +; CHECK: vpcmpeqd +; CHECK: vinsertf128 $1 +define void @ones([0 x float]* nocapture %RET, [0 x float]* nocapture %aFOO) nounwind { +allocas: + %ptr2vec615 = bitcast [0 x float]* %RET to <8 x float>* + store <8 x float> , <8 x +float>* %ptr2vec615, align 32 + ret void +}