From 745a86bac9684f9617aeb0e1566194ca797a64d4 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 19 Nov 2011 22:34:59 +0000 Subject: [PATCH] Use 256-bit vcmpeqd for creating an all ones vector when AVX2 is enabled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@145004 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 39 ++++++++++++++++++------------ lib/Target/X86/X86InstrInfo.cpp | 5 +++- lib/Target/X86/X86InstrSSE.td | 4 +++ lib/Target/X86/X86MCInstLower.cpp | 1 + test/CodeGen/X86/avx2-logic.ll | 12 +++++++++ 5 files changed, 45 insertions(+), 16 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 04ec5a1438d..61d9246b259 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -4387,23 +4387,30 @@ static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG, } /// getOnesVector - Returns a vector of specified type with all bits set. -/// Always build ones vectors as <4 x i32>. For 256-bit types, use two -/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their -/// original type, ensuring they get CSE'd. -static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { +/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with +/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. +/// Then bitcast to their original type, ensuring they get CSE'd. +static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG, + DebugLoc dl) { assert(VT.isVector() && "Expected a vector type"); assert((VT.is128BitVector() || VT.is256BitVector()) && "Expected a 128-bit or 256-bit vector type"); SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); - SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, - Cst, Cst, Cst, Cst); - - if (VT.is256BitVector()) { - SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), - Vec, DAG.getConstant(0, MVT::i32), DAG, dl); - Vec = Insert128BitVector(InsV, Vec, - DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); + SDValue Vec; + if (VT.getSizeInBits() == 256) { + if (HasAVX2) { // AVX2 + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8); + } else { // AVX + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), + Vec, DAG.getConstant(0, MVT::i32), DAG, dl); + Vec = Insert128BitVector(InsV, Vec, + DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); + } + } else { + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } return DAG.getNode(ISD::BITCAST, dl, VT, Vec); @@ -5221,12 +5228,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } // Vectors containing all ones can be matched by pcmpeqd on 128-bit width - // vectors or broken into v4i32 operations on 256-bit vectors. + // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use + // vpcmpeqd on 256-bit vectors. if (ISD::isBuildVectorAllOnes(Op.getNode())) { - if (Op.getValueType() == MVT::v4i32) + if (Op.getValueType() == MVT::v4i32 || + (Op.getValueType() == MVT::v8i32 && Subtarget->hasAVX2())) return Op; - return getOnesVector(Op.getValueType(), DAG, dl); + return getOnesVector(Op.getValueType(), Subtarget->hasAVX2(), DAG, dl); } SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2()); diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 692c234422b..4f5b757acd2 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2902,6 +2902,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, switch (LoadMI->getOpcode()) { case X86::AVX_SET0PSY: case X86::AVX_SET0PDY: + case X86::AVX2_SETALLONES: Alignment = 32; break; case X86::V_SET0: @@ -2947,6 +2948,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, case X86::AVX_SET0PSY: case X86::AVX_SET0PDY: case X86::AVX_SETALLONES: + case X86::AVX2_SETALLONES: case X86::FsFLD0SD: case X86::FsFLD0SS: case X86::VFsFLD0SD: @@ -2985,7 +2987,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, else Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); - bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES); + bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES || + Opc == X86::AVX2_SETALLONES); const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) : Constant::getNullValue(Ty); unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 0eba87ab97c..1e74da1b95e 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -318,6 +318,10 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isCodeGenOnly = 1, ExeDomain = SSEPackedInt, Predicates = [HasAVX] in def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V; +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isCodeGenOnly = 1, ExeDomain = SSEPackedInt, Predicates = [HasAVX2] in + def AVX2_SETALLONES : PDI<0x76, MRMInitReg, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8i32 immAllOnesV))]>, VEX_4V; //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 328cf674f5b..81ee665ea78 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -376,6 +376,7 @@ ReSimplify: case X86::AVX_SET0PSY: LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break; case X86::AVX_SET0PDY: LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break; case X86::AVX_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break; + case X86::AVX2_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDYrr);break; case X86::MOV16r0: LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV16r0 -> MOV32r0 diff --git a/test/CodeGen/X86/avx2-logic.ll b/test/CodeGen/X86/avx2-logic.ll index aa3d37d5533..13ebaa6f879 100644 --- a/test/CodeGen/X86/avx2-logic.ll +++ b/test/CodeGen/X86/avx2-logic.ll @@ -82,3 +82,15 @@ entry: %cond = or <8 x i32> %1, %2 ret <8 x i32> %cond } + +define <8 x i32> @allOnes() nounwind { +; CHECK: vpcmpeqd +; CHECK-NOT: vinsert + ret <8 x i32> +} + +define <16 x i16> @allOnes2() nounwind { +; CHECK: vpcmpeqd +; CHECK-NOT: vinsert + ret <16 x i16> +} -- 2.34.1