From c1c5dcf06913839943d536cf7ed7e2b2b87d0158 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Fri, 5 Sep 2014 10:36:31 +0000 Subject: [PATCH] [x86] Factor out the zero vector insertion logic in the new vector shuffle lowering for integer vectors and share it from v4i32, v8i16, and v16i8 code paths. Ironically, the SSE2 v16i8 code for this is now better than the SSSE3! =] Will have to fix the SSSE3 code next to just using a single pshufb. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217240 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 140 ++++++++++++++------- test/CodeGen/X86/vector-shuffle-128-v16.ll | 47 +++++++ test/CodeGen/X86/vector-shuffle-128-v8.ll | 59 +++++++++ 3 files changed, 201 insertions(+), 45 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f85b00a84f8..5b0d315a6d0 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7488,6 +7488,81 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(NewMask, DAG)); } +static SDValue lowerIntegerElementInsertionVectorShuffle( + MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + int V2Index = std::find_if(Mask.begin(), Mask.end(), + [&Mask](int M) { return M >= (int)Mask.size(); }) - + Mask.begin(); + + // Check for a single input from a SCALAR_TO_VECTOR node. + // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and + // all the smarts here sunk into that routine. However, the current + // lowering of BUILD_VECTOR makes that nearly impossible until the old + // vector shuffle lowering is dead. + if ((Mask[V2Index] == (int)Mask.size() && + V2.getOpcode() == ISD::SCALAR_TO_VECTOR) || + V2.getOpcode() == ISD::BUILD_VECTOR) { + SDValue V2S = V2.getOperand(Mask[V2Index] - Mask.size()); + + bool V1IsAllZero = false; + if (ISD::isBuildVectorAllZeros(V1.getNode())) { + V1IsAllZero = true; + } else if (V1.getOpcode() == ISD::BUILD_VECTOR) { + V1IsAllZero = true; + for (int M : Mask) { + if (M < 0 || M >= (int)Mask.size()) + continue; + SDValue Input = V1.getOperand(M); + if (Input.getOpcode() != ISD::UNDEF && !X86::isZeroNode(Input)) { + // A non-zero input! + V1IsAllZero = false; + break; + } + } + } + if (V1IsAllZero) { + // First, we need to zext the scalar if it is smaller than an i32. + MVT EltVT = VT.getVectorElementType(); + assert(EltVT == V2S.getSimpleValueType() && + "Different scalar and element types!"); + MVT ExtVT = VT; + if (EltVT == MVT::i8 || EltVT == MVT::i16) { + // Zero-extend directly to i32. + ExtVT = MVT::v4i32; + V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); + } + + V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S)); + if (ExtVT != VT) + V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); + + if (V2Index != 0) { + // If we have 4 or fewer lanes we can cheaply shuffle the element into + // the desired position. Otherwise it is more efficient to do a vector + // shift left. We know that we can do a vector shift left because all + // the inputs are zero. + if (VT.getVectorNumElements() <= 4) { + SmallVector V2Shuffle(Mask.size(), 1); + V2Shuffle[V2Index] = 0; + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); + } else { + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2); + V2 = DAG.getNode( + X86ISD::VSHLDQ, DL, MVT::v2i64, V2, + DAG.getConstant( + V2Index * EltVT.getSizeInBits(), + DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64))); + V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); + } + } + return V2; + } + } + return SDValue(); +} + /// \brief Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for @@ -7519,50 +7594,10 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); // There are special ways we can lower some single-element blends. - if (NumV2Elements == 1) { - int V2Index = - std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - - Mask.begin(); - - // Check for a single input from a SCALAR_TO_VECTOR node. - // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and - // all the smarts here sunk into that routine. However, the current - // lowering of BUILD_VECTOR makes that nearly impossible until the old - // vector shuffle lowering is dead. - if ((Mask[V2Index] == 4 && V2.getOpcode() == ISD::SCALAR_TO_VECTOR) || - V2.getOpcode() == ISD::BUILD_VECTOR) { - SDValue V2S = V2.getOperand(Mask[V2Index] - 4); - - bool V1IsAllZero = false; - if (ISD::isBuildVectorAllZeros(V1.getNode())) { - V1IsAllZero = true; - } else if (V1.getOpcode() == ISD::BUILD_VECTOR) { - V1IsAllZero = true; - for (int M : Mask) { - if (M < 0 || M >= 4) - continue; - SDValue Input = V1.getOperand(M); - if (Input.getOpcode() != ISD::UNDEF && !X86::isZeroNode(Input)) { - // A non-zero input! - V1IsAllZero = false; - break; - } - } - } - if (V1IsAllZero) { - V2 = DAG.getNode( - X86ISD::VZEXT_MOVL, DL, MVT::v4i32, - DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V2S)); - if (V2Index != 0) { - int V2Shuffle[] = {1, 1, 1, 1}; - V2Shuffle[V2Index] = 0; - V2 = DAG.getVectorShuffle(MVT::v4i32, DL, V2, - DAG.getUNDEF(MVT::v4i32), V2Shuffle); - } - return V2; - } - } - } + if (NumV2Elements == 1) + if (SDValue V = lowerIntegerElementInsertionVectorShuffle( + MVT::v4i32, DL, V1, V2, Mask, Subtarget, DAG)) + return V; // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build @@ -8210,6 +8245,12 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized " "to be V1-input shuffles."); + // There are special ways we can lower some single-element blends. + if (NumV2Inputs == 1) + if (SDValue V = lowerIntegerElementInsertionVectorShuffle( + MVT::v8i16, DL, V1, V2, Mask, Subtarget, DAG)) + return V; + if (NumV1Inputs + NumV2Inputs <= 4) return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG); @@ -8347,8 +8388,11 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, MutableArrayRef LoMask = Mask.slice(0, 8); MutableArrayRef HiMask = Mask.slice(8, 8); + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; }); + // For single-input shuffles, there are some nicer lowering tricks we can use. - if (isSingleInputShuffleMask(Mask)) { + if (NumV2Elements == 0) { // Check whether we can widen this to an i16 shuffle by duplicating bytes. // Notably, this handles splat and partial-splat shuffles more efficiently. // However, it only makes sense if the pre-duplication shuffle simplifies @@ -8495,6 +8539,12 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); } + // There are special ways we can lower some single-element blends. + if (NumV2Elements == 1) + if (SDValue V = lowerIntegerElementInsertionVectorShuffle( + MVT::v16i8, DL, V1, V2, Mask, Subtarget, DAG)) + return V; + // Check whether a compaction lowering can be done. This handles shuffles // which take every Nth element for some even N. See the helper function for // details. diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index 6f49a03cb8b..38734eda941 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -325,3 +325,50 @@ define <16 x i8> @PR20540(<8 x i8> %a) { %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } + +define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE2-LABEL: @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz +; SSE2: # BB#0: +; SSE2-NEXT: movzbl {{.*}}, %[[R:.*]] +; SSE2-NEXT: movd %[[R]], %xmm0 +; SSE2-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE2-LABEL: @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz +; SSE2: # BB#0: +; SSE2-NEXT: movzbl {{.*}}, %[[R:.*]] +; SSE2-NEXT: movd %[[R]], %xmm0 +; SSE2-NEXT: pslldq $5, %xmm0 +; SSE2-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) { +; SSE2-LABEL: @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16 +; SSE2: # BB#0: +; SSE2-NEXT: movzbl {{.*}}, %[[R:.*]] +; SSE2-NEXT: movd %[[R]], %xmm0 +; SSE2-NEXT: pslldq $15, %xmm0 +; SSE2-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE2-LABEL: @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz +; SSE2: # BB#0: +; SSE2-NEXT: movzbl {{.*}}, %[[R:.*]] +; SSE2-NEXT: movd %[[R]], %xmm0 +; SSE2-NEXT: pslldq $2, %xmm0 +; SSE2-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 3 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index f1e17377c13..33993aae682 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -771,3 +771,62 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) { %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } + +define <8 x i16> @shuffle_v8i16_8zzzzzzz(i16 %i) { +; ALL-LABEL: @shuffle_v8i16_8zzzzzzz +; ALL: # BB#0: +; ALL-NEXT: movzwl {{.*}}, %[[R:.*]] +; ALL-NEXT: movd %[[R]], %xmm0 +; ALL-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) { +; ALL-LABEL: @shuffle_v8i16_z8zzzzzz +; ALL: # BB#0: +; ALL-NEXT: movzwl {{.*}}, %[[R:.*]] +; ALL-NEXT: movd %[[R]], %xmm0 +; ALL-NEXT: pslldq $2, %xmm0 +; ALL-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) { +; ALL-LABEL: @shuffle_v8i16_zzzzz8zz +; ALL: # BB#0: +; ALL-NEXT: movzwl {{.*}}, %[[R:.*]] +; ALL-NEXT: movd %[[R]], %xmm0 +; ALL-NEXT: pslldq $10, %xmm0 +; ALL-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) { +; ALL-LABEL: @shuffle_v8i16_zuuzuuz8 +; ALL: # BB#0: +; ALL-NEXT: movzwl {{.*}}, %[[R:.*]] +; ALL-NEXT: movd %[[R]], %xmm0 +; ALL-NEXT: pslldq $14, %xmm0 +; ALL-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) { +; ALL-LABEL: @shuffle_v8i16_zzBzzzzz +; ALL: # BB#0: +; ALL-NEXT: movzwl {{.*}}, %[[R:.*]] +; ALL-NEXT: movd %[[R]], %xmm0 +; ALL-NEXT: pslldq $4, %xmm0 +; ALL-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 3 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> + ret <8 x i16> %shuffle +} -- 2.34.1