From 023935c3d9c184928758078a4b510d672981c98c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 23 Dec 2015 13:10:07 +0000 Subject: [PATCH] [X86][AVX] Only shuffle the lower half of vectors if the upper half is undefined First step towards making better use of AVX's implicit zeroing of the upper half of a 256-bit vector by instructions that only act on the lower 128-bit vector - discussed on D14151. As well as the fact that 128-bit shuffle instructions are generally more capable, this can be performant for older CPUs with 128-bit ALUs (e.g. Jaguar, Sandy Bridge) that must treat 256-bit vectors as multiple micro-ops. Moved the similar subvector extraction shuffle combines from PerformShuffleCombine256 to lowerVectorShuffle as well. Note: I've avoided combining shuffles that reference elements from the upper halves of the input vectors - this may be reviewed in future work as well (AVX1 would probably always gain, but AVX2 does have some cross-lane shuffle instructions). Differential Revision: http://reviews.llvm.org/D15477 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@256332 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 162 ++++++++++++++------- test/CodeGen/X86/avx-splat.ll | 2 +- test/CodeGen/X86/vector-shuffle-256-v16.ll | 42 +++++- test/CodeGen/X86/vector-shuffle-256-v32.ll | 30 +++- test/CodeGen/X86/vector-shuffle-256-v4.ll | 72 +++++---- test/CodeGen/X86/vector-shuffle-256-v8.ll | 94 +++++++++--- test/CodeGen/X86/vector-zext.ll | 10 +- 7 files changed, 289 insertions(+), 123 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e165f1adcaf..542dd97996b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -10359,6 +10359,108 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); } +/// Lower shuffles where an entire half of a 256-bit vector is UNDEF. +/// This allows for fast cases such as subvector extraction/insertion +/// or shuffling smaller vector types which can lower more efficiently. +static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector"); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfNumElts = NumElts / 2; + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); + + bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts); + bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts); + if (!UndefLower && !UndefUpper) + return SDValue(); + + // Upper half is undef and lower half is whole upper subvector. + // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + if (UndefUpper && + isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(HalfNumElts, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(0, DL)); + } + + // Lower half is undef and upper half is whole lower subvector. + // e.g. vector_shuffle or + if (UndefLower && + isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(HalfNumElts, DL)); + } + + // AVX2 supports efficient immediate 64-bit element cross-lane shuffles. + if (UndefLower && Subtarget->hasAVX2() && + (VT == MVT::v4f64 || VT == MVT::v4i64)) + return SDValue(); + + // If the shuffle only uses the lower halves of the input operands, + // then extract them and perform the 'half' shuffle at half width. + // e.g. vector_shuffle or + int HalfIdx1 = -1, HalfIdx2 = -1; + SmallVector HalfMask; + unsigned Offset = UndefLower ? HalfNumElts : 0; + for (unsigned i = 0; i != HalfNumElts; ++i) { + int M = Mask[i + Offset]; + if (M < 0) { + HalfMask.push_back(M); + continue; + } + + // Determine which of the 4 half vectors this element is from. + // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2. + int HalfIdx = M / HalfNumElts; + + // Only shuffle using the lower halves of the inputs. + // TODO: Investigate usefulness of shuffling with upper halves. + if (HalfIdx != 0 && HalfIdx != 2) + return SDValue(); + + // Determine the element index into its half vector source. + int HalfElt = M % HalfNumElts; + + // We can shuffle with up to 2 half vectors, set the new 'half' + // shuffle mask accordingly. + if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) { + HalfMask.push_back(HalfElt); + HalfIdx1 = HalfIdx; + continue; + } + if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) { + HalfMask.push_back(HalfElt + HalfNumElts); + HalfIdx2 = HalfIdx; + continue; + } + + // Too many half vectors referenced. + return SDValue(); + } + assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); + + auto GetHalfVector = [&](int HalfIdx) { + if (HalfIdx < 0) + return DAG.getUNDEF(HalfVT); + SDValue V = (HalfIdx < 2 ? V1 : V2); + HalfIdx = (HalfIdx % 2) * HalfNumElts; + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V, + DAG.getIntPtrConstant(HalfIdx, DL)); + }; + + SDValue Half1 = GetHalfVector(HalfIdx1); + SDValue Half2 = GetHalfVector(HalfIdx2); + SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, + DAG.getIntPtrConstant(Offset, DL)); +} + /// \brief Test whether the specified input (0 or 1) is in-place blended by the /// given mask. /// @@ -10928,6 +11030,11 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, VT, V1, V2, Mask, Subtarget, DAG)) return Insertion; + // Handle special cases where the lower or upper half is UNDEF. + if (SDValue V = + lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) + return V; + // There is a really nice hard cut-over between AVX1 and AVX2 that means we // can check for those subtargets here and avoid much of the subtarget // querying in the per-vector-type lowering routines. With AVX1 we have @@ -19866,7 +19973,7 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget, assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); // Use the original mask here, do not modify the mask twice - Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true); + Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true); // The value that should be stored MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); @@ -22609,39 +22716,8 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, return TargetLowering::isGAPlusOffset(N, GA, Offset); } -/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the -/// same as extracting the high 128-bit part of 256-bit vector and then -/// inserting the result into the low part of a new 256-bit vector -static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) - if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || - SVOp->getMaskElt(j) >= 0) - return false; - - return true; -} - -/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the -/// same as extracting the low 128-bit part of 256-bit vector and then -/// inserting the result into the high part of a new 256-bit vector -static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - // vector_shuffle or - for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) - if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || - SVOp->getMaskElt(j) >= 0) - return false; - - return true; -} - /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. +/// FIXME: This could be expanded to support 512 bit vectors as well. static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget* Subtarget) { @@ -22715,24 +22791,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, return DCI.CombineTo(N, InsV); } - //===--------------------------------------------------------------------===// - // Combine some shuffles into subvector extracts and inserts: - // - - // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - if (isShuffleHigh128VectorInsertLow(SVOp)) { - SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); - return DCI.CombineTo(N, InsV); - } - - // vector_shuffle or - if (isShuffleLow128VectorInsertHigh(SVOp)) { - SDValue V = Extract128BitVector(V1, 0, DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); - return DCI.CombineTo(N, InsV); - } - return SDValue(); } @@ -26710,7 +26768,7 @@ static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG, // If we're negating a FMUL node on a target with FMA, then we can avoid the // use of a constant by performing (-0 - A*B) instead. - // FIXME: Check rounding control flags as well once it becomes available. + // FIXME: Check rounding control flags as well once it becomes available. if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) { SDValue Zero = DAG.getConstantFP(0.0, DL, VT); diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll index 37eacc87c3f..ebaaf0e8d00 100644 --- a/test/CodeGen/X86/avx-splat.ll +++ b/test/CodeGen/X86/avx-splat.ll @@ -99,7 +99,7 @@ define <8 x float> @funcF(i32 %val) nounwind { ; CHECK-LABEL: funcF: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovd %edi, %xmm0 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,0] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,0] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: retq %ret6 = insertelement <8 x i32> undef, i32 %val, i32 6 diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index d3e5ea0a8bd..7e3dc6e294f 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -159,11 +159,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0 ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX2: # BB#0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -3250,6 +3250,22 @@ define <16 x i16> @shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_u ret <16 x i16> %shuffle } +define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19: +; AVX1: # BB#0: +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + ret <16 x i16> %shuffle +} + define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: ; AVX1: # BB#0: @@ -3284,6 +3300,24 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a, ret <16 x i16> %shuffle } +define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { +; ALL-LABEL: shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u: +; ALL: # BB#0: +; ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; ALL-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { +; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: +; ALL: # BB#0: +; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + ret <16 x i16> %shuffle +} + define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: ; AVX1: # BB#0: diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index d7ebebb6082..161a21cef03 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -325,7 +325,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_ ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -2013,6 +2013,34 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_ ret <32 x i8> %shuffle } +define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { +; ALL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; ALL: # BB#0: +; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; ALL-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %shuffle +} + define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX1: # BB#0: diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 2dcda842920..7e33f5f3aa8 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -236,23 +236,11 @@ define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) { } define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) { -; AVX1-LABEL: shuffle_v4f64_0423: -; AVX1: # BB#0: -; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4f64_0423: -; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4f64_0423: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v4f64_0423: +; ALL: # BB#0: +; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle } @@ -488,10 +476,19 @@ define <4 x double> @shuffle_v4f64_u062(<4 x double> %a, <4 x double> %b) { ret <4 x double> %shuffle } +define <4 x double> @shuffle_v4f64_15uu(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_15uu: +; ALL: # BB#0: +; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + ret <4 x double> %shuffle +} + define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) { ; ALL-LABEL: shuffle_v4f64_11uu: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; ALL-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] ; ALL-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %shuffle @@ -750,21 +747,21 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0142: ; AVX1: # BB#0: -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2] ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_0142: ; AVX2: # BB#0: -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0142: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm1 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX512VL-NEXT: retq @@ -778,21 +775,21 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_0412: ; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] -; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0412: ; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] -; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -1134,21 +1131,20 @@ define <4 x i64> @shuffle_v4i64_40u2(<4 x i64> %a, <4 x i64> %b) { ret <4 x i64> %shuffle } +define <4 x i64> @shuffle_v4i64_15uu(<4 x i64> %a, <4 x i64> %b) { +; ALL-LABEL: shuffle_v4i64_15uu: +; ALL: # BB#0: +; ALL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> + ret <4 x i64> %shuffle +} + define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) { -; AVX1-LABEL: shuffle_v4i64_11uu: -; AVX1: # BB#0: -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4i64_11uu: -; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i64_11uu: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v4i64_11uu: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; ALL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index 18c9e7061e3..e8b886afd1a 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -73,10 +73,10 @@ define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) { define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00040000: ; AVX1: # BB#0: -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8f32_00040000: @@ -832,17 +832,11 @@ define <8 x float> @shuffle_v8f32_a2u3e6f7(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) { -; AVX1-LABEL: shuffle_v8f32_uuuu1111: -; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v8f32_uuuu1111: -; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; ALL-LABEL: shuffle_v8f32_uuuu1111: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } @@ -864,6 +858,44 @@ define <8 x float> @shuffle_v8f32_44444444(<8 x float> %a, <8 x float> %b) { ret <8 x float> %shuffle } +define <8 x float> @shuffle_v8f32_1188uuuu(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_1188uuuu: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_uuuu3210(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_uuuu3210: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_uuuu1188(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_uuuu1188: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_1111uuuu: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_5555uuuu: ; AVX1: # BB#0: @@ -949,10 +981,10 @@ define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) { define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00040000: ; AVX1: # BB#0: -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_00040000: @@ -1948,19 +1980,37 @@ define <8 x i32> @shuffle_v8i32_80u1b4uu(<8 x i32> %a, <8 x i32> %b) { define <8 x i32> @shuffle_v8i32_uuuu1111(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_uuuu1111: ; AVX1: # BB#0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_uuuu1111: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle } +define <8 x i32> @shuffle_v8i32_2222uuuu(<8 x i32> %a, <8 x i32> %b) { +; ALL-LABEL: shuffle_v8i32_2222uuuu: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_2A3Buuuu(<8 x i32> %a, <8 x i32> %b) { +; ALL-LABEL: shuffle_v8i32_2A3Buuuu: +; ALL: # BB#0: +; ALL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + ret <8 x i32> %shuffle +} + define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_44444444: ; AVX1: # BB#0: diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll index e8e60ee9ac9..b8024203ab2 100644 --- a/test/CodeGen/X86/vector-zext.ll +++ b/test/CodeGen/X86/vector-zext.ll @@ -1167,7 +1167,7 @@ define <4 x i64> @shuf_zext_16i8_to_4i64_offset11(<16 x i8> %A) nounwind uwtable ; ; AVX2-LABEL: shuf_zext_16i8_to_4i64_offset11: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: retq entry: @@ -1251,7 +1251,7 @@ define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable ; ; AVX2-LABEL: shuf_zext_8i16_to_4i64_offset2: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,2,3,5,6,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-NEXT: retq entry: @@ -1319,7 +1319,7 @@ define <8 x i32> @shuf_zext_8i16_to_8i32_offset3(<8 x i16> %A) nounwind uwtable ; ; AVX2-LABEL: shuf_zext_8i16_to_8i32_offset3: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: retq entry: @@ -1366,7 +1366,7 @@ define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtabl ; ; AVX2-LABEL: shuf_zext_16i16_to_8i32_offset8: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: retq entry: @@ -1428,7 +1428,7 @@ define <4 x i64> @shuf_zext_4i32_to_4i64_offset1(<4 x i32> %A) nounwind uwtable ; ; AVX2-LABEL: shuf_zext_4i32_to_4i64_offset1: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: retq entry: -- 2.34.1