From 828f5b807cc2cec7dc3e25730e3c6bac965d17e2 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Sat, 30 May 2015 03:20:59 +0000 Subject: [PATCH] [x86] Implement a faster vector population count based on the PSHUFB in-register LUT technique. Summary: A description of this technique can be found here: http://wm.ite.pl/articles/sse-popcount.html The core of the idea is to use an in-register lookup table and the PSHUFB instruction to compute the population count for the low and high nibbles of each byte, and then to use horizontal sums to aggregate these into vector population counts with wider element types. On x86 there is an instruction that will directly compute the horizontal sum for the low 8 and high 8 bytes, giving vNi64 popcount very easily. Various tricks are used to get vNi32 and vNi16 from the vNi8 that the LUT computes. The base implemantion of this, and most of the work, was done by Bruno in a follow up to D6531. See Bruno's detailed post there for lots of timing information about these changes. I have extended Bruno's patch in the following ways: 0) I committed the new tests with baseline sequences so this shows a diff, and regenerated the tests using the update scripts. 1) Bruno had noticed and mentioned in IRC a redundant mask that I removed. 2) I introduced a particular optimization for the i32 vector cases where we use PSHL + PSADBW to compute the the low i32 popcounts, and PSHUFD + PSADBW to compute doubled high i32 popcounts. This takes advantage of the fact that to line up the high i32 popcounts we have to shift them anyways, and we can shift them by one fewer bit to effectively divide the count by two. While the PSHUFD based horizontal add is no faster, it doesn't require registers or load traffic the way a mask would, and provides more ILP as it happens on different ports with high throughput. 3) I did some code cleanups throughout to simplify the implementation logic. 4) I refactored it to continue to use the parallel bitmath lowering when SSSE3 is not available to preserve the performance of that version on SSE2 targets where it is still much better than scalarizing as we'll still do a bitmath implementation of popcount even in scalar code there. With #1 and #2 above, I analyzed the result in IACA for sandybridge, ivybridge, and haswell. In every case I measured, the throughput is the same or better using the LUT lowering, even v2i64 and v4i64, and even compared with using the native popcnt instruction! The latency of the LUT lowering is often higher than the latency of the scalarized popcnt instruction sequence, but I think those latency measurements are deeply misleading. Keeping the operation fully in the vector unit and having many chances for increased throughput seems much more likely to win. With this, we can lower every integer vector popcount implementation using the LUT strategy if we have SSSE3 or better (and thus have PSHUFB). I've updated the operation lowering to reflect this. This also fixes an issue where we were scalarizing horribly some AVX lowerings. Finally, there are some remaining cleanups. There is duplication between the two techniques in how they perform the horizontal sum once the byte population count is computed. I'm going to factor and merge those two in a separate follow-up commit. Differential Revision: http://reviews.llvm.org/D10084 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@238636 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 204 ++- lib/Target/X86/X86ISelLowering.h | 3 + lib/Target/X86/X86InstrFragmentsSIMD.td | 3 + lib/Target/X86/X86InstrSSE.td | 14 + test/CodeGen/X86/vector-popcnt-128.ll | 555 +++++-- test/CodeGen/X86/vector-popcnt-256.ll | 1973 ++--------------------- 6 files changed, 698 insertions(+), 2054 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index c834be3c1a7..49be23a2204 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -842,15 +842,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - // Only provide customized ctpop vector bit twiddling for vector types we - // know to perform better than using the popcnt instructions on each vector - // element. If popcnt isn't supported, always provide the custom version. - if (!Subtarget->hasPOPCNT()) { - setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); - setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); - setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); - setOperationAction(ISD::CTPOP, MVT::v16i8, Custom); - } + setOperationAction(ISD::CTPOP, MVT::v16i8, Custom); + setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); + setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); // Custom lower build_vector, vector_shuffle, and extract_vector_elt. for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { @@ -1115,6 +1110,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v32i8, Custom); + setOperationAction(ISD::CTPOP, MVT::v16i16, Custom); + setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); + if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); setOperationAction(ISD::FMA, MVT::v4f64, Legal); @@ -1149,16 +1149,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); - // Only provide customized ctpop vector bit twiddling for vector types we - // know to perform better than using the popcnt instructions on each - // vector element. If popcnt isn't supported, always provide the custom - // version. - if (!Subtarget->hasPOPCNT()) - setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); - - // Custom CTPOP always performs better on natively supported v8i32 - setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); - // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); @@ -17329,12 +17319,164 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, return SDValue(); } +static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + unsigned VecSize = VT.getSizeInBits(); + + // Implement a lookup table in register by using an algorithm based on: + // http://wm.ite.pl/articles/sse-popcount.html + // + // The general idea is that every lower byte nibble in the input vector is an + // index into a in-register pre-computed pop count table. We then split up the + // input vector in two new ones: (1) a vector with only the shifted-right + // higher nibbles for each byte and (2) a vector with the lower nibbles (and + // masked out higher ones) for each byte. PSHUB is used separately with both + // to index the in-register table. Next, both are added and the result is a + // i8 vector where each element contains the pop count for input byte. + // + // To obtain the pop count for elements != i8, we follow up with the same + // approach and use additional tricks as described below. + // + const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, + /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, + /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, + /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4}; + + int NumByteElts = VecSize / 8; + MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts); + SDValue In = DAG.getNode(ISD::BITCAST, DL, ByteVecVT, Op); + SmallVector LUTVec; + for (int i = 0; i < NumByteElts; ++i) + LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); + SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec); + SmallVector Mask0F(NumByteElts, + DAG.getConstant(0x0F, DL, MVT::i8)); + SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F); + + // High nibbles + SmallVector Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8)); + SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four); + SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV); + + // Low nibbles + SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F); + + // The input vector is used as the shuffle mask that index elements into the + // LUT. After counting low and high nibbles, add the vector to obtain the + // final pop count per i8 element. + SDValue HighPopCnt = + DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles); + SDValue LowPopCnt = + DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles); + SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt); + + if (EltVT == MVT::i8) + return PopCnt; + + // PSADBW instruction horizontally add all bytes and leave the result in i64 + // chunks, thus directly computes the pop count for v2i64 and v4i64. + if (EltVT == MVT::i64) { + SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); + PopCnt = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, PopCnt, Zeros); + return DAG.getNode(ISD::BITCAST, DL, VT, PopCnt); + } + + int NumI64Elts = VecSize / 64; + MVT VecI64VT = MVT::getVectorVT(MVT::i64, NumI64Elts); + + if (EltVT == MVT::i32) { + // We unpack the low half and high half into i32s interleaved with zeros so + // that we can use PSADBW to horizontally sum them. The most useful part of + // this is that it lines up the results of two PSADBW instructions to be + // two v2i64 vectors which concatenated are the 4 population counts. We can + // then use PACKUSWB to shrink and concatenate them into a v4i32 again. + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL); + SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, PopCnt, Zeros); + SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, PopCnt, Zeros); + + // Do the horizontal sums into two v2i64s. + Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); + Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + DAG.getNode(ISD::BITCAST, DL, ByteVecVT, Low), Zeros); + High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + DAG.getNode(ISD::BITCAST, DL, ByteVecVT, High), Zeros); + + // Merge them together. + MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16); + PopCnt = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT, + DAG.getNode(ISD::BITCAST, DL, ShortVecVT, Low), + DAG.getNode(ISD::BITCAST, DL, ShortVecVT, High)); + + return DAG.getNode(ISD::BITCAST, DL, VT, PopCnt); + } + + // To obtain pop count for each i16 element, shuffle the byte pop count to get + // even and odd elements into distinct vectors, add them and zero-extend each + // i8 elemento into i16, i.e.: + // + // B -> pop count per i8 + // W -> pop count per i16 + // + // Y = shuffle B, undef <0, 2, ...> + // Z = shuffle B, undef <1, 3, ...> + // W = zext <... x i8> to <... x i16> (Y + Z) + // + // Use a byte shuffle mask that matches PSHUFB. + // + assert(EltVT == MVT::i16 && "Unknown how to handle type"); + SDValue Undef = DAG.getUNDEF(ByteVecVT); + SmallVector MaskA, MaskB; + + // We can't use PSHUFB across lanes, so do the shuffle and sum inside each + // 128-bit lane, and then collapse the result. + int NumLanes = NumByteElts / 16; + assert(NumByteElts % 16 == 0 && "Must have 16-byte multiple vectors!"); + for (int i = 0; i < NumLanes; ++i) { + for (int j = 0; j < 8; ++j) { + MaskA.push_back(i * 16 + j * 2); + MaskB.push_back(i * 16 + (j * 2) + 1); + } + MaskA.append((size_t)8, -1); + MaskB.append((size_t)8, -1); + } + + SDValue ShuffA = DAG.getVectorShuffle(ByteVecVT, DL, PopCnt, Undef, MaskA); + SDValue ShuffB = DAG.getVectorShuffle(ByteVecVT, DL, PopCnt, Undef, MaskB); + PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, ShuffA, ShuffB); + + SmallVector Mask; + for (int i = 0; i < NumLanes; ++i) + Mask.push_back(2 * i); + Mask.append((size_t)NumLanes, -1); + + PopCnt = DAG.getNode(ISD::BITCAST, DL, VecI64VT, PopCnt); + PopCnt = + DAG.getVectorShuffle(VecI64VT, DL, PopCnt, DAG.getUNDEF(VecI64VT), Mask); + PopCnt = DAG.getNode(ISD::BITCAST, DL, ByteVecVT, PopCnt); + + // Zero extend i8s into i16 elts + SmallVector ZExtInRegMask; + for (int i = 0; i < NumByteElts / 2; ++i) { + ZExtInRegMask.push_back(i); + ZExtInRegMask.push_back(NumByteElts); + } + + return DAG.getNode( + ISD::BITCAST, DL, VT, + DAG.getVectorShuffle(ByteVecVT, DL, PopCnt, + getZeroVector(ByteVecVT, Subtarget, DAG, DL), + ZExtInRegMask)); +} + static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); - assert((VT.is128BitVector() || VT.is256BitVector()) && - "CTPOP lowering only implemented for 128/256-bit wide vector types"); + assert(VT.is128BitVector() && + "Only 128-bit vector bitmath lowering supported."); int VecSize = VT.getSizeInBits(); int NumElts = VT.getVectorNumElements(); @@ -17344,9 +17486,9 @@ static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL, // This is the vectorized version of the "best" algorithm from // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel // with a minor tweak to use a series of adds + shifts instead of vector - // multiplications. Implemented for all integer vector types. - // - // FIXME: Use strategies from http://wm.ite.pl/articles/sse-popcount.html + // multiplications. Implemented for all integer vector types. We only use + // this when we don't have SSSE3 which allows a LUT-based lowering that is + // much faster, even faster than using native popcnt instructions. SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), DL, EltVT); @@ -17424,7 +17566,6 @@ static SDValue LowerVectorCTPOPBitmath(SDValue Op, SDLoc DL, return V; } - static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); @@ -17434,6 +17575,12 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget, SDLoc DL(Op.getNode()); SDValue Op0 = Op.getOperand(0); + if (!Subtarget->hasSSSE3()) { + // We can't use the fast LUT approach, so fall back on vectorized bitmath. + assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!"); + return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG); + } + if (VT.is256BitVector() && !Subtarget->hasInt256()) { unsigned NumElems = VT.getVectorNumElements(); @@ -17442,11 +17589,11 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget, SDValue RHS = Extract128BitVector(Op0, NumElems/2, DAG, DL); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, - LowerVectorCTPOPBitmath(LHS, DL, Subtarget, DAG), - LowerVectorCTPOPBitmath(RHS, DL, Subtarget, DAG)); + LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG), + LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG)); } - return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG); + return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); } static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, @@ -18149,6 +18296,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; + case X86ISD::PSADBW: return "X86ISD::PSADBW"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index b589ca42e56..8f4ceb1bd06 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -184,6 +184,9 @@ namespace llvm { /// Shuffle 16 8-bit values within a vector. PSHUFB, + /// Compute Sum of Absolute Differences. + PSADBW, + /// Bitwise Logical AND NOT of Packed FP values. ANDNP, diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 79d213c6e1a..05dbf477bb3 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -78,6 +78,9 @@ def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>; def X86pshufb : SDNode<"X86ISD::PSHUFB", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; +def X86psadbw : SDNode<"X86ISD::PSADBW", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; def X86andnp : SDNode<"X86ISD::ANDNP", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index d3b401e8cfc..bce99783dab 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4053,6 +4053,20 @@ defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w, defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw, int_x86_avx2_psad_bw, SSE_PMADD, 1>; +let Predicates = [HasAVX2] in + def : Pat<(v32i8 (X86psadbw (v32i8 VR256:$src1), + (v32i8 VR256:$src2))), + (VPSADBWYrr VR256:$src2, VR256:$src1)>; + +let Predicates = [HasAVX] in + def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1), + (v16i8 VR128:$src2))), + (VPSADBWrr VR128:$src2, VR128:$src1)>; + +def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1), + (v16i8 VR128:$src2))), + (PSADBWrr VR128:$src2, VR128:$src1)>; + let Predicates = [HasAVX] in defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, diff --git a/test/CodeGen/X86/vector-popcnt-128.ll b/test/CodeGen/X86/vector-popcnt-128.ll index 14dd4b0bbdb..dc99fec3d47 100644 --- a/test/CodeGen/X86/vector-popcnt-128.ll +++ b/test/CodeGen/X86/vector-popcnt-128.ll @@ -8,215 +8,418 @@ target triple = "x86_64-unknown-unknown" define <2 x i64> @testv2i64(<2 x i64> %in) { -; SSE-LABEL: testv2i64: -; SSE: # BB#0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrlq $1, %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: psubq %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: psrlq $2, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrlq $4, %xmm1 -; SSE-NEXT: paddq %xmm0, %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: paddb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psllq $16, %xmm1 -; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psllq $8, %xmm0 -; SSE-NEXT: paddb %xmm1, %xmm0 -; SSE-NEXT: psrlq $56, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: testv2i64: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $1, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlq $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: paddq %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $4, %xmm1 +; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psllq $16, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psllq $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: psrlq $56, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv2i64: +; SSE3: # BB#0: +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlq $1, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: psubq %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlq $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: paddq %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlq $4, %xmm1 +; SSE3-NEXT: paddq %xmm0, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psllq $32, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psllq $16, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psllq $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: psrlq $56, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv2i64: +; SSSE3: # BB#0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pshufb %xmm2, %xmm4 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm3 +; SSSE3-NEXT: paddb %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: psadbw %xmm3, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv2i64: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pshufb %xmm2, %xmm4 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm3 +; SSE41-NEXT: paddb %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: psadbw %xmm3, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: testv2i64: ; AVX: # BB#0: -; AVX-NEXT: vpsrlq $1, %xmm0, %xmm1 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpsrlq $2, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpsrlq $4, %xmm0, %xmm1 -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpsllq $32, %xmm0, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsllq $16, %xmm0, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsllq $8, %xmm0, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlq $56, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in) ret <2 x i64> %out } define <4 x i32> @testv4i32(<4 x i32> %in) { -; SSE-LABEL: testv4i32: -; SSE: # BB#0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $1, %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: psubd %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: psrld $2, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: paddd %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $4, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psllq $16, %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: psllq $8, %xmm0 -; SSE-NEXT: paddb %xmm2, %xmm0 -; SSE-NEXT: psrld $24, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: testv4i32: -; AVX1: # BB#0: -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsrld $4, %xmm0, %xmm1 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsllq $16, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsllq $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: testv4i32: -; AVX2: # BB#0: -; AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2 -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpsrld $2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpsrld $4, %xmm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsllq $16, %xmm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsllq $8, %xmm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $24, %xmm0, %xmm0 -; AVX2-NEXT: retq +; SSE2-LABEL: testv4i32: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrld $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $4, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psllq $16, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psllq $8, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: psrld $24, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv4i32: +; SSE3: # BB#0: +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrld $1, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: psubd %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrld $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: paddd %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrld $4, %xmm1 +; SSE3-NEXT: paddd %xmm0, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE3-NEXT: psllq $16, %xmm2 +; SSE3-NEXT: paddb %xmm1, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 +; SSE3-NEXT: psllq $8, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 +; SSE3-NEXT: psrld $24, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv4i32: +; SSSE3: # BB#0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm1 +; SSSE3-NEXT: paddb %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: psadbw %xmm0, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: psadbw %xmm0, %xmm1 +; SSSE3-NEXT: packuswb %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv4i32: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pshufb %xmm3, %xmm4 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm1 +; SSE41-NEXT: paddb %xmm4, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE41-NEXT: psadbw %xmm0, %xmm2 +; SSE41-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE41-NEXT: psadbw %xmm0, %xmm1 +; SSE41-NEXT: packuswb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testv4i32: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpsadbw %xmm2, %xmm1, %xmm2 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in) ret <4 x i32> %out } define <8 x i16> @testv8i16(<8 x i16> %in) { -; SSE-LABEL: testv8i16: -; SSE: # BB#0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrlw $1, %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: psubw %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: psrlw $2, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: paddw %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrlw $4, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psllq $8, %xmm0 -; SSE-NEXT: paddb %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: testv8i16: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psubw %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: paddw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psllq $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv8i16: +; SSE3: # BB#0: +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: psubw %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: paddw %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddw %xmm0, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psllq $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: psrlw $8, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv8i16: +; SSSE3: # BB#0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm1 +; SSSE3-NEXT: paddb %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: paddb %xmm0, %xmm1 +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv8i16: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pshufb %xmm2, %xmm4 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm3 +; SSE41-NEXT: paddb %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; SSE41-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE41-NEXT: paddb %xmm0, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: retq ; ; AVX-LABEL: testv8i16: ; AVX: # BB#0: -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm1 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpsllq $8, %xmm0, %xmm1 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX-NEXT: retq %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in) ret <8 x i16> %out } define <16 x i8> @testv16i8(<16 x i8> %in) { -; SSE-LABEL: testv16i8: -; SSE: # BB#0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrlw $1, %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: psubb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: psrlw $2, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: paddb %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrlw $4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: testv16i8: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv16i8: +; SSE3: # BB#0: +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv16i8: +; SSSE3: # BB#0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm1 +; SSSE3-NEXT: paddb %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv16i8: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pshufb %xmm3, %xmm4 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm1 +; SSE41-NEXT: paddb %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: testv16i8: ; AVX: # BB#0: -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm1 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in) ret <16 x i8> %out diff --git a/test/CodeGen/X86/vector-popcnt-256.ll b/test/CodeGen/X86/vector-popcnt-256.ll index a8cc1e08930..391f14ba807 100644 --- a/test/CodeGen/X86/vector-popcnt-256.ll +++ b/test/CodeGen/X86/vector-popcnt-256.ll @@ -7,105 +7,38 @@ define <4 x i64> @testv4i64(<4 x i64> %in) { ; AVX1-LABEL: testv4i64: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rdx -; AVX1-NEXT: movq %rdx, %rax -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555 -; AVX1-NEXT: andq %r8, %rax -; AVX1-NEXT: subq %rax, %rdx -; AVX1-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 -; AVX1-NEXT: movq %rdx, %rsi -; AVX1-NEXT: andq %rax, %rsi -; AVX1-NEXT: shrq $2, %rdx -; AVX1-NEXT: andq %rax, %rdx -; AVX1-NEXT: addq %rsi, %rdx -; AVX1-NEXT: movq %rdx, %rdi -; AVX1-NEXT: shrq $4, %rdi -; AVX1-NEXT: addq %rdx, %rdi -; AVX1-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F -; AVX1-NEXT: andq %rdx, %rdi -; AVX1-NEXT: movabsq $72340172838076673, %rsi # imm = 0x101010101010101 -; AVX1-NEXT: imulq %rsi, %rdi -; AVX1-NEXT: shrq $56, %rdi -; AVX1-NEXT: vmovq %rdi, %xmm2 -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: movq %rcx, %rdi -; AVX1-NEXT: shrq %rdi -; AVX1-NEXT: andq %r8, %rdi -; AVX1-NEXT: subq %rdi, %rcx -; AVX1-NEXT: movq %rcx, %rdi -; AVX1-NEXT: andq %rax, %rdi -; AVX1-NEXT: shrq $2, %rcx -; AVX1-NEXT: andq %rax, %rcx -; AVX1-NEXT: addq %rdi, %rcx -; AVX1-NEXT: movq %rcx, %rdi -; AVX1-NEXT: shrq $4, %rdi -; AVX1-NEXT: addq %rcx, %rdi -; AVX1-NEXT: andq %rdx, %rdi -; AVX1-NEXT: imulq %rsi, %rdi -; AVX1-NEXT: shrq $56, %rdi -; AVX1-NEXT: vmovq %rdi, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rdi -; AVX1-NEXT: shrq %rdi -; AVX1-NEXT: andq %r8, %rdi -; AVX1-NEXT: subq %rdi, %rcx -; AVX1-NEXT: movq %rcx, %rdi -; AVX1-NEXT: andq %rax, %rdi -; AVX1-NEXT: shrq $2, %rcx -; AVX1-NEXT: andq %rax, %rcx -; AVX1-NEXT: addq %rdi, %rcx -; AVX1-NEXT: movq %rcx, %rdi -; AVX1-NEXT: shrq $4, %rdi -; AVX1-NEXT: addq %rcx, %rdi -; AVX1-NEXT: andq %rdx, %rdi -; AVX1-NEXT: imulq %rsi, %rdi -; AVX1-NEXT: shrq $56, %rdi -; AVX1-NEXT: vmovq %rdi, %xmm2 -; AVX1-NEXT: vmovq %xmm0, %rcx -; AVX1-NEXT: movq %rcx, %rdi -; AVX1-NEXT: shrq %rdi -; AVX1-NEXT: andq %r8, %rdi -; AVX1-NEXT: subq %rdi, %rcx -; AVX1-NEXT: movq %rcx, %rdi -; AVX1-NEXT: andq %rax, %rdi -; AVX1-NEXT: shrq $2, %rcx -; AVX1-NEXT: andq %rax, %rcx -; AVX1-NEXT: addq %rdi, %rcx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: shrq $4, %rax -; AVX1-NEXT: addq %rcx, %rax -; AVX1-NEXT: andq %rdx, %rax -; AVX1-NEXT: imulq %rsi, %rax -; AVX1-NEXT: shrq $56, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlq $2, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpsrlq $4, %ymm0, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $32, %ymm0, %ymm1 -; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $16, %ymm0, %ymm1 -; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $8, %ymm0, %ymm1 -; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlq $56, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in) ret <4 x i64> %out @@ -115,165 +48,50 @@ define <8 x i32> @testv8i32(<8 x i32> %in) { ; AVX1-LABEL: testv8i32: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrd $1, %xmm1, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $858993459, %eax # imm = 0x33333333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX1-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; AVX1-NEXT: shrl $24, %eax -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: andl $1431655765, %edx # imm = 0x55555555 -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: andl $858993459, %edx # imm = 0x33333333 -; AVX1-NEXT: shrl $2, %ecx -; AVX1-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $4, %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F -; AVX1-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101 -; AVX1-NEXT: shrl $24, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $2, %xmm1, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $858993459, %eax # imm = 0x33333333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX1-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; AVX1-NEXT: shrl $24, %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $3, %xmm1, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $858993459, %eax # imm = 0x33333333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX1-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; AVX1-NEXT: shrl $24, %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $858993459, %eax # imm = 0x33333333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX1-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; AVX1-NEXT: shrl $24, %eax -; AVX1-NEXT: vmovd %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: andl $1431655765, %edx # imm = 0x55555555 -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: andl $858993459, %edx # imm = 0x33333333 -; AVX1-NEXT: shrl $2, %ecx -; AVX1-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl $4, %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F -; AVX1-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101 -; AVX1-NEXT: shrl $24, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $858993459, %eax # imm = 0x33333333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX1-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; AVX1-NEXT: shrl $24, %eax -; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $858993459, %eax # imm = 0x33333333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX1-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; AVX1-NEXT: shrl $24, %eax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX1-NEXT: vpsadbw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv8i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm1 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpsrld $2, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpsrld $4, %ymm0, %ymm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $16, %ymm0, %ymm1 -; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsllq $8, %ymm0, %ymm1 -; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in) ret <8 x i32> %out @@ -282,590 +100,49 @@ define <8 x i32> @testv8i32(<8 x i32> %in) { define <16 x i16> @testv16i16(<16 x i16> %in) { ; AVX1-LABEL: testv16i16: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm1, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX1-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX1-NEXT: movzbl %ah, %eax # NOREX -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: andl $21845, %edx # imm = 0x5555 -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: andl $13107, %edx # imm = 0x3333 -; AVX1-NEXT: shrl $2, %ecx -; AVX1-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: andl $65520, %edx # imm = 0xFFF0 -; AVX1-NEXT: shrl $4, %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: andl $3855, %edx # imm = 0xF0F -; AVX1-NEXT: imull $257, %edx, %ecx # imm = 0x101 -; AVX1-NEXT: movzbl %ch, %ecx # NOREX -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $2, %xmm1, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX1-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX1-NEXT: movzbl %ah, %eax # NOREX -; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $3, %xmm1, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX1-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX1-NEXT: movzbl %ah, %eax # NOREX -; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $4, %xmm1, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX1-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX1-NEXT: movzbl %ah, %eax # NOREX -; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $5, %xmm1, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX1-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX1-NEXT: movzbl %ah, %eax # NOREX -; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $6, %xmm1, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX1-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX1-NEXT: movzbl %ah, %eax # NOREX -; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $7, %xmm1, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX1-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX1-NEXT: movzbl %ah, %eax # NOREX -; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX1-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX1-NEXT: movzbl %ah, %eax # NOREX -; AVX1-NEXT: vmovd %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: shrl %edx -; AVX1-NEXT: andl $21845, %edx # imm = 0x5555 -; AVX1-NEXT: subl %edx, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: andl $13107, %edx # imm = 0x3333 -; AVX1-NEXT: shrl $2, %ecx -; AVX1-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX1-NEXT: addl %edx, %ecx -; AVX1-NEXT: movl %ecx, %edx -; AVX1-NEXT: andl $65520, %edx # imm = 0xFFF0 -; AVX1-NEXT: shrl $4, %edx -; AVX1-NEXT: addl %ecx, %edx -; AVX1-NEXT: andl $3855, %edx # imm = 0xF0F -; AVX1-NEXT: imull $257, %edx, %ecx # imm = 0x101 -; AVX1-NEXT: movzbl %ch, %ecx # NOREX -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX1-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX1-NEXT: movzbl %ah, %eax # NOREX -; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX1-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX1-NEXT: movzbl %ah, %eax # NOREX -; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $4, %xmm0, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX1-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX1-NEXT: movzbl %ah, %eax # NOREX -; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX1-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX1-NEXT: movzbl %ah, %eax # NOREX -; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $6, %xmm0, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX1-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX1-NEXT: movzbl %ah, %eax # NOREX -; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrw $7, %xmm0, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl %ecx -; AVX1-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX1-NEXT: shrl $4, %ecx -; AVX1-NEXT: addl %eax, %ecx -; AVX1-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX1-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX1-NEXT: movzbl %ah, %eax # NOREX -; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv16i16: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrw $1, %xmm1, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl %ecx -; AVX2-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NEXT: subl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NEXT: shrl $2, %eax -; AVX2-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NEXT: shrl $4, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NEXT: vmovd %xmm1, %ecx -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: shrl %edx -; AVX2-NEXT: andl $21845, %edx # imm = 0x5555 -; AVX2-NEXT: subl %edx, %ecx -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: andl $13107, %edx # imm = 0x3333 -; AVX2-NEXT: shrl $2, %ecx -; AVX2-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: andl $65520, %edx # imm = 0xFFF0 -; AVX2-NEXT: shrl $4, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: andl $3855, %edx # imm = 0xF0F -; AVX2-NEXT: imull $257, %edx, %ecx # imm = 0x101 -; AVX2-NEXT: movzbl %ch, %ecx # NOREX -; AVX2-NEXT: vmovd %ecx, %xmm2 -; AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $2, %xmm1, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl %ecx -; AVX2-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NEXT: subl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NEXT: shrl $2, %eax -; AVX2-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NEXT: shrl $4, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $3, %xmm1, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl %ecx -; AVX2-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NEXT: subl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NEXT: shrl $2, %eax -; AVX2-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NEXT: shrl $4, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $4, %xmm1, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl %ecx -; AVX2-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NEXT: subl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NEXT: shrl $2, %eax -; AVX2-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NEXT: shrl $4, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $5, %xmm1, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl %ecx -; AVX2-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NEXT: subl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NEXT: shrl $2, %eax -; AVX2-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NEXT: shrl $4, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $6, %xmm1, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl %ecx -; AVX2-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NEXT: subl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NEXT: shrl $2, %eax -; AVX2-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NEXT: shrl $4, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $7, %xmm1, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl %ecx -; AVX2-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NEXT: subl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NEXT: shrl $2, %eax -; AVX2-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NEXT: shrl $4, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl %ecx -; AVX2-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NEXT: subl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NEXT: shrl $2, %eax -; AVX2-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NEXT: shrl $4, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NEXT: vmovd %xmm0, %ecx -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: shrl %edx -; AVX2-NEXT: andl $21845, %edx # imm = 0x5555 -; AVX2-NEXT: subl %edx, %ecx -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: andl $13107, %edx # imm = 0x3333 -; AVX2-NEXT: shrl $2, %ecx -; AVX2-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NEXT: addl %edx, %ecx -; AVX2-NEXT: movl %ecx, %edx -; AVX2-NEXT: andl $65520, %edx # imm = 0xFFF0 -; AVX2-NEXT: shrl $4, %edx -; AVX2-NEXT: addl %ecx, %edx -; AVX2-NEXT: andl $3855, %edx # imm = 0xF0F -; AVX2-NEXT: imull $257, %edx, %ecx # imm = 0x101 -; AVX2-NEXT: movzbl %ch, %ecx # NOREX -; AVX2-NEXT: vmovd %ecx, %xmm2 -; AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl %ecx -; AVX2-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NEXT: subl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NEXT: shrl $2, %eax -; AVX2-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NEXT: shrl $4, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl %ecx -; AVX2-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NEXT: subl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NEXT: shrl $2, %eax -; AVX2-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NEXT: shrl $4, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl %ecx -; AVX2-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NEXT: subl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NEXT: shrl $2, %eax -; AVX2-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NEXT: shrl $4, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl %ecx -; AVX2-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NEXT: subl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NEXT: shrl $2, %eax -; AVX2-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NEXT: shrl $4, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl %ecx -; AVX2-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NEXT: subl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NEXT: shrl $2, %eax -; AVX2-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NEXT: shrl $4, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl %ecx -; AVX2-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NEXT: subl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NEXT: shrl $2, %eax -; AVX2-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NEXT: addl %ecx, %eax -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NEXT: shrl $4, %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: retq %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in) ret <16 x i16> %out @@ -875,1037 +152,33 @@ define <32 x i8> @testv32i8(<32 x i8> %in) { ; AVX1-LABEL: testv32i8: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrb $1, %xmm1, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpextrb $0, %xmm1, %ecx -; AVX1-NEXT: movb %cl, %dl -; AVX1-NEXT: shrb %dl -; AVX1-NEXT: andb $85, %dl -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movb %cl, %dl -; AVX1-NEXT: andb $51, %dl -; AVX1-NEXT: shrb $2, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: movb %cl, %dl -; AVX1-NEXT: shrb $4, %dl -; AVX1-NEXT: addb %cl, %dl -; AVX1-NEXT: andb $15, %dl -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $2, %xmm1, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $3, %xmm1, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm1, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $5, %xmm1, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm1, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $7, %xmm1, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $9, %xmm1, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $10, %xmm1, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $11, %xmm1, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $13, %xmm1, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $14, %xmm1, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 -; AVX1-NEXT: vpextrb $1, %xmm0, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpextrb $0, %xmm0, %ecx -; AVX1-NEXT: movb %cl, %dl -; AVX1-NEXT: shrb %dl -; AVX1-NEXT: andb $85, %dl -; AVX1-NEXT: subb %dl, %cl -; AVX1-NEXT: movb %cl, %dl -; AVX1-NEXT: andb $51, %dl -; AVX1-NEXT: shrb $2, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: addb %dl, %cl -; AVX1-NEXT: movb %cl, %dl -; AVX1-NEXT: shrb $4, %dl -; AVX1-NEXT: addb %cl, %dl -; AVX1-NEXT: andb $15, %dl -; AVX1-NEXT: movzbl %dl, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $2, %xmm0, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $3, %xmm0, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $4, %xmm0, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $5, %xmm0, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $6, %xmm0, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $7, %xmm0, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $8, %xmm0, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $9, %xmm0, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $10, %xmm0, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $11, %xmm0, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $12, %xmm0, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $13, %xmm0, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $14, %xmm0, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX1-NEXT: vpextrb $15, %xmm0, %eax -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb %cl -; AVX1-NEXT: andb $85, %cl -; AVX1-NEXT: subb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: andb $51, %cl -; AVX1-NEXT: shrb $2, %al -; AVX1-NEXT: andb $51, %al -; AVX1-NEXT: addb %cl, %al -; AVX1-NEXT: movb %al, %cl -; AVX1-NEXT: shrb $4, %cl -; AVX1-NEXT: addb %al, %cl -; AVX1-NEXT: andb $15, %cl -; AVX1-NEXT: movzbl %cl, %eax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv32i8: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpextrb $0, %xmm1, %ecx -; AVX2-NEXT: movb %cl, %dl -; AVX2-NEXT: shrb %dl -; AVX2-NEXT: andb $85, %dl -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movb %cl, %dl -; AVX2-NEXT: andb $51, %dl -; AVX2-NEXT: shrb $2, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: movb %cl, %dl -; AVX2-NEXT: shrb $4, %dl -; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: andb $15, %dl -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm2 -; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $2, %xmm1, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $5, %xmm1, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm1, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $10, %xmm1, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpextrb $0, %xmm0, %ecx -; AVX2-NEXT: movb %cl, %dl -; AVX2-NEXT: shrb %dl -; AVX2-NEXT: andb $85, %dl -; AVX2-NEXT: subb %dl, %cl -; AVX2-NEXT: movb %cl, %dl -; AVX2-NEXT: andb $51, %dl -; AVX2-NEXT: shrb $2, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: addb %dl, %cl -; AVX2-NEXT: movb %cl, %dl -; AVX2-NEXT: shrb $4, %dl -; AVX2-NEXT: addb %cl, %dl -; AVX2-NEXT: andb $15, %dl -; AVX2-NEXT: movzbl %dl, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm2 -; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb %cl -; AVX2-NEXT: andb $85, %cl -; AVX2-NEXT: subb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: andb $51, %cl -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $51, %al -; AVX2-NEXT: addb %cl, %al -; AVX2-NEXT: movb %al, %cl -; AVX2-NEXT: shrb $4, %cl -; AVX2-NEXT: addb %al, %cl -; AVX2-NEXT: andb $15, %cl -; AVX2-NEXT: movzbl %cl, %eax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in) ret <32 x i8> %out -- 2.34.1