+static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, SDLoc DL,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ MVT EltVT = VT.getVectorElementType();
+ unsigned VecSize = VT.getSizeInBits();
+
+ // Implement a lookup table in register by using an algorithm based on:
+ // http://wm.ite.pl/articles/sse-popcount.html
+ //
+ // The general idea is that every lower byte nibble in the input vector is an
+ // index into a in-register pre-computed pop count table. We then split up the
+ // input vector in two new ones: (1) a vector with only the shifted-right
+ // higher nibbles for each byte and (2) a vector with the lower nibbles (and
+ // masked out higher ones) for each byte. PSHUB is used separately with both
+ // to index the in-register table. Next, both are added and the result is a
+ // i8 vector where each element contains the pop count for input byte.
+ //
+ // To obtain the pop count for elements != i8, we follow up with the same
+ // approach and use additional tricks as described below.
+ //
+ const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
+ /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
+ /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
+ /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
+
+ int NumByteElts = VecSize / 8;
+ MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
+ SDValue In = DAG.getBitcast(ByteVecVT, Op);
+ SmallVector<SDValue, 16> LUTVec;
+ for (int i = 0; i < NumByteElts; ++i)
+ LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
+ SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec);
+ SmallVector<SDValue, 16> Mask0F(NumByteElts,
+ DAG.getConstant(0x0F, DL, MVT::i8));
+ SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F);
+
+ // High nibbles
+ SmallVector<SDValue, 16> Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8));
+ SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four);
+ SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
+
+ // Low nibbles
+ SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
+
+ // The input vector is used as the shuffle mask that index elements into the
+ // LUT. After counting low and high nibbles, add the vector to obtain the
+ // final pop count per i8 element.
+ SDValue HighPopCnt =
+ DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
+ SDValue LowPopCnt =
+ DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
+ SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
+
+ if (EltVT == MVT::i8)
+ return PopCnt;
+
+ return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
+}
+