- // PSADBW instruction horizontally add all bytes and leave the result in i64
- // chunks, thus directly computes the pop count for v2i64 and v4i64.
- if (EltVT == MVT::i64) {
- SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
- PopCnt = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, PopCnt, Zeros);
- return DAG.getBitcast(VT, PopCnt);
- }
-
- int NumI64Elts = VecSize / 64;
- MVT VecI64VT = MVT::getVectorVT(MVT::i64, NumI64Elts);
-
- if (EltVT == MVT::i32) {
- // We unpack the low half and high half into i32s interleaved with zeros so
- // that we can use PSADBW to horizontally sum them. The most useful part of
- // this is that it lines up the results of two PSADBW instructions to be
- // two v2i64 vectors which concatenated are the 4 population counts. We can
- // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
- SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
- SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, PopCnt, Zeros);
- SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, PopCnt, Zeros);
-
- // Do the horizontal sums into two v2i64s.
- Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
- Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
- DAG.getBitcast(ByteVecVT, Low), Zeros);
- High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT,
- DAG.getBitcast(ByteVecVT, High), Zeros);
-
- // Merge them together.
- MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
- PopCnt = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
- DAG.getBitcast(ShortVecVT, Low),
- DAG.getBitcast(ShortVecVT, High));
-
- return DAG.getBitcast(VT, PopCnt);
- }
-
- // To obtain pop count for each i16 element, shuffle the byte pop count to get
- // even and odd elements into distinct vectors, add them and zero-extend each
- // i8 elemento into i16, i.e.:
- //
- // B -> pop count per i8
- // W -> pop count per i16
- //
- // Y = shuffle B, undef <0, 2, ...>
- // Z = shuffle B, undef <1, 3, ...>
- // W = zext <... x i8> to <... x i16> (Y + Z)
- //
- // Use a byte shuffle mask that matches PSHUFB.
- //
- assert(EltVT == MVT::i16 && "Unknown how to handle type");
- SDValue Undef = DAG.getUNDEF(ByteVecVT);
- SmallVector<int, 32> MaskA, MaskB;
-
- // We can't use PSHUFB across lanes, so do the shuffle and sum inside each
- // 128-bit lane, and then collapse the result.
- int NumLanes = NumByteElts / 16;
- assert(NumByteElts % 16 == 0 && "Must have 16-byte multiple vectors!");
- for (int i = 0; i < NumLanes; ++i) {
- for (int j = 0; j < 8; ++j) {
- MaskA.push_back(i * 16 + j * 2);
- MaskB.push_back(i * 16 + (j * 2) + 1);
- }
- MaskA.append((size_t)8, -1);
- MaskB.append((size_t)8, -1);
- }
-
- SDValue ShuffA = DAG.getVectorShuffle(ByteVecVT, DL, PopCnt, Undef, MaskA);
- SDValue ShuffB = DAG.getVectorShuffle(ByteVecVT, DL, PopCnt, Undef, MaskB);
- PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, ShuffA, ShuffB);
-
- SmallVector<int, 4> Mask;
- for (int i = 0; i < NumLanes; ++i)
- Mask.push_back(2 * i);
- Mask.append((size_t)NumLanes, -1);
-
- PopCnt = DAG.getBitcast(VecI64VT, PopCnt);
- PopCnt =
- DAG.getVectorShuffle(VecI64VT, DL, PopCnt, DAG.getUNDEF(VecI64VT), Mask);
- PopCnt = DAG.getBitcast(ByteVecVT, PopCnt);
-
- // Zero extend i8s into i16 elts
- SmallVector<int, 16> ZExtInRegMask;
- for (int i = 0; i < NumByteElts / 2; ++i) {
- ZExtInRegMask.push_back(i);
- ZExtInRegMask.push_back(NumByteElts);
- }
-
- return DAG.getBitcast(
- VT, DAG.getVectorShuffle(ByteVecVT, DL, PopCnt,
- getZeroVector(ByteVecVT, Subtarget, DAG, DL),
- ZExtInRegMask));