+ // Lower v16i8/v32i8 mul as promotion to v8i16/v16i16 vector
+ // pairs, multiply and truncate.
+ if (VT == MVT::v16i8 || VT == MVT::v32i8) {
+ if (Subtarget->hasInt256()) {
+ if (VT == MVT::v32i8) {
+ MVT SubVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() / 2);
+ SDValue Lo = DAG.getIntPtrConstant(0, dl);
+ SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
+ SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Lo);
+ SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Lo);
+ SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, A, Hi);
+ SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, B, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(ISD::MUL, dl, SubVT, ALo, BLo),
+ DAG.getNode(ISD::MUL, dl, SubVT, AHi, BHi));
+ }
+
+ MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
+ return DAG.getNode(
+ ISD::TRUNCATE, dl, VT,
+ DAG.getNode(ISD::MUL, dl, ExVT,
+ DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
+ DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
+ }
+
+ assert(VT == MVT::v16i8 &&
+ "Pre-AVX2 support only supports v16i8 multiplication");
+ MVT ExVT = MVT::v8i16;
+
+ // Extract the lo parts and sign extend to i16
+ SDValue ALo, BLo;
+ if (Subtarget->hasSSE41()) {
+ ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
+ BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
+ } else {
+ const int ShufMask[] = {0, -1, 1, -1, 2, -1, 3, -1,
+ 4, -1, 5, -1, 6, -1, 7, -1};
+ ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ ALo = DAG.getNode(ISD::BITCAST, dl, ExVT, ALo);
+ BLo = DAG.getNode(ISD::BITCAST, dl, ExVT, BLo);
+ ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
+ BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
+ }
+
+ // Extract the hi parts and sign extend to i16
+ SDValue AHi, BHi;
+ if (Subtarget->hasSSE41()) {
+ const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
+ -1, -1, -1, -1, -1, -1, -1, -1};
+ AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
+ BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
+ } else {
+ const int ShufMask[] = {8, -1, 9, -1, 10, -1, 11, -1,
+ 12, -1, 13, -1, 14, -1, 15, -1};
+ AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
+ BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
+ AHi = DAG.getNode(ISD::BITCAST, dl, ExVT, AHi);
+ BHi = DAG.getNode(ISD::BITCAST, dl, ExVT, BHi);
+ AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
+ BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
+ }
+
+ // Multiply, mask the lower 8bits of the lo/hi results and pack
+ SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
+ SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
+ RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
+ RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
+ return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
+ }
+