+ int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
+ bool ByteShift = ShiftEltBits > 64;
+ unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
+ : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
+ int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
+
+ // Normalize the scale for byte shifts to still produce an i64 element
+ // type.
+ Scale = ByteShift ? Scale / 2 : Scale;
+
+ // We need to round trip through the appropriate type for the shift.
+ MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
+ MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
+ assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
+ "Illegal integer vector type");
+ V = DAG.getBitcast(ShiftVT, V);
+
+ V = DAG.getNode(OpCode, DL, ShiftVT, V,
+ DAG.getConstant(ShiftAmt, DL, MVT::i8));
+ return DAG.getBitcast(VT, V);
+ };
+
+ // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
+ // keep doubling the size of the integer elements up to that. We can
+ // then shift the elements of the integer vector by whole multiples of
+ // their width within the elements of the larger integer vector. Test each
+ // multiple to see if we can find a match with the moved element indices
+ // and that the shifted in elements are all zeroable.
+ for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2)
+ for (int Shift = 1; Shift != Scale; ++Shift)
+ for (bool Left : {true, false})
+ if (CheckZeros(Shift, Scale, Left))
+ for (SDValue V : {V1, V2})
+ if (SDValue Match = MatchShift(Shift, Scale, Left, V))
+ return Match;
+
+ // no match
+ return SDValue();
+}
+
+/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
+static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+ assert(!Zeroable.all() && "Fully zeroable shuffle mask");
+
+ int Size = Mask.size();
+ int HalfSize = Size / 2;
+ assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+ // Upper half must be undefined.
+ if (!isUndefInRange(Mask, HalfSize, HalfSize))
+ return SDValue();
+
+ // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
+ // Remainder of lower half result is zero and upper half is all undef.
+ auto LowerAsEXTRQ = [&]() {
+ // Determine the extraction length from the part of the
+ // lower half that isn't zeroable.
+ int Len = HalfSize;
+ for (; Len >= 0; --Len)
+ if (!Zeroable[Len - 1])
+ break;
+ assert(Len > 0 && "Zeroable shuffle mask");
+
+ // Attempt to match first Len sequential elements from the lower half.
+ SDValue Src;
+ int Idx = -1;
+ for (int i = 0; i != Len; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+ SDValue &V = (M < Size ? V1 : V2);
+ M = M % Size;
+
+ // All mask elements must be in the lower half.
+ if (M > HalfSize)
+ return SDValue();
+
+ if (Idx < 0 || (Src == V && Idx == (M - i))) {
+ Src = V;
+ Idx = M - i;
+ continue;
+ }
+ return SDValue();
+ }
+
+ if (Idx < 0)
+ return SDValue();
+
+ assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
+ int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+ int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+ return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
+ DAG.getConstant(BitLen, DL, MVT::i8),
+ DAG.getConstant(BitIdx, DL, MVT::i8));
+ };
+
+ if (SDValue ExtrQ = LowerAsEXTRQ())
+ return ExtrQ;
+
+ // INSERTQ: Extract lowest Len elements from lower half of second source and
+ // insert over first source, starting at Idx.
+ // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
+ auto LowerAsInsertQ = [&]() {
+ for (int Idx = 0; Idx != HalfSize; ++Idx) {
+ SDValue Base;
+
+ // Attempt to match first source from mask before insertion point.
+ if (isUndefInRange(Mask, 0, Idx)) {
+ /* EMPTY */
+ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
+ Base = V1;
+ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
+ Base = V2;
+ } else {
+ continue;
+ }