- int QuarterSize = NumElems/4;
- int HalfSize = QuarterSize*2;
- for (int i = 0; i < QuarterSize; ++i)
- if (!isUndefOrInRange(Mask[i], 0, HalfSize))
- return false;
- for (int i = QuarterSize; i < QuarterSize*2; ++i)
- if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
- return false;
-
- // For VSHUFPSY, the mask of the second half must be the same as the first
- // but with the appropriate offsets. This works in the same way as
- // VPERMILPS works with masks.
- for (int i = QuarterSize*2; i < QuarterSize*3; ++i) {
- if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
- return false;
- if (NumElems == 4)
- continue;
- // VSHUFPSY handling
- int FstHalfIdx = i-HalfSize;
- if (Mask[FstHalfIdx] < 0)
- continue;
- if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
- return false;
- }
- for (int i = QuarterSize*3; i < NumElems; ++i) {
- if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
- return false;
- int FstHalfIdx = i-HalfSize;
- if (NumElems == 4)
- continue;
- // VSHUFPSY handling
- if (Mask[FstHalfIdx] < 0)
- continue;
- if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
- return false;
- }
-
- return true;
-}
-
-/// isCommutedVSHUFP() - Returns true if the shuffle mask is exactly
-/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
-/// half elements to come from vector 1 (which would equal the dest.) and
-/// the upper half to come from vector 2.
-static bool isCommutedVSHUFPY(ShuffleVectorSDNode *N, bool HasAVX) {
- EVT VT = N->getValueType(0);
- int NumElems = VT.getVectorNumElements();
- SmallVector<int, 8> Mask;
- N->getMask(Mask);
-
- if (!HasAVX || VT.getSizeInBits() != 256)
- return false;
-
- if (NumElems != 4 && NumElems != 8)
- return false;
-
- // VSHUFPSY divides the resulting vector into 4 chunks.
- // The sources are also splitted into 4 chunks, and each destination
- // chunk must come from a different source chunk.
- //
- // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0
- // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9
- //
- // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4,
- // Y3..Y0, Y3..Y0, X3..X0, X3..X0
- //
- // VSHUFPDY divides the resulting vector into 4 chunks.
- // The sources are also splitted into 4 chunks, and each destination
- // chunk must come from a different source chunk.
- //
- // SRC1 => X3 X2 X1 X0
- // SRC2 => Y3 Y2 Y1 Y0
- //
- // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0
- //
- int QuarterSize = NumElems/4;
- int HalfSize = QuarterSize*2;
- for (int i = 0; i < QuarterSize; ++i)
- if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
- return false;
- for (int i = QuarterSize; i < QuarterSize*2; ++i)
- if (!isUndefOrInRange(Mask[i], 0, HalfSize))
- return false;
-
- // For VSHUFPSY, the mask of the second half must be the same as the first
- // but with the appropriate offsets. This works in the same way as
- // VPERMILPS works with masks.
- for (int i = QuarterSize*2; i < QuarterSize*3; ++i) {
- if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
- return false;
- if (NumElems == 4)
- continue;
- // VSHUFPSY handling
- int FstHalfIdx = i-HalfSize;
- if (Mask[FstHalfIdx] < 0)
- continue;
- if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
- return false;
- }
- for (int i = QuarterSize*3; i < NumElems; ++i) {
- if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
- return false;
- if (NumElems == 4)
- continue;
- // VSHUFPSY handling
- int FstHalfIdx = i-HalfSize;
- if (Mask[FstHalfIdx] < 0)
- continue;
- if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
- return false;
+ unsigned QuarterSize = NumElems/4;
+ unsigned HalfSize = QuarterSize*2;
+ for (unsigned l = 0; l != 2; ++l) {
+ unsigned LaneStart = l*HalfSize;
+ for (unsigned s = 0; s != 2; ++s) {
+ unsigned QuarterStart = s*QuarterSize;
+ unsigned Src = (Commuted) ? (1-s) : s;
+ unsigned SrcStart = Src*NumElems + LaneStart;
+ for (unsigned i = 0; i != QuarterSize; ++i) {
+ int Idx = Mask[i+QuarterStart+LaneStart];
+ if (!isUndefOrInRange(Idx, SrcStart, SrcStart+HalfSize))
+ return false;
+ // For VSHUFPSY, the mask of the second half must be the same as the
+ // first but with the appropriate offsets. This works in the same way as
+ // VPERMILPS works with masks.
+ if (NumElems == 4 || l == 0 || Mask[i+QuarterStart] < 0)
+ continue;
+ if (!isUndefOrEqual(Idx, Mask[i+QuarterStart]+LaneStart))
+ return false;
+ }
+ }