From: Chandler Carruth Date: Sat, 27 Sep 2014 04:42:44 +0000 (+0000) Subject: [x86] Fix terrible bugs everywhere in the new vector shuffle lowering X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=72c3b07dfd6ece452716ae317a7c4ab14769e4f8;p=oota-llvm.git [x86] Fix terrible bugs everywhere in the new vector shuffle lowering and in the target shuffle combining when trying to widen vector elements. Previously only one of these was correct, and we didn't correctly propagate zeroing target shuffle masks (which have a different sentinel value from undef in non- target shuffle masks now). This isn't just a missed optimization, this caused us to drop zeroing shuffles on the floor and miscompile code. The added test case is one example of that. There are other fixes to the test suite as a consequence of this as well as restoring the undef elements in some of the masks that were lost when I brought sanity to the actual *value* of the undef and zero sentinels. I've also just cleaned up some of the PSHUFD and PSHUFLW and PSHUFHW combining code, but that code really needs to go. It was a nice initial attempt, but it isn't very principled and the recursive shuffle combiner is much more powerful. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218562 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 4d090529784..17f925bd3e8 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -9863,14 +9863,48 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, } } -/// \brief Tiny helper function to test whether a shuffle mask could be +/// \brief Helper function to test whether a shuffle mask could be /// simplified by widening the elements being shuffled. -static bool canWidenShuffleElements(ArrayRef Mask) { - for (int i = 0, Size = Mask.size(); i < Size; i += 2) - if ((Mask[i] != -1 && Mask[i] % 2 != 0) || - (Mask[i + 1] != -1 && (Mask[i + 1] % 2 != 1 || - (Mask[i] != -1 && Mask[i] + 1 != Mask[i + 1])))) - return false; +/// +/// Appends the mask for wider elements in WidenedMask if valid. Otherwise +/// leaves it in an unspecified state. +/// +/// NOTE: This must handle normal vector shuffle masks and *target* vector +/// shuffle masks. The latter have the special property of a '-2' representing +/// a zero-ed lane of a vector. +static bool canWidenShuffleElements(ArrayRef Mask, + SmallVectorImpl &WidenedMask) { + for (int i = 0, Size = Mask.size(); i < Size; i += 2) { + // Check for any of the sentinel values (negative) and if they are the same, + // we can widen to that. + if (Mask[i] < 0 && Mask[i] == Mask[i + 1]) { + WidenedMask.push_back(Mask[i]); + continue; + } + + // Check for an undef mask and a mask value properly aligned to fit with + // a pair of values. If we find such a case, use the non-undef mask's value. + if (Mask[i] == -1 && Mask[i + 1] % 2 == 1) { + WidenedMask.push_back(Mask[i + 1] / 2); + continue; + } + if (Mask[i + 1] == -1 && Mask[i] % 2 == 0) { + WidenedMask.push_back(Mask[i] / 2); + continue; + } + + // Finally check if the two mask values are adjacent and aligned with + // a pair. + if (Mask[i] != -1 && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) { + WidenedMask.push_back(Mask[i] / 2); + continue; + } + + // Otherwise we can't safely widen the elements used in this shuffle. + return false; + } + assert(WidenedMask.size() == Mask.size() / 2 && + "Incorrect size of mask after widening the elements!"); return true; } @@ -9922,20 +9956,16 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, // lanes but wider integers. We cap this to not form integers larger than i64 // but it might be interesting to form i128 integers to handle flipping the // low and high halves of AVX 256-bit vectors. + SmallVector WidenedMask; if (VT.isInteger() && VT.getScalarSizeInBits() < 64 && - canWidenShuffleElements(Mask)) { - SmallVector NewMask; - for (int i = 0, Size = Mask.size(); i < Size; i += 2) - NewMask.push_back(Mask[i] != -1 - ? Mask[i] / 2 - : (Mask[i + 1] != -1 ? Mask[i + 1] / 2 : -1)); + canWidenShuffleElements(Mask, WidenedMask)) { MVT NewVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2), VT.getVectorNumElements() / 2); V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask)); + DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask)); } int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; @@ -20697,10 +20727,10 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, // elements, and shrink them to the half-width mask. It does this in a loop // so it will reduce the size of the mask to the minimal width mask which // performs an equivalent shuffle. - while (Mask.size() > 1 && canWidenShuffleElements(Mask)) { - for (int i = 0, e = Mask.size() / 2; i < e; ++i) - Mask[i] = Mask[2 * i] / 2; - Mask.resize(Mask.size() / 2); + SmallVector WidenedMask; + while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { + Mask = std::move(WidenedMask); + WidenedMask.clear(); } return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI, @@ -20971,12 +21001,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, return SDValue(); // We combined away this shuffle, so we're done. // See if this reduces to a PSHUFD which is no more expensive and can - // combine with more operations. - if (canWidenShuffleElements(Mask)) { - int DMask[] = {-1, -1, -1, -1}; + // combine with more operations. Note that it has to at least flip the + // dwords as otherwise it would have been removed as a no-op. + if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3]) { + int DMask[] = {0, 1, 2, 3}; int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; - DMask[DOffset + 0] = DOffset + Mask[0] / 2; - DMask[DOffset + 1] = DOffset + Mask[2] / 2; + DMask[DOffset + 0] = DOffset + 1; + DMask[DOffset + 1] = DOffset + 0; V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); DCI.AddToWorklist(V.getNode()); V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index 0fcd0cca7a2..afe04589f15 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1262,17 +1262,14 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55 ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*}} # xmm3 = [0,1,2,3,4,5,6,7,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpmovzxbw %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*}} # xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpmovzxbw %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa {{.*}} # xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpunpcklbw {{.*}} # xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpunpcklbw {{.*}} # xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX1-NEXT: vpmovzxbw %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklbw {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -1289,7 +1286,7 @@ define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_ ; AVX1-LABEL: @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*}} # xmm3 = [8,9,10,11,12,13,14,15,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vmovdqa {{.*}} # xmm3 = <8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 @@ -1312,12 +1309,11 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_ ; AVX1-LABEL: @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*}} # xmm3 = [8,9,10,11,12,13,14,15,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vmovdqa {{.*}} # xmm3 = <8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpunpcklbw {{.*}} # xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-NEXT: vpshufb {{.*}} # xmm1 = xmm1[0,1,2,3,4,5,6,7,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vpmovzxbw %xmm0, %xmm0 ; AVX1-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; AVX1-NEXT: vpunpcklbw {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1338,12 +1334,11 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_ define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55 ; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa {{.*}} # xmm2 = [8,9,10,11,12,13,14,15,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vmovdqa {{.*}} # xmm2 = <8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*}} # xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpshufb {{.*}} # xmm1 = xmm1[0,1,2,3,4,5,6,7,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpmovzxbw %xmm0, %xmm0 ; AVX1-NEXT: vpshufb {{.*}} # xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] @@ -1567,3 +1562,47 @@ define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_ %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } + +define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39 +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*}} # xmm2 = zero,zero,xmm0[u],zero,xmm0[u,u,u,u,u,u,u,7,u,u,u,u] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpshufb {{.*}} # xmm4 = xmm3[4,3,u,3,u,u,u,u,u,u,u],zero,xmm3[u,u,u,u] +; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb {{.*}} # xmm2 = xmm2[0,1],zero,xmm2[3],zero,zero,zero,zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpshufb {{.*}} # xmm5 = xmm4[u,u,4,u,1,6],zero,zero,xmm4[0],zero,xmm4[11,u],zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*}} # xmm6 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7] +; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpshufb {{.*}} # xmm5 = zero,zero,xmm5[2],zero,xmm5[4,5,6,7,8,9,10],zero,xmm5[12,13,14,15] +; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpshufb {{.*}} # xmm3 = xmm3[u,u],zero,zero,xmm3[u,u,u,u,1,6,13,u,u],zero,xmm3[u,u] +; AVX1-NEXT: vpshufb {{.*}} # xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u] +; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*}} # xmm0 = zero,zero,xmm0[2,3],zero,zero,zero,zero,xmm0[8,9,10],zero,zero,xmm0[13],zero,zero +; AVX1-NEXT: vpshufb {{.*}} # xmm3 = zero,zero,xmm4[u,u],zero,zero,xmm4[12],zero,xmm4[u,u,u],zero,zero,xmm4[u,0,3] +; AVX1-NEXT: vpshufb {{.*}} # xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero +; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb {{.*}} # xmm1 = xmm1[0,1],zero,zero,xmm1[4,5,6,7],zero,zero,zero,xmm1[11,12],zero,xmm1[14,15] +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39 +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*}} # ymm2 = ymm1[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*}} # ymm2 = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23] +; AVX2-NEXT: vpshufb {{.*}} # ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*}} # ymm3 = <0,0,u,u,0,0,128,0,u,u,u,0,0,u,128,128,u,u,0,u,0,0,128,128,0,128,0,u,128,128,128,128> +; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vperm2i128 {{.*}} # ymm2 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*}} # ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*}} # ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*}} # ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7] +; AVX2-NEXT: vmovdqa {{.*}} # ymm2 = [0,0,128,128,0,0,0,0,128,128,128,0,0,128,0,0,128,128,0,128,0,0,0,0,0,0,0,128,0,0,0,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %shuffle +}