From 46c5a97adcc677c6baec90477104c488d91e55a5 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Fri, 21 Nov 2014 14:53:03 +0000 Subject: [PATCH] [x86] Restructure the checking patterns for v16 and v32 avx2 vector shuffle lowering to allow much better blend matching. Specifically, with the new structure the code seems clearer to me and we correctly can hit the cases where merging two 128-bit lanes is a clear win and can be shuffled cheaply afterward. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@222539 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 52 ++++++++++------------ test/CodeGen/X86/vector-shuffle-256-v16.ll | 15 +++---- test/CodeGen/X86/vector-shuffle-256-v32.ll | 15 +++---- test/CodeGen/X86/vector-shuffle-512-v8.ll | 29 +++++------- 4 files changed, 47 insertions(+), 64 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 76f35070c63..4ec7fa15198 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -10433,20 +10433,6 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Mask, Subtarget, DAG)) return Broadcast; - // There are no generalized cross-lane shuffle operations available on i16 - // element types. - if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) { - // Try to simplify this by merging 128-bit lanes to enable a lane-based - // shuffle. - if (!isSingleInputShuffleMask(Mask)) - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( - DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) - return Result; - - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, - Mask, DAG); - } - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Blend; @@ -10466,6 +10452,12 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2); if (isSingleInputShuffleMask(Mask)) { + // There are no generalized cross-lane shuffle operations available on i16 + // element types. + if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, + Mask, DAG); + SDValue PSHUFBMask[32]; for (int i = 0; i < 16; ++i) { if (Mask[i] == -1) { @@ -10486,6 +10478,12 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask))); } + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) + return Result; + // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); } @@ -10510,20 +10508,6 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Mask, Subtarget, DAG)) return Broadcast; - // There are no generalized cross-lane shuffle operations available on i8 - // element types. - if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) { - // Try to simplify this by merging 128-bit lanes to enable a lane-based - // shuffle. - if (!isSingleInputShuffleMask(Mask)) - if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( - DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) - return Result; - - return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, - DAG); - } - if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Blend; @@ -10547,6 +10531,12 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2); if (isSingleInputShuffleMask(Mask)) { + // There are no generalized cross-lane shuffle operations available on i8 + // element types. + if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, + Mask, DAG); + SDValue PSHUFBMask[32]; for (int i = 0; i < 32; ++i) PSHUFBMask[i] = @@ -10559,6 +10549,12 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)); } + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) + return Result; + // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); } diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index 2a6efda0dfa..a83a9122bee 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -732,10 +732,9 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_1 ; ; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -1339,12 +1338,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_2 ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28: ; AVX2: # BB#0: -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index d8390fdfb35..81fbce61cb5 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1007,10 +1007,10 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_ ; ; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -1630,11 +1630,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_ ; ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: ; AVX2: # BB#0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] ; AVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll index 6671735248a..050fb34b661 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -191,15 +191,13 @@ define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_9832dc76: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[0,0,3,2] -; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[1,0,2,2] -; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3] +; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -911,15 +909,13 @@ define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_9832dc76: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] -; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] -; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] -; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1414,9 +1410,8 @@ define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) { ; ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] ; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] ; ALL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] -; ALL-NEXT: vpbroadcastq %xmm3, %ymm3 -; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; ALL-NEXT: retq -- 2.34.1