From e6329cf3032aef9e7dfe75cd0aa03add19a7e120 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Tue, 5 Aug 2014 08:19:21 +0000 Subject: [PATCH] [x86] Fix a crash and wrong-code bug in the new vector lowering all found by a single test reduced out of a failure on llvm-stress. The start of the problem (and the crash) came when we tried to use a find of a non-used slot in the move-to half of the move-mask as the target for two bad-half inputs. While if lucky this will be the first of a pair of slots which we can place the bad-half inputs into, it isn't actually guaranteed. This really isn't surprising, not sure what I was thinking. The correct way to find the two unused slots is to look for one of the *used* slots. We know it isn't that pair, and we can use some modular arithmetic to find the other pair by masking off the odd bit and adding 2 modulo 4. With this, we reliably found a viable pair of slots for the bad-half inputs. Sadly, that wasn't enough. We also had a wrong code bug that surfaced when I reduced the test case for this where we would use the same slot twice for the two bad inputs. This is because both of the bad inputs could be in odd slots originally and thus the mod-2 mapping would actually be the same. The whole point of the weird indexing into the pair of empty slots was to try to leverage when the end result needed the two bad-half inputs to be paired in a dword and pre-pair them in the correct orrientation. This is less important with the powerful combining we're now doing, and also easier and more reliable to achieve be noting that we add the bad-half inputs in order. Thus, if they are in a dword pair, the low part of that will be the first input in the sequence. Always putting that in the low element will just do the right thing in addition to computing the correct result. Test case added. =] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214849 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 39 ++++++++++++----------- test/CodeGen/X86/vector-shuffle-128-v8.ll | 23 +++++++++++++ 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 88972a9a893..d1540562298 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7628,34 +7628,37 @@ static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1, if (GoodInputs.size() == 2) { // If the low inputs are spread across two dwords, pack them into // a single dword. - MoveMask[Mask[GoodInputs[0]] % 2 + MoveOffset] = - Mask[GoodInputs[0]] - MaskOffset; - MoveMask[Mask[GoodInputs[1]] % 2 + MoveOffset] = - Mask[GoodInputs[1]] - MaskOffset; - Mask[GoodInputs[0]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; - Mask[GoodInputs[1]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; + MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset; + MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset; + Mask[GoodInputs[0]] = MoveOffset + MaskOffset; + Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset; } else { - // Otherwise pin the low inputs. + // Otherwise pin the good inputs. for (int GoodInput : GoodInputs) MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset; } - int MoveMaskIdx = - std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) - - std::begin(MoveMask); - assert(MoveMaskIdx >= MoveOffset && "Established above"); - if (BadInputs.size() == 2) { + // If we have two bad inputs then there may be either one or two good + // inputs fixed in place. Find a fixed input, and then find the *other* + // two adjacent indices by using modular arithmetic. + int GoodMaskIdx = + std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), + [](int M) { return M >= 0; }) - + std::begin(MoveMask); + int MoveMaskIdx = + (((GoodMaskIdx - MoveOffset) & ~1) + 2 % 4) + MoveOffset; assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot"); assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot"); - MoveMask[MoveMaskIdx + Mask[BadInputs[0]] % 2] = - Mask[BadInputs[0]] - MaskOffset; - MoveMask[MoveMaskIdx + Mask[BadInputs[1]] % 2] = - Mask[BadInputs[1]] - MaskOffset; - Mask[BadInputs[0]] = MoveMaskIdx + Mask[BadInputs[0]] % 2 + MaskOffset; - Mask[BadInputs[1]] = MoveMaskIdx + Mask[BadInputs[1]] % 2 + MaskOffset; + MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; + MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset; + Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; + Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset; } else { assert(BadInputs.size() == 1 && "All sizes handled"); + int MoveMaskIdx = + std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) - + std::begin(MoveMask); MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; } diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 9a719c8c7ee..1dc744af47d 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -648,3 +648,26 @@ define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) { %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } + +define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: @shuffle_v8i16_XXX1X579 +; SSE2: # BB#0: +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,7,6,7] +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1] +; SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,5,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: @shuffle_v8i16_XXX1X579 +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,7,6,7] +; SSSE3-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: punpcklwd {{.*}} # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: pshufb {{.*}} # xmm0 = xmm0[{{[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+,[0-9]+}},4,5,{{[0-9]+,[0-9]+}},8,9,12,13,6,7] +; SSSE3-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %shuffle +} -- 2.34.1