From b1769420b8b081bfa3fe41555b78a9d57264b641 Mon Sep 17 00:00:00 2001 From: Ahmed Bougacha Date: Fri, 6 Nov 2015 23:16:38 +0000 Subject: [PATCH] [X86] Don't fold non-LSB extracts into truncating broadcasts. We used to incorrectly assume that the offset we're extracting from was a multiple of the element size. So, we'd fold: (v8i16 (shufflevector (v8i16 (bitcast (v4i32 (build_vector X, Y, ...)))), <1,1,...,1>)) into: (v8i16 (vbroadcast (i16 (trunc Y)))) whereas we should have extracted the higher bits from X. Instead, bail out if the assumption doesn't hold. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@252361 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 64 +++++++-- test/CodeGen/X86/vector-shuffle-128-v16.ll | 146 +++++++++++++++++++++ test/CodeGen/X86/vector-shuffle-128-v8.ll | 142 ++++++++++++++++++++ test/CodeGen/X86/vector-shuffle-256-v16.ll | 42 ++++++ test/CodeGen/X86/vector-shuffle-256-v32.ll | 66 ++++++++++ 5 files changed, 448 insertions(+), 12 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index c9218a17a0b..0929b95e279 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7861,6 +7861,54 @@ static SDValue lowerVectorShuffleAsElementInsertion( return V2; } +/// \brief Try to lower broadcast of a single - truncated - integer element, +/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. +/// +/// This assumes we have AVX2. +static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0, + int BroadcastIdx, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(Subtarget->hasAVX2() && + "We can only lower integer broadcasts with AVX2!"); + + EVT EltVT = VT.getVectorElementType(); + EVT V0VT = V0.getValueType(); + + assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!"); + assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!"); + + EVT V0EltVT = V0VT.getVectorElementType(); + if (!V0EltVT.isInteger()) + return SDValue(); + + const unsigned EltSize = EltVT.getSizeInBits(); + const unsigned V0EltSize = V0EltVT.getSizeInBits(); + + // This is only a truncation if the original element type is larger. + if (V0EltSize <= EltSize) + return SDValue(); + + assert(((V0EltSize % EltSize) == 0) && + "Scalar type sizes must all be powers of 2 on x86!"); + + const unsigned V0Opc = V0.getOpcode(); + const unsigned Scale = V0EltSize / EltSize; + const unsigned V0BroadcastIdx = BroadcastIdx / Scale; + + // If we're extracting non-least-significant bits, this isn't a truncation. + if (BroadcastIdx % Scale) + return SDValue(); + + if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) && + V0Opc != ISD::BUILD_VECTOR) + return SDValue(); + + SDValue Scalar = V0.getOperand(V0BroadcastIdx); + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, + DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); +} + /// \brief Try to lower broadcast of a single element. /// /// For convenience, this code also bundles all of the subtarget feature set @@ -7924,18 +7972,10 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, // First, look through bitcast: if the original value has a larger element // type than the shuffle, the broadcast element is in essence truncated. // Make that explicit to ease folding. - if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) { - MVT EltVT = VT.getVectorElementType(); - SDValue V0 = V.getOperand(0); - MVT V0VT = V0.getSimpleValueType(); - - if (V0VT.isInteger() && V0VT.getVectorElementType().bitsGT(EltVT) && - ((V0.getOpcode() == ISD::BUILD_VECTOR || - (V0.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)))) { - V = DAG.getNode(ISD::TRUNCATE, DL, EltVT, V0.getOperand(BroadcastIdx)); - BroadcastIdx = 0; - } - } + if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) + if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast( + DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG)) + return TruncBroadcast; // Also check the simpler case, where we can directly reuse the scalar. if (V.getOpcode() == ISD::BUILD_VECTOR || diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index a8ada6fbfa7..e55ba0f7129 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1459,3 +1459,149 @@ define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) { %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> zeroinitializer ret <16 x i8> %tmp4 } + +define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) { +; SSE2-LABEL: insert_dup_elt1_mem_v16i8_i32: +; SSE2: # BB#0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_dup_elt1_mem_v16i8_i32: +; SSE41: # BB#0: +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: insert_dup_elt1_mem_v16i8_i32: +; AVX: # BB#0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX-NEXT: retq + %tmp = load i32, i32* %ptr, align 4 + %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 + %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8> + %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> + ret <16 x i8> %tmp3 +} + +define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) { +; SSE2-LABEL: insert_dup_elt2_mem_v16i8_i32: +; SSE2: # BB#0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_dup_elt2_mem_v16i8_i32: +; SSE41: # BB#0: +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; SSE41-NEXT: retq +; +; AVX-LABEL: insert_dup_elt2_mem_v16i8_i32: +; AVX: # BB#0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX-NEXT: retq + %tmp = load i32, i32* %ptr, align 4 + %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 + %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8> + %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> + ret <16 x i8> %tmp3 +} + +define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) { +; SSE2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: +; SSE2: # BB#0: +; SSE2-NEXT: movsbl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsbl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: +; SSE41: # BB#0: +; SSE41-NEXT: movsbl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: +; AVX: # BB#0: +; AVX-NEXT: movsbl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX-NEXT: retq + %tmp = load i8, i8* %ptr, align 1 + %tmp1 = sext i8 %tmp to i32 + %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 + %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8> + %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> + ret <16 x i8> %tmp4 +} + +define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) { +; SSE2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: +; SSE2: # BB#0: +; SSE2-NEXT: movsbl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsbl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: +; SSE41: # BB#0: +; SSE41-NEXT: movsbl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; SSE41-NEXT: retq +; +; AVX-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: +; AVX: # BB#0: +; AVX-NEXT: movsbl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX-NEXT: retq + %tmp = load i8, i8* %ptr, align 1 + %tmp1 = sext i8 %tmp to i32 + %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 + %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8> + %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> + ret <16 x i8> %tmp4 +} diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index f040a691938..eb69c6e9a33 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -2228,3 +2228,145 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) { %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> zeroinitializer ret <8 x i16> %tmp4 } + +define <8 x i16> @insert_dup_elt1_mem_v8i16_i32(i32* %ptr) { +; SSE2-LABEL: insert_dup_elt1_mem_v8i16_i32: +; SSE2: # BB#0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_i32: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_dup_elt1_mem_v8i16_i32: +; SSE41: # BB#0: +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: insert_dup_elt1_mem_v8i16_i32: +; AVX: # BB#0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX-NEXT: retq + %tmp = load i32, i32* %ptr, align 4 + %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 + %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16> + %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> + ret <8 x i16> %tmp3 +} + +define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(i32* %ptr) { +; SSE2-LABEL: insert_dup_elt3_mem_v8i16_i32: +; SSE2: # BB#0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_i32: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_dup_elt3_mem_v8i16_i32: +; SSE41: # BB#0: +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: insert_dup_elt3_mem_v8i16_i32: +; AVX: # BB#0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX-NEXT: retq + %tmp = load i32, i32* %ptr, align 4 + %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1 + %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16> + %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> + ret <8 x i16> %tmp3 +} + +define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(i16* %ptr) { +; SSE2-LABEL: insert_dup_elt1_mem_v8i16_sext_i16: +; SSE2: # BB#0: +; SSE2-NEXT: movswl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_sext_i16: +; SSSE3: # BB#0: +; SSSE3-NEXT: movswl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_dup_elt1_mem_v8i16_sext_i16: +; SSE41: # BB#0: +; SSE41-NEXT: movswl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: insert_dup_elt1_mem_v8i16_sext_i16: +; AVX: # BB#0: +; AVX-NEXT: movswl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX-NEXT: retq + %tmp = load i16, i16* %ptr, align 2 + %tmp1 = sext i16 %tmp to i32 + %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 + %tmp3 = bitcast <4 x i32> %tmp2 to <8 x i16> + %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> + ret <8 x i16> %tmp4 +} + +define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(i16* %ptr) { +; SSE2-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: +; SSE2: # BB#0: +; SSE2-NEXT: movswl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: +; SSSE3: # BB#0: +; SSSE3-NEXT: movswl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: +; SSE41: # BB#0: +; SSE41-NEXT: movswl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: +; AVX: # BB#0: +; AVX-NEXT: movswl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX-NEXT: retq + %tmp = load i16, i16* %ptr, align 2 + %tmp1 = sext i16 %tmp to i32 + %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 1 + %tmp3 = bitcast <4 x i32> %tmp2 to <8 x i16> + %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> + ret <8 x i16> %tmp4 +} diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index 4ba4b391ceb..53821fc71ba 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -3331,3 +3331,45 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16(i16* %ptr) { %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <16 x i32> zeroinitializer ret <16 x i16> %tmp4 } + +define <16 x i16> @insert_dup_elt1_mem_v16i16_i32(i32* %ptr) #0 { +; AVX1-LABEL: insert_dup_elt1_mem_v16i16_i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_elt1_mem_v16i16_i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %tmp = load i32, i32* %ptr, align 4 + %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 + %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16> + %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> + ret <16 x i16> %tmp3 +} + +define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(i32* %ptr) #0 { +; AVX1-LABEL: insert_dup_elt3_mem_v16i16_i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_elt3_mem_v16i16_i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %tmp = load i32, i32* %ptr, align 4 + %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1 + %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16> + %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> + ret <16 x i16> %tmp3 +} diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index ae4407916fa..269d9da8e1d 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2018,3 +2018,69 @@ define <32 x i8> @insert_dup_mem_v32i8_sext_i8(i8* %ptr) { %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <32 x i32> zeroinitializer ret <32 x i8> %tmp4 } + +define <32 x i8> @insert_dup_elt1_mem_v32i8_i32(i32* %ptr) { +; AVX1-LABEL: insert_dup_elt1_mem_v32i8_i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_elt1_mem_v32i8_i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %tmp = load i32, i32* %ptr, align 4 + %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 + %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8> + %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <32 x i32> + ret <32 x i8> %tmp3 +} + +define <32 x i8> @insert_dup_elt3_mem_v32i8_i32(i32* %ptr) { +; AVX1-LABEL: insert_dup_elt3_mem_v32i8_i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_elt3_mem_v32i8_i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %tmp = load i32, i32* %ptr, align 4 + %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 + %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8> + %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <32 x i32> + ret <32 x i8> %tmp3 +} + +define <32 x i8> @insert_dup_elt1_mem_v32i8_sext_i8(i8* %ptr) { +; AVX1-LABEL: insert_dup_elt1_mem_v32i8_sext_i8: +; AVX1: # BB#0: +; AVX1-NEXT: movsbl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_elt1_mem_v32i8_sext_i8: +; AVX2: # BB#0: +; AVX2-NEXT: movsbl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %tmp = load i8, i8* %ptr, align 1 + %tmp1 = sext i8 %tmp to i32 + %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 + %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8> + %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <32 x i32> + ret <32 x i8> %tmp4 +} -- 2.34.1