From: Chandler Carruth Date: Fri, 3 Oct 2014 13:11:13 +0000 (+0000) Subject: [x86] Teach the new vector shuffle lowering to aggressively form MOVSS X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=dce98e67391a5d721b96fa2be85219c2f81bdce8;p=oota-llvm.git [x86] Teach the new vector shuffle lowering to aggressively form MOVSS and MOVSD nodes for single element vector inserts. This is particularly important because a number of patterns in the backend detect these patterns and leverage them to simplify things. It also fixes quite a few of the insertion bad code examples. However, it regresses a specific area: when available, blendps and blendpd are *dramatically* faster than movss and movsd respectively. But it doesn't really work to form the blend logic first because the blends *aren't* as crazy efficient when the data is coming from memory anyways, and thus will have a movss or movsd regardless. Also, doing that would block a bunch of the patterns that this is designed to hit. So my plan is to go into the patterns for lowering MOVSS and MOVSD and lower them via blends when available. However that's a pretty invasive restructuring so it will need to be a follow-up patch. I have already gone into the patterns to lower MOVSS and MOVSD from memory using MOVLPD, etc. Without that, several of the test cases I already have regress. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218985 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 702e163b46a..b6d134ff0fb 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7779,16 +7779,18 @@ static SDValue lowerVectorShuffleAsElementInsertion( MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + MVT ExtVT = VT; + MVT EltVT = VT.getVectorElementType(); int V2Index = std::find_if(Mask.begin(), Mask.end(), [&Mask](int M) { return M >= (int)Mask.size(); }) - Mask.begin(); + bool IsV1Zeroable = true; for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (i != V2Index && !Zeroable[i]) - return SDValue(); // Not inserting into a zero vector. - - MVT ExtVT = VT; - MVT EltVT = VT.getVectorElementType(); + if (i != V2Index && !Zeroable[i]) { + IsV1Zeroable = false; + break; + } // Check for a single input from a SCALAR_TO_VECTOR node. // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and @@ -7800,6 +7802,11 @@ static SDValue lowerVectorShuffleAsElementInsertion( // We need to zext the scalar if it is smaller than an i32. V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S); if (EltVT == MVT::i8 || EltVT == MVT::i16) { + // Using zext to expand a narrow element won't work for non-zero + // insertions. + if (!IsV1Zeroable) + return SDValue(); + // Zero-extend directly to i32. ExtVT = MVT::v4i32; V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); @@ -7812,6 +7819,25 @@ static SDValue lowerVectorShuffleAsElementInsertion( return SDValue(); } + if (!IsV1Zeroable) { + // If V1 can't be treated as a zero vector we have fewer options to lower + // this. We can't support integer vectors or non-zero targets cheaply, and + // the V1 elements can't be permuted in any way. + assert(VT == ExtVT && "Cannot change extended type when non-zeroable!"); + if (!VT.isFloatingPoint() || V2Index != 0) + return SDValue(); + SmallVector V1Mask(Mask.begin(), Mask.end()); + V1Mask[V2Index] = -1; + if (!isNoopShuffleMask(V1Mask)) + return SDValue(); + + // Otherwise, use MOVSD or MOVSS. + assert((EltVT == MVT::f32 || EltVT == MVT::f64) && + "Only two types of floating point element types to handle!"); + return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL, + ExtVT, V1, V2); + } + V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); if (ExtVT != VT) V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index f833d043027..bd00bdd02b5 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -1269,6 +1269,9 @@ let Predicates = [HasAVX] in { (VMOVLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), (VMOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; // Store patterns def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), @@ -1316,6 +1319,9 @@ let Predicates = [UseSSE2] in { (MOVLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), (MOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (MOVLPDrm VR128:$src1, addr:$src2)>; // Store patterns def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index d7373602431..e8613be0272 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -336,13 +336,14 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20( ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] -; SSE2-NEXT: shufpd {{.*#+}} xmm4 = xmm4[0],xmm3[1] +; SSE2-NEXT: movsd %xmm4, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index e15773b067a..aa837f15e57 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -211,61 +211,28 @@ define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) { ret <2 x double> %shuffle } define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) { -; SSE2-LABEL: shuffle_v2f64_03: -; SSE2: # BB#0: -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; SSE2-NEXT: retq -; -; SSE3-LABEL: shuffle_v2f64_03: -; SSE3: # BB#0: -; SSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; SSE3-NEXT: retq -; -; SSSE3-LABEL: shuffle_v2f64_03: -; SSSE3: # BB#0: -; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v2f64_03: -; SSE41: # BB#0: -; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v2f64_03: +; SSE: # BB#0: +; SSE-NEXT: movsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v2f64_03: ; AVX: # BB#0: -; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle } define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) { -; SSE2-LABEL: shuffle_v2f64_21: -; SSE2: # BB#0: -; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: shuffle_v2f64_21: -; SSE3: # BB#0: -; SSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSE3-NEXT: movapd %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: shuffle_v2f64_21: -; SSSE3: # BB#0: -; SSSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSSE3-NEXT: movapd %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v2f64_21: -; SSE41: # BB#0: -; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v2f64_21: +; SSE: # BB#0: +; SSE-NEXT: movsd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v2f64_21: ; AVX: # BB#0: -; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle @@ -302,17 +269,20 @@ define <2 x i64> @shuffle_v2i64_02_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64 define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) { ; SSE2-LABEL: shuffle_v2i64_03: ; SSE2: # BB#0: -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v2i64_03: ; SSE3: # BB#0: -; SSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE3-NEXT: movsd %xmm0, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2i64_03: ; SSSE3: # BB#0: -; SSSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v2i64_03: @@ -335,20 +305,20 @@ define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) { define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { ; SSE2-LABEL: shuffle_v2i64_03_copy: ; SSE2: # BB#0: -; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm2[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: movsd %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v2i64_03_copy: ; SSE3: # BB#0: -; SSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm2[1] -; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: movsd %xmm1, %xmm2 +; SSE3-NEXT: movaps %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2i64_03_copy: ; SSSE3: # BB#0: -; SSSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm2[1] -; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: movsd %xmm1, %xmm2 +; SSSE3-NEXT: movaps %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v2i64_03_copy: @@ -489,20 +459,17 @@ define <2 x i64> @shuffle_v2i64_20_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64 define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) { ; SSE2-LABEL: shuffle_v2i64_21: ; SSE2: # BB#0: -; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: movsd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v2i64_21: ; SSE3: # BB#0: -; SSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: movsd %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2i64_21: ; SSSE3: # BB#0: -; SSSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: movsd %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v2i64_21: @@ -526,20 +493,20 @@ define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) { define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { ; SSE2-LABEL: shuffle_v2i64_21_copy: ; SSE2: # BB#0: -; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] -; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: movsd %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v2i64_21_copy: ; SSE3: # BB#0: -; SSE3-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] -; SSE3-NEXT: movapd %xmm2, %xmm0 +; SSE3-NEXT: movsd %xmm2, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2i64_21_copy: ; SSSE3: # BB#0: -; SSSE3-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1] -; SSSE3-NEXT: movapd %xmm2, %xmm0 +; SSSE3-NEXT: movsd %xmm2, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v2i64_21_copy: @@ -700,23 +667,20 @@ define <2 x i64> @shuffle_v2i64_z0(<2 x i64> %a) { define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) { ; SSE2-LABEL: shuffle_v2i64_z1: ; SSE2: # BB#0: -; SSE2-NEXT: xorpd %xmm1, %xmm1 -; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movsd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v2i64_z1: ; SSE3: # BB#0: -; SSE3-NEXT: xorpd %xmm1, %xmm1 -; SSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movsd %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v2i64_z1: ; SSSE3: # BB#0: -; SSSE3-NEXT: xorpd %xmm1, %xmm1 -; SSSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movsd %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v2i64_z1: @@ -789,38 +753,16 @@ define <2 x double> @shuffle_v2f64_z0(<2 x double> %a) { } define <2 x double> @shuffle_v2f64_z1(<2 x double> %a) { -; SSE2-LABEL: shuffle_v2f64_z1: -; SSE2: # BB#0: -; SSE2-NEXT: xorpd %xmm1, %xmm1 -; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: shuffle_v2f64_z1: -; SSE3: # BB#0: -; SSE3-NEXT: xorpd %xmm1, %xmm1 -; SSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSE3-NEXT: movapd %xmm1, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: shuffle_v2f64_z1: -; SSSE3: # BB#0: -; SSSE3-NEXT: xorpd %xmm1, %xmm1 -; SSSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSSE3-NEXT: movapd %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v2f64_z1: -; SSE41: # BB#0: -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v2f64_z1: +; SSE: # BB#0: +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movsd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v2f64_z1: ; AVX: # BB#0: -; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> ret <2 x double> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 10a27f44320..595447775b5 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -55,7 +55,7 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: vmovsd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4f64_0300: @@ -382,7 +382,7 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: vmovsd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_0300: @@ -518,7 +518,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { ; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_4012: @@ -654,7 +654,7 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: vmovsd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: stress_test1: diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll index 2f02f2fc08f..662b9832611 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -91,7 +91,7 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3] ; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 -; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3] +; ALL-NEXT: vmovsd %xmm1, %xmm0, %xmm1 ; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -275,12 +275,12 @@ define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08991abb: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm0[1,0,2,2] -; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,2,3,3] -; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3] -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1] -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] -; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,1,1] +; ALL-NEXT: vmovsd %xmm0, %xmm2, %xmm2 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3] +; ALL-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -303,11 +303,11 @@ define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_09ab1def: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,0,2,2] -; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3] -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] -; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vmovsd %xmm0, %xmm1, %xmm2 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] +; ALL-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -721,7 +721,7 @@ define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) { ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] ; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 ; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3] -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; ALL-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ; ALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32>