From 03a77831cc5019b287e7f5be109f44b302e63ffd Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Sat, 4 Oct 2014 03:52:55 +0000 Subject: [PATCH] [x86] Enable the new vector shuffle lowering by default. Update the entire regression test suite for the new shuffles. Remove most of the old testing which was devoted to the old shuffle lowering path and is no longer relevant really. Also remove a few other random tests that only really exercised shuffles and only incidently or without any interesting aspects to them. Benchmarking that I have done shows a few small regressions with this on LNT, zero measurable regressions on real, large applications, and for several benchmarks where the loop vectorizer fires in the hot path it shows 5% to 40% improvements for SSE2 and SSE3 code running on Sandy Bridge machines. Running on AMD machines shows even more dramatic improvements. When using newer ISA vector extensions the gains are much more modest, but the code is still better on the whole. There are a few regressions being tracked (PR21137, PR21138, PR21139) but by and large this is expected to be a win for x86 generated code performance. It is also more correct than the code it replaces. I have fuzz tested this extensively with ISA extensions up through AVX2 and found no crashes or miscompiles (yet...). The old lowering had a few miscompiles and crashers after a somewhat smaller amount of fuzz testing. There is one significant area where the new code path lags behind and that is in AVX-512 support. However, there was *extremely little* support for that already and so this isn't a significant step backwards and the new framework will probably make it easier to implement lowering that uses the full power of AVX-512's table-based shuffle+blend (IMO). Many thanks to Quentin, Andrea, Robert, and others for benchmarking assistance. Thanks to Adam and others for help with AVX-512. Thanks to Hal, Eric, and *many* others for answering my incessant questions about how the backend actually works. =] I will leave the old code path in the tree until the 3 PRs above are at least resolved to folks' satisfaction. Then I will rip it (and 1000s of lines of code) out. =] I don't expect this flag to stay around for very long. It may not survive next week. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@219046 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 2 +- test/CodeGen/X86/2008-06-18-BadShuffle.ll | 10 - test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll | 30 - test/CodeGen/X86/2012-04-26-sdglue.ll | 2 +- test/CodeGen/X86/avx-basic.ll | 40 - test/CodeGen/X86/avx-intrinsics-x86.ll | 2 +- test/CodeGen/X86/avx-movdup.ll | 53 -- test/CodeGen/X86/avx-shuffle.ll | 336 -------- test/CodeGen/X86/avx-splat.ll | 11 +- test/CodeGen/X86/avx-vmovddup.ll | 14 - test/CodeGen/X86/avx-vperm2x128.ll | 177 +++-- test/CodeGen/X86/avx-vpermil.ll | 54 -- test/CodeGen/X86/avx-vshufp.ll | 157 ---- test/CodeGen/X86/avx2-palignr.ll | 57 -- test/CodeGen/X86/avx2-shuffle.ll | 127 --- test/CodeGen/X86/avx2-unpack.ll | 86 -- test/CodeGen/X86/avx2-vbroadcast.ll | 2 +- test/CodeGen/X86/avx512-arith.ll | 5 +- test/CodeGen/X86/avx512-build-vector.ll | 11 +- test/CodeGen/X86/avx512-shuffle.ll | 362 --------- test/CodeGen/X86/avx512-vbroadcast.ll | 22 +- test/CodeGen/X86/avx512-vec-cmp.ll | 24 +- test/CodeGen/X86/combine-or.ll | 68 +- test/CodeGen/X86/exedepsfix-broadcast.ll | 5 +- test/CodeGen/X86/extractelement-load.ll | 6 +- test/CodeGen/X86/fp-load-trunc.ll | 6 +- test/CodeGen/X86/fp-trunc.ll | 6 +- test/CodeGen/X86/palignr.ll | 113 +-- test/CodeGen/X86/pmul.ll | 24 +- test/CodeGen/X86/pr11334.ll | 8 +- test/CodeGen/X86/pr12359.ll | 10 - test/CodeGen/X86/sincos-opt.ll | 2 +- test/CodeGen/X86/splat-scalar-load.ll | 17 - test/CodeGen/X86/sse-align-12.ll | 4 +- test/CodeGen/X86/sse-scalar-fp-arith.ll | 3 - test/CodeGen/X86/sse1.ll | 4 +- test/CodeGen/X86/sse2-mul.ll | 14 - test/CodeGen/X86/sse2.ll | 50 +- test/CodeGen/X86/sse3.ll | 109 +-- test/CodeGen/X86/sse41.ll | 110 +-- test/CodeGen/X86/swizzle-2.ll | 54 +- test/CodeGen/X86/swizzle.ll | 19 - test/CodeGen/X86/trunc-ext-ld-st.ll | 2 +- test/CodeGen/X86/uint_to_fp-2.ll | 4 +- test/CodeGen/X86/v2f32.ll | 4 +- test/CodeGen/X86/vec_cast2.ll | 55 +- test/CodeGen/X86/vec_extract-sse4.ll | 41 +- test/CodeGen/X86/vec_extract.ll | 43 +- test/CodeGen/X86/vec_insert-5.ll | 26 +- test/CodeGen/X86/vec_set-3.ll | 21 +- test/CodeGen/X86/vec_set-5.ll | 28 - test/CodeGen/X86/vec_set-9.ll | 14 - test/CodeGen/X86/vec_set-E.ll | 9 - test/CodeGen/X86/vec_set-I.ll | 13 - test/CodeGen/X86/vec_set-J.ll | 10 - test/CodeGen/X86/vec_shuffle-11.ll | 11 - test/CodeGen/X86/vec_shuffle-14.ll | 70 -- test/CodeGen/X86/vec_shuffle-15.ll | 81 -- test/CodeGen/X86/vec_shuffle-16.ll | 43 - test/CodeGen/X86/vec_shuffle-17.ll | 16 - test/CodeGen/X86/vec_shuffle-18.ll | 25 - test/CodeGen/X86/vec_shuffle-19.ll | 9 - test/CodeGen/X86/vec_shuffle-20.ll | 8 - test/CodeGen/X86/vec_shuffle-22.ll | 15 - test/CodeGen/X86/vec_shuffle-23.ll | 18 - test/CodeGen/X86/vec_shuffle-24.ll | 18 - test/CodeGen/X86/vec_shuffle-25.ll | 34 - test/CodeGen/X86/vec_shuffle-26.ll | 68 -- test/CodeGen/X86/vec_shuffle-27.ll | 38 - test/CodeGen/X86/vec_shuffle-28.ll | 14 - test/CodeGen/X86/vec_shuffle-30.ll | 26 - test/CodeGen/X86/vec_shuffle-31.ll | 8 - test/CodeGen/X86/vec_shuffle-34.ll | 7 - test/CodeGen/X86/vec_shuffle-35.ll | 20 - test/CodeGen/X86/vec_shuffle-36.ll | 16 - test/CodeGen/X86/vec_shuffle-37.ll | 47 -- test/CodeGen/X86/vec_shuffle-38.ll | 77 -- test/CodeGen/X86/vec_shuffle-39.ll | 86 -- test/CodeGen/X86/vec_shuffle-40.ll | 22 - test/CodeGen/X86/vec_shuffle-41.ll | 21 - test/CodeGen/X86/vec_shuffle.ll | 50 -- test/CodeGen/X86/vec_splat-2.ll | 33 - test/CodeGen/X86/vec_splat-3.ll | 201 ----- test/CodeGen/X86/vec_splat.ll | 68 -- test/CodeGen/X86/vector-blend.ll | 152 ++-- test/CodeGen/X86/vector-idiv.ll | 328 ++++---- test/CodeGen/X86/vector-sext.ll | 187 ++--- test/CodeGen/X86/vector-shuffle-combining.ll | 746 +++++++++--------- test/CodeGen/X86/vector-zext.ll | 44 +- test/CodeGen/X86/vselect.ll | 4 +- test/CodeGen/X86/widen_shuffle-1.ll | 13 +- 91 files changed, 1334 insertions(+), 3776 deletions(-) delete mode 100644 test/CodeGen/X86/2008-06-18-BadShuffle.ll delete mode 100644 test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll delete mode 100644 test/CodeGen/X86/avx-movdup.ll delete mode 100644 test/CodeGen/X86/avx-shuffle.ll delete mode 100644 test/CodeGen/X86/avx-vmovddup.ll delete mode 100644 test/CodeGen/X86/avx-vpermil.ll delete mode 100644 test/CodeGen/X86/avx-vshufp.ll delete mode 100644 test/CodeGen/X86/avx2-palignr.ll delete mode 100644 test/CodeGen/X86/avx2-shuffle.ll delete mode 100644 test/CodeGen/X86/avx2-unpack.ll delete mode 100644 test/CodeGen/X86/avx512-shuffle.ll delete mode 100644 test/CodeGen/X86/pr12359.ll delete mode 100644 test/CodeGen/X86/splat-scalar-load.ll delete mode 100644 test/CodeGen/X86/sse2-mul.ll delete mode 100644 test/CodeGen/X86/swizzle.ll delete mode 100644 test/CodeGen/X86/vec_set-5.ll delete mode 100644 test/CodeGen/X86/vec_set-9.ll delete mode 100644 test/CodeGen/X86/vec_set-E.ll delete mode 100644 test/CodeGen/X86/vec_set-I.ll delete mode 100644 test/CodeGen/X86/vec_set-J.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-11.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-14.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-15.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-16.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-17.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-18.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-19.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-20.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-22.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-23.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-24.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-25.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-26.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-27.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-28.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-30.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-31.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-34.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-35.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-36.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-37.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-38.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-39.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-40.ll delete mode 100644 test/CodeGen/X86/vec_shuffle-41.ll delete mode 100644 test/CodeGen/X86/vec_shuffle.ll delete mode 100644 test/CodeGen/X86/vec_splat-2.ll delete mode 100644 test/CodeGen/X86/vec_splat-3.ll delete mode 100644 test/CodeGen/X86/vec_splat.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ef5592fe033..c09c38a31a1 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -67,7 +67,7 @@ static cl::opt ExperimentalVectorWideningLegalization( cl::Hidden); static cl::opt ExperimentalVectorShuffleLowering( - "x86-experimental-vector-shuffle-lowering", cl::init(false), + "x86-experimental-vector-shuffle-lowering", cl::init(true), cl::desc("Enable an experimental vector shuffle lowering code path."), cl::Hidden); diff --git a/test/CodeGen/X86/2008-06-18-BadShuffle.ll b/test/CodeGen/X86/2008-06-18-BadShuffle.ll deleted file mode 100644 index 66f9065799e..00000000000 --- a/test/CodeGen/X86/2008-06-18-BadShuffle.ll +++ /dev/null @@ -1,10 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=i386 -mattr=+sse2 | grep pinsrw - -; Test to make sure we actually insert the bottom element of the vector -define <8 x i16> @a(<8 x i16> %a) nounwind { -entry: - shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> < i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8 > - %add = add <8 x i16> %0, %a - ret <8 x i16> %add -} - diff --git a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll b/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll deleted file mode 100644 index e1930e012dd..00000000000 --- a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc -mtriple=i386-apple-darwin10.0 -relocation-model=pic -asm-verbose=false \ -; RUN: -mcpu=generic -disable-fp-elim -mattr=-sse4.1,-sse3,+sse2 -post-RA-scheduler=false -regalloc=basic < %s | \ -; RUN: FileCheck %s -; rdar://6808032 - -; CHECK: pextrw $14 -; CHECK-NEXT: shrl $8 -; CHECK-NEXT: pinsrw - -define void @update(i8** %args_list) nounwind { -entry: - %cmp.i = icmp eq i32 0, 0 ; [#uses=1] - br i1 %cmp.i, label %if.then.i, label %test_cl.exit - -if.then.i: ; preds = %entry - %val = load <16 x i8> addrspace(1)* null ; <<16 x i8>> [#uses=8] - %tmp10.i = shufflevector <16 x i8> , <16 x i8> %val, <16 x i32> ; <<16 x i8>> [#uses=1] - %tmp17.i = shufflevector <16 x i8> %tmp10.i, <16 x i8> %val, <16 x i32> ; <<16 x i8>> [#uses=1] - %tmp24.i = shufflevector <16 x i8> %tmp17.i, <16 x i8> %val, <16 x i32> ; <<16 x i8>> [#uses=1] - %tmp31.i = shufflevector <16 x i8> %tmp24.i, <16 x i8> %val, <16 x i32> ; <<16 x i8>> [#uses=1] - %tmp38.i = shufflevector <16 x i8> %tmp31.i, <16 x i8> %val, <16 x i32> ; <<16 x i8>> [#uses=1] - %tmp45.i = shufflevector <16 x i8> %tmp38.i, <16 x i8> %val, <16 x i32> ; <<16 x i8>> [#uses=1] - %tmp52.i = shufflevector <16 x i8> %tmp45.i, <16 x i8> %val, <16 x i32> ; <<16 x i8>> [#uses=1] - %tmp59.i = shufflevector <16 x i8> %tmp52.i, <16 x i8> %val, <16 x i32> ; <<16 x i8>> [#uses=1] - store <16 x i8> %tmp59.i, <16 x i8> addrspace(1)* null - ret void - -test_cl.exit: ; preds = %entry - ret void -} diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll index 16706ae957f..6651af70555 100644 --- a/test/CodeGen/X86/2012-04-26-sdglue.ll +++ b/test/CodeGen/X86/2012-04-26-sdglue.ll @@ -8,7 +8,7 @@ ;CHECK: vpxor ;CHECK: vinserti128 ;CHECK: vpshufd -;CHECK: vpshufd +;CHECK: vpbroadcastd ;CHECK: vmulps ;CHECK: vmulps ;CHECK: ret diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll index a8dae82a8be..02ea173c803 100644 --- a/test/CodeGen/X86/avx-basic.ll +++ b/test/CodeGen/X86/avx-basic.ll @@ -51,46 +51,6 @@ entry: ret <4 x i64> %shuffle } -;;; -;;; Check that some 256-bit vectors are xformed into 128 ops -; CHECK: _A -; CHECK: vshufpd $1 -; CHECK-NEXT: vextractf128 $1 -; CHECK-NEXT: vshufpd $1 -; CHECK-NEXT: vinsertf128 $1 -define <4 x i64> @A(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> - ret <4 x i64> %shuffle -} - -; CHECK: _B -; CHECK: vshufpd $1, %ymm -define <4 x i64> @B(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> - ret <4 x i64> %shuffle -} - -; CHECK: vmovlhps -; CHECK-NEXT: vextractf128 $1 -; CHECK-NEXT: vmovlhps -; CHECK-NEXT: vinsertf128 $1 -define <4 x i64> @C(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> - ret <4 x i64> %shuffle -} - -; CHECK: vpshufd $-96 -; CHECK: vpshufd $-6 -; CHECK: vinsertf128 $1 -define <8 x i32> @D(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> - ret <8 x i32> %shuffle -} - ;;; Don't crash on movd ; CHECK: _VMOVZQI2PQI ; CHECK: vmovd (% diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll index 850d6f2a4a3..a957d10cc2b 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -2324,7 +2324,7 @@ declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8) nounwind rea define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) { - ; CHECK: vpshufd + ; CHECK: vpermilps %res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } diff --git a/test/CodeGen/X86/avx-movdup.ll b/test/CodeGen/X86/avx-movdup.ll deleted file mode 100644 index 19a03259378..00000000000 --- a/test/CodeGen/X86/avx-movdup.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; CHECK: vmovsldup -define <8 x float> @movdupA(<8 x float> %src) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <8 x float> %src, <8 x float> undef, <8 x i32> - ret <8 x float> %shuffle.i -} - -; CHECK: vmovshdup -define <8 x float> @movdupB(<8 x float> %src) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <8 x float> %src, <8 x float> undef, <8 x i32> - ret <8 x float> %shuffle.i -} - -; CHECK: vmovsldup -define <4 x i64> @movdupC(<4 x i64> %src) nounwind uwtable readnone ssp { -entry: - %0 = bitcast <4 x i64> %src to <8 x float> - %shuffle.i = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> - %1 = bitcast <8 x float> %shuffle.i to <4 x i64> - ret <4 x i64> %1 -} - -; CHECK: vmovshdup -define <4 x i64> @movdupD(<4 x i64> %src) nounwind uwtable readnone ssp { -entry: - %0 = bitcast <4 x i64> %src to <8 x float> - %shuffle.i = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> - %1 = bitcast <8 x float> %shuffle.i to <4 x i64> - ret <4 x i64> %1 -} - -; Check that there is a pattern (v2f64 (X86VBroadcast f64:$src)). - -; CHECK-LABEL: _vbroadcast_v128_f64 -; CHECK: vmovsd LCPI{{[0-9]+}}_0(%rip), %xmm[[R0:[0-9]+]] -; CHECK: vmovddup %xmm[[R0]], %xmm{{[0-9]+}} - -@E1 = external global [5 x double], align 16 -@.str3 = external unnamed_addr constant [44 x i8], align 1 - -define void @vbroadcast_v128_f64() #0 { -entry: - store <2 x double> , <2 x double>* bitcast (double* getelementptr inbounds ([5 x double]* @E1, i64 0, i64 2) to <2 x double>*), align 16 - tail call void @foo1(double -1.000000e+00) - ret void -} - -declare void @foo1(double) - -attributes #0 = { optsize } diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll deleted file mode 100644 index 4a996d79815..00000000000 --- a/test/CodeGen/X86/avx-shuffle.ll +++ /dev/null @@ -1,336 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; PR11102 -define <4 x float> @test1(<4 x float> %a) nounwind { - %b = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> - ret <4 x float> %b -; CHECK-LABEL: test1: -;; TODO: This test could be improved by removing the xor instruction and -;; having vinsertps zero out the needed elements. -; CHECK: vxorps -; CHECK: vinsertps -} - -; rdar://10538417 -define <3 x i64> @test2(<2 x i64> %v) nounwind readnone { -; CHECK-LABEL: test2: -; CHECK: vinsertf128 - %1 = shufflevector <2 x i64> %v, <2 x i64> %v, <3 x i32> - %2 = shufflevector <3 x i64> zeroinitializer, <3 x i64> %1, <3 x i32> - ret <3 x i64> %2 -; CHECK: ret -} - -define <4 x i64> @test3(<4 x i64> %a, <4 x i64> %b) nounwind { - %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> - ret <4 x i64> %c -; CHECK-LABEL: test3: -; CHECK: vblendpd -; CHECK: ret -} - -define <8 x float> @test4(float %a) nounwind { - %b = insertelement <8 x float> zeroinitializer, float %a, i32 0 - ret <8 x float> %b -; CHECK-LABEL: test4: -; CHECK: vinsertf128 -} - -; rdar://10594409 -define <8 x float> @test5(float* nocapture %f) nounwind uwtable readonly ssp { -entry: - %0 = bitcast float* %f to <4 x float>* - %1 = load <4 x float>* %0, align 16 -; CHECK: test5 -; CHECK: vmovaps -; CHECK-NOT: vxorps -; CHECK-NOT: vinsertf128 - %shuffle.i = shufflevector <4 x float> %1, <4 x float> , <8 x i32> - ret <8 x float> %shuffle.i -} - -define <4 x double> @test6(double* nocapture %d) nounwind uwtable readonly ssp { -entry: - %0 = bitcast double* %d to <2 x double>* - %1 = load <2 x double>* %0, align 16 -; CHECK: test6 -; CHECK: vmovaps -; CHECK-NOT: vxorps -; CHECK-NOT: vinsertf128 - %shuffle.i = shufflevector <2 x double> %1, <2 x double> , <4 x i32> - ret <4 x double> %shuffle.i -} - -define <16 x i16> @test7(<4 x i16> %a) nounwind { -; CHECK: test7 - %b = shufflevector <4 x i16> %a, <4 x i16> undef, <16 x i32> -; CHECK: ret - ret <16 x i16> %b -} - -; CHECK: test8 -define void @test8() { -entry: - %0 = load <16 x i64> addrspace(1)* null, align 128 - %1 = shufflevector <16 x i64> , <16 x i64> %0, <16 x i32> - %2 = shufflevector <16 x i64> %1, <16 x i64> %0, <16 x i32> - store <16 x i64> %2, <16 x i64> addrspace(1)* undef, align 128 -; CHECK: ret - ret void -} - -; Extract a value from a shufflevector.. -define i32 @test9(<4 x i32> %a) nounwind { -; CHECK: test9 -; CHECK: vpextrd - %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> - %r = extractelement <8 x i32> %b, i32 2 -; CHECK: ret - ret i32 %r -} - -; Extract a value which is the result of an undef mask. -define i32 @test10(<4 x i32> %a) nounwind { -; CHECK: @test10 -; CHECK-NOT: {{^[^#]*[a-z]}} -; CHECK: ret - %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> - %r = extractelement <8 x i32> %b, i32 2 - ret i32 %r -} - -define <4 x float> @test11(<4 x float> %a) nounwind { -; CHECK: test11 -; CHECK: vpshufd $27 - %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> - ret <4 x float> %tmp1 -} - -define <4 x float> @test12(<4 x float>* %a) nounwind { -; CHECK: test12 -; CHECK: vpshufd - %tmp0 = load <4 x float>* %a - %tmp1 = shufflevector <4 x float> %tmp0, <4 x float> undef, <4 x i32> - ret <4 x float> %tmp1 -} - -define <4 x i32> @test13(<4 x i32> %a) nounwind { -; CHECK: test13 -; CHECK: vpshufd $27 - %tmp1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> - ret <4 x i32> %tmp1 -} - -define <4 x i32> @test14(<4 x i32>* %a) nounwind { -; CHECK: test14 -; CHECK: vpshufd $27, ( - %tmp0 = load <4 x i32>* %a - %tmp1 = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> - ret <4 x i32> %tmp1 -} - -; CHECK: test15 -; CHECK: vpshufd $8 -; CHECK: ret -define <4 x i32> @test15(<2 x i32>%x) nounwind readnone { - %x1 = shufflevector <2 x i32> %x, <2 x i32> undef, <4 x i32> - ret <4 x i32>%x1 -} - -; rdar://10974078 -define <8 x float> @test16(float* nocapture %f) nounwind uwtable readonly ssp { -entry: - %0 = bitcast float* %f to <4 x float>* - %1 = load <4 x float>* %0, align 8 -; CHECK: test16 -; CHECK: vmovups -; CHECK-NOT: vxorps -; CHECK-NOT: vinsertf128 - %shuffle.i = shufflevector <4 x float> %1, <4 x float> , <8 x i32> - ret <8 x float> %shuffle.i -} - -; PR12413 -; CHECK: shuf1 -; CHECK: vpshufb -; CHECK: vpshufb -; CHECK: vpshufb -; CHECK: vpshufb -define <32 x i8> @shuf1(<32 x i8> %inval1, <32 x i8> %inval2) { -entry: - %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> - ret <32 x i8> %0 -} - -; handle the case where only half of the 256-bits is splittable -; CHECK: shuf2 -; CHECK: vpshufb -; CHECK: vpshufb -; CHECK: vpextrb -; CHECK: vpextrb -define <32 x i8> @shuf2(<32 x i8> %inval1, <32 x i8> %inval2) { -entry: - %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> - ret <32 x i8> %0 -} - -; CHECK: blend1 -; CHECK: vblendps -; CHECK: ret -define <4 x i32> @blend1(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { - %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - ret <4 x i32> %t -} - -; CHECK: blend2 -; CHECK: vblendps -; CHECK: ret -define <4 x i32> @blend2(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { - %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - ret <4 x i32> %t -} - -; CHECK: blend2a -; CHECK: vblendps -; CHECK: ret -define <4 x float> @blend2a(<4 x float> %a, <4 x float> %b) nounwind alwaysinline { - %t = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - ret <4 x float> %t -} - -; CHECK: blend3 -; CHECK-NOT: vblendps -; CHECK: ret -define <4 x i32> @blend3(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { - %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - ret <4 x i32> %t -} - -; CHECK: blend4 -; CHECK: vblendpd -; CHECK: ret -define <4 x i64> @blend4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline { - %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> - ret <4 x i64> %t -} - -; CHECK: narrow -; CHECK: vpermilps -; CHECK: ret -define <16 x i16> @narrow(<16 x i16> %a) nounwind alwaysinline { - %t = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> - ret <16 x i16> %t -} - -;CHECK-LABEL: test17: -;CHECK-NOT: vinsertf128 -;CHECK: ret -define <8 x float> @test17(<4 x float> %y) { - %x = shufflevector <4 x float> %y, <4 x float> undef, <8 x i32> - ret <8 x float> %x -} - -; CHECK: test18 -; CHECK: vmovshdup -; CHECK: vblendps -; CHECK: ret -define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind { - %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> - ret <8 x float>%S -} - -; CHECK: test19 -; CHECK: vmovsldup -; CHECK: vblendps -; CHECK: ret -define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind { - %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> - ret <8 x float>%S -} - -; rdar://12684358 -; Make sure loads happen before stores. -; CHECK: swap8doubles -; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}} -; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}} -; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}} -; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}} -; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}} -; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}} -; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi) -; CHECK: vextractf128 -; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi) -; CHECK: vextractf128 -; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi) -; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi) -define void @swap8doubles(double* nocapture %A, double* nocapture %C) nounwind uwtable ssp { -entry: - %add.ptr = getelementptr inbounds double* %A, i64 2 - %v.i = bitcast double* %A to <2 x double>* - %0 = load <2 x double>* %v.i, align 1 - %shuffle.i.i = shufflevector <2 x double> %0, <2 x double> , <4 x i32> - %v1.i = bitcast double* %add.ptr to <2 x double>* - %1 = load <2 x double>* %v1.i, align 1 - %2 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i, <2 x double> %1, i8 1) nounwind - %add.ptr1 = getelementptr inbounds double* %A, i64 6 - %add.ptr2 = getelementptr inbounds double* %A, i64 4 - %v.i27 = bitcast double* %add.ptr2 to <2 x double>* - %3 = load <2 x double>* %v.i27, align 1 - %shuffle.i.i28 = shufflevector <2 x double> %3, <2 x double> , <4 x i32> - %v1.i29 = bitcast double* %add.ptr1 to <2 x double>* - %4 = load <2 x double>* %v1.i29, align 1 - %5 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i28, <2 x double> %4, i8 1) nounwind - %6 = bitcast double* %C to <4 x double>* - %7 = load <4 x double>* %6, align 32 - %add.ptr5 = getelementptr inbounds double* %C, i64 4 - %8 = bitcast double* %add.ptr5 to <4 x double>* - %9 = load <4 x double>* %8, align 32 - %shuffle.i26 = shufflevector <4 x double> %7, <4 x double> undef, <2 x i32> - %10 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %7, i8 1) - %shuffle.i = shufflevector <4 x double> %9, <4 x double> undef, <2 x i32> - %11 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %9, i8 1) - store <2 x double> %shuffle.i26, <2 x double>* %v.i, align 16 - store <2 x double> %10, <2 x double>* %v1.i, align 16 - store <2 x double> %shuffle.i, <2 x double>* %v.i27, align 16 - store <2 x double> %11, <2 x double>* %v1.i29, align 16 - store <4 x double> %2, <4 x double>* %6, align 32 - store <4 x double> %5, <4 x double>* %8, align 32 - ret void -} -declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone -declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone - -; this test case just should not fail -define void @test20() { - %a0 = insertelement <3 x double> , double 0.000000e+00, i32 2 - store <3 x double> %a0, <3 x double>* undef, align 1 - %a1 = insertelement <3 x double> , double undef, i32 2 - store <3 x double> %a1, <3 x double>* undef, align 1 - ret void -} - -define <2 x i64> @test_insert_64_zext(<2 x i64> %i) { -; CHECK-LABEL: test_insert_64_zext -; CHECK-NOT: xor -; CHECK: vmovq - %1 = shufflevector <2 x i64> %i, <2 x i64> , <2 x i32> - ret <2 x i64> %1 -} - -;; Ensure we don't use insertps from non v4x32 vectors. -;; On SSE4.1 it works because bigger vectors use more than 1 register. -;; On AVX they get passed in a single register. -;; FIXME: We could probably optimize this case, if we're only using the -;; first 4 indices. -define <4 x i32> @insert_from_diff_size(<8 x i32> %x) { -; CHECK-LABEL: insert_from_diff_size: -; CHECK-NOT: insertps -; CHECK: ret - %vecext = extractelement <8 x i32> %x, i32 0 - %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 - %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 - %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 - %a.0 = extractelement <8 x i32> %x, i32 0 - %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a.0, i32 3 - ret <4 x i32> %vecinit3 -} diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll index 058db314d28..98c1645b908 100644 --- a/test/CodeGen/X86/avx-splat.ll +++ b/test/CodeGen/X86/avx-splat.ll @@ -9,8 +9,7 @@ entry: ret <32 x i8> %shuffle } -; CHECK: vpunpckhwd %xmm -; CHECK-NEXT: vpshufd $85 +; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11] ; CHECK-NEXT: vinsertf128 $1 define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp { entry: @@ -19,7 +18,7 @@ entry: } ; CHECK: vmovq -; CHECK-NEXT: vmovlhps %xmm +; CHECK-NEXT: vunpcklpd %xmm ; CHECK-NEXT: vinsertf128 $1 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp { entry: @@ -70,7 +69,7 @@ __load_and_broadcast_32.exit1249: ; preds = %load.i1247, %for_ex ret <8 x float> %load_broadcast12281250 } -; CHECK: vpshufd $0 +; CHECK: vpermilps $4 ; CHECK-NEXT: vinsertf128 $1 define <8 x float> @funcF(i32 %val) nounwind { %ret6 = insertelement <8 x i32> undef, i32 %val, i32 6 @@ -79,7 +78,7 @@ define <8 x float> @funcF(i32 %val) nounwind { ret <8 x float> %tmp } -; CHECK: vpshufd $0 +; CHECK: vpermilps $0 ; CHECK-NEXT: vinsertf128 $1 define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp { entry: @@ -88,7 +87,7 @@ entry: } ; CHECK: vextractf128 $1 -; CHECK-NEXT: vpshufd +; CHECK-NEXT: vpermilps $85 ; CHECK-NEXT: vinsertf128 $1 define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp { entry: diff --git a/test/CodeGen/X86/avx-vmovddup.ll b/test/CodeGen/X86/avx-vmovddup.ll deleted file mode 100644 index 1c56fe2b1a0..00000000000 --- a/test/CodeGen/X86/avx-vmovddup.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; CHECK: vmovddup %ymm -define <4 x i64> @A(<4 x i64> %a) { - %c = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> - ret <4 x i64> %c -} - -; CHECK: vmovddup (% -define <4 x i64> @B(<4 x i64>* %ptr) { - %a = load <4 x i64>* %ptr - %c = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> - ret <4 x i64> %c -} diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll index 9d5be4ae28d..a8e5d885443 100644 --- a/test/CodeGen/X86/avx-vperm2x128.ll +++ b/test/CodeGen/X86/avx-vperm2x128.ll @@ -2,10 +2,15 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: A: -; ALL: ## BB#0: ## %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; ALL-NEXT: retq +; AVX1-LABEL: A: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: A: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -14,7 +19,7 @@ entry: define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { ; ALL-LABEL: B: ; ALL: ## BB#0: ## %entry -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; ALL-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -22,40 +27,66 @@ entry: } define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: C: -; ALL: ## BB#0: ## %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1] -; ALL-NEXT: retq +; AVX1-LABEL: C: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: C: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: D: -; ALL: ## BB#0: ## %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: D: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: D: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle } define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: E: -; ALL: ## BB#0: ## %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; ALL-NEXT: retq +; AVX1-LABEL: E: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: E: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle } define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: E2: -; ALL: ## BB#0: ## %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] -; ALL-NEXT: retq +; AVX1-LABEL: E2: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: E2: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -64,18 +95,15 @@ entry: define <32 x i8> @Ei(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { ; AVX1-LABEL: Ei: ; AVX1: ## BB#0: ## %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: Ei: ; AVX2: ## BB#0: ## %entry -; AVX2-NEXT: vpaddb {{.*}}, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-NEXT: retq entry: ; add forces execution domain @@ -87,19 +115,19 @@ entry: define <4 x i64> @E2i(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { ; AVX1-LABEL: E2i: ; AVX1: ## BB#0: ## %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] -; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] +; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: E2i: ; AVX2: ## BB#0: ## %entry -; AVX2-NEXT: vpbroadcastq {{.*}}, %ymm2 +; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq entry: ; add forces execution domain @@ -111,19 +139,17 @@ entry: define <8 x i32> @E3i(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { ; AVX1-LABEL: E3i: ; AVX1: ## BB#0: ## %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1] -; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: E3i: ; AVX2: ## BB#0: ## %entry -; AVX2-NEXT: vpbroadcastd {{.*}}, %ymm2 +; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: retq entry: ; add forces execution domain @@ -135,18 +161,16 @@ entry: define <16 x i16> @E4i(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp { ; AVX1-LABEL: E4i: ; AVX1: ## BB#0: ## %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: E4i: ; AVX2: ## BB#0: ## %entry -; AVX2-NEXT: vpaddw {{.*}}, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm0[0,1] +; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq entry: ; add forces execution domain @@ -158,20 +182,20 @@ entry: define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp { ; AVX1-LABEL: E5i: ; AVX1: ## BB#0: ## %entry -; AVX1-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1] +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd (%rsi), %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: E5i: ; AVX2: ## BB#0: ## %entry ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddw {{.*}}, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1] +; AVX2-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq entry: %c = load <16 x i16>* %a @@ -184,10 +208,19 @@ entry: ;;;; Cases with undef indicies mixed in the mask define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -; ALL-LABEL: F: -; ALL: ## BB#0: ## %entry -; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; ALL-NEXT: retq +; AVX1-LABEL: F: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: F: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] +; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -199,17 +232,15 @@ define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ; AVX1-LABEL: G: ; AVX1: ## BB#0: ## %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: G: ; AVX2: ## BB#0: ## %entry -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] ; AVX2-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> diff --git a/test/CodeGen/X86/avx-vpermil.ll b/test/CodeGen/X86/avx-vpermil.ll deleted file mode 100644 index b7f8d72e58c..00000000000 --- a/test/CodeGen/X86/avx-vpermil.ll +++ /dev/null @@ -1,54 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; CHECK: vpermilps -define <8 x float> @funcA(<8 x float> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> - ret <8 x float> %shuffle -} - -; CHECK: vpermilpd -define <4 x double> @funcB(<4 x double> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> - ret <4 x double> %shuffle -} - -; CHECK: vpermilps -define <8 x i32> @funcC(<8 x i32> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> - ret <8 x i32> %shuffle -} - -; CHECK: vpermilpd -define <4 x i64> @funcD(<4 x i64> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> - ret <4 x i64> %shuffle -} - -; CHECK: vpermilpd -define <4 x i64> @funcQ(<4 x i64>* %a) nounwind uwtable readnone ssp { -entry: - %a2 = load <4 x i64>* %a - %shuffle = shufflevector <4 x i64> %a2, <4 x i64> undef, <4 x i32> - ret <4 x i64> %shuffle -} - -; vpermil should match masks like this: . Check that the -; target specific mask was correctly generated. -; CHECK: vpermilps $-100 -define <8 x float> @funcE(<8 x float> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> - ret <8 x float> %shuffle -} - -; CHECK: palignr $8 -; CHECK: palignr $8 -define <8 x float> @funcF(<8 x float> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> - ret <8 x float> %shuffle -} diff --git a/test/CodeGen/X86/avx-vshufp.ll b/test/CodeGen/X86/avx-vshufp.ll deleted file mode 100644 index ad3dbc1ed89..00000000000 --- a/test/CodeGen/X86/avx-vshufp.ll +++ /dev/null @@ -1,157 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; CHECK: vshufps $-53, %ymm -define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> - ret <8 x float> %shuffle -} - -; CHECK: vshufps $-53, (%{{.*}}), %ymm -define <8 x float> @A2(<8 x float>* %a, <8 x float>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <8 x float>* %a - %b2 = load <8 x float>* %b - %shuffle = shufflevector <8 x float> %a2, <8 x float> %b2, <8 x i32> - ret <8 x float> %shuffle -} - -; CHECK: vshufps $-53, %ymm -define <8 x i32> @A3(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> - ret <8 x i32> %shuffle -} - -; CHECK: vshufps $-53, (%{{.*}}), %ymm -define <8 x i32> @A4(<8 x i32>* %a, <8 x i32>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <8 x i32>* %a - %b2 = load <8 x i32>* %b - %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b2, <8 x i32> - ret <8 x i32> %shuffle -} - -; CHECK: vblendpd $10, %ymm -define <4 x double> @B(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> - ret <4 x double> %shuffle -} - -; CHECK: vblendpd $10, (%{{.*}}), %ymm -define <4 x double> @B2(<4 x double>* %a, <4 x double>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <4 x double>* %a - %b2 = load <4 x double>* %b - %shuffle = shufflevector <4 x double> %a2, <4 x double> %b2, <4 x i32> - ret <4 x double> %shuffle -} - -; CHECK: vblendpd $10, %ymm -define <4 x i64> @B3(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> - ret <4 x i64> %shuffle -} - -; CHECK: vblendpd $10, (%{{.*}}), %ymm -define <4 x i64> @B4(<4 x i64>* %a, <4 x i64>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <4 x i64>* %a - %b2 = load <4 x i64>* %b - %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b2, <4 x i32> - ret <4 x i64> %shuffle -} - -; CHECK: vshufps $-53, %ymm -define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> - ret <8 x float> %shuffle -} - -; CHECK: vblendpd $2, %ymm -define <4 x double> @D(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> - ret <4 x double> %shuffle -} - -; CHECK: vshufps $-55, %ymm -define <8 x float> @E(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> - ret <8 x float> %shuffle -} - -; CHECK: vshufpd $8, %ymm -define <4 x double> @F(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> - ret <4 x double> %shuffle -} - -; CHECK: vshufps $-53, %xmm -define <4 x float> @A128(<4 x float> %a, <4 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> - ret <4 x float> %shuffle -} - -; CHECK: vshufps $-53, (%{{.*}}), %xmm -define <4 x float> @A2128(<4 x float>* %a, <4 x float>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <4 x float>* %a - %b2 = load <4 x float>* %b - %shuffle = shufflevector <4 x float> %a2, <4 x float> %b2, <4 x i32> - ret <4 x float> %shuffle -} - -; CHECK: vshufps $-53, %xmm -define <4 x i32> @A3128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - ret <4 x i32> %shuffle -} - -; CHECK: vshufps $-53, (%{{.*}}), %xmm -define <4 x i32> @A4128(<4 x i32>* %a, <4 x i32>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <4 x i32>* %a - %b2 = load <4 x i32>* %b - %shuffle = shufflevector <4 x i32> %a2, <4 x i32> %b2, <4 x i32> - ret <4 x i32> %shuffle -} - -; CHECK: vshufpd $1, %xmm -define <2 x double> @B128(<2 x double> %a, <2 x double> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> - ret <2 x double> %shuffle -} - -; CHECK: vshufpd $1, (%{{.*}}), %xmm -define <2 x double> @B2128(<2 x double>* %a, <2 x double>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <2 x double>* %a - %b2 = load <2 x double>* %b - %shuffle = shufflevector <2 x double> %a2, <2 x double> %b2, <2 x i32> - ret <2 x double> %shuffle -} - -; CHECK: vshufpd $1, %xmm -define <2 x i64> @B3128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> - ret <2 x i64> %shuffle -} - -; CHECK: vshufpd $1, (%{{.*}}), %xmm -define <2 x i64> @B4128(<2 x i64>* %a, <2 x i64>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <2 x i64>* %a - %b2 = load <2 x i64>* %b - %shuffle = shufflevector <2 x i64> %a2, <2 x i64> %b2, <2 x i32> - ret <2 x i64> %shuffle -} diff --git a/test/CodeGen/X86/avx2-palignr.ll b/test/CodeGen/X86/avx2-palignr.ll deleted file mode 100644 index 83573dc7b26..00000000000 --- a/test/CodeGen/X86/avx2-palignr.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s - -define <8 x i32> @test1(<8 x i32> %A, <8 x i32> %B) nounwind { -; CHECK-LABEL: test1: -; CHECK: vpalignr $4 - %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> - ret <8 x i32> %C -} - -define <8 x i32> @test2(<8 x i32> %A, <8 x i32> %B) nounwind { -; CHECK-LABEL: test2: -; CHECK: vpalignr $4 - %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> - ret <8 x i32> %C -} - -define <8 x i32> @test3(<8 x i32> %A, <8 x i32> %B) nounwind { -; CHECK-LABEL: test3: -; CHECK: vpalignr $4 - %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> - ret <8 x i32> %C -} -; -define <8 x i32> @test4(<8 x i32> %A, <8 x i32> %B) nounwind { -; CHECK-LABEL: test4: -; CHECK: vpalignr $8 - %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> - ret <8 x i32> %C -} - -define <16 x i16> @test5(<16 x i16> %A, <16 x i16> %B) nounwind { -; CHECK-LABEL: test5: -; CHECK: vpalignr $6 - %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> - ret <16 x i16> %C -} - -define <16 x i16> @test6(<16 x i16> %A, <16 x i16> %B) nounwind { -; CHECK-LABEL: test6: -; CHECK: vpalignr $6 - %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> - ret <16 x i16> %C -} - -define <16 x i16> @test7(<16 x i16> %A, <16 x i16> %B) nounwind { -; CHECK-LABEL: test7: -; CHECK: vpalignr $6 - %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> - ret <16 x i16> %C -} - -define <32 x i8> @test8(<32 x i8> %A, <32 x i8> %B) nounwind { -; CHECK-LABEL: test8: -; CHECK: vpalignr $5 - %C = shufflevector <32 x i8> %A, <32 x i8> %B, <32 x i32> - ret <32 x i8> %C -} diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll deleted file mode 100644 index 185b989458a..00000000000 --- a/test/CodeGen/X86/avx2-shuffle.ll +++ /dev/null @@ -1,127 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s - -; Make sure that we don't match this shuffle using the vpblendw YMM instruction. -; The mask for the vpblendw instruction needs to be identical for both halves -; of the YMM. Need to use two vpblendw instructions. - -; CHECK: vpblendw_test1 -; mask = 10010110,b = 150,d -; CHECK: vpblendw $150, %ymm -; CHECK: ret -define <16 x i16> @vpblendw_test1(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline { - %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> - ret <16 x i16> %t -} - -; CHECK: vpblendw_test2 -; mask1 = 00010110 = 22 -; mask2 = 10000000 = 128 -; CHECK: vpblendw $128, %xmm -; CHECK: vpblendw $22, %xmm -; CHECK: vinserti128 -; CHECK: ret -define <16 x i16> @vpblendw_test2(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline { - %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> - ret <16 x i16> %t -} - -; CHECK: blend_test1 -; CHECK: vpblendd -; CHECK: ret -define <8 x i32> @blend_test1(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline { - %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> - ret <8 x i32> %t -} - -; CHECK: blend_test2 -; CHECK: vpblendd -; CHECK: ret -define <8 x i32> @blend_test2(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline { - %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> - ret <8 x i32> %t -} - - -; CHECK: blend_test3 -; CHECK: vblendps -; CHECK: ret -define <8 x float> @blend_test3(<8 x float> %a, <8 x float> %b) nounwind alwaysinline { - %t = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> - ret <8 x float> %t -} - -; CHECK: blend_test4 -; CHECK: vblendpd -; CHECK: ret -define <4 x i64> @blend_test4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline { - %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> - ret <4 x i64> %t -} - -;; 2 tests for shufflevectors that optimize to blend + immediate -; CHECK-LABEL: @blend_test5 -; CHECK: vpblendd $10, %xmm1, %xmm0, %xmm0 -; CHECK: ret -define <4 x i32> @blend_test5(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - ret <4 x i32> %1 -} - -; CHECK-LABEL: @blend_test6 -; CHECK: vpblendw $134, %ymm1, %ymm0, %ymm0 -; CHECK: ret -define <16 x i16> @blend_test6(<16 x i16> %a, <16 x i16> %b) { - %1 = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> - ret <16 x i16> %1 -} - -; CHECK: vpshufhw $27, %ymm -define <16 x i16> @vpshufhw(<16 x i16> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> - ret <16 x i16> %shuffle.i -} - -; CHECK: vpshuflw $27, %ymm -define <16 x i16> @vpshuflw(<16 x i16> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> - ret <16 x i16> %shuffle.i -} - -; CHECK: vpshufb_test -; CHECK: vpshufb {{.*\(%r.*}}, %ymm -; CHECK: ret -define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind { - %S = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> - ret <32 x i8>%S -} - -; CHECK: vpshufb1_test -; CHECK: vpshufb {{.*\(%r.*}}, %ymm -; CHECK: ret -define <32 x i8> @vpshufb1_test(<32 x i8> %a) nounwind { - %S = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> - ret <32 x i8>%S -} - - -; CHECK: vpshufb2_test -; CHECK: vpshufb {{.*\(%r.*}}, %ymm -; CHECK: ret -define <32 x i8> @vpshufb2_test(<32 x i8> %a) nounwind { - %S = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> - ret <32 x i8>%S -} diff --git a/test/CodeGen/X86/avx2-unpack.ll b/test/CodeGen/X86/avx2-unpack.ll deleted file mode 100644 index 6d17443489a..00000000000 --- a/test/CodeGen/X86/avx2-unpack.ll +++ /dev/null @@ -1,86 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s - -; CHECK: vpunpckhdq -define <8 x i32> @unpackhidq1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> - ret <8 x i32> %shuffle.i -} - -; CHECK: vpunpckhqdq -define <4 x i64> @unpackhiqdq1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> - ret <4 x i64> %shuffle.i -} - -; CHECK: vpunpckldq -define <8 x i32> @unpacklodq1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> - ret <8 x i32> %shuffle.i -} - -; CHECK: vpunpcklqdq -define <4 x i64> @unpacklqdq1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> - ret <4 x i64> %shuffle.i -} - -; CHECK: vpunpckhwd -define <16 x i16> @unpackhwd(<16 x i16> %src1, <16 x i16> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src2, <16 x i32> - ret <16 x i16> %shuffle.i -} - -; CHECK: vpunpcklwd -define <16 x i16> @unpacklwd(<16 x i16> %src1, <16 x i16> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src2, <16 x i32> - ret <16 x i16> %shuffle.i -} - -; CHECK: vpunpckhbw -define <32 x i8> @unpackhbw(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src2, <32 x i32> - ret <32 x i8> %shuffle.i -} - -; CHECK: vpunpcklbw -define <32 x i8> @unpacklbw(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src2, <32 x i32> - ret <32 x i8> %shuffle.i -} - -; CHECK: vpunpckhdq -define <8 x i32> @unpackhidq1_undef(<8 x i32> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src1, <8 x i32> - ret <8 x i32> %shuffle.i -} - -; CHECK: vpunpckhqdq -define <4 x i64> @unpackhiqdq1_undef(<4 x i64> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src1, <4 x i32> - ret <4 x i64> %shuffle.i -} - -; CHECK: vpunpckhwd -define <16 x i16> @unpackhwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> - ret <16 x i16> %shuffle.i -} - -; CHECK: vpunpcklwd -define <16 x i16> @unpacklwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> - ret <16 x i16> %shuffle.i -} - diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll index 66f586d23d1..924c06eba76 100644 --- a/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/test/CodeGen/X86/avx2-vbroadcast.ll @@ -317,7 +317,7 @@ define <4 x double> @_inreg4xdouble(<4 x double> %a) { } ;CHECK-LABEL: _inreg2xdouble: -;CHECK: vpbroadcastq +;CHECK: vunpcklpd ;CHECK: ret define <2 x double> @_inreg2xdouble(<2 x double> %a) { %b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll index c43da9c03a6..e161b5cdac0 100644 --- a/test/CodeGen/X86/avx512-arith.ll +++ b/test/CodeGen/X86/avx512-arith.ll @@ -453,7 +453,10 @@ entry: define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) { ; CHECK-LABEL: andqbrst: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpandq (%rdi){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: vmovq (%rdi), %xmm1 +; CHECK-NEXT: vpbroadcastq %xmm1, %ymm1 +; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq entry: %a = load i64* %ap, align 8 diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll index 20f837129eb..38f0cbcc8d1 100644 --- a/test/CodeGen/X86/avx512-build-vector.ll +++ b/test/CodeGen/X86/avx512-build-vector.ll @@ -4,11 +4,10 @@ define <16 x i32> @test1(i32* %x) { ; CHECK-LABEL: test1: ; CHECK: ## BB#0: ; CHECK-NEXT: vmovd (%rdi), %xmm0 -; CHECK-NEXT: vmovdqa32 {{.*}}(%rip), %zmm1 -; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; CHECK-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 -; CHECK-NEXT: vmovdqa32 {{.*}}(%rip), %zmm0 -; CHECK-NEXT: vpermd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 +; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7] +; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %y = load i32* %x, align 4 %res = insertelement <16 x i32>zeroinitializer, i32 %y, i32 4 @@ -27,7 +26,7 @@ define <16 x i32> @test2(<16 x i32> %x) { define <16 x float> @test3(<4 x float> %a) { ; CHECK-LABEL: test3: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovhlps %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vmovss %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: vmovss %xmm1, %xmm2, %xmm1 diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll deleted file mode 100644 index dc765c81cc3..00000000000 --- a/test/CodeGen/X86/avx512-shuffle.ll +++ /dev/null @@ -1,362 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -; CHECK: LCP -; CHECK: .long 2 -; CHECK: .long 5 -; CHECK: .long 0 -; CHECK: .long 0 -; CHECK: .long 7 -; CHECK: .long 0 -; CHECK: .long 10 -; CHECK: .long 1 -; CHECK: .long 0 -; CHECK: .long 5 -; CHECK: .long 0 -; CHECK: .long 4 -; CHECK: .long 7 -; CHECK: .long 0 -; CHECK: .long 10 -; CHECK: .long 1 -; CHECK-LABEL: test1: -; CHECK: vpermps -; CHECK: ret -define <16 x float> @test1(<16 x float> %a) nounwind { - %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> - ret <16 x float> %c -} - -; CHECK-LABEL: test2: -; CHECK: vpermd -; CHECK: ret -define <16 x i32> @test2(<16 x i32> %a) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> - ret <16 x i32> %c -} - -; CHECK-LABEL: test3: -; CHECK: vpermq -; CHECK: ret -define <8 x i64> @test3(<8 x i64> %a) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> - ret <8 x i64> %c -} - -; CHECK-LABEL: test4: -; CHECK: vpermpd -; CHECK: ret -define <8 x double> @test4(<8 x double> %a) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> - ret <8 x double> %c -} - -; CHECK-LABEL: test5: -; CHECK: vpermt2pd -; CHECK: ret -define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> - ret <8 x double> %c -} - -; The reg variant of vpermt2 with a writemask -; CHECK-LABEL: test5m: -; CHECK: vpermt2pd {{.* {%k[1-7]} {z}}} -define <8 x double> @test5m(<8 x double> %a, <8 x double> %b, i8 %mask) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> - %m = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %m, <8 x double> %c, <8 x double> zeroinitializer - ret <8 x double> %res -} - -; CHECK-LABEL: test6: -; CHECK: vpermq $30 -; CHECK: ret -define <8 x i64> @test6(<8 x i64> %a) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> - ret <8 x i64> %c -} - -; CHECK-LABEL: test7: -; CHECK: vpermt2q -; CHECK: ret -define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> - ret <8 x i64> %c -} - -; The reg variant of vpermt2 with a writemask -; CHECK-LABEL: test7m: -; CHECK: vpermt2q {{.* {%k[1-7]} {z}}} -define <8 x i64> @test7m(<8 x i64> %a, <8 x i64> %b, i8 %mask) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> - %m = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer - ret <8 x i64> %res -} - -; The mem variant of vpermt2 with a writemask -; CHECK-LABEL: test7mm: -; CHECK: vpermt2q {{\(.*\).* {%k[1-7]} {z}}} -define <8 x i64> @test7mm(<8 x i64> %a, <8 x i64> *%pb, i8 %mask) nounwind { - %b = load <8 x i64>* %pb - %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> - %m = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer - ret <8 x i64> %res -} - -; CHECK-LABEL: test8: -; CHECK: vpermt2d -; CHECK: ret -define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> - ret <16 x i32> %c -} - -; The reg variant of vpermt2 with a writemask -; CHECK-LABEL: test8m: -; CHECK: vpermt2d {{.* {%k[1-7]} {z}}} -define <16 x i32> @test8m(<16 x i32> %a, <16 x i32> %b, i16 %mask) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> - %m = bitcast i16 %mask to <16 x i1> - %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer - ret <16 x i32> %res -} - -; The mem variant of vpermt2 with a writemask -; CHECK-LABEL: test8mm: -; CHECK: vpermt2d {{\(.*\).* {%k[1-7]} {z}}} -define <16 x i32> @test8mm(<16 x i32> %a, <16 x i32> *%pb, i16 %mask) nounwind { - %b = load <16 x i32> * %pb - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> - %m = bitcast i16 %mask to <16 x i1> - %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer - ret <16 x i32> %res -} - -; CHECK-LABEL: test9: -; CHECK: vpermt2ps -; CHECK: ret -define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind { - %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> - ret <16 x float> %c -} - -; The reg variant of vpermt2 with a writemask -; CHECK-LABEL: test9m: -; CHECK: vpermt2ps {{.*}} {%k{{.}}} {z} -define <16 x float> @test9m(<16 x float> %a, <16 x float> %b, i16 %mask) nounwind { - %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> - %m = bitcast i16 %mask to <16 x i1> - %res = select <16 x i1> %m, <16 x float> %c, <16 x float> zeroinitializer - ret <16 x float> %res -} - -; CHECK-LABEL: test10: -; CHECK: vpermt2ps ( -; CHECK: ret -define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind { - %c = load <16 x float>* %b - %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> - ret <16 x float> %d -} - -; CHECK-LABEL: test11: -; CHECK: vpermt2d -; CHECK: ret -define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind { - %c = load <16 x i32>* %b - %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> - ret <16 x i32> %d -} - -; CHECK-LABEL: test12 -; CHECK: vmovlhps {{.*}} -; CHECK: ret -define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) nounwind { - %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - ret <4 x i32> %c -} - -; CHECK-LABEL: test13 -; CHECK: vpermilps $-79, %zmm -; CHECK: ret -define <16 x float> @test13(<16 x float> %a) { - %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> - ret <16 x float> %b -} - -; CHECK-LABEL: test14 -; CHECK: vpermilpd $-53, %zmm -; CHECK: ret -define <8 x double> @test14(<8 x double> %a) { - %b = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> - ret <8 x double> %b -} - -; CHECK-LABEL: test15 -; CHECK: vpshufd $-79, %zmm -; CHECK: ret -define <16 x i32> @test15(<16 x i32> %a) { - %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> - ret <16 x i32> %b -} - -; CHECK-LABEL: valign_test_v16f32 -; CHECK: valignd $2, %zmm0, %zmm0 -; CHECK: ret -define <16 x float> @valign_test_v16f32(<16 x float> %a, <16 x float> %b) nounwind { - %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> - ret <16 x float> %c -} - -; CHECK-LABEL: valign_test_v16i32 -; CHECK: valignd $2, %zmm0, %zmm0 -; CHECK: ret -define <16 x i32> @valign_test_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> - ret <16 x i32> %c -} - - -; CHECK-LABEL: test16 -; CHECK: valignq $2, %zmm0, %zmm1 -; CHECK: ret -define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> - ret <8 x double> %c -} - -; CHECK-LABEL: test16k -; CHECK: valignq $2, %zmm0, %zmm1, %zmm2 {%k1} -define <8 x i64> @test16k(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> - %m = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> %src - ret <8 x i64> %res -} - -; CHECK-LABEL: test16kz -; CHECK: valignq $2, %zmm0, %zmm1, %zmm0 {%k1} {z} -define <8 x i64> @test16kz(<8 x i64> %a, <8 x i64> %b, i8 %mask) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> - %m = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer - ret <8 x i64> %res -} - -; CHECK-LABEL: test17 -; CHECK: vshufpd $19, %zmm1, %zmm0 -; CHECK: ret -define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> - ret <8 x double> %c -} - -; CHECK-LABEL: test18 -; CHECK: vpunpckhdq %zmm -; CHECK: ret -define <16 x i32> @test18(<16 x i32> %a, <16 x i32> %c) { - %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> - ret <16 x i32> %b -} - -; CHECK-LABEL: test19 -; CHECK: vpunpckldq %zmm -; CHECK: ret -define <16 x i32> @test19(<16 x i32> %a, <16 x i32> %c) { - %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> - ret <16 x i32> %b -} - -; CHECK-LABEL: test20 -; CHECK: vpunpckhqdq %zmm -; CHECK: ret -define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) { - %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32> - ret <8 x i64> %b -} - -; CHECK-LABEL: test21 -; CHECK: vunpcklps %zmm -; CHECK: ret -define <16 x float> @test21(<16 x float> %a, <16 x float> %c) { - %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> - ret <16 x float> %b -} - -; CHECK-LABEL: test22 -; CHECK: vmovhlps {{.*}} -; CHECK: ret -define <4 x i32> @test22(<4 x i32> %a, <4 x i32> %b) nounwind { - %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> - ret <4 x i32> %c -} - -; CHECK-LABEL: @test23 -; CHECK: vshufps $-112, %zmm -; CHECK: ret -define <16 x float> @test23(<16 x float> %a, <16 x float> %c) { - %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> - ret <16 x float> %b -} - -; CHECK-LABEL: @test24 -; CHECK: vpermt2d -; CHECK: ret -define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> - ret <16 x i32> %c -} - -; CHECK-LABEL: @test25 -; CHECK: vshufps $52 -; CHECK: ret -define <16 x i32> @test25(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> - ret <16 x i32> %c -} - -; CHECK-LABEL: @test26 -; CHECK: vmovshdup -; CHECK: ret -define <16 x i32> @test26(<16 x i32> %a) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> - ret <16 x i32> %c -} - -; CHECK-LABEL: @test27 -; CHECK: ret -define <16 x i32> @test27(<4 x i32>%a) { - %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> - ret <16 x i32> %res -} - -; CHECK-LABEL: @test28 -; CHECK: vinserti64x4 $1 -; CHECK: ret -define <16 x i32> @test28(<16 x i32>%x, <16 x i32>%y) { - %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> - ret <16 x i32> %res -} - -; CHECK-LABEL: @test29 -; CHECK: vinserti64x4 $0 -; CHECK: ret -define <16 x i32> @test29(<16 x i32>%x, <16 x i32>%y) { - %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> - ret <16 x i32> %res -} - diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll index 0b0e0fc2bc8..99def0694bf 100644 --- a/test/CodeGen/X86/avx512-vbroadcast.ll +++ b/test/CodeGen/X86/avx512-vbroadcast.ll @@ -3,7 +3,9 @@ define <16 x i32> @_inreg16xi32(i32 %a) { ; CHECK-LABEL: _inreg16xi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastd %edi, %zmm0 +; CHECK-NEXT: vmovd %edi, %xmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; CHECK-NEXT: retq %b = insertelement <16 x i32> undef, i32 %a, i32 0 %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer @@ -13,7 +15,9 @@ define <16 x i32> @_inreg16xi32(i32 %a) { define <8 x i64> @_inreg8xi64(i64 %a) { ; CHECK-LABEL: _inreg8xi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 +; CHECK-NEXT: vmovq %rdi, %xmm0 +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; CHECK-NEXT: retq %b = insertelement <8 x i64> undef, i64 %a, i32 0 %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer @@ -23,7 +27,9 @@ define <8 x i64> @_inreg8xi64(i64 %a) { define <16 x float> @_inreg16xfloat(float %a) { ; CHECK-LABEL: _inreg16xfloat: ; CHECK: ## BB#0: -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 +; CHECK-NEXT: ## kill: XMM0 XMM0 ZMM0 +; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; CHECK-NEXT: retq %b = insertelement <16 x float> undef, float %a, i32 0 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer @@ -33,7 +39,9 @@ define <16 x float> @_inreg16xfloat(float %a) { define <8 x double> @_inreg8xdouble(double %a) { ; CHECK-LABEL: _inreg8xdouble: ; CHECK: ## BB#0: -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 +; CHECK-NEXT: ## kill: XMM0 XMM0 ZMM0 +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; CHECK-NEXT: retq %b = insertelement <8 x double> undef, double %a, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer @@ -43,7 +51,8 @@ define <8 x double> @_inreg8xdouble(double %a) { define <16 x i32> @_xmm16xi32(<16 x i32> %a) { ; CHECK-LABEL: _xmm16xi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 +; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 +; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; CHECK-NEXT: retq %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer ret <16 x i32> %b @@ -52,7 +61,8 @@ define <16 x i32> @_xmm16xi32(<16 x i32> %a) { define <16 x float> @_xmm16xfloat(<16 x float> %a) { ; CHECK-LABEL: _xmm16xfloat: ; CHECK: ## BB#0: -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 +; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ; CHECK-NEXT: retq %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer ret <16 x float> %b diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll index d993a0e0f21..8ff0263a87a 100644 --- a/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/test/CodeGen/X86/avx512-vec-cmp.ll @@ -312,7 +312,10 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16 define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind { ; CHECK-LABEL: test24: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1 +; CHECK-NEXT: vmovq (%rdi), %xmm2 +; CHECK-NEXT: vpbroadcastq %xmm2, %ymm2 +; CHECK-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -327,7 +330,10 @@ define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind { define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind { ; CHECK-LABEL: test25: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1 +; CHECK-NEXT: vmovd (%rdi), %xmm2 +; CHECK-NEXT: vpbroadcastd %xmm2, %ymm2 +; CHECK-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 +; CHECK-NEXT: vpcmpled %zmm2, %zmm0, %k1 ; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -342,8 +348,11 @@ define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind { ; CHECK-LABEL: test26: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1 -; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1} +; CHECK-NEXT: vmovd (%rdi), %xmm3 +; CHECK-NEXT: vpbroadcastd %xmm3, %ymm3 +; CHECK-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 +; CHECK-NEXT: vpcmpgtd %zmm3, %zmm0, %k1 +; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1 {%k1} ; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -360,8 +369,11 @@ define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32 define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind { ; CHECK-LABEL: test27: ; CHECK: ## BB#0: -; CHECK-NEXT: vpcmpleq %zmm1, %zmm2, %k1 -; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1} +; CHECK-NEXT: vmovq (%rdi), %xmm3 +; CHECK-NEXT: vpbroadcastq %xmm3, %ymm3 +; CHECK-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 +; CHECK-NEXT: vpcmpleq %zmm3, %zmm0, %k1 +; CHECK-NEXT: vpcmpleq %zmm1, %zmm2, %k1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll index c1f6c79e81a..ba9d48d6047 100644 --- a/test/CodeGen/X86/combine-or.ll +++ b/test/CodeGen/X86/combine-or.ll @@ -7,8 +7,7 @@ define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test1: ; CHECK: # BB#0: -; CHECK-NEXT: movsd %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32> @@ -20,7 +19,8 @@ define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) { define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test2: ; CHECK: # BB#0: -; CHECK-NEXT: movsd %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> @@ -32,7 +32,8 @@ define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) { define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test3: ; CHECK: # BB#0: -; CHECK-NEXT: movsd %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32> @@ -44,8 +45,8 @@ define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) { define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test4: ; CHECK: # BB#0: -; CHECK-NEXT: movss %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> @@ -57,7 +58,7 @@ define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test5: ; CHECK: # BB#0: -; CHECK-NEXT: movss %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> @@ -69,7 +70,7 @@ define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test6: ; CHECK: # BB#0: -; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> @@ -81,7 +82,7 @@ define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test7: ; CHECK: # BB#0: -; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; CHECK-NEXT: retq %and1 = and <4 x i32> %a, %and2 = and <4 x i32> %b, @@ -93,8 +94,7 @@ define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) { define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test8: ; CHECK: # BB#0: -; CHECK-NEXT: movsd %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; CHECK-NEXT: retq %and1 = and <2 x i64> %a, %and2 = and <2 x i64> %b, @@ -106,7 +106,8 @@ define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) { define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test9: ; CHECK: # BB#0: -; CHECK-NEXT: movsd %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %and1 = and <4 x i32> %a, %and2 = and <4 x i32> %b, @@ -118,7 +119,8 @@ define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) { define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test10: ; CHECK: # BB#0: -; CHECK-NEXT: movsd %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %and1 = and <2 x i64> %a, %and2 = and <2 x i64> %b, @@ -130,8 +132,8 @@ define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) { define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test11: ; CHECK: # BB#0: -; CHECK-NEXT: movss %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %and1 = and <4 x i32> %a, %and2 = and <4 x i32> %b, @@ -143,7 +145,7 @@ define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test12: ; CHECK: # BB#0: -; CHECK-NEXT: movss %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; CHECK-NEXT: retq %and1 = and <4 x i32> %a, %and2 = and <4 x i32> %b, @@ -211,10 +213,11 @@ define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test17: ; CHECK: # BB#0: ; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,0] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,0] -; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; CHECK-NEXT: orps %xmm1, %xmm2 +; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> @@ -228,10 +231,10 @@ define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) { ; CHECK: # BB#0: ; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: xorps %xmm3, %xmm3 -; CHECK-NEXT: blendps $1, %xmm0, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[0,0] -; CHECK-NEXT: blendps $1, %xmm1, %xmm2 -; CHECK-NEXT: orps %xmm3, %xmm2 +; CHECK-NEXT: blendps {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,1,1] +; CHECK-NEXT: blendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; CHECK-NEXT: orps %xmm0, %xmm2 ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> @@ -245,12 +248,13 @@ define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test19: ; CHECK: # BB#0: ; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,0] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pslldq $8, %xmm2 +; CHECK-NEXT: xorps %xmm3, %xmm3 +; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[0,3] +; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0] ; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,2] -; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: orps %xmm3, %xmm2 +; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32> @@ -275,8 +279,9 @@ define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) { define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test21: ; CHECK: # BB#0: -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pslldq $8, %xmm0 +; CHECK-NEXT: orps %xmm1, %xmm0 +; CHECK-NEXT: movq %xmm0, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32> @@ -290,7 +295,8 @@ define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) { define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) { ; CHECK-LABEL: test_crash: ; CHECK: # BB#0: -; CHECK-NEXT: movsd %xmm1, %xmm0 +; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32> %shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32> diff --git a/test/CodeGen/X86/exedepsfix-broadcast.ll b/test/CodeGen/X86/exedepsfix-broadcast.ll index ab794959550..ab92fe0d1d0 100644 --- a/test/CodeGen/X86/exedepsfix-broadcast.ll +++ b/test/CodeGen/X86/exedepsfix-broadcast.ll @@ -95,8 +95,9 @@ define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> % ; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg ; ExeDepsFix works top down, thus it coalesces vpunpcklqdq domain with ; vpand and there is nothing more you can do to match vmaxpd. -; CHECK: vmovlhps -; CHECK: vandps +; CHECK: vmovq +; CHECK: vpbroadcastq +; CHECK: vpand ; CHECK: vmaxpd ; CHECK: ret define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) { diff --git a/test/CodeGen/X86/extractelement-load.ll b/test/CodeGen/X86/extractelement-load.ll index 0d5d299ed10..b5b320d4d4b 100644 --- a/test/CodeGen/X86/extractelement-load.ll +++ b/test/CodeGen/X86/extractelement-load.ll @@ -36,9 +36,9 @@ define void @t3() { ; ; This movs the entire vector, shuffling the high double down. If we fixed the ; FIXME above it would just move the high double directly. -; CHECK: movups -; CHECK: movhlps -; CHECK: movlps +; CHECK: movupd +; CHECK: shufpd +; CHECK: movlpd bb: %tmp13 = load <2 x double>* undef, align 1 diff --git a/test/CodeGen/X86/fp-load-trunc.ll b/test/CodeGen/X86/fp-load-trunc.ll index 7e78dd046c3..e6c1e1adb59 100644 --- a/test/CodeGen/X86/fp-load-trunc.ll +++ b/test/CodeGen/X86/fp-load-trunc.ll @@ -51,7 +51,7 @@ define <4 x float> @test3(<4 x double>* %p) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: cvtpd2ps 16(%eax), %xmm1 ; CHECK-NEXT: cvtpd2ps (%eax), %xmm0 -; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retl ; ; AVX-LABEL: test3: @@ -70,10 +70,10 @@ define <8 x float> @test4(<8 x double>* %p) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: cvtpd2ps 16(%eax), %xmm1 ; CHECK-NEXT: cvtpd2ps (%eax), %xmm0 -; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: cvtpd2ps 48(%eax), %xmm2 ; CHECK-NEXT: cvtpd2ps 32(%eax), %xmm1 -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; CHECK-NEXT: retl ; ; AVX-LABEL: test4: diff --git a/test/CodeGen/X86/fp-trunc.ll b/test/CodeGen/X86/fp-trunc.ll index 4f6ce937aa8..6424bfc9c21 100644 --- a/test/CodeGen/X86/fp-trunc.ll +++ b/test/CodeGen/X86/fp-trunc.ll @@ -44,7 +44,7 @@ define <4 x float> @test3(<4 x double> %x) nounwind { ; CHECK: # BB#0: ; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1 ; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0 -; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retl ; ; AVX-LABEL: test3: @@ -61,10 +61,10 @@ define <8 x float> @test4(<8 x double> %x) nounwind { ; CHECK: # BB#0: ; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1 ; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0 -; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: cvtpd2ps %xmm3, %xmm3 ; CHECK-NEXT: cvtpd2ps %xmm2, %xmm1 -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; CHECK-NEXT: retl ; ; AVX-LABEL: test4: diff --git a/test/CodeGen/X86/palignr.ll b/test/CodeGen/X86/palignr.ll index 5c2dd05cb29..f047349970d 100644 --- a/test/CodeGen/X86/palignr.ll +++ b/test/CodeGen/X86/palignr.ll @@ -40,7 +40,7 @@ define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) nounwind { ; ; CHECK-YONAH-LABEL: test3: ; CHECK-YONAH: # BB#0: -; CHECK-YONAH-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,0] +; CHECK-YONAH-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0] ; CHECK-YONAH-NEXT: retl %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 1, i32 2, i32 undef, i32 4 > ret <4 x i32> %C @@ -54,8 +54,8 @@ define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) nounwind { ; ; CHECK-YONAH-LABEL: test4: ; CHECK-YONAH: # BB#0: -; CHECK-YONAH-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] -; CHECK-YONAH-NEXT: movaps %xmm1, %xmm0 +; CHECK-YONAH-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; CHECK-YONAH-NEXT: movapd %xmm1, %xmm0 ; CHECK-YONAH-NEXT: retl %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 6, i32 7, i32 undef, i32 1 > ret <4 x i32> %C @@ -64,13 +64,14 @@ define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) nounwind { define <4 x float> @test5(<4 x float> %A, <4 x float> %B) nounwind { ; CHECK-LABEL: test5: ; CHECK: # BB#0: -; CHECK-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; CHECK-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; CHECK-NEXT: movapd %xmm1, %xmm0 ; CHECK-NEXT: retl ; ; CHECK-YONAH-LABEL: test5: ; CHECK-YONAH: # BB#0: -; CHECK-YONAH-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] -; CHECK-YONAH-NEXT: movaps %xmm1, %xmm0 +; CHECK-YONAH-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; CHECK-YONAH-NEXT: movapd %xmm1, %xmm0 ; CHECK-YONAH-NEXT: retl %C = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 6, i32 7, i32 undef, i32 1 > ret <4 x float> %C @@ -85,15 +86,16 @@ define <8 x i16> @test6(<8 x i16> %A, <8 x i16> %B) nounwind { ; ; CHECK-YONAH-LABEL: test6: ; CHECK-YONAH: # BB#0: -; CHECK-YONAH-NEXT: movapd %xmm0, %xmm2 -; CHECK-YONAH-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] -; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,2,4,5,6,7] -; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] -; CHECK-YONAH-NEXT: pextrw $3, %xmm0, %eax -; CHECK-YONAH-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK-YONAH-NEXT: pextrw $7, %xmm0, %eax -; CHECK-YONAH-NEXT: pinsrw $4, %eax, %xmm1 -; CHECK-YONAH-NEXT: movdqa %xmm1, %xmm0 +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; CHECK-YONAH-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] +; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,2,1,4,5,6,7] +; CHECK-YONAH-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-YONAH-NEXT: retl %C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 3, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10 > ret <8 x i16> %C @@ -108,13 +110,15 @@ define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) nounwind { ; ; CHECK-YONAH-LABEL: test7: ; CHECK-YONAH: # BB#0: -; CHECK-YONAH-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] -; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,0,0,4,5,6,7] -; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; CHECK-YONAH-NEXT: movd %xmm1, %eax -; CHECK-YONAH-NEXT: pinsrw $3, %eax, %xmm0 -; CHECK-YONAH-NEXT: pextrw $4, %xmm1, %eax -; CHECK-YONAH-NEXT: pinsrw $7, %eax, %xmm0 +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-YONAH-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; CHECK-YONAH-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-YONAH-NEXT: retl %C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 undef, i32 6, i32 undef, i32 8, i32 9, i32 10, i32 11, i32 12 > ret <8 x i16> %C @@ -129,35 +133,33 @@ define <16 x i8> @test8(<16 x i8> %A, <16 x i8> %B) nounwind { ; ; CHECK-YONAH-LABEL: test8: ; CHECK-YONAH: # BB#0: -; CHECK-YONAH-NEXT: pushl %esi +; CHECK-YONAH-NEXT: pxor %xmm3, %xmm3 ; CHECK-YONAH-NEXT: movdqa %xmm0, %xmm2 -; CHECK-YONAH-NEXT: pextrw $4, %xmm2, %eax -; CHECK-YONAH-NEXT: pextrw $5, %xmm2, %ecx -; CHECK-YONAH-NEXT: shrdw $8, %cx, %ax -; CHECK-YONAH-NEXT: pextrw $2, %xmm2, %edx -; CHECK-YONAH-NEXT: pextrw $3, %xmm2, %esi -; CHECK-YONAH-NEXT: shrdw $8, %si, %dx -; CHECK-YONAH-NEXT: # kill: XMM0 XMM2 -; CHECK-YONAH-NEXT: pinsrw $0, %edx, %xmm0 -; CHECK-YONAH-NEXT: shrl $8, %esi -; CHECK-YONAH-NEXT: pinsrw $1, %esi, %xmm0 -; CHECK-YONAH-NEXT: pinsrw $2, %eax, %xmm0 -; CHECK-YONAH-NEXT: pextrw $6, %xmm2, %eax -; CHECK-YONAH-NEXT: shrdw $8, %ax, %cx -; CHECK-YONAH-NEXT: pinsrw $3, %ecx, %xmm0 -; CHECK-YONAH-NEXT: pextrw $7, %xmm2, %ecx -; CHECK-YONAH-NEXT: shrdw $8, %cx, %ax -; CHECK-YONAH-NEXT: pinsrw $4, %eax, %xmm0 -; CHECK-YONAH-NEXT: pextrw $8, %xmm1, %eax -; CHECK-YONAH-NEXT: shrdw $8, %ax, %cx -; CHECK-YONAH-NEXT: pinsrw $5, %ecx, %xmm0 -; CHECK-YONAH-NEXT: pextrw $9, %xmm1, %ecx -; CHECK-YONAH-NEXT: shrdw $8, %cx, %ax -; CHECK-YONAH-NEXT: pinsrw $6, %eax, %xmm0 -; CHECK-YONAH-NEXT: pextrw $10, %xmm1, %eax -; CHECK-YONAH-NEXT: shldw $8, %cx, %ax -; CHECK-YONAH-NEXT: pinsrw $7, %eax, %xmm0 -; CHECK-YONAH-NEXT: popl %esi +; CHECK-YONAH-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7] +; CHECK-YONAH-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,1,2,0] +; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] +; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] +; CHECK-YONAH-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7] +; CHECK-YONAH-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] +; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] +; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] +; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] +; CHECK-YONAH-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-YONAH-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-YONAH-NEXT: packuswb %xmm0, %xmm2 +; CHECK-YONAH-NEXT: movdqa %xmm2, %xmm0 ; CHECK-YONAH-NEXT: retl %C = shufflevector <16 x i8> %A, <16 x i8> %B, <16 x i32> < i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20 > ret <16 x i8> %C @@ -170,18 +172,17 @@ define <16 x i8> @test8(<16 x i8> %A, <16 x i8> %B) nounwind { define <8 x i16> @test9(<8 x i16> %A, <8 x i16> %B) nounwind { ; CHECK-LABEL: test9: ; CHECK: # BB#0: -; CHECK-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,5,6,7,8,9,10,11,12,13,14,15,0,1] +; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1] ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retl ; ; CHECK-YONAH-LABEL: test9: ; CHECK-YONAH: # BB#0: -; CHECK-YONAH-NEXT: pextrw $4, %xmm1, %eax -; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,3,0,4,5,6,7] +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3] +; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,2,4,5,6,7] ; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; CHECK-YONAH-NEXT: pinsrw $3, %eax, %xmm0 -; CHECK-YONAH-NEXT: movd %xmm1, %eax -; CHECK-YONAH-NEXT: pinsrw $7, %eax, %xmm0 ; CHECK-YONAH-NEXT: retl %C = shufflevector <8 x i16> %B, <8 x i16> %A, <8 x i32> < i32 undef, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0 > ret <8 x i16> %C diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index e129d5618aa..8937d6afa0a 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -4,11 +4,11 @@ define <4 x i32> @a(<4 x i32> %i) nounwind { ; SSE2-LABEL: a: ; SSE2: movdqa {{.*}}, %[[X1:xmm[0-9]+]] -; SSE2-NEXT: pshufd {{.*}} # [[X2:xmm[0-9]+]] = xmm0[1,0,3,0] +; SSE2-NEXT: pshufd {{.*}} # [[X2:xmm[0-9]+]] = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %[[X1]], %xmm0 ; SSE2-NEXT: pmuludq %[[X1]], %[[X2]] ; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2],[[X2]][0,2] -; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: a: @@ -31,12 +31,12 @@ entry: define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind { ; SSE2-LABEL: c: -; SSE2: pshufd {{.*}} # [[X2:xmm[0-9]+]] = xmm0[1,0,3,0] +; SSE2: pshufd {{.*}} # [[X2:xmm[0-9]+]] = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[1,0,3,0] +; SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pmuludq %[[X2]], %xmm1 ; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: c: @@ -61,14 +61,14 @@ declare void @foo() define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind { ; SSE2-LABEL: e: -; SSE2: movdqa {{[0-9]*}}(%rsp), %[[X1:xmm[0-9]+]] -; SSE2-NEXT: pshufd {{.*}} # xmm0 = [[X2]][1,0,3,0] +; SSE2: movdqa {{[0-9]*}}(%rsp), %xmm0 +; SSE2-NEXT: pshufd {{.*}} # [[X1:xmm[0-9]+]] = xmm0[1,1,3,3] ; SSE2-NEXT: movdqa {{[0-9]*}}(%rsp), %[[X2:xmm[0-9]+]] -; SSE2-NEXT: pmuludq %[[X2]], %[[X1]] -; SSE2-NEXT: pshufd {{.*}} # [[X2]] = [[X2]][1,0,3,0] -; SSE2-NEXT: pmuludq %xmm0, %[[X2]] -; SSE2-NEXT: shufps {{.*}} # [[X1]] = [[X1]][0,2],[[X2]][0,2] -; SSE2-NEXT: pshufd {{.*}} # xmm0 = [[X1]][0,2,1,3] +; SSE2-NEXT: pmuludq %[[X2]], %xmm0 +; SSE2-NEXT: pshufd {{.*}} # [[X2]] = [[X2]][1,1,3,3] +; SSE2-NEXT: pmuludq %[[X1]], %[[X2]] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2],[[X2]][0,2] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3] ; SSE2-NEXT: addq ${{[0-9]+}}, %rsp ; SSE2-NEXT: retq ; diff --git a/test/CodeGen/X86/pr11334.ll b/test/CodeGen/X86/pr11334.ll index e7e29e0d609..0bdb0ec7cf4 100644 --- a/test/CodeGen/X86/pr11334.ll +++ b/test/CodeGen/X86/pr11334.ll @@ -15,7 +15,7 @@ define <3 x double> @v3f2d_ext_vec(<3 x float> %v1) nounwind { entry: ; CHECK: v3f2d_ext_vec ; CHECK: cvtps2pd -; CHECK: movhlps +; CHECK: shufpd ; CHECK: cvtps2pd ; AVX: v3f2d_ext_vec ; AVX: vcvtps2pd @@ -28,7 +28,7 @@ define <4 x double> @v4f2d_ext_vec(<4 x float> %v1) nounwind { entry: ; CHECK: v4f2d_ext_vec ; CHECK: cvtps2pd -; CHECK: movhlps +; CHECK: shufpd ; CHECK: cvtps2pd ; AVX: v4f2d_ext_vec ; AVX: vcvtps2pd @@ -42,9 +42,9 @@ entry: ; CHECK: v8f2d_ext_vec ; CHECK: cvtps2pd ; CHECK: cvtps2pd -; CHECK: movhlps +; CHECK: shufpd ; CHECK: cvtps2pd -; CHECK: movhlps +; CHECK: shufpd ; CHECK: cvtps2pd ; AVX: v8f2d_ext_vec ; AVX: vcvtps2pd diff --git a/test/CodeGen/X86/pr12359.ll b/test/CodeGen/X86/pr12359.ll deleted file mode 100644 index 024b163fa71..00000000000 --- a/test/CodeGen/X86/pr12359.ll +++ /dev/null @@ -1,10 +0,0 @@ -; RUN: llc -asm-verbose -mtriple=x86_64-unknown-unknown -mcpu=corei7 < %s | FileCheck %s -define <16 x i8> @shuf(<16 x i8> %inval1) { -entry: - %0 = shufflevector <16 x i8> %inval1, <16 x i8> zeroinitializer, <16 x i32> - ret <16 x i8> %0 -; CHECK: shuf -; CHECK: # BB#0: # %entry -; CHECK-NEXT: pshufb -; CHECK-NEXT: ret -} diff --git a/test/CodeGen/X86/sincos-opt.ll b/test/CodeGen/X86/sincos-opt.ll index 2dc8816f840..434ee9b3d0c 100644 --- a/test/CodeGen/X86/sincos-opt.ll +++ b/test/CodeGen/X86/sincos-opt.ll @@ -15,7 +15,7 @@ entry: ; OSX_SINCOS-LABEL: test1: ; OSX_SINCOS: callq ___sincosf_stret -; OSX_SINCOS: pshufd $1, %xmm0, %xmm1 +; OSX_SINCOS: pshufd {{.*}} ## xmm1 = xmm0[1,1,2,3] ; OSX_SINCOS: addss %xmm0, %xmm1 ; OSX_NOOPT: test1 diff --git a/test/CodeGen/X86/splat-scalar-load.ll b/test/CodeGen/X86/splat-scalar-load.ll deleted file mode 100644 index 4d59b9cc2f6..00000000000 --- a/test/CodeGen/X86/splat-scalar-load.ll +++ /dev/null @@ -1,17 +0,0 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 -mcpu=nehalem | FileCheck %s -; rdar://7434544 - -define <2 x i64> @t2() nounwind { -entry: -; CHECK-LABEL: t2: -; CHECK: pshufd $85, (%esp), %xmm0 - %array = alloca [8 x float], align 4 - %arrayidx = getelementptr inbounds [8 x float]* %array, i32 0, i32 1 - %tmp2 = load float* %arrayidx - %vecinit = insertelement <4 x float> undef, float %tmp2, i32 0 - %vecinit5 = insertelement <4 x float> %vecinit, float %tmp2, i32 1 - %vecinit7 = insertelement <4 x float> %vecinit5, float %tmp2, i32 2 - %vecinit9 = insertelement <4 x float> %vecinit7, float %tmp2, i32 3 - %0 = bitcast <4 x float> %vecinit9 to <2 x i64> - ret <2 x i64> %0 -} diff --git a/test/CodeGen/X86/sse-align-12.ll b/test/CodeGen/X86/sse-align-12.ll index 4933969c09a..396da0f4895 100644 --- a/test/CodeGen/X86/sse-align-12.ll +++ b/test/CodeGen/X86/sse-align-12.ll @@ -3,8 +3,8 @@ define <4 x float> @a(<4 x float>* %y) nounwind { ; CHECK-LABEL: a: ; CHECK: # BB#0: -; CHECK-NEXT: movdqu (%rdi), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; CHECK-NEXT: movups (%rdi), %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; CHECK-NEXT: retq %x = load <4 x float>* %y, align 4 %a = extractelement <4 x float> %x, i32 0 diff --git a/test/CodeGen/X86/sse-scalar-fp-arith.ll b/test/CodeGen/X86/sse-scalar-fp-arith.ll index 415a4f12b2c..b122ef67544 100644 --- a/test/CodeGen/X86/sse-scalar-fp-arith.ll +++ b/test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -1,9 +1,6 @@ ; RUN: llc -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s -; RUN: llc -mcpu=x86-64 -mattr=+sse2 < %s -x86-experimental-vector-shuffle-lowering | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s ; RUN: llc -mcpu=x86-64 -mattr=+sse4.1 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE41 %s -; RUN: llc -mcpu=x86-64 -mattr=+sse4.1 < %s -x86-experimental-vector-shuffle-lowering | FileCheck --check-prefix=SSE --check-prefix=SSE41 %s ; RUN: llc -mcpu=x86-64 -mattr=+avx < %s | FileCheck --check-prefix=AVX %s -; RUN: llc -mcpu=x86-64 -mattr=+avx < %s -x86-experimental-vector-shuffle-lowering | FileCheck --check-prefix=AVX %s target triple = "x86_64-unknown-unknown" diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll index 6e099ff6f90..fd35e75d71a 100644 --- a/test/CodeGen/X86/sse1.ll +++ b/test/CodeGen/X86/sse1.ll @@ -15,9 +15,9 @@ define <2 x float> @test4(<2 x float> %A, <2 x float> %B) nounwind { ; CHECK-LABEL: test4: ; CHECK: # BB#0: # %entry ; CHECK-NEXT: movaps %xmm0, %xmm2 -; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0,0,0] +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] ; CHECK-NEXT: addss %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,0,0] +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] ; CHECK-NEXT: subss %xmm1, %xmm2 ; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-NEXT: ret diff --git a/test/CodeGen/X86/sse2-mul.ll b/test/CodeGen/X86/sse2-mul.ll deleted file mode 100644 index e066368dc73..00000000000 --- a/test/CodeGen/X86/sse2-mul.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s - -define <4 x i32> @test1(<4 x i32> %x, <4 x i32> %y) { - %m = mul <4 x i32> %x, %y - ret <4 x i32> %m -; CHECK-LABEL: test1: -; CHECK: pshufd $49 -; CHECK: pmuludq -; CHECK: pshufd $49 -; CHECK: pmuludq -; CHECK: shufps $-120 -; CHECK: pshufd $-40 -; CHECK: ret -} diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll index 853dda81f2e..b144c4f514f 100644 --- a/test/CodeGen/X86/sse2.ll +++ b/test/CodeGen/X86/sse2.ll @@ -62,8 +62,8 @@ define void @test4(<4 x float> %X, <4 x float>* %res) nounwind { ; CHECK-LABEL: test4: ; CHECK: ## BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,0] -; CHECK-NEXT: movdqa %xmm0, (%eax) +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; CHECK-NEXT: movaps %xmm0, (%eax) ; CHECK-NEXT: retl %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] store <4 x float> %tmp5, <4 x float>* %res @@ -178,10 +178,11 @@ define <2 x double> @test11(double %a, double %b) nounwind { define void @test12() nounwind { ; CHECK-LABEL: test12: ; CHECK: ## BB#0: -; CHECK-NEXT: movaps 0, %xmm0 -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; CHECK-NEXT: movapd 0, %xmm0 +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; CHECK-NEXT: movsd %xmm0, %xmm1 +; CHECK-NEXT: xorpd %xmm2, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; CHECK-NEXT: addps %xmm1, %xmm0 ; CHECK-NEXT: movaps %xmm0, 0 ; CHECK-NEXT: retl @@ -201,8 +202,8 @@ define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x fl ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK-NEXT: movaps (%edx), %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; CHECK-NEXT: movdqa %xmm0, (%eax) +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; CHECK-NEXT: movaps %xmm0, (%eax) ; CHECK-NEXT: retl %tmp3 = load <4 x float>* %B ; <<4 x float>> [#uses=1] %tmp5 = load <4 x float>* %C ; <<4 x float>> [#uses=1] @@ -221,7 +222,7 @@ define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind { ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: addps %xmm1, %xmm0 ; CHECK-NEXT: subps %xmm1, %xmm2 -; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; CHECK-NEXT: retl %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=2] %tmp5 = load <4 x float>* %x ; <<4 x float>> [#uses=2] @@ -236,9 +237,8 @@ define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind { ; CHECK: ## BB#0: ## %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movaps (%ecx), %xmm0 -; CHECK-NEXT: movaps (%eax), %xmm1 -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; CHECK-NEXT: movapd (%ecx), %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] ; CHECK-NEXT: retl entry: %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=1] @@ -283,7 +283,7 @@ define <4 x float> @f(<4 x double>) nounwind { ; CHECK: ## BB#0: ## %entry ; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1 ; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0 -; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retl entry: %double2float.i = fptrunc <4 x double> %0 to <4 x float> @@ -302,17 +302,17 @@ define <2 x i64> @test_insert_64_zext(<2 x i64> %i) { define <4 x i32> @PR19721(<4 x i32> %i) { ; CHECK-LABEL: PR19721: ; CHECK: ## BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,0,0] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; CHECK-NEXT: movd %xmm1, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,0,0,0] -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movd %xmm0, %ecx -; CHECK-NEXT: movd %xmm1, %edx -; CHECK-NEXT: movd %edx, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; CHECK-NEXT: movd %xmm1, %ecx +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: movss %xmm1, %xmm0 ; CHECK-NEXT: movd %ecx, %xmm1 -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1] +; CHECK-NEXT: movd %eax, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,1] ; CHECK-NEXT: retl %bc = bitcast <4 x i32> %i to i128 %insert = and i128 %bc, -4294967296 @@ -323,12 +323,12 @@ define <4 x i32> @PR19721(<4 x i32> %i) { define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: test_mul: ; CHECK: ## BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,0] +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; CHECK-NEXT: pmuludq %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,0] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; CHECK-NEXT: retl %m = mul <4 x i32> %x, %y ret <4 x i32> %m diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll index 9bfea88462d..5fdc8efc555 100644 --- a/test/CodeGen/X86/sse3.ll +++ b/test/CodeGen/X86/sse3.ll @@ -8,8 +8,8 @@ define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind { ; X64-LABEL: t0: ; X64: ## BB#0: ## %entry -; X64-NEXT: movdqa (%rsi), %xmm0 -; X64-NEXT: pslldq $2, %xmm0 +; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq entry: @@ -26,7 +26,13 @@ define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind { ; X64-LABEL: t1: ; X64: ## BB#0: ; X64-NEXT: movdqa (%rdi), %xmm0 -; X64-NEXT: pinsrw $0, (%rsi), %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7] +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq %tmp1 = load <8 x i16>* %A %tmp2 = load <8 x i16>* %B @@ -38,9 +44,11 @@ define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind { define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind { ; X64-LABEL: t2: ; X64: ## BB#0: -; X64-NEXT: pextrw $1, %xmm1, %eax -; X64-NEXT: pinsrw $0, %eax, %xmm0 -; X64-NEXT: pinsrw $3, %eax, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,0,3,4,5,6,7] +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; X64-NEXT: retq %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 > ret <8 x i16> %tmp @@ -49,10 +57,11 @@ define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind { define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind { ; X64-LABEL: t3: ; X64: ## BB#0: -; X64-NEXT: pextrw $5, %xmm0, %eax -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,0,4,5,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] -; X64-NEXT: pinsrw $3, %eax, %xmm0 ; X64-NEXT: retq %tmp = shufflevector <8 x i16> %A, <8 x i16> %A, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 > ret <8 x i16> %tmp @@ -61,12 +70,10 @@ define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind { define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind { ; X64-LABEL: t4: ; X64: ## BB#0: -; X64-NEXT: pextrw $7, %xmm0, %eax -; X64-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,6,5] -; X64-NEXT: pinsrw $1, %eax, %xmm1 -; X64-NEXT: pextrw $1, %xmm0, %eax -; X64-NEXT: pinsrw $4, %eax, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,4,7] ; X64-NEXT: retq %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 > ret <8 x i16> %tmp @@ -75,8 +82,8 @@ define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind { define <8 x i16> @t5(<8 x i16> %A, <8 x i16> %B) nounwind { ; X64-LABEL: t5: ; X64: ## BB#0: -; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: retq %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 > ret <8 x i16> %tmp @@ -134,9 +141,9 @@ define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind { define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind { ; X64-LABEL: t9: ; X64: ## BB#0: -; X64-NEXT: movaps (%rdi), %xmm0 -; X64-NEXT: movhps (%rsi), %xmm0 -; X64-NEXT: movaps %xmm0, (%rdi) +; X64-NEXT: movapd (%rdi), %xmm0 +; X64-NEXT: movhpd (%rsi), %xmm0 +; X64-NEXT: movapd %xmm0, (%rdi) ; X64-NEXT: retq %tmp = load <4 x float>* %r %tmp.upgrd.3 = bitcast <2 x i32>* %A to double* @@ -168,13 +175,9 @@ define void @t10() nounwind { ; X64-LABEL: t10: ; X64: ## BB#0: ; X64-NEXT: movq _g1@{{.*}}(%rip), %rax -; X64-NEXT: movdqa (%rax), %xmm0 -; X64-NEXT: pextrw $4, %xmm0, %eax -; X64-NEXT: pextrw $6, %xmm0, %ecx -; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,0,0,4,5,6,7] -; X64-NEXT: pinsrw $2, %eax, %xmm0 -; X64-NEXT: pinsrw $3, %ecx, %xmm0 +; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: movq _g2@{{.*}}(%rip), %rax ; X64-NEXT: movq %xmm0, (%rax) ; X64-NEXT: retq @@ -192,10 +195,8 @@ define void @t10() nounwind { define <8 x i16> @t11(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { ; X64-LABEL: t11: ; X64: ## BB#0: ## %entry -; X64-NEXT: movd %xmm1, %eax -; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,0,0,4,5,6,7] -; X64-NEXT: pinsrw $1, %eax, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; X64-NEXT: retq entry: %tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > @@ -206,10 +207,9 @@ entry: define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { ; X64-LABEL: t12: ; X64: ## BB#0: ## %entry -; X64-NEXT: pextrw $3, %xmm1, %eax -; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] -; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,4,4] -; X64-NEXT: pinsrw $5, %eax, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7] ; X64-NEXT: retq entry: %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef > @@ -220,10 +220,9 @@ entry: define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { ; X64-LABEL: t13: ; X64: ## BB#0: ## %entry -; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-NEXT: pextrw $3, %xmm1, %eax -; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,7,4,4] -; X64-NEXT: pinsrw $4, %eax, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7] ; X64-NEXT: retq entry: %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef > @@ -233,8 +232,8 @@ entry: define <8 x i16> @t14(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { ; X64-LABEL: t14: ; X64: ## BB#0: ## %entry -; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,4,4] +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; X64-NEXT: retq entry: %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef > @@ -245,10 +244,12 @@ entry: define <8 x i16> @t15(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { ; X64-LABEL: t15: ; X64: ## BB#0: ## %entry -; X64-NEXT: pextrw $7, %xmm0, %eax -; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] -; X64-NEXT: pinsrw $2, %eax, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] ; X64-NEXT: retq entry: %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef > @@ -259,13 +260,17 @@ entry: define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone { ; X64-LABEL: t16: ; X64: ## BB#0: ## %entry -; X64-NEXT: pextrw $8, %xmm0, %eax -; X64-NEXT: pslldq $2, %xmm0 -; X64-NEXT: andl $65280, %eax ## imm = 0xFF00 -; X64-NEXT: pextrw $1, %xmm0, %ecx -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orl %eax, %ecx -; X64-NEXT: pinsrw $1, %ecx, %xmm0 +; X64-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0] +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: packuswb %xmm0, %xmm0 ; X64-NEXT: retq entry: %tmp8 = shufflevector <16 x i8> , <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll index c8e509ce69e..76fcff63501 100644 --- a/test/CodeGen/X86/sse41.ll +++ b/test/CodeGen/X86/sse41.ll @@ -136,7 +136,7 @@ define float @ext_1(<4 x float> %v) nounwind { ; X32-LABEL: ext_1: ; X32: ## BB#0: ; X32-NEXT: pushl %eax -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; X32-NEXT: addss LCPI7_0, %xmm0 ; X32-NEXT: movss %xmm0, (%esp) ; X32-NEXT: flds (%esp) @@ -145,7 +145,7 @@ define float @ext_1(<4 x float> %v) nounwind { ; ; X64-LABEL: ext_1: ; X64: ## BB#0: -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; X64-NEXT: addss {{.*}}(%rip), %xmm0 ; X64-NEXT: retq %s = extractelement <4 x float> %v, i32 3 @@ -156,7 +156,7 @@ define float @ext_2(<4 x float> %v) nounwind { ; X32-LABEL: ext_2: ; X32: ## BB#0: ; X32-NEXT: pushl %eax -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; X32-NEXT: movss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax @@ -164,7 +164,7 @@ define float @ext_2(<4 x float> %v) nounwind { ; ; X64-LABEL: ext_2: ; X64: ## BB#0: -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; X64-NEXT: retq %s = extractelement <4 x float> %v, i32 3 ret float %s @@ -291,20 +291,20 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { ; X32-LABEL: buildvector: ; X32: ## BB#0: ## %entry -; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,0,0] -; X32-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,0,0,0] +; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; X32-NEXT: addss %xmm1, %xmm0 -; X32-NEXT: addss %xmm2, %xmm3 -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X32-NEXT: addss %xmm2, %xmm1 +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; X32-NEXT: retl ; ; X64-LABEL: buildvector: ; X64: ## BB#0: ## %entry -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,0,0] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,0,0,0] +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] ; X64-NEXT: addss %xmm1, %xmm0 -; X64-NEXT: addss %xmm2, %xmm3 -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X64-NEXT: addss %xmm2, %xmm1 +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; X64-NEXT: retq entry: %tmp7 = extractelement <2 x float> %A, i32 0 @@ -356,12 +356,12 @@ define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocaptu ; X32-LABEL: pinsrd_from_shufflevector_i32: ; X32: ## BB#0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pinsrd $3, (%eax), %xmm0 +; X32-NEXT: insertps $48, (%eax), %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: pinsrd_from_shufflevector_i32: ; X64: ## BB#0: ## %entry -; X64-NEXT: pinsrd $3, (%rdi), %xmm0 +; X64-NEXT: insertps $48, (%rdi), %xmm0 ; X64-NEXT: retq entry: %0 = load <4 x i32>* %pb, align 16 @@ -522,16 +522,16 @@ define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) { ; X32-LABEL: shuf_X00A: ; X32: ## BB#0: ; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: blendps $1, %xmm0, %xmm2 -; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[0] +; X32-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],zero,zero,xmm1[0] ; X32-NEXT: movaps %xmm2, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: shuf_X00A: ; X64: ## BB#0: ; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: blendps $1, %xmm0, %xmm2 -; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[0] +; X64-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],zero,zero,xmm1[0] ; X64-NEXT: movaps %xmm2, %xmm0 ; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 @@ -546,16 +546,16 @@ define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) { ; X32-LABEL: shuf_X00X: ; X32: ## BB#0: ; X32-NEXT: xorps %xmm1, %xmm1 -; X32-NEXT: blendps $1, %xmm0, %xmm1 -; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] +; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,zero,xmm0[0] ; X32-NEXT: movaps %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: shuf_X00X: ; X64: ## BB#0: ; X64-NEXT: xorps %xmm1, %xmm1 -; X64-NEXT: blendps $1, %xmm0, %xmm1 -; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] +; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,zero,xmm0[0] ; X64-NEXT: movaps %xmm1, %xmm0 ; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 @@ -570,8 +570,8 @@ define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { ; X32-LABEL: shuf_X0YC: ; X32: ## BB#0: ; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: blendps $1, %xmm0, %xmm2 -; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,0] +; X32-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],zero,xmm0[1],zero ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2] ; X32-NEXT: movaps %xmm2, %xmm0 ; X32-NEXT: retl @@ -579,8 +579,8 @@ define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { ; X64-LABEL: shuf_X0YC: ; X64: ## BB#0: ; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: blendps $1, %xmm0, %xmm2 -; X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,0] +; X64-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],zero,xmm0[1],zero ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2] ; X64-NEXT: movaps %xmm2, %xmm0 ; X64-NEXT: retq @@ -692,7 +692,7 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { ; X32-LABEL: i32_shuf_X00A: ; X32: ## BB#0: ; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: blendps $1, %xmm0, %xmm2 +; X32-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[0] ; X32-NEXT: movaps %xmm2, %xmm0 ; X32-NEXT: retl @@ -700,7 +700,7 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { ; X64-LABEL: i32_shuf_X00A: ; X64: ## BB#0: ; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: blendps $1, %xmm0, %xmm2 +; X64-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[0] ; X64-NEXT: movaps %xmm2, %xmm0 ; X64-NEXT: retq @@ -716,7 +716,7 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { ; X32-LABEL: i32_shuf_X00X: ; X32: ## BB#0: ; X32-NEXT: xorps %xmm1, %xmm1 -; X32-NEXT: blendps $1, %xmm0, %xmm1 +; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] ; X32-NEXT: movaps %xmm1, %xmm0 ; X32-NEXT: retl @@ -724,7 +724,7 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { ; X64-LABEL: i32_shuf_X00X: ; X64: ## BB#0: ; X64-NEXT: xorps %xmm1, %xmm1 -; X64-NEXT: blendps $1, %xmm0, %xmm1 +; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] ; X64-NEXT: movaps %xmm1, %xmm0 ; X64-NEXT: retq @@ -740,8 +740,8 @@ define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { ; X32-LABEL: i32_shuf_X0YC: ; X32: ## BB#0: ; X32-NEXT: xorps %xmm2, %xmm2 -; X32-NEXT: blendps $1, %xmm0, %xmm2 -; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,0] +; X32-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1],zero ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2] ; X32-NEXT: movaps %xmm2, %xmm0 ; X32-NEXT: retl @@ -749,8 +749,8 @@ define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { ; X64-LABEL: i32_shuf_X0YC: ; X64: ## BB#0: ; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: blendps $1, %xmm0, %xmm2 -; X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,0] +; X64-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1],zero ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2] ; X64-NEXT: movaps %xmm2, %xmm0 ; X64-NEXT: retq @@ -870,12 +870,16 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: insertps $48, (%ecx,%eax,4), %xmm0 +; X32-NEXT: movss (%ecx,%eax,4), %xmm1 +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_broadcast_loadf32: ; X64: ## BB#0: -; X64-NEXT: insertps $48, (%rdi,%rsi,4), %xmm0 +; X64-NEXT: movss (%rdi,%rsi,4), %xmm1 +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; X64-NEXT: retq %1 = getelementptr inbounds float* %fb, i64 %index %2 = load float* %1, align 4 @@ -891,12 +895,16 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float ; X32-LABEL: insertps_from_broadcast_loadv4f32: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: insertps $48, (%eax), %xmm0 +; X32-NEXT: movups (%eax), %xmm1 +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_broadcast_loadv4f32: ; X64: ## BB#0: -; X64-NEXT: insertps $48, (%rdi), %xmm0 +; X64-NEXT: movups (%rdi), %xmm1 +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; X64-NEXT: retq %1 = load <4 x float>* %b, align 4 %2 = extractelement <4 x float> %1, i32 0 @@ -915,7 +923,7 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movss (%ecx,%eax,4), %xmm4 -; X32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; X32-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] @@ -928,7 +936,7 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X64-LABEL: insertps_from_broadcast_multiple_use: ; X64: ## BB#0: ; X64-NEXT: movss (%rdi,%rsi,4), %xmm4 -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; X64-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] @@ -958,14 +966,14 @@ define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) { ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movss (%eax), %xmm1 -; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3] +; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm0[0],xmm1[3] ; X32-NEXT: movaps %xmm1, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: insertps_with_undefs: ; X64: ## BB#0: ; X64-NEXT: movss (%rdi), %xmm1 -; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3] +; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm0[0],xmm1[3] ; X64-NEXT: movaps %xmm1, %xmm0 ; X64-NEXT: retq %1 = load float* %b, align 4 @@ -980,12 +988,12 @@ define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) { ; X32-LABEL: pr20087: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: insertps $48, 8(%eax), %xmm0 +; X32-NEXT: insertps $-78, 8(%eax), %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: pr20087: ; X64: ## BB#0: -; X64-NEXT: insertps $48, 8(%rdi), %xmm0 +; X64-NEXT: insertps $-78, 8(%rdi), %xmm0 ; X64-NEXT: retq %load = load <4 x float> *%ptr %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> @@ -997,18 +1005,16 @@ define void @insertps_pr20411(i32* noalias nocapture %RET) #1 { ; X32-LABEL: insertps_pr20411: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movaps {{.*#+}} xmm0 = [4,5,6,7] -; X32-NEXT: pshufd {{.*#+}} xmm1 = mem[3,0,0,0] -; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[3],xmm1[2,3] -; X32-NEXT: movups %xmm1, (%eax) +; X32-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] +; X32-NEXT: insertps $-36, LCPI49_1+12, %xmm0 +; X32-NEXT: movups %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: insertps_pr20411: ; X64: ## BB#0: -; X64-NEXT: movaps {{.*#+}} xmm0 = [4,5,6,7] -; X64-NEXT: pshufd {{.*#+}} xmm1 = mem[3,0,0,0] -; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[3],xmm1[2,3] -; X64-NEXT: movups %xmm1, (%rdi) +; X64-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] +; X64-NEXT: insertps $-36, LCPI49_1+{{.*}}(%rip), %xmm0 +; X64-NEXT: movups %xmm0, (%rdi) ; X64-NEXT: retq %gather_load = shufflevector <8 x i32> , <8 x i32> undef, <8 x i32> %shuffle109 = shufflevector <4 x i32> , <4 x i32> undef, <4 x i32> ; 4 5 6 7 diff --git a/test/CodeGen/X86/swizzle-2.ll b/test/CodeGen/X86/swizzle-2.ll index 5472193a600..697af843abb 100644 --- a/test/CodeGen/X86/swizzle-2.ll +++ b/test/CodeGen/X86/swizzle-2.ll @@ -151,7 +151,7 @@ define <4 x i32> @swizzle_14(<4 x i32> %v) { define <4 x float> @swizzle_15(<4 x float> %v) { ; CHECK-LABEL: swizzle_15: ; CHECK: # BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,3,2] ; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> @@ -161,7 +161,7 @@ define <4 x float> @swizzle_15(<4 x float> %v) { define <4 x float> @swizzle_16(<4 x float> %v) { ; CHECK-LABEL: swizzle_16: ; CHECK: # BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,0] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,0] ; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> @@ -171,7 +171,7 @@ define <4 x float> @swizzle_16(<4 x float> %v) { define <4 x float> @swizzle_17(<4 x float> %v) { ; CHECK-LABEL: swizzle_17: ; CHECK: # BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,3,2] ; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> @@ -181,7 +181,7 @@ define <4 x float> @swizzle_17(<4 x float> %v) { define <4 x float> @swizzle_18(<4 x float> %v) { ; CHECK-LABEL: swizzle_18: ; CHECK: # BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,0,2] ; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> @@ -191,7 +191,7 @@ define <4 x float> @swizzle_18(<4 x float> %v) { define <4 x float> @swizzle_19(<4 x float> %v) { ; CHECK-LABEL: swizzle_19: ; CHECK: # BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> @@ -201,7 +201,7 @@ define <4 x float> @swizzle_19(<4 x float> %v) { define <4 x float> @swizzle_20(<4 x float> %v) { ; CHECK-LABEL: swizzle_20: ; CHECK: # BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,1,3] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] ; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> @@ -211,7 +211,7 @@ define <4 x float> @swizzle_20(<4 x float> %v) { define <4 x float> @swizzle_21(<4 x float> %v) { ; CHECK-LABEL: swizzle_21: ; CHECK: # BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,1] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] ; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> @@ -221,7 +221,7 @@ define <4 x float> @swizzle_21(<4 x float> %v) { define <4 x float> @swizzle_22(<4 x float> %v) { ; CHECK-LABEL: swizzle_22: ; CHECK: # BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,0] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,0] ; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> @@ -231,7 +231,7 @@ define <4 x float> @swizzle_22(<4 x float> %v) { define <4 x float> @swizzle_23(<4 x float> %v) { ; CHECK-LABEL: swizzle_23: ; CHECK: # BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> @@ -241,7 +241,7 @@ define <4 x float> @swizzle_23(<4 x float> %v) { define <4 x float> @swizzle_24(<4 x float> %v) { ; CHECK-LABEL: swizzle_24: ; CHECK: # BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,0,3] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2,0,3] ; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> @@ -251,7 +251,7 @@ define <4 x float> @swizzle_24(<4 x float> %v) { define <4 x float> @swizzle_25(<4 x float> %v) { ; CHECK-LABEL: swizzle_25: ; CHECK: # BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> @@ -261,7 +261,7 @@ define <4 x float> @swizzle_25(<4 x float> %v) { define <4 x float> @swizzle_26(<4 x float> %v) { ; CHECK-LABEL: swizzle_26: ; CHECK: # BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,1,2] ; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> @@ -271,7 +271,7 @@ define <4 x float> @swizzle_26(<4 x float> %v) { define <4 x float> @swizzle_27(<4 x float> %v) { ; CHECK-LABEL: swizzle_27: ; CHECK: # BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> @@ -281,7 +281,7 @@ define <4 x float> @swizzle_27(<4 x float> %v) { define <4 x float> @swizzle_28(<4 x float> %v) { ; CHECK-LABEL: swizzle_28: ; CHECK: # BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,2,1] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,2,1] ; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> @@ -291,7 +291,7 @@ define <4 x float> @swizzle_28(<4 x float> %v) { define <4 x float> @swizzle_29(<4 x float> %v) { ; CHECK-LABEL: swizzle_29: ; CHECK: # BB#0: -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,0] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,0] ; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> @@ -334,7 +334,8 @@ define <8 x i16> @swizzle_32(<8 x i16> %v) { define <8 x i16> @swizzle_33(<8 x i16> %v) { ; CHECK-LABEL: swizzle_33: ; CHECK: # BB#0: -; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,6,7,0,1,10,11,14,15,12,13,8,9] +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4] ; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> @@ -344,7 +345,8 @@ define <8 x i16> @swizzle_33(<8 x i16> %v) { define <8 x i16> @swizzle_34(<8 x i16> %v) { ; CHECK-LABEL: swizzle_34: ; CHECK: # BB#0: -; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,0,1,4,5,14,15,12,13,8,9,10,11] +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,0,2,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5] ; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> @@ -354,7 +356,8 @@ define <8 x i16> @swizzle_34(<8 x i16> %v) { define <8 x i16> @swizzle_35(<8 x i16> %v) { ; CHECK-LABEL: swizzle_35: ; CHECK: # BB#0: -; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,8,9,10,11,14,15,12,13] +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] ; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> @@ -364,7 +367,8 @@ define <8 x i16> @swizzle_35(<8 x i16> %v) { define <8 x i16> @swizzle_36(<8 x i16> %v) { ; CHECK-LABEL: swizzle_36: ; CHECK: # BB#0: -; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,8,9,12,13,10,11,14,15] +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] ; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> @@ -384,7 +388,8 @@ define <8 x i16> @swizzle_37(<8 x i16> %v) { define <8 x i16> @swizzle_38(<8 x i16> %v) { ; CHECK-LABEL: swizzle_38: ; CHECK: # BB#0: -; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,10,11,8,9,12,13,14,15] +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] ; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> @@ -394,7 +399,8 @@ define <8 x i16> @swizzle_38(<8 x i16> %v) { define <8 x i16> @swizzle_39(<8 x i16> %v) { ; CHECK-LABEL: swizzle_39: ; CHECK: # BB#0: -; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,2,3,0,1,14,15,12,13,8,9,10,11] +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,3,1,0,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5] ; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> @@ -404,7 +410,8 @@ define <8 x i16> @swizzle_39(<8 x i16> %v) { define <8 x i16> @swizzle_40(<8 x i16> %v) { ; CHECK-LABEL: swizzle_40: ; CHECK: # BB#0: -; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,0,1,8,9,12,13,10,11,14,15] +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] ; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> @@ -414,7 +421,8 @@ define <8 x i16> @swizzle_40(<8 x i16> %v) { define <8 x i16> @swizzle_41(<8 x i16> %v) { ; CHECK-LABEL: swizzle_41: ; CHECK: # BB#0: -; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,4,5,2,3,0,1,12,13,14,15,8,9,10,11] +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] ; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> diff --git a/test/CodeGen/X86/swizzle.ll b/test/CodeGen/X86/swizzle.ll deleted file mode 100644 index 23e0c2453d6..00000000000 --- a/test/CodeGen/X86/swizzle.ll +++ /dev/null @@ -1,19 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movlps -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movsd -; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep movups -; rdar://6523650 - - %struct.vector4_t = type { <4 x float> } - -define void @swizzle(i8* nocapture %a, %struct.vector4_t* nocapture %b, %struct.vector4_t* nocapture %c) nounwind { -entry: - %0 = getelementptr %struct.vector4_t* %b, i32 0, i32 0 ; <<4 x float>*> [#uses=2] - %1 = load <4 x float>* %0, align 4 ; <<4 x float>> [#uses=1] - %tmp.i = bitcast i8* %a to double* ; [#uses=1] - %tmp1.i = load double* %tmp.i ; [#uses=1] - %2 = insertelement <2 x double> undef, double %tmp1.i, i32 0 ; <<2 x double>> [#uses=1] - %tmp2.i = bitcast <2 x double> %2 to <4 x float> ; <<4 x float>> [#uses=1] - %3 = shufflevector <4 x float> %1, <4 x float> %tmp2.i, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x float>> [#uses=1] - store <4 x float> %3, <4 x float>* %0, align 4 - ret void -} diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll index b981871d94b..8de6297906c 100644 --- a/test/CodeGen/X86/trunc-ext-ld-st.ll +++ b/test/CodeGen/X86/trunc-ext-ld-st.ll @@ -20,7 +20,7 @@ define void @load_2_i8(<2 x i8>* %A) { ; Read 32-bits ;CHECK: pmovzxwq ;CHECK: paddq -;CHECK: pshufb +;CHECK: pshufd ;CHECK: movd ;CHECK: ret define void @load_2_i16(<2 x i16>* %A) { diff --git a/test/CodeGen/X86/uint_to_fp-2.ll b/test/CodeGen/X86/uint_to_fp-2.ll index f8c8d4b5acb..e47f15453ed 100644 --- a/test/CodeGen/X86/uint_to_fp-2.ll +++ b/test/CodeGen/X86/uint_to_fp-2.ll @@ -5,7 +5,7 @@ define float @test1(i32 %x) nounwind readnone { ; CHECK-LABEL: test1: ; CHECK: # BB#0: # %entry ; CHECK-NEXT: pushl %eax -; CHECK-NEXT: movsd {{.*}}, %xmm0 +; CHECK-NEXT: movsd .LCPI0_0, %xmm0 ; CHECK-NEXT: movd {{[0-9]+}}(%esp), %xmm1 ; CHECK-NEXT: orps %xmm0, %xmm1 ; CHECK-NEXT: subsd %xmm0, %xmm1 @@ -27,7 +27,7 @@ define float @test2(<4 x i32> %x) nounwind readnone ssp { ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: movss %xmm0, %xmm1 -; CHECK-NEXT: movsd {{.*}}, %xmm0 +; CHECK-NEXT: movsd .LCPI1_0, %xmm0 ; CHECK-NEXT: orps %xmm0, %xmm1 ; CHECK-NEXT: subsd %xmm0, %xmm1 ; CHECK-NEXT: xorps %xmm0, %xmm0 diff --git a/test/CodeGen/X86/v2f32.ll b/test/CodeGen/X86/v2f32.ll index e5ad698258f..d73b9da0f2c 100644 --- a/test/CodeGen/X86/v2f32.ll +++ b/test/CodeGen/X86/v2f32.ll @@ -5,7 +5,7 @@ define void @test1(<2 x float> %Q, float *%P2) nounwind { ; X64-LABEL: test1: ; X64: # BB#0: -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,0,0] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-NEXT: addss %xmm0, %xmm1 ; X64-NEXT: movss %xmm1, (%rdi) ; X64-NEXT: retq @@ -13,7 +13,7 @@ define void @test1(<2 x float> %Q, float *%P2) nounwind { ; X32-LABEL: test1: ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,0,0] +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X32-NEXT: addss %xmm0, %xmm1 ; X32-NEXT: movss %xmm1, (%eax) ; X32-NEXT: retl diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll index 343856b290b..24830e046ea 100644 --- a/test/CodeGen/X86/vec_cast2.ll +++ b/test/CodeGen/X86/vec_cast2.ll @@ -19,7 +19,8 @@ define <8 x float> @foo1_8(<8 x i8> %src) { ; CHECK-WIDE-NEXT: vpmovzxbd %xmm0, %xmm1 ; CHECK-WIDE-NEXT: vpslld $24, %xmm1, %xmm1 ; CHECK-WIDE-NEXT: vpsrad $24, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-WIDE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0 ; CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0 ; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -51,18 +52,31 @@ define <4 x float> @foo1_4(<4 x i8> %src) { define <8 x float> @foo2_8(<8 x i8> %src) { ; CHECK-LABEL: foo2_8: ; CHECK: ## BB#0: -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] -; CHECK-NEXT: vpmovzxwd %xmm0, %xmm0 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpmovzxwd %xmm0, %xmm1 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: vandps LCPI2_0, %ymm0, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: retl ; ; CHECK-WIDE-LABEL: foo2_8: ; CHECK-WIDE: ## BB#0: -; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-WIDE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-WIDE-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; CHECK-WIDE-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; CHECK-WIDE-NEXT: vmovdqa {{.*#+}} xmm5 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; CHECK-WIDE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u] +; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; CHECK-WIDE-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; CHECK-WIDE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpmovzxbd %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-WIDE-NEXT: retl %res = uitofp <8 x i8> %src to <8 x float> @@ -78,10 +92,7 @@ define <4 x float> @foo2_4(<4 x i8> %src) { ; ; CHECK-WIDE-LABEL: foo2_4: ; CHECK-WIDE: ## BB#0: -; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; CHECK-WIDE-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3],zero,xmm1[5,6,7],zero,xmm1[9,10,11],zero,xmm1[13,14,15] -; CHECK-WIDE-NEXT: vpor %xmm0, %xmm1, %xmm0 +; CHECK-WIDE-NEXT: vpmovzxbd %xmm0, %xmm0 ; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-WIDE-NEXT: retl %res = uitofp <4 x i8> %src to <4 x float> @@ -93,23 +104,23 @@ define <8 x i8> @foo3_8(<8 x float> %src) { ; CHECK: ## BB#0: ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl ; ; CHECK-WIDE-LABEL: foo3_8: ; CHECK-WIDE: ## BB#0: -; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,0,0,0] +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] ; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax ; CHECK-WIDE-NEXT: shll $8, %eax -; CHECK-WIDE-NEXT: vmovhlps {{.*#+}} xmm1 = xmm0[1,1] +; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx ; CHECK-WIDE-NEXT: movzbl %cl, %ecx ; CHECK-WIDE-NEXT: orl %eax, %ecx -; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,0,0] +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax ; CHECK-WIDE-NEXT: shll $8, %eax ; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx @@ -118,17 +129,17 @@ define <8 x i8> @foo3_8(<8 x float> %src) { ; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 ; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 ; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,0,0,0] +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3] ; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax ; CHECK-WIDE-NEXT: shll $8, %eax ; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx ; CHECK-WIDE-NEXT: movzbl %cl, %ecx ; CHECK-WIDE-NEXT: orl %eax, %ecx ; CHECK-WIDE-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,0,0,0] +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] ; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax ; CHECK-WIDE-NEXT: shll $8, %eax -; CHECK-WIDE-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx ; CHECK-WIDE-NEXT: movzbl %cl, %ecx ; CHECK-WIDE-NEXT: orl %eax, %ecx @@ -147,14 +158,14 @@ define <4 x i8> @foo3_4(<4 x float> %src) { ; ; CHECK-WIDE-LABEL: foo3_4: ; CHECK-WIDE: ## BB#0: -; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,0,0,0] +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] ; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax ; CHECK-WIDE-NEXT: shll $8, %eax -; CHECK-WIDE-NEXT: vmovhlps {{.*#+}} xmm1 = xmm0[1,1] +; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx ; CHECK-WIDE-NEXT: movzbl %cl, %ecx ; CHECK-WIDE-NEXT: orl %eax, %ecx -; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,0,0] +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] ; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax ; CHECK-WIDE-NEXT: shll $8, %eax ; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx diff --git a/test/CodeGen/X86/vec_extract-sse4.ll b/test/CodeGen/X86/vec_extract-sse4.ll index 51f7a9898fa..530911add12 100644 --- a/test/CodeGen/X86/vec_extract-sse4.ll +++ b/test/CodeGen/X86/vec_extract-sse4.ll @@ -1,11 +1,12 @@ ; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse4.1 | FileCheck %s define void @t1(float* %R, <4 x float>* %P1) nounwind { -; CHECK-LABEL: @t1 -; CHECK: movl 4(%esp), %[[R0:e[abcd]x]] -; CHECK-NEXT: movl 8(%esp), %[[R1:e[abcd]x]] -; CHECK-NEXT: movss 12(%[[R1]]), %[[R2:xmm.*]] -; CHECK-NEXT: movss %[[R2]], (%[[R0]]) +; CHECK-LABEL: t1: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movss 12(%ecx), %xmm0 +; CHECK-NEXT: movss %xmm0, (%eax) ; CHECK-NEXT: retl %X = load <4 x float>* %P1 @@ -15,9 +16,15 @@ define void @t1(float* %R, <4 x float>* %P1) nounwind { } define float @t2(<4 x float>* %P1) nounwind { -; CHECK-LABEL: @t2 -; CHECK: movl 4(%esp), %[[R0:e[abcd]x]] -; CHECK-NEXT: flds 8(%[[R0]]) +; CHECK-LABEL: t2: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movapd (%eax), %xmm0 +; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: movss %xmm0, (%esp) +; CHECK-NEXT: flds (%esp) +; CHECK-NEXT: popl %eax ; CHECK-NEXT: retl %X = load <4 x float>* %P1 @@ -26,11 +33,12 @@ define float @t2(<4 x float>* %P1) nounwind { } define void @t3(i32* %R, <4 x i32>* %P1) nounwind { -; CHECK-LABEL: @t3 -; CHECK: movl 4(%esp), %[[R0:e[abcd]x]] -; CHECK-NEXT: movl 8(%esp), %[[R1:e[abcd]x]] -; CHECK-NEXT: movl 12(%[[R1]]), %[[R2:e[abcd]x]] -; CHECK-NEXT: movl %[[R2]], (%[[R0]]) +; CHECK-LABEL: t3: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl 12(%ecx), %ecx +; CHECK-NEXT: movl %ecx, (%eax) ; CHECK-NEXT: retl %X = load <4 x i32>* %P1 @@ -40,9 +48,10 @@ define void @t3(i32* %R, <4 x i32>* %P1) nounwind { } define i32 @t4(<4 x i32>* %P1) nounwind { -; CHECK-LABEL: @t4 -; CHECK: movl 4(%esp), %[[R0:e[abcd]x]] -; CHECK-NEXT: movl 12(%[[R0]]), %eax +; CHECK-LABEL: t4: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl 12(%eax), %eax ; CHECK-NEXT: retl %X = load <4 x i32>* %P1 diff --git a/test/CodeGen/X86/vec_extract.ll b/test/CodeGen/X86/vec_extract.ll index 6391ef61682..f677c00355e 100644 --- a/test/CodeGen/X86/vec_extract.ll +++ b/test/CodeGen/X86/vec_extract.ll @@ -2,8 +2,12 @@ define void @test1(<4 x float>* %F, float* %f) nounwind { ; CHECK-LABEL: test1: -; CHECK: addps %[[X:xmm[0-9]+]], %[[X]] -; CHECK-NEXT: movss %[[X]], {{.*}}(%{{.*}}) +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movaps (%ecx), %xmm0 +; CHECK-NEXT: addps %xmm0, %xmm0 +; CHECK-NEXT: movss %xmm0, (%eax) ; CHECK-NEXT: retl entry: %tmp = load <4 x float>* %F ; <<4 x float>> [#uses=2] @@ -15,10 +19,16 @@ entry: define float @test2(<4 x float>* %F, float* %f) nounwind { ; CHECK-LABEL: test2: -; CHECK: addps %[[X:xmm[0-9]+]], %[[X]] -; CHECK-NEXT: movhlps %[[X]], %[[X2:xmm[0-9]+]] -; CHECK-NEXT: movss %[[X2]], [[mem:.*\(%.*\)]] -; CHECK-NEXT: flds [[mem]] +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movaps (%eax), %xmm0 +; CHECK-NEXT: addps %xmm0, %xmm0 +; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: movss %xmm0, (%esp) +; CHECK-NEXT: flds (%esp) +; CHECK-NEXT: popl %eax +; CHECK-NEXT: retl entry: %tmp = load <4 x float>* %F ; <<4 x float>> [#uses=2] %tmp7 = fadd <4 x float> %tmp, %tmp ; <<4 x float>> [#uses=1] @@ -28,8 +38,11 @@ entry: define void @test3(float* %R, <4 x float>* %P1) nounwind { ; CHECK-LABEL: test3: -; CHECK: movss {{.*}}(%{{.*}}), %[[X:xmm[0-9]+]] -; CHECK-NEXT: movss %[[X]], {{.*}}(%{{.*}}) +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movss 12(%ecx), %xmm0 +; CHECK-NEXT: movss %xmm0, (%eax) ; CHECK-NEXT: retl entry: %X = load <4 x float>* %P1 ; <<4 x float>> [#uses=1] @@ -40,11 +53,15 @@ entry: define double @test4(double %A) nounwind { ; CHECK-LABEL: test4: -; CHECK: calll {{.*}}foo -; CHECK-NEXT: movhlps %[[X:xmm[0-9]+]], %[[X]] -; CHECK-NEXT: addsd {{.*}}(%{{.*}}), %[[X2]] -; CHECK-NEXT: movsd %[[X2]], [[mem:.*\(%.*\)]] -; CHECK-NEXT: fldl [[mem]] +; CHECK: # BB#0: # %entry +; CHECK-NEXT: subl $12, %esp +; CHECK-NEXT: calll foo +; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: addsd {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: movsd %xmm0, (%esp) +; CHECK-NEXT: fldl (%esp) +; CHECK-NEXT: addl $12, %esp +; CHECK-NEXT: retl entry: %tmp1 = call <2 x double> @foo( ) ; <<2 x double>> [#uses=1] %tmp2 = extractelement <2 x double> %tmp1, i32 1 ; [#uses=1] diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll index 05ca3a478a1..f2933431aee 100644 --- a/test/CodeGen/X86/vec_insert-5.ll +++ b/test/CodeGen/X86/vec_insert-5.ll @@ -8,7 +8,7 @@ define void @t1(i32 %a, x86_mmx* %P) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: shll $12, %ecx ; CHECK-NEXT: movd %ecx, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1] ; CHECK-NEXT: movlpd %xmm0, (%eax) ; CHECK-NEXT: retl %tmp12 = shl i32 %a, 12 @@ -23,8 +23,10 @@ define <4 x float> @t2(<4 x float>* %P) nounwind { ; CHECK-LABEL: t2: ; CHECK: # BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movdqa (%eax), %xmm0 -; CHECK-NEXT: pslldq $12, %xmm0 +; CHECK-NEXT: movaps (%eax), %xmm1 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] ; CHECK-NEXT: retl %tmp1 = load <4 x float>* %P %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 > @@ -35,8 +37,9 @@ define <4 x float> @t3(<4 x float>* %P) nounwind { ; CHECK-LABEL: t3: ; CHECK: # BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movdqa (%eax), %xmm0 -; CHECK-NEXT: psrldq $8, %xmm0 +; CHECK-NEXT: movaps (%eax), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,0] ; CHECK-NEXT: retl %tmp1 = load <4 x float>* %P %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 > @@ -47,8 +50,10 @@ define <4 x float> @t4(<4 x float>* %P) nounwind { ; CHECK-LABEL: t4: ; CHECK: # BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movdqa (%eax), %xmm0 -; CHECK-NEXT: psrldq $12, %xmm0 +; CHECK-NEXT: movaps (%eax), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0] ; CHECK-NEXT: retl %tmp1 = load <4 x float>* %P %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 > @@ -58,7 +63,10 @@ define <4 x float> @t4(<4 x float>* %P) nounwind { define <16 x i8> @t5(<16 x i8> %x) nounwind { ; CHECK-LABEL: t5: ; CHECK: # BB#0: -; CHECK-NEXT: psrldq $1, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,1] +; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero +; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: retl %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %s @@ -76,7 +84,7 @@ define <16 x i8> @t6(<16 x i8> %x) nounwind { define <16 x i8> @t7(<16 x i8> %x) nounwind { ; CHECK-LABEL: t7: ; CHECK: # BB#0: -; CHECK-NEXT: pslldq $13, %xmm0 +; CHECK-NEXT: palignr {{.*#+}} xmm0 = xmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2] ; CHECK-NEXT: retl %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> ret <16 x i8> %s diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll index b38b8bfb81f..a13c813ea7b 100644 --- a/test/CodeGen/X86/vec_set-3.ll +++ b/test/CodeGen/X86/vec_set-3.ll @@ -1,15 +1,9 @@ ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | FileCheck %s -; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-EXP define <4 x float> @test(float %a) { ; CHECK-LABEL: test: -; CHECK: movss {{.*}}, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; CHECK: insertps $29, {{.*}}, %xmm0 ; CHECK-NEXT: retl -; -; CHECK-EXP-LABEL: test: -; CHECK-EXP: insertps $29, {{.*}}, %xmm0 -; CHECK-EXP-NEXT: retl entry: %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1 @@ -23,11 +17,6 @@ define <2 x i64> @test2(i32 %a) { ; CHECK: movd {{.*}}, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; CHECK-NEXT: retl -; -; CHECK-EXP-LABEL: test2: -; CHECK-EXP: movd {{.*}}, %xmm0 -; CHECK-EXP-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] -; CHECK-EXP-NEXT: retl entry: %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2 @@ -38,14 +27,8 @@ entry: define <4 x float> @test3(<4 x float> %A) { ; CHECK-LABEL: test3: -; CHECK: xorps %[[X1:xmm[0-9]+]], %[[X1]] -; CHECK-NEXT: blendps $1, %xmm0, %[[X1]] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = [[X1]][1,0,1,1] +; CHECK: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero ; CHECK-NEXT: retl -; -; CHECK-EXP-LABEL: test3: -; CHECK-EXP: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero -; CHECK-EXP-NEXT: retl %tmp0 = extractelement <4 x float> %A, i32 0 %tmp1 = insertelement <4 x float> , float %tmp0, i32 1 diff --git a/test/CodeGen/X86/vec_set-5.ll b/test/CodeGen/X86/vec_set-5.ll deleted file mode 100644 index f811a7404a2..00000000000 --- a/test/CodeGen/X86/vec_set-5.ll +++ /dev/null @@ -1,28 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 -o %t -; RUN: grep movlhps %t | count 1 -; RUN: grep movq %t | count 2 - -define <4 x float> @test1(float %a, float %b) nounwind { - %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 ; <<4 x float>> [#uses=1] - %tmp6 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1] - %tmp8 = insertelement <4 x float> %tmp6, float %b, i32 2 ; <<4 x float>> [#uses=1] - %tmp9 = insertelement <4 x float> %tmp8, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] - ret <4 x float> %tmp9 -} - -define <4 x float> @test2(float %a, float %b) nounwind { - %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 ; <<4 x float>> [#uses=1] - %tmp7 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] - %tmp8 = insertelement <4 x float> %tmp7, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] - %tmp9 = insertelement <4 x float> %tmp8, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] - ret <4 x float> %tmp9 -} - -define <2 x i64> @test3(i32 %a, i32 %b) nounwind { - %tmp = insertelement <4 x i32> zeroinitializer, i32 %a, i32 0 ; <<4 x i32>> [#uses=1] - %tmp6 = insertelement <4 x i32> %tmp, i32 %b, i32 1 ; <<4 x i32>> [#uses=1] - %tmp8 = insertelement <4 x i32> %tmp6, i32 0, i32 2 ; <<4 x i32>> [#uses=1] - %tmp10 = insertelement <4 x i32> %tmp8, i32 0, i32 3 ; <<4 x i32>> [#uses=1] - %tmp11 = bitcast <4 x i32> %tmp10 to <2 x i64> ; <<2 x i64>> [#uses=1] - ret <2 x i64> %tmp11 -} diff --git a/test/CodeGen/X86/vec_set-9.ll b/test/CodeGen/X86/vec_set-9.ll deleted file mode 100644 index a73909097c1..00000000000 --- a/test/CodeGen/X86/vec_set-9.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -march=x86-64 -mattr=-avx,-pad-short-functions | FileCheck %s - -; CHECK: test3 -; CHECK: movd -; CHECK-NOT: movd -; CHECK: {{movlhps.*%xmm0, %xmm0}} -; CHECK-NEXT: ret - -define <2 x i64> @test3(i64 %A) nounwind { -entry: - %B = insertelement <2 x i64> undef, i64 %A, i32 1 - ret <2 x i64> %B -} - diff --git a/test/CodeGen/X86/vec_set-E.ll b/test/CodeGen/X86/vec_set-E.ll deleted file mode 100644 index d78be669fc7..00000000000 --- a/test/CodeGen/X86/vec_set-E.ll +++ /dev/null @@ -1,9 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movq - -define <4 x float> @t(float %X) nounwind { - %tmp11 = insertelement <4 x float> undef, float %X, i32 0 - %tmp12 = insertelement <4 x float> %tmp11, float %X, i32 1 - %tmp27 = insertelement <4 x float> %tmp12, float 0.000000e+00, i32 2 - %tmp28 = insertelement <4 x float> %tmp27, float 0.000000e+00, i32 3 - ret <4 x float> %tmp28 -} diff --git a/test/CodeGen/X86/vec_set-I.ll b/test/CodeGen/X86/vec_set-I.ll deleted file mode 100644 index c5d6ab88a35..00000000000 --- a/test/CodeGen/X86/vec_set-I.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s - -; CHECK-NOT: xorp -; CHECK: movd -; CHECK-NOT: xorp - -define void @t1() nounwind { - %tmp298.i.i = load <4 x float>* null, align 16 - %tmp304.i.i = bitcast <4 x float> %tmp298.i.i to <4 x i32> - %tmp305.i.i = and <4 x i32> %tmp304.i.i, < i32 -1, i32 0, i32 0, i32 0 > - store <4 x i32> %tmp305.i.i, <4 x i32>* null, align 16 - unreachable -} diff --git a/test/CodeGen/X86/vec_set-J.ll b/test/CodeGen/X86/vec_set-J.ll deleted file mode 100644 index d90ab85b8cf..00000000000 --- a/test/CodeGen/X86/vec_set-J.ll +++ /dev/null @@ -1,10 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movss -; PR2472 - -define <4 x i32> @a(<4 x i32> %a) nounwind { -entry: - %vecext = extractelement <4 x i32> %a, i32 0 - insertelement <4 x i32> zeroinitializer, i32 %vecext, i32 0 - %add = add <4 x i32> %a, %0 - ret <4 x i32> %add -} diff --git a/test/CodeGen/X86/vec_shuffle-11.ll b/test/CodeGen/X86/vec_shuffle-11.ll deleted file mode 100644 index 640745ae264..00000000000 --- a/test/CodeGen/X86/vec_shuffle-11.ll +++ /dev/null @@ -1,11 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 -; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | not grep mov - -define <4 x i32> @test() nounwind { - %tmp131 = call <2 x i64> @llvm.x86.sse2.psrl.dq( <2 x i64> < i64 -1, i64 -1 >, i32 96 ) ; <<2 x i64>> [#uses=1] - %tmp137 = bitcast <2 x i64> %tmp131 to <4 x i32> ; <<4 x i32>> [#uses=1] - %tmp138 = and <4 x i32> %tmp137, bitcast (<2 x i64> < i64 -1, i64 -1 > to <4 x i32>) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %tmp138 -} - -declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) diff --git a/test/CodeGen/X86/vec_shuffle-14.ll b/test/CodeGen/X86/vec_shuffle-14.ll deleted file mode 100644 index 8f2519728b7..00000000000 --- a/test/CodeGen/X86/vec_shuffle-14.ll +++ /dev/null @@ -1,70 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s -check-prefix=X86-32 -; RUN: llc < %s -march=x86-64 -mattr=+sse2,-avx | FileCheck %s -check-prefix=X86-64 - -define <4 x i32> @t1(i32 %a) nounwind { -entry: - %tmp = insertelement <4 x i32> undef, i32 %a, i32 0 - %tmp6 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp, <4 x i32> < i32 4, i32 1, i32 2, i32 3 > ; <<4 x i32>> [#uses=1] - ret <4 x i32> %tmp6 - -; X86-32-LABEL: t1: -; X86-32: movd 4(%esp), %xmm0 - -; X86-64-LABEL: t1: -; X86-64: movd %e{{..}}, %xmm0 -} - -define <2 x i64> @t2(i64 %a) nounwind { -entry: - %tmp = insertelement <2 x i64> undef, i64 %a, i32 0 - %tmp6 = shufflevector <2 x i64> zeroinitializer, <2 x i64> %tmp, <2 x i32> < i32 2, i32 1 > ; <<4 x i32>> [#uses=1] - ret <2 x i64> %tmp6 - -; X86-32-LABEL: t2: -; X86-32: movq 4(%esp), %xmm0 - -; X86-64-LABEL: t2: -; X86-64: movd %r{{..}}, %xmm0 -} - -define <2 x i64> @t3(<2 x i64>* %a) nounwind { -entry: - %tmp4 = load <2 x i64>* %a, align 16 ; <<2 x i64>> [#uses=1] - %tmp6 = bitcast <2 x i64> %tmp4 to <4 x i32> ; <<4 x i32>> [#uses=1] - %tmp7 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp6, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x i32>> [#uses=1] - %tmp8 = bitcast <4 x i32> %tmp7 to <2 x i64> ; <<2 x i64>> [#uses=1] - ret <2 x i64> %tmp8 - -; X86-32-LABEL: t3: -; X86-32: movl 4(%esp) -; X86-32: movq - -; X86-64-LABEL: t3: -; X86-64: movq ({{.*}}), %xmm0 -} - -define <2 x i64> @t4(<2 x i64> %a) nounwind { -entry: - %tmp5 = bitcast <2 x i64> %a to <4 x i32> ; <<4 x i32>> [#uses=1] - %tmp6 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp5, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x i32>> [#uses=1] - %tmp7 = bitcast <4 x i32> %tmp6 to <2 x i64> ; <<2 x i64>> [#uses=1] - ret <2 x i64> %tmp7 - -; X86-32-LABEL: t4: -; X86-32: movq %xmm0, %xmm0 - -; X86-64-LABEL: t4: -; X86-64: movq {{.*}}, %xmm0 -} - -define <2 x i64> @t5(<2 x i64> %a) nounwind { -entry: - %tmp6 = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <2 x i32> < i32 2, i32 1 > ; <<4 x i32>> [#uses=1] - ret <2 x i64> %tmp6 - -; X86-32-LABEL: t5: -; X86-32: movq %xmm0, %xmm0 - -; X86-64-LABEL: t5: -; X86-64: movq {{.*}}, %xmm0 -} diff --git a/test/CodeGen/X86/vec_shuffle-15.ll b/test/CodeGen/X86/vec_shuffle-15.ll deleted file mode 100644 index 5a9b8fd3457..00000000000 --- a/test/CodeGen/X86/vec_shuffle-15.ll +++ /dev/null @@ -1,81 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 - -define <2 x i64> @t00(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 0 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t01(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 1 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t02(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 2 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t03(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 3 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t10(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 0 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t11(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 1 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t12(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 2 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t13(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 3 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t20(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 0 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t21(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 1 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t22(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 2 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t23(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 3 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t30(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 0 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t31(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 1 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t32(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 2 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t33(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 3 > - ret <2 x i64> %tmp -} diff --git a/test/CodeGen/X86/vec_shuffle-16.ll b/test/CodeGen/X86/vec_shuffle-16.ll deleted file mode 100644 index 9aeb94289c8..00000000000 --- a/test/CodeGen/X86/vec_shuffle-16.ll +++ /dev/null @@ -1,43 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse,-sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse -; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse2 - -; sse-LABEL: t1: -; sse2-LABEL: t1: -define <4 x float> @t1(<4 x float> %a, <4 x float> %b) nounwind { -; sse: shufps -; sse2: pshufd -; sse2-NEXT: ret - %tmp1 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer - ret <4 x float> %tmp1 -} - -; sse-LABEL: t2: -; sse2-LABEL: t2: -define <4 x float> @t2(<4 x float> %A, <4 x float> %B) nounwind { -; sse: shufps -; sse2: pshufd -; sse2-NEXT: ret - %tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 3, i32 3, i32 3, i32 3 > - ret <4 x float> %tmp -} - -; sse-LABEL: t3: -; sse2-LABEL: t3: -define <4 x float> @t3(<4 x float> %A, <4 x float> %B) nounwind { -; sse: shufps -; sse2: pshufd -; sse2-NEXT: ret - %tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 4, i32 4, i32 4, i32 4 > - ret <4 x float> %tmp -} - -; sse-LABEL: t4: -; sse2-LABEL: t4: -define <4 x float> @t4(<4 x float> %A, <4 x float> %B) nounwind { - -; sse: shufps -; sse2: pshufd -; sse2-NEXT: ret - %tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 1, i32 3, i32 2, i32 0 > - ret <4 x float> %tmp -} diff --git a/test/CodeGen/X86/vec_shuffle-17.ll b/test/CodeGen/X86/vec_shuffle-17.ll deleted file mode 100644 index f2f96ba94af..00000000000 --- a/test/CodeGen/X86/vec_shuffle-17.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-win32 -mattr=-avx | FileCheck %s -; CHECK-NOT: xor -; CHECK: movd {{%rdi|%rcx}}, %xmm0 -; CHECK-NOT: xor -; PR2108 - -define <2 x i64> @doload64(i64 %x) nounwind { -entry: - %tmp717 = bitcast i64 %x to double ; [#uses=1] - %tmp8 = insertelement <2 x double> undef, double %tmp717, i32 0 ; <<2 x double>> [#uses=1] - %tmp9 = insertelement <2 x double> %tmp8, double 0.000000e+00, i32 1 ; <<2 x double>> [#uses=1] - %tmp11 = bitcast <2 x double> %tmp9 to <2 x i64> ; <<2 x i64>> [#uses=1] - ret <2 x i64> %tmp11 -} - diff --git a/test/CodeGen/X86/vec_shuffle-18.ll b/test/CodeGen/X86/vec_shuffle-18.ll deleted file mode 100644 index 1104a4a8856..00000000000 --- a/test/CodeGen/X86/vec_shuffle-18.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin8.8.0 | grep mov | count 7 - - %struct.vector4_t = type { <4 x float> } - -define void @swizzle(i8* %a, %struct.vector4_t* %b, %struct.vector4_t* %c) nounwind { -entry: - %tmp9 = getelementptr %struct.vector4_t* %b, i32 0, i32 0 ; <<4 x float>*> [#uses=2] - %tmp10 = load <4 x float>* %tmp9, align 16 ; <<4 x float>> [#uses=1] - %tmp14 = bitcast i8* %a to double* ; [#uses=1] - %tmp15 = load double* %tmp14 ; [#uses=1] - %tmp16 = insertelement <2 x double> undef, double %tmp15, i32 0 ; <<2 x double>> [#uses=1] - %tmp18 = bitcast <2 x double> %tmp16 to <4 x float> ; <<4 x float>> [#uses=1] - %tmp19 = shufflevector <4 x float> %tmp10, <4 x float> %tmp18, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x float>> [#uses=1] - store <4 x float> %tmp19, <4 x float>* %tmp9, align 16 - %tmp28 = getelementptr %struct.vector4_t* %c, i32 0, i32 0 ; <<4 x float>*> [#uses=2] - %tmp29 = load <4 x float>* %tmp28, align 16 ; <<4 x float>> [#uses=1] - %tmp26 = getelementptr i8* %a, i32 8 ; [#uses=1] - %tmp33 = bitcast i8* %tmp26 to double* ; [#uses=1] - %tmp34 = load double* %tmp33 ; [#uses=1] - %tmp35 = insertelement <2 x double> undef, double %tmp34, i32 0 ; <<2 x double>> [#uses=1] - %tmp37 = bitcast <2 x double> %tmp35 to <4 x float> ; <<4 x float>> [#uses=1] - %tmp38 = shufflevector <4 x float> %tmp29, <4 x float> %tmp37, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x float>> [#uses=1] - store <4 x float> %tmp38, <4 x float>* %tmp28, align 16 - ret void -} diff --git a/test/CodeGen/X86/vec_shuffle-19.ll b/test/CodeGen/X86/vec_shuffle-19.ll deleted file mode 100644 index 48db8de0d93..00000000000 --- a/test/CodeGen/X86/vec_shuffle-19.ll +++ /dev/null @@ -1,9 +0,0 @@ -; REQUIRES: asserts -; RUN: llc < %s -o /dev/null -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 4 -; PR2485 - -define <4 x i32> @t(<4 x i32> %a, <4 x i32> %b) nounwind { -entry: - %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> < i32 4, i32 0, i32 0, i32 0 > ; <<4 x i32>> [#uses=1] - ret <4 x i32> %shuffle -} diff --git a/test/CodeGen/X86/vec_shuffle-20.ll b/test/CodeGen/X86/vec_shuffle-20.ll deleted file mode 100644 index 5a2c4449456..00000000000 --- a/test/CodeGen/X86/vec_shuffle-20.ll +++ /dev/null @@ -1,8 +0,0 @@ -; REQUIRES: asserts -; RUN: llc < %s -o /dev/null -march=x86 -mcpu=corei7 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 2 - -define <4 x float> @func(<4 x float> %fp0, <4 x float> %fp1) nounwind { -entry: - shufflevector <4 x float> %fp0, <4 x float> %fp1, <4 x i32> < i32 0, i32 1, i32 2, i32 7 > ; <<4 x float>>:0 [#uses=1] - ret <4 x float> %0 -} diff --git a/test/CodeGen/X86/vec_shuffle-22.ll b/test/CodeGen/X86/vec_shuffle-22.ll deleted file mode 100644 index 6807e4d6390..00000000000 --- a/test/CodeGen/X86/vec_shuffle-22.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=pentium-m | FileCheck %s - -define <4 x float> @t1(<4 x float> %a) nounwind { -; CHECK: movlhps - %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> < i32 0, i32 1, i32 0, i32 1 > ; <<4 x float>> [#uses=1] - ret <4 x float> %tmp1 -} - -define <4 x i32> @t2(<4 x i32>* %a) nounwind { -; CHECK: pshufd -; CHECK: ret - %tmp1 = load <4 x i32>* %a - %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> < i32 0, i32 1, i32 0, i32 1 > ; <<4 x i32>> [#uses=1] - ret <4 x i32> %tmp2 -} diff --git a/test/CodeGen/X86/vec_shuffle-23.ll b/test/CodeGen/X86/vec_shuffle-23.ll deleted file mode 100644 index 24687359cc5..00000000000 --- a/test/CodeGen/X86/vec_shuffle-23.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep punpck -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep pshufd - -define i32 @t() nounwind { -entry: - %a = alloca <4 x i32> ; <<4 x i32>*> [#uses=2] - %b = alloca <4 x i32> ; <<4 x i32>*> [#uses=5] - store volatile <4 x i32> < i32 0, i32 1, i32 2, i32 3 >, <4 x i32>* %a - %tmp = load <4 x i32>* %a ; <<4 x i32>> [#uses=1] - store <4 x i32> %tmp, <4 x i32>* %b - %tmp1 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %tmp2 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %punpckldq = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x i32>> [#uses=1] - store <4 x i32> %punpckldq, <4 x i32>* %b - %tmp3 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %result = extractelement <4 x i32> %tmp3, i32 0 ; [#uses=1] - ret i32 %result -} diff --git a/test/CodeGen/X86/vec_shuffle-24.ll b/test/CodeGen/X86/vec_shuffle-24.ll deleted file mode 100644 index d038dafaf29..00000000000 --- a/test/CodeGen/X86/vec_shuffle-24.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s - -define i32 @t() nounwind optsize { -entry: -; CHECK: punpckldq - %a = alloca <4 x i32> ; <<4 x i32>*> [#uses=2] - %b = alloca <4 x i32> ; <<4 x i32>*> [#uses=5] - store volatile <4 x i32> < i32 0, i32 1, i32 2, i32 3 >, <4 x i32>* %a - %tmp = load <4 x i32>* %a ; <<4 x i32>> [#uses=1] - store <4 x i32> %tmp, <4 x i32>* %b - %tmp1 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %tmp2 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %punpckldq = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x i32>> [#uses=1] - store <4 x i32> %punpckldq, <4 x i32>* %b - %tmp3 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %result = extractelement <4 x i32> %tmp3, i32 0 ; [#uses=1] - ret i32 %result -} diff --git a/test/CodeGen/X86/vec_shuffle-25.ll b/test/CodeGen/X86/vec_shuffle-25.ll deleted file mode 100644 index 3f42a132ef2..00000000000 --- a/test/CodeGen/X86/vec_shuffle-25.ll +++ /dev/null @@ -1,34 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=sse4.1 -o %t -; RUN: grep unpcklps %t | count 3 -; RUN: grep unpckhps %t | count 1 - -; Transpose example using the more generic vector shuffle. We return -; float8 instead of float16 since x86 can return that in register. -; ModuleID = 'transpose2_opt.bc' -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32" -target triple = "i386-apple-cl.1.0" -@r0 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r1 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r2 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r3 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] - -define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind { -entry: - %unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2] - %unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2] - %unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2] - %unpckhps11 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2] - %unpcklps14 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] - %unpcklps14a = shufflevector <4 x float> %unpcklps14, <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %unpckhps17 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] - %unpckhps17a = shufflevector <4 x float> %unpckhps17, <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %r1 = shufflevector <16 x float> %unpcklps14a, <16 x float> %unpckhps17a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> - %unpcklps20 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] - %unpcklps20a = shufflevector <4 x float> %unpcklps20, <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %r2 = shufflevector <16 x float> %r1, <16 x float> %unpcklps20a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15> - %unpckhps23 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] - %unpckhps23a = shufflevector <4 x float> %unpckhps23, <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %r3 = shufflevector <16 x float> %r2, <16 x float> %unpckhps23a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19> - %r4 = shufflevector <16 x float> %r3, <16 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - ret <8 x float> %r4 -} diff --git a/test/CodeGen/X86/vec_shuffle-26.ll b/test/CodeGen/X86/vec_shuffle-26.ll deleted file mode 100644 index 00e8e73e184..00000000000 --- a/test/CodeGen/X86/vec_shuffle-26.ll +++ /dev/null @@ -1,68 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse4.1 | FileCheck %s -; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s - -; Transpose example using the more generic vector shuffle. Return float8 -; instead of float16 -; ModuleID = 'transpose2_opt.bc' -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32" -target triple = "i386-apple-cl.1.0" -@r0 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r1 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r2 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r3 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] - -define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind { -entry: -; CHECK: transpose2 -; CHECK: unpckhps -; CHECK: unpckhps -; CHECK: unpcklps -; CHECK: unpckhps -; Different instruction order for Atom. -; ATOM: transpose2 -; ATOM: unpckhps -; ATOM: unpckhps -; ATOM: unpckhps -; ATOM: unpcklps - %unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2] - %unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2] - %unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2] - %unpckhps11 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2] - %unpcklps14 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] - %unpckhps17 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] - %r1 = shufflevector <4 x float> %unpcklps14, <4 x float> %unpckhps17, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > - %unpcklps20 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] - %unpckhps23 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] - %r2 = shufflevector <4 x float> %unpcklps20, <4 x float> %unpckhps23, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > -; %r3 = shufflevector <8 x float> %r1, <8 x float> %r2, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15 >; - ret <8 x float> %r2 -} - -define <2 x i64> @lo_hi_shift(float* nocapture %x, float* nocapture %y) nounwind { -entry: -; movhps should happen before extractps to assure it gets the correct value. -; CHECK: lo_hi_shift -; CHECK: movhps ([[BASEREG:%[a-z]+]]), -; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]]) -; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]]) -; ATOM: lo_hi_shift -; ATOM: movhps ([[BASEREG:%[a-z]+]]), -; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]]) -; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]]) - %v.i = bitcast float* %y to <4 x float>* - %0 = load <4 x float>* %v.i, align 1 - %1 = bitcast float* %x to <1 x i64>* - %.val = load <1 x i64>* %1, align 1 - %2 = bitcast <1 x i64> %.val to <2 x float> - %shuffle.i = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> - %shuffle1.i = shufflevector <4 x float> %0, <4 x float> %shuffle.i, <4 x i32> - %cast.i = bitcast <4 x float> %0 to <2 x i64> - %extract.i = extractelement <2 x i64> %cast.i, i32 1 - %3 = bitcast float* %x to i64* - store i64 %extract.i, i64* %3, align 4 - %4 = bitcast <4 x float> %0 to <16 x i8> - %5 = bitcast <4 x float> %shuffle1.i to <16 x i8> - %palignr = shufflevector <16 x i8> %5, <16 x i8> %4, <16 x i32> - %6 = bitcast <16 x i8> %palignr to <2 x i64> - ret <2 x i64> %6 -} diff --git a/test/CodeGen/X86/vec_shuffle-27.ll b/test/CodeGen/X86/vec_shuffle-27.ll deleted file mode 100644 index c9b2fb51d78..00000000000 --- a/test/CodeGen/X86/vec_shuffle-27.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse4.1 | FileCheck %s - -; ModuleID = 'vec_shuffle-27.bc' -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32" -target triple = "i686-apple-cl.1.0" - -define <8 x float> @my2filter4_1d(<4 x float> %a, <8 x float> %T0, <8 x float> %T1) nounwind readnone { -entry: -; CHECK: subps -; CHECK: subps -; CHECK: mulps -; CHECK: mulps -; CHECK: addps -; CHECK: addps - %tmp7 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3 > ; <<8 x float>> [#uses=1] - %sub = fsub <8 x float> %T1, %T0 ; <<8 x float>> [#uses=1] - %mul = fmul <8 x float> %sub, %tmp7 ; <<8 x float>> [#uses=1] - %add = fadd <8 x float> %mul, %T0 ; <<8 x float>> [#uses=1] - ret <8 x float> %add -} - -; Test case for r122206 -define void @test2(<4 x i64>* %ap, <4 x i64>* %bp) nounwind { -entry: -; CHECK: movdqa - %a = load <4 x i64> * %ap - %b = load <4 x i64> * %bp - %mulaa = mul <4 x i64> %a, %a - %mulbb = mul <4 x i64> %b, %b - %mulab = mul <4 x i64> %a, %b - %vect1271 = shufflevector <4 x i64> %mulaa, <4 x i64> %mulbb, <4 x i32> - %vect1272 = shufflevector <4 x i64> %mulaa, <4 x i64> %mulbb, <4 x i32> - %vect1487 = shufflevector <4 x i64> %vect1271, <4 x i64> %mulab, <4 x i32> - %vect1488 = shufflevector <4 x i64> %vect1272, <4 x i64> %mulab, <4 x i32> - store <4 x i64> %vect1487, <4 x i64>* %ap - store <4 x i64> %vect1488, <4 x i64>* %bp - ret void; -} diff --git a/test/CodeGen/X86/vec_shuffle-28.ll b/test/CodeGen/X86/vec_shuffle-28.ll deleted file mode 100644 index ebf557762cb..00000000000 --- a/test/CodeGen/X86/vec_shuffle-28.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s - -; CHECK: pshufb -; CHECK-NOT: pshufb - -; FIXME: this test has a superfluous punpcklqdq pre-pshufb currently. -; Don't XFAIL it because it's still better than the previous code. - -; Pack various elements via shuffles. -define <8 x i16> @shuf1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > - ret <8 x i16> %tmp7 -} diff --git a/test/CodeGen/X86/vec_shuffle-30.ll b/test/CodeGen/X86/vec_shuffle-30.ll deleted file mode 100644 index f5f88426058..00000000000 --- a/test/CodeGen/X86/vec_shuffle-30.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s - -; CHECK: test -; Test case when creating pshufhw, we incorrectly set the higher order bit -; for an undef, -define void @test(<8 x i16>* %dest, <8 x i16> %in) nounwind { -entry: -; CHECK-NOT: vmovaps -; CHECK: vmovlpd -; CHECK: vpshufhw $-95 - %0 = load <8 x i16>* %dest - %1 = shufflevector <8 x i16> %0, <8 x i16> %in, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 13, i32 undef, i32 14, i32 14> - store <8 x i16> %1, <8 x i16>* %dest - ret void -} - -; CHECK: test2 -; A test case where we shouldn't generate a punpckldq but a pshufd and a pslldq -define void @test2(<4 x i32>* %dest, <4 x i32> %in) nounwind { -entry: -; CHECK-NOT: pslldq -; CHECK: shufps - %0 = shufflevector <4 x i32> %in, <4 x i32> , <4 x i32> < i32 undef, i32 5, i32 undef, i32 2> - store <4 x i32> %0, <4 x i32>* %dest - ret void -} diff --git a/test/CodeGen/X86/vec_shuffle-31.ll b/test/CodeGen/X86/vec_shuffle-31.ll deleted file mode 100644 index bb06e15425b..00000000000 --- a/test/CodeGen/X86/vec_shuffle-31.ll +++ /dev/null @@ -1,8 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=core2 -o %t -; RUN: grep pshufb %t | count 1 - -define <8 x i16> @shuf3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef > - ret <8 x i16> %tmp9 -} diff --git a/test/CodeGen/X86/vec_shuffle-34.ll b/test/CodeGen/X86/vec_shuffle-34.ll deleted file mode 100644 index d057b3fa7ea..00000000000 --- a/test/CodeGen/X86/vec_shuffle-34.ll +++ /dev/null @@ -1,7 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=core2 | grep pshufb | count 2 - -define <8 x i16> @shuf2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef > - ret <8 x i16> %tmp8 -} diff --git a/test/CodeGen/X86/vec_shuffle-35.ll b/test/CodeGen/X86/vec_shuffle-35.ll deleted file mode 100644 index f5083b4b801..00000000000 --- a/test/CodeGen/X86/vec_shuffle-35.ll +++ /dev/null @@ -1,20 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=yonah -stack-alignment=16 -o %t -; RUN: grep pextrw %t | count 12 -; RUN: grep pinsrw %t | count 13 -; RUN: grep rolw %t | count 13 -; RUN: not grep esp %t -; RUN: not grep ebp %t -; RUN: llc < %s -march=x86 -mcpu=core2 -stack-alignment=16 -o %t -; RUN: grep pshufb %t | count 3 - -define <16 x i8> @shuf1(<16 x i8> %T0) nounwind readnone { -entry: - %tmp8 = shufflevector <16 x i8> %T0, <16 x i8> undef, <16 x i32> < i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 > - ret <16 x i8> %tmp8 -} - -define <16 x i8> @shuf2(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { -entry: - %tmp8 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> < i32 undef, i32 undef, i32 3, i32 2, i32 17, i32 16, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 > - ret <16 x i8> %tmp8 -} diff --git a/test/CodeGen/X86/vec_shuffle-36.ll b/test/CodeGen/X86/vec_shuffle-36.ll deleted file mode 100644 index f1d0f939e60..00000000000 --- a/test/CodeGen/X86/vec_shuffle-36.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc < %s -march=x86-64 -mcpu=penryn -mattr=sse4.1 | FileCheck %s - -define <8 x i16> @shuf6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK: ret -entry: - %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 3, i32 2, i32 0, i32 2, i32 1, i32 5, i32 6 , i32 undef > - ret <8 x i16> %tmp9 -} - -define <8 x i16> @shuf7(<8 x i16> %t0) { -; CHECK: pshufd - %tmp10 = shufflevector <8 x i16> %t0, <8 x i16> undef, <8 x i32> < i32 undef, i32 2, i32 2, i32 2, i32 2, i32 2, i32 undef, i32 undef > - ret <8 x i16> %tmp10 -} diff --git a/test/CodeGen/X86/vec_shuffle-37.ll b/test/CodeGen/X86/vec_shuffle-37.ll deleted file mode 100644 index ed285f93fe1..00000000000 --- a/test/CodeGen/X86/vec_shuffle-37.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core2 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=core2 | FileCheck %s -; RUN: llc -O0 < %s -march=x86 -mcpu=core2 | FileCheck %s --check-prefix=CHECK_O0 - -define <4 x i32> @t00(<4 x i32>* %a0) nounwind ssp { -entry: -; CHECK: movaps ({{%rdi|%rcx}}), %[[XMM0:xmm[0-9]+]] -; CHECK: movaps %[[XMM0]], %[[XMM1:xmm[0-9]+]] -; CHECK-NEXT: movss %xmm{{[0-9]+}}, %[[XMM1]] -; CHECK-NEXT: shufps $36, %[[XMM1]], %[[XMM0]] - %0 = load <4 x i32>* undef, align 16 - %1 = load <4 x i32>* %a0, align 16 - %2 = shufflevector <4 x i32> %1, <4 x i32> %0, <4 x i32> - ret <4 x i32> %2 -} - -define void @t01(double* %a0) nounwind ssp { -entry: -; CHECK_O0: movsd (%eax), %xmm0 -; CHECK_O0: unpcklpd %xmm0, %xmm0 - %tmp93 = load double* %a0, align 8 - %vecinit94 = insertelement <2 x double> undef, double %tmp93, i32 1 - store <2 x double> %vecinit94, <2 x double>* undef - ret void -} - -define void @t02(<8 x i32>* %source, <2 x i32>* %dest) nounwind noinline { -entry: -; CHECK: t02 -; CHECK: movaps -; CHECK: shufps -; CHECK: pshufd -; CHECK: movq -; CHECK: ret - %0 = bitcast <8 x i32>* %source to <4 x i32>* - %arrayidx = getelementptr inbounds <4 x i32>* %0, i64 3 - %tmp2 = load <4 x i32>* %arrayidx, align 16 - %tmp3 = extractelement <4 x i32> %tmp2, i32 0 - %tmp5 = insertelement <2 x i32> , i32 %tmp3, i32 0 - %arrayidx7 = getelementptr inbounds <8 x i32>* %source, i64 1 - %1 = bitcast <8 x i32>* %arrayidx7 to <4 x i32>* - %tmp8 = load <4 x i32>* %1, align 16 - %tmp9 = extractelement <4 x i32> %tmp8, i32 1 - %tmp11 = insertelement <2 x i32> %tmp5, i32 %tmp9, i32 1 - store <2 x i32> %tmp11, <2 x i32>* %dest, align 8 - ret void -} diff --git a/test/CodeGen/X86/vec_shuffle-38.ll b/test/CodeGen/X86/vec_shuffle-38.ll deleted file mode 100644 index e76860ffafa..00000000000 --- a/test/CodeGen/X86/vec_shuffle-38.ll +++ /dev/null @@ -1,77 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s - -define <2 x double> @ld(<2 x double> %p) nounwind optsize ssp { -; CHECK: unpcklpd - %shuffle = shufflevector <2 x double> %p, <2 x double> undef, <2 x i32> zeroinitializer - ret <2 x double> %shuffle -} - -define <2 x double> @hd(<2 x double> %p) nounwind optsize ssp { -; CHECK: movhlps - %shuffle = shufflevector <2 x double> %p, <2 x double> undef, <2 x i32> - ret <2 x double> %shuffle -} - -define <2 x i64> @ldi(<2 x i64> %p) nounwind optsize ssp { -; CHECK: punpcklqdq - %shuffle = shufflevector <2 x i64> %p, <2 x i64> undef, <2 x i32> zeroinitializer - ret <2 x i64> %shuffle -} - -define <2 x i64> @hdi(<2 x i64> %p) nounwind optsize ssp { -; CHECK: punpckhqdq - %shuffle = shufflevector <2 x i64> %p, <2 x i64> undef, <2 x i32> - ret <2 x i64> %shuffle -} - -; rdar://10050549 -%struct.Float2 = type { float, float } - -define <4 x float> @loadhpi(%struct.Float2* %vPtr, <4 x float> %vecin1) nounwind readonly ssp { -entry: -; CHECK: loadhpi -; CHECK-NOT: movq -; CHECK: movhps ( - %tmp1 = bitcast %struct.Float2* %vPtr to <1 x i64>* - %addptr7 = getelementptr inbounds <1 x i64>* %tmp1, i64 0 - %tmp2 = bitcast <1 x i64>* %addptr7 to float* - %tmp3 = load float* %tmp2, align 4 - %vec = insertelement <4 x float> undef, float %tmp3, i32 0 - %addptr.i12 = getelementptr inbounds float* %tmp2, i64 1 - %tmp4 = load float* %addptr.i12, align 4 - %vecin2 = insertelement <4 x float> %vec, float %tmp4, i32 1 - %shuffle = shufflevector <4 x float> %vecin1, <4 x float> %vecin2, <4 x i32> - ret <4 x float> %shuffle -} - -; rdar://10119696 -; CHECK: f -define <4 x float> @f(<4 x float> %x, double* nocapture %y) nounwind readonly ssp { -entry: - ; CHECK: movlps (%{{rdi|rdx}}), %xmm0 - %u110.i = load double* %y, align 1 - %tmp8.i = insertelement <2 x double> undef, double %u110.i, i32 0 - %tmp9.i = bitcast <2 x double> %tmp8.i to <4 x float> - %shuffle.i = shufflevector <4 x float> %x, <4 x float> %tmp9.i, <4 x i32> - ret <4 x float> %shuffle.i -} - -define <4 x float> @loadhpi2(%struct.Float2* nocapture %vHiCoefPtr_0, %struct.Float2* nocapture %vLoCoefPtr_0, i32 %s) nounwind readonly ssp { -entry: -; CHECK: loadhpi2 -; CHECK: movhps ( -; CHECK-NOT: movlhps - %0 = bitcast %struct.Float2* %vHiCoefPtr_0 to <1 x i64>* - %idx.ext = sext i32 %s to i64 - %add.ptr = getelementptr inbounds <1 x i64>* %0, i64 %idx.ext - %add.ptr.val = load <1 x i64>* %add.ptr, align 1 - %1 = bitcast <1 x i64> %add.ptr.val to <2 x float> - %shuffle.i = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> - %2 = bitcast %struct.Float2* %vLoCoefPtr_0 to <1 x i64>* - %add.ptr2 = getelementptr inbounds <1 x i64>* %2, i64 %idx.ext - %add.ptr2.val = load <1 x i64>* %add.ptr2, align 1 - %3 = bitcast <1 x i64> %add.ptr2.val to <2 x float> - %shuffle.i4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> - %shuffle1.i5 = shufflevector <4 x float> %shuffle.i, <4 x float> %shuffle.i4, <4 x i32> - ret <4 x float> %shuffle1.i5 -} diff --git a/test/CodeGen/X86/vec_shuffle-39.ll b/test/CodeGen/X86/vec_shuffle-39.ll deleted file mode 100644 index 8fd9a5cd023..00000000000 --- a/test/CodeGen/X86/vec_shuffle-39.ll +++ /dev/null @@ -1,86 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn | FileCheck %s -; rdar://10050222, rdar://10134392 - -define <4 x float> @t1(<4 x float> %a, <1 x i64>* nocapture %p) nounwind { -entry: -; CHECK-LABEL: t1: -; CHECK: movlps (%rdi), %xmm0 -; CHECK: ret - %p.val = load <1 x i64>* %p, align 1 - %0 = bitcast <1 x i64> %p.val to <2 x float> - %shuffle.i = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> - %shuffle1.i = shufflevector <4 x float> %a, <4 x float> %shuffle.i, <4 x i32> - ret <4 x float> %shuffle1.i -} - -define <4 x float> @t1a(<4 x float> %a, <1 x i64>* nocapture %p) nounwind { -entry: -; CHECK-LABEL: t1a: -; CHECK: movlps (%rdi), %xmm0 -; CHECK: ret - %0 = bitcast <1 x i64>* %p to double* - %1 = load double* %0 - %2 = insertelement <2 x double> undef, double %1, i32 0 - %3 = bitcast <2 x double> %2 to <4 x float> - %4 = shufflevector <4 x float> %a, <4 x float> %3, <4 x i32> - ret <4 x float> %4 -} - -define void @t2(<1 x i64>* nocapture %p, <4 x float> %a) nounwind { -entry: -; CHECK-LABEL: t2: -; CHECK: movlps %xmm0, (%rdi) -; CHECK: ret - %cast.i = bitcast <4 x float> %a to <2 x i64> - %extract.i = extractelement <2 x i64> %cast.i, i32 0 - %0 = getelementptr inbounds <1 x i64>* %p, i64 0, i64 0 - store i64 %extract.i, i64* %0, align 8 - ret void -} - -define void @t2a(<1 x i64>* nocapture %p, <4 x float> %a) nounwind { -entry: -; CHECK-LABEL: t2a: -; CHECK: movlps %xmm0, (%rdi) -; CHECK: ret - %0 = bitcast <1 x i64>* %p to double* - %1 = bitcast <4 x float> %a to <2 x double> - %2 = extractelement <2 x double> %1, i32 0 - store double %2, double* %0 - ret void -} - -; rdar://10436044 -define <2 x double> @t3() nounwind readonly { -bb: -; CHECK-LABEL: t3: -; CHECK: movq (%rax), %xmm1 -; CHECK: punpcklqdq %xmm2, %xmm0 -; CHECK: movsd %xmm1, %xmm0 - %tmp0 = load i128* null, align 1 - %tmp1 = load <2 x i32>* undef, align 8 - %tmp2 = bitcast i128 %tmp0 to <16 x i8> - %tmp3 = bitcast <2 x i32> %tmp1 to i64 - %tmp4 = insertelement <2 x i64> undef, i64 %tmp3, i32 0 - %tmp5 = bitcast <16 x i8> %tmp2 to <2 x double> - %tmp6 = bitcast <2 x i64> %tmp4 to <2 x double> - %tmp7 = shufflevector <2 x double> %tmp5, <2 x double> %tmp6, <2 x i32> - ret <2 x double> %tmp7 -} - -; rdar://10450317 -define <2 x i64> @t4() nounwind readonly { -bb: -; CHECK-LABEL: t4: -; CHECK: movq (%rax), %xmm0 -; CHECK: punpcklqdq %{{xmm.}}, %[[XMM:xmm[0-9]]] -; CHECK: movsd %[[XMM]], %xmm0 - %tmp0 = load i128* null, align 1 - %tmp1 = load <2 x i32>* undef, align 8 - %tmp2 = bitcast i128 %tmp0 to <16 x i8> - %tmp3 = bitcast <2 x i32> %tmp1 to i64 - %tmp4 = insertelement <2 x i64> undef, i64 %tmp3, i32 0 - %tmp5 = bitcast <16 x i8> %tmp2 to <2 x i64> - %tmp6 = shufflevector <2 x i64> %tmp4, <2 x i64> %tmp5, <2 x i32> - ret <2 x i64> %tmp6 -} diff --git a/test/CodeGen/X86/vec_shuffle-40.ll b/test/CodeGen/X86/vec_shuffle-40.ll deleted file mode 100644 index 75b45e3df11..00000000000 --- a/test/CodeGen/X86/vec_shuffle-40.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s - -define void @shuffle_v16i16(<16 x i16>* %a) { -; CHECK-LABEL: shuffle_v16i16: -; CHECK: vpshufb {{.*}}%ymm -; CHECK-NOT: vpshufb {{.*}}%xmm -entry: - %0 = load <16 x i16>* %a, align 32 - %shuffle = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> - store <16 x i16> %shuffle, <16 x i16>* %a, align 32 - ret void -} - -define void @shuffle_v16i16_lanecrossing(<16 x i16>* %a) { -; CHECK-LABEL: shuffle_v16i16_lanecrossing: -; CHECK-NOT: vpshufb {{.*}}%ymm -entry: - %0 = load <16 x i16>* %a, align 32 - %shuffle = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> - store <16 x i16> %shuffle, <16 x i16>* %a, align 32 - ret void -} diff --git a/test/CodeGen/X86/vec_shuffle-41.ll b/test/CodeGen/X86/vec_shuffle-41.ll deleted file mode 100644 index 28fdd2f5ce1..00000000000 --- a/test/CodeGen/X86/vec_shuffle-41.ll +++ /dev/null @@ -1,21 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s - -; Use buildFromShuffleMostly which allows this to be generated as two 128-bit -; shuffles and an insert. - -; This is the (somewhat questionable) LLVM IR that is generated for: -; x8.s0123456 = x8.s1234567; // x8 is a <8 x float> type -; x8.s7 = f; // f is float - - -define <8 x float> @test1(<8 x float> %a, float %b) { -; CHECK-LABEL: test1: -; CHECK: vinsertps -; CHECK-NOT: vinsertps -entry: - %shift = shufflevector <8 x float> %a, <8 x float> undef, <7 x i32> - %extend = shufflevector <7 x float> %shift, <7 x float> undef, <8 x i32> - %insert = insertelement <8 x float> %extend, float %b, i32 7 - - ret <8 x float> %insert -} diff --git a/test/CodeGen/X86/vec_shuffle.ll b/test/CodeGen/X86/vec_shuffle.ll deleted file mode 100644 index 65995984859..00000000000 --- a/test/CodeGen/X86/vec_shuffle.ll +++ /dev/null @@ -1,50 +0,0 @@ -; RUN: llc < %s -mtriple=i686-linux -mcpu=core2 | FileCheck %s - -; CHECK: test_v4sf -; CHECK: movq 8(%esp) -; CHECK: pshufd $80 -define void @test_v4sf(<4 x float>* %P, float %X, float %Y) nounwind { - %tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0 ; <<4 x float>> [#uses=1] - %tmp2 = insertelement <4 x float> %tmp, float %X, i32 1 ; <<4 x float>> [#uses=1] - %tmp4 = insertelement <4 x float> %tmp2, float %Y, i32 2 ; <<4 x float>> [#uses=1] - %tmp6 = insertelement <4 x float> %tmp4, float %Y, i32 3 ; <<4 x float>> [#uses=1] - store <4 x float> %tmp6, <4 x float>* %P - ret void -} - -; CHECK: test_v2sd -; CHECK: movups 8(%esp) -; CHECK: movaps -define void @test_v2sd(<2 x double>* %P, double %X, double %Y) nounwind { - %tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0 ; <<2 x double>> [#uses=1] - %tmp2 = insertelement <2 x double> %tmp, double %Y, i32 1 ; <<2 x double>> [#uses=1] - store <2 x double> %tmp2, <2 x double>* %P - ret void -} - -; CHECK: test_v8i16 -; CHECK: pshufhw $-58 -; CHECK: movdqa -define void @test_v8i16(<2 x i64>* %res, <2 x i64>* %A) nounwind { - %tmp = load <2 x i64>* %A ; <<2 x i64>> [#uses=1] - %tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16> ; <<8 x i16>> [#uses=8] - %tmp.upgrd.2 = extractelement <8 x i16> %tmp.upgrd.1, i32 0 ; [#uses=1] - %tmp1 = extractelement <8 x i16> %tmp.upgrd.1, i32 1 ; [#uses=1] - %tmp2 = extractelement <8 x i16> %tmp.upgrd.1, i32 2 ; [#uses=1] - %tmp3 = extractelement <8 x i16> %tmp.upgrd.1, i32 3 ; [#uses=1] - %tmp4 = extractelement <8 x i16> %tmp.upgrd.1, i32 6 ; [#uses=1] - %tmp5 = extractelement <8 x i16> %tmp.upgrd.1, i32 5 ; [#uses=1] - %tmp6 = extractelement <8 x i16> %tmp.upgrd.1, i32 4 ; [#uses=1] - %tmp7 = extractelement <8 x i16> %tmp.upgrd.1, i32 7 ; [#uses=1] - %tmp8 = insertelement <8 x i16> undef, i16 %tmp.upgrd.2, i32 0 ; <<8 x i16>> [#uses=1] - %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 1 ; <<8 x i16>> [#uses=1] - %tmp10 = insertelement <8 x i16> %tmp9, i16 %tmp2, i32 2 ; <<8 x i16>> [#uses=1] - %tmp11 = insertelement <8 x i16> %tmp10, i16 %tmp3, i32 3 ; <<8 x i16>> [#uses=1] - %tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp4, i32 4 ; <<8 x i16>> [#uses=1] - %tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 5 ; <<8 x i16>> [#uses=1] - %tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp6, i32 6 ; <<8 x i16>> [#uses=1] - %tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 7 ; <<8 x i16>> [#uses=1] - %tmp15.upgrd.3 = bitcast <8 x i16> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1] - store <2 x i64> %tmp15.upgrd.3, <2 x i64>* %res - ret void -} diff --git a/test/CodeGen/X86/vec_splat-2.ll b/test/CodeGen/X86/vec_splat-2.ll deleted file mode 100644 index 9d82f97dca1..00000000000 --- a/test/CodeGen/X86/vec_splat-2.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s - -define void @test(<2 x i64>* %P, i8 %x) nounwind { - %tmp = insertelement <16 x i8> zeroinitializer, i8 %x, i32 0 ; <<16 x i8>> [#uses=1] - %tmp36 = insertelement <16 x i8> %tmp, i8 %x, i32 1 ; <<16 x i8>> [#uses=1] - %tmp38 = insertelement <16 x i8> %tmp36, i8 %x, i32 2 ; <<16 x i8>> [#uses=1] - %tmp40 = insertelement <16 x i8> %tmp38, i8 %x, i32 3 ; <<16 x i8>> [#uses=1] - %tmp42 = insertelement <16 x i8> %tmp40, i8 %x, i32 4 ; <<16 x i8>> [#uses=1] - %tmp44 = insertelement <16 x i8> %tmp42, i8 %x, i32 5 ; <<16 x i8>> [#uses=1] - %tmp46 = insertelement <16 x i8> %tmp44, i8 %x, i32 6 ; <<16 x i8>> [#uses=1] - %tmp48 = insertelement <16 x i8> %tmp46, i8 %x, i32 7 ; <<16 x i8>> [#uses=1] - %tmp50 = insertelement <16 x i8> %tmp48, i8 %x, i32 8 ; <<16 x i8>> [#uses=1] - %tmp52 = insertelement <16 x i8> %tmp50, i8 %x, i32 9 ; <<16 x i8>> [#uses=1] - %tmp54 = insertelement <16 x i8> %tmp52, i8 %x, i32 10 ; <<16 x i8>> [#uses=1] - %tmp56 = insertelement <16 x i8> %tmp54, i8 %x, i32 11 ; <<16 x i8>> [#uses=1] - %tmp58 = insertelement <16 x i8> %tmp56, i8 %x, i32 12 ; <<16 x i8>> [#uses=1] - %tmp60 = insertelement <16 x i8> %tmp58, i8 %x, i32 13 ; <<16 x i8>> [#uses=1] - %tmp62 = insertelement <16 x i8> %tmp60, i8 %x, i32 14 ; <<16 x i8>> [#uses=1] - %tmp64 = insertelement <16 x i8> %tmp62, i8 %x, i32 15 ; <<16 x i8>> [#uses=1] - %tmp68 = load <2 x i64>* %P ; <<2 x i64>> [#uses=1] - %tmp71 = bitcast <2 x i64> %tmp68 to <16 x i8> ; <<16 x i8>> [#uses=1] - %tmp73 = add <16 x i8> %tmp71, %tmp64 ; <<16 x i8>> [#uses=1] - %tmp73.upgrd.1 = bitcast <16 x i8> %tmp73 to <2 x i64> ; <<2 x i64>> [#uses=1] - store <2 x i64> %tmp73.upgrd.1, <2 x i64>* %P - ret void - -; CHECK-LABEL: test: -; CHECK-NOT: pshufd -; CHECK: punpcklbw -; CHECK: punpcklbw -; CHECK: pshufd $0 -; CHECK-NOT: pshufd -} diff --git a/test/CodeGen/X86/vec_splat-3.ll b/test/CodeGen/X86/vec_splat-3.ll deleted file mode 100644 index abdfd436769..00000000000 --- a/test/CodeGen/X86/vec_splat-3.ll +++ /dev/null @@ -1,201 +0,0 @@ -; RUN: llc <%s -mtriple=i686-unknown-unknown -mcpu=penryn -mattr=sse4.1 | FileCheck %s - -; Splat test for v8i16 -define <8 x i16> @shuf_8i16_0(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_0: -; CHECK: pshuflw $0 -} - -define <8 x i16> @shuf_8i16_1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_1: -; CHECK: pshuflw $5 -} - -define <8 x i16> @shuf_8i16_2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_2: -; CHECK: punpcklwd -; CHECK-NEXT: pshufd $-86 -} - -define <8 x i16> @shuf_8i16_3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_3: -; CHECK: pshuflw $15 -} - -define <8 x i16> @shuf_8i16_4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_4: -; CHECK: movhlps -} - -define <8 x i16> @shuf_8i16_5(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_5: -; CHECK: punpckhwd -; CHECK-NEXT: pshufd $85 -} - -define <8 x i16> @shuf_8i16_6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_6: -; CHECK: punpckhwd -; CHECK-NEXT: pshufd $-86 -} - -define <8 x i16> @shuf_8i16_7(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_7: -; CHECK: punpckhwd -; CHECK-NEXT: pshufd $-1 -} - -; Splat test for v16i8 -define <16 x i8> @shuf_16i8_8(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_8: -; CHECK: pxor %[[X:xmm[0-9]+]], %[[X]] -; CHECK-NEXT: pshufb %[[X]], %xmm0 -} - -define <16 x i8> @shuf_16i8_9(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_9: -; CHECK: pshufb {{.*}} # xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -} - -define <16 x i8> @shuf_16i8_10(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_10: -; CHECK: pshufb {{.*}} # xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -} - -define <16 x i8> @shuf_16i8_11(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_11: -; CHECK: pshufb {{.*}} # xmm0 = xmm0[3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] -} - - -define <16 x i8> @shuf_16i8_12(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_12: -; CHECK: pshufd $5 -} - -define <16 x i8> @shuf_16i8_13(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_13: -; CHECK: pshufb {{.*}} # xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] -} - -define <16 x i8> @shuf_16i8_14(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_14: -; CHECK: pshufb {{.*}} # xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] -} - -define <16 x i8> @shuf_16i8_15(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_15: -; CHECK: pshufb {{.*}} # xmm0 = xmm0[7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -} - -define <16 x i8> @shuf_16i8_16(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_16: -; CHECK: pshufb {{.*}} # xmm0 = xmm0[8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -} - -define <16 x i8> @shuf_16i8_17(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_17: -; CHECK: pshufb {{.*}} # xmm0 = xmm0[9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] -} - -define <16 x i8> @shuf_16i8_18(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_18: -; CHECK: pshufb {{.*}} # xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] -} - -define <16 x i8> @shuf_16i8_19(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_19: -; CHECK: pshufb {{.*}} # xmm0 = xmm0[11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] -} - -define <16 x i8> @shuf_16i8_20(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_20: -; CHECK: pshufb {{.*}} # xmm0 = xmm0[12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] -} - -define <16 x i8> @shuf_16i8_21(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_21: -; CHECK: pshufb {{.*}} # xmm0 = xmm0[13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] -} - -define <16 x i8> @shuf_16i8_22(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_22: -; CHECK: pshufb {{.*}} # xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] -} - -define <16 x i8> @shuf_16i8_23(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_23: -; CHECK: pshufb {{.*}} # xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -} diff --git a/test/CodeGen/X86/vec_splat.ll b/test/CodeGen/X86/vec_splat.ll deleted file mode 100644 index 07eeb3575c7..00000000000 --- a/test/CodeGen/X86/vec_splat.ll +++ /dev/null @@ -1,68 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s -check-prefix=SSE2 -; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse3 | FileCheck %s -check-prefix=SSE3 -; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s -check-prefix=AVX - -define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind { - %tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0 ; <<4 x float>> [#uses=1] - %tmp2 = insertelement <4 x float> %tmp, float %X, i32 1 ; <<4 x float>> [#uses=1] - %tmp4 = insertelement <4 x float> %tmp2, float %X, i32 2 ; <<4 x float>> [#uses=1] - %tmp6 = insertelement <4 x float> %tmp4, float %X, i32 3 ; <<4 x float>> [#uses=1] - %tmp8 = load <4 x float>* %Q ; <<4 x float>> [#uses=1] - %tmp10 = fmul <4 x float> %tmp8, %tmp6 ; <<4 x float>> [#uses=1] - store <4 x float> %tmp10, <4 x float>* %P - ret void - -; SSE2-LABEL: test_v4sf: -; SSE2: pshufd $0 - -; SSE3-LABEL: test_v4sf: -; SSE3: pshufd $0 -} - -define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind { - %tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0 ; <<2 x double>> [#uses=1] - %tmp2 = insertelement <2 x double> %tmp, double %X, i32 1 ; <<2 x double>> [#uses=1] - %tmp4 = load <2 x double>* %Q ; <<2 x double>> [#uses=1] - %tmp6 = fmul <2 x double> %tmp4, %tmp2 ; <<2 x double>> [#uses=1] - store <2 x double> %tmp6, <2 x double>* %P - ret void - -; SSE2-LABEL: test_v2sd: -; SSE2: movlhps - -; SSE3-LABEL: test_v2sd: -; SSE3: movddup -} - -; Fold extract of a load into the load's address computation. This avoids spilling to the stack. -define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind { - %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i - %2 = load <4 x float>* %1, align 16 - %3 = trunc i64 %j to i32 - %4 = extractelement <4 x float> %2, i32 %3 - %5 = insertelement <4 x float> undef, float %4, i32 0 - %6 = insertelement <4 x float> %5, float %4, i32 1 - %7 = insertelement <4 x float> %6, float %4, i32 2 - %8 = insertelement <4 x float> %7, float %4, i32 3 - ret <4 x float> %8 - -; AVX-LABEL: load_extract_splat -; AVX-NOT: rsp -; AVX: vbroadcastss -} - -; Fold extract of a load into the load's address computation. This avoids spilling to the stack. -define <4 x float> @load_extract_splat1(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind { - %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i - %2 = load <4 x float>* %1, align 16 - %3 = extractelement <4 x float> %2, i64 %j - %4 = insertelement <4 x float> undef, float %3, i32 0 - %5 = insertelement <4 x float> %4, float %3, i32 1 - %6 = insertelement <4 x float> %5, float %3, i32 2 - %7 = insertelement <4 x float> %6, float %3, i32 3 - ret <4 x float> %7 - -; AVX-LABEL: load_extract_splat1 -; AVX-NOT: movs -; AVX: vbroadcastss -} diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll index 25d3806b991..b66ed708b0d 100644 --- a/test/CodeGen/X86/vector-blend.ll +++ b/test/CodeGen/X86/vector-blend.ll @@ -9,15 +9,15 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { ; SSE2-LABEL: vsel_float: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: andps {{.*}}, %xmm1 -; SSE2-NEXT: andps {{.*}}, %xmm0 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_float: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: andps {{.*}}, %xmm1 -; SSSE3-NEXT: andps {{.*}}, %xmm0 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 ; SSSE3-NEXT: orps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; @@ -54,26 +54,26 @@ entry: define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { ; SSE2-LABEL: vsel_4xi8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: andps {{.*}}, %xmm1 -; SSE2-NEXT: andps {{.*}}, %xmm0 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_4xi8: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: andps {{.*}}, %xmm1 -; SSSE3-NEXT: andps {{.*}}, %xmm0 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 ; SSSE3-NEXT: orps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_4xi8: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: vsel_4xi8: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: vsel_4xi8: @@ -88,26 +88,26 @@ entry: define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) { ; SSE2-LABEL: vsel_4xi16: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: andps {{.*}}, %xmm1 -; SSE2-NEXT: andps {{.*}}, %xmm0 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_4xi16: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: andps {{.*}}, %xmm1 -; SSSE3-NEXT: andps {{.*}}, %xmm0 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 ; SSSE3-NEXT: orps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_4xi16: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: vsel_4xi16: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: vsel_4xi16: @@ -122,26 +122,26 @@ entry: define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) { ; SSE2-LABEL: vsel_i32: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: andps {{.*}}, %xmm1 -; SSE2-NEXT: andps {{.*}}, %xmm0 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_i32: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: andps {{.*}}, %xmm1 -; SSSE3-NEXT: andps {{.*}}, %xmm0 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 ; SSSE3-NEXT: orps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_i32: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: vsel_i32: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: vsel_i32: @@ -188,21 +188,22 @@ entry: define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) { ; SSE2-LABEL: vsel_8xi16: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: andps {{.*}}, %xmm1 -; SSE2-NEXT: andps {{.*}}, %xmm0 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_8xi16: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: andps {{.*}}, %xmm1 -; SSSE3-NEXT: andps {{.*}}, %xmm0 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 ; SSSE3-NEXT: orps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: vsel_8xi16: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: vsel_8xi16: @@ -217,15 +218,15 @@ entry: define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) { ; SSE2-LABEL: vsel_i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: andps {{.*}}, %xmm1 -; SSE2-NEXT: andps {{.*}}, %xmm0 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: vsel_i8: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: andps {{.*}}, %xmm1 -; SSSE3-NEXT: andps {{.*}}, %xmm0 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 ; SSSE3-NEXT: orps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; @@ -359,11 +360,17 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) { ; SSE41-NEXT: blendpd {{.*#+}} xmm3 = xmm7[0,1] ; SSE41-NEXT: retq ; -; AVX-LABEL: vsel_i648: -; AVX: # BB#0: # %entry -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: vsel_i648: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_i648: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX2-NEXT: retq entry: %vsel = select <8 x i1> , <8 x i64> %v1, <8 x i64> %v2 ret <8 x i64> %vsel @@ -511,8 +518,10 @@ define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) ; ; SSE41-LABEL: constant_blendvps_avx: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3] +; SSE41-NEXT: blendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3] +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: movaps %xmm3, %xmm1 ; SSE41-NEXT: retq ; ; AVX-LABEL: constant_blendvps_avx: @@ -561,8 +570,8 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) { ; ; AVX1-LABEL: constant_pblendvb_avx2: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vandps {{.*}}, %ymm1, %ymm1 -; AVX1-NEXT: vandps {{.*}}, %ymm0, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -584,13 +593,13 @@ define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) ; SSE2-LABEL: blend_shufflevector_4xfloat: ; SSE2: # BB#0: # %entry ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: blend_shufflevector_4xfloat: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: blend_shufflevector_4xfloat: @@ -628,8 +637,8 @@ define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b) ; ; SSE41-LABEL: blend_shufflevector_8xfloat: ; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] ; SSE41-NEXT: blendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] -; SSE41-NEXT: movss %xmm0, %xmm2 ; SSE41-NEXT: movaps %xmm2, %xmm0 ; SSE41-NEXT: movaps %xmm3, %xmm1 ; SSE41-NEXT: retq @@ -644,11 +653,22 @@ entry: } define <4 x double> @blend_shufflevector_4xdouble(<4 x double> %a, <4 x double> %b) { -; SSE-LABEL: blend_shufflevector_4xdouble: -; SSE: # BB#0: # %entry -; SSE-NEXT: movsd %xmm0, %xmm2 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: blend_shufflevector_4xdouble: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: blend_shufflevector_4xdouble: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm0, %xmm2 +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: blend_shufflevector_4xdouble: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] +; SSE41-NEXT: retq ; ; AVX-LABEL: blend_shufflevector_4xdouble: ; AVX: # BB#0: # %entry @@ -660,16 +680,34 @@ entry: } define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) { -; SSE-LABEL: blend_shufflevector_4xi64: -; SSE: # BB#0: # %entry -; SSE-NEXT: movsd %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: blend_shufflevector_4xi64: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq ; -; AVX-LABEL: blend_shufflevector_4xi64: -; AVX: # BB#0: # %entry -; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] -; AVX-NEXT: retq +; SSSE3-LABEL: blend_shufflevector_4xi64: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: blend_shufflevector_4xi64: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movaps %xmm3, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: blend_shufflevector_4xi64: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: blend_shufflevector_4xi64: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq entry: %select = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %select diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll index bad1fa7643e..4b269dc923c 100644 --- a/test/CodeGen/X86/vector-idiv.ll +++ b/test/CodeGen/X86/vector-idiv.ll @@ -10,14 +10,14 @@ define <4 x i32> @test1(<4 x i32> %a) { ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pmuludq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,0] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,0,3,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE41-NEXT: pmuludq %xmm1, %xmm3 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] -; SSE41-NEXT: psubd %xmm1, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE41-NEXT: psubd %xmm2, %xmm0 ; SSE41-NEXT: psrld $1, %xmm0 -; SSE41-NEXT: paddd %xmm1, %xmm0 +; SSE41-NEXT: paddd %xmm2, %xmm0 ; SSE41-NEXT: psrld $2, %xmm0 ; SSE41-NEXT: retq ; @@ -26,14 +26,14 @@ define <4 x i32> @test1(<4 x i32> %a) { ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,0] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,0,3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE-NEXT: pmuludq %xmm1, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] -; SSE-NEXT: psubd %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: psubd %xmm2, %xmm0 ; SSE-NEXT: psrld $1, %xmm0 -; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: paddd %xmm2, %xmm0 ; SSE-NEXT: psrld $2, %xmm0 ; SSE-NEXT: retq ; @@ -41,11 +41,11 @@ define <4 x i32> @test1(<4 x i32> %a) { ; AVX: # BB#0: ; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,0] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,0,3,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3] ; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrld $1, %xmm0, %xmm0 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -61,20 +61,20 @@ define <8 x i32> @test2(<8 x i32> %a) { ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pmuludq %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,0,3,0] -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,0,3,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] ; SSE41-NEXT: pmuludq %xmm4, %xmm5 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] ; SSE41-NEXT: psubd %xmm3, %xmm0 ; SSE41-NEXT: psrld $1, %xmm0 ; SSE41-NEXT: paddd %xmm3, %xmm0 ; SSE41-NEXT: psrld $2, %xmm0 ; SSE41-NEXT: pmuludq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,0,3,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; SSE41-NEXT: pmuludq %xmm4, %xmm3 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE41-NEXT: psubd %xmm2, %xmm1 ; SSE41-NEXT: psrld $1, %xmm1 ; SSE41-NEXT: paddd %xmm2, %xmm1 @@ -86,20 +86,20 @@ define <8 x i32> @test2(<8 x i32> %a) { ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,0,3,0] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,0,3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] ; SSE-NEXT: pmuludq %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] ; SSE-NEXT: psubd %xmm3, %xmm0 ; SSE-NEXT: psrld $1, %xmm0 ; SSE-NEXT: paddd %xmm3, %xmm0 ; SSE-NEXT: psrld $2, %xmm0 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,0,3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; SSE-NEXT: pmuludq %xmm4, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: psubd %xmm2, %xmm1 ; SSE-NEXT: psrld $1, %xmm1 ; SSE-NEXT: paddd %xmm2, %xmm1 @@ -109,11 +109,11 @@ define <8 x i32> @test2(<8 x i32> %a) { ; AVX-LABEL: test2: ; AVX: # BB#0: ; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX-NEXT: vpalignr {{.*#+}} ymm2 = ymm1[4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3],ymm1[20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19] -; AVX-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,20,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19] +; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] ; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 ; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 -; AVX-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3],ymm1[20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19] +; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] ; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vpsrld $1, %ymm0, %ymm0 @@ -822,14 +822,14 @@ define <16 x i8> @test7(<16 x i8> %a) { define <4 x i32> @test8(<4 x i32> %a) { ; SSE41-LABEL: test8: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmuldq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,0] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,0,3,0] -; SSE41-NEXT: pmuldq %xmm1, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pmuldq %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm2, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE41-NEXT: paddd %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: psrld $31, %xmm0 @@ -840,21 +840,21 @@ define <4 x i32> @test8(<4 x i32> %a) { ; ; SSE-LABEL: test8: ; SSE: # BB#0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: psrad $31, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: paddd %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,0] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,0,3,0] -; SSE-NEXT: pmuludq %xmm1, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[1,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: paddd %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pmuludq %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: psubd %xmm3, %xmm1 ; SSE-NEXT: paddd %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -868,11 +868,11 @@ define <4 x i32> @test8(<4 x i32> %a) { ; AVX: # BB#0: ; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 ; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,0] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,0,3,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vpmuldq %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3] ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpsrld $31, %xmm0, %xmm1 ; AVX-NEXT: vpsrad $2, %xmm0, %xmm0 @@ -885,85 +885,85 @@ define <4 x i32> @test8(<4 x i32> %a) { define <8 x i32> @test9(<8 x i32> %a) { ; SSE41-LABEL: test9: ; SSE41: # BB#0: -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: pmuldq %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,0,3,0] -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,0,3,0] +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; SSE41-NEXT: # kill: XMM0 XMM3 +; SSE41-NEXT: pmuldq %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] ; SSE41-NEXT: pmuldq %xmm4, %xmm5 -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE41-NEXT: paddd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: psrld $31, %xmm0 -; SSE41-NEXT: psrad $2, %xmm2 -; SSE41-NEXT: paddd %xmm0, %xmm2 -; SSE41-NEXT: pmuldq %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,0] -; SSE41-NEXT: pmuldq %xmm4, %xmm0 -; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm0[1,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE41-NEXT: paddd %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: psrld $31, %xmm0 -; SSE41-NEXT: psrad $2, %xmm3 -; SSE41-NEXT: paddd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm5[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE41-NEXT: paddd %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrld $31, %xmm3 +; SSE41-NEXT: psrad $2, %xmm0 +; SSE41-NEXT: paddd %xmm3, %xmm0 +; SSE41-NEXT: pmuldq %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm4, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrld $31, %xmm2 +; SSE41-NEXT: psrad $2, %xmm1 +; SSE41-NEXT: paddd %xmm2, %xmm1 ; SSE41-NEXT: retq ; ; SSE-LABEL: test9: ; SSE: # BB#0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: psrad $31, %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: psrad $31, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pand %xmm1, %xmm5 ; SSE-NEXT: paddd %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pmuludq %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,0,3,0] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,0,3,0] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] ; SSE-NEXT: pmuludq %xmm6, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm7[1,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: psubd %xmm5, %xmm0 -; SSE-NEXT: paddd %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrld $31, %xmm2 +; SSE-NEXT: paddd %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrld $31, %xmm3 ; SSE-NEXT: psrad $2, %xmm0 -; SSE-NEXT: paddd %xmm2, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: psrad $31, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm5 -; SSE-NEXT: paddd %xmm4, %xmm5 -; SSE-NEXT: pmuludq %xmm1, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,0] -; SSE-NEXT: pmuludq %xmm6, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] -; SSE-NEXT: psubd %xmm5, %xmm2 -; SSE-NEXT: paddd %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrld $31, %xmm1 -; SSE-NEXT: psrad $2, %xmm2 -; SSE-NEXT: paddd %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: paddd %xmm3, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: paddd %xmm4, %xmm3 +; SSE-NEXT: pmuludq %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE-NEXT: pmuludq %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: psubd %xmm3, %xmm1 +; SSE-NEXT: paddd %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrld $31, %xmm2 +; SSE-NEXT: psrad $2, %xmm1 +; SSE-NEXT: paddd %xmm2, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: test9: ; AVX: # BB#0: ; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX-NEXT: vpalignr {{.*#+}} ymm2 = ymm1[4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3],ymm1[20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19] -; AVX-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,20,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19] +; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] ; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 ; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 -; AVX-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3],ymm1[20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19] +; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] ; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vpsrld $31, %ymm0, %ymm1 @@ -980,11 +980,11 @@ define <8 x i32> @test10(<8 x i32> %a) { ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pmuludq %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,0,3,0] -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,0,3,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] ; SSE41-NEXT: pmuludq %xmm4, %xmm5 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] ; SSE41-NEXT: movdqa %xmm0, %xmm5 ; SSE41-NEXT: psubd %xmm3, %xmm5 ; SSE41-NEXT: psrld $1, %xmm5 @@ -994,10 +994,10 @@ define <8 x i32> @test10(<8 x i32> %a) { ; SSE41-NEXT: pmulld %xmm3, %xmm5 ; SSE41-NEXT: psubd %xmm5, %xmm0 ; SSE41-NEXT: pmuludq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,0,3,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] ; SSE41-NEXT: pmuludq %xmm4, %xmm5 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: psubd %xmm2, %xmm4 ; SSE41-NEXT: psrld $1, %xmm4 @@ -1012,49 +1012,49 @@ define <8 x i32> @test10(<8 x i32> %a) { ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,0,3,0] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,0,3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] ; SSE-NEXT: pmuludq %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] ; SSE-NEXT: movdqa %xmm0, %xmm5 ; SSE-NEXT: psubd %xmm3, %xmm5 ; SSE-NEXT: psrld $1, %xmm5 ; SSE-NEXT: paddd %xmm3, %xmm5 ; SSE-NEXT: psrld $2, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] ; SSE-NEXT: pmuludq %xmm3, %xmm5 ; SSE-NEXT: pmuludq %xmm3, %xmm6 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,1,3] ; SSE-NEXT: psubd %xmm5, %xmm0 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,0,3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] ; SSE-NEXT: pmuludq %xmm4, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: psubd %xmm2, %xmm4 ; SSE-NEXT: psrld $1, %xmm4 ; SSE-NEXT: paddd %xmm2, %xmm4 ; SSE-NEXT: psrld $2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,0,3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] ; SSE-NEXT: pmuludq %xmm3, %xmm4 ; SSE-NEXT: pmuludq %xmm3, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,1,3] -; SSE-NEXT: psubd %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: psubd %xmm4, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: test10: ; AVX: # BB#0: ; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX-NEXT: vpalignr {{.*#+}} ymm2 = ymm1[4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3],ymm1[20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19] -; AVX-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,20,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19] +; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] ; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 ; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 -; AVX-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3],ymm1[20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19] +; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] ; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm2 ; AVX-NEXT: vpsrld $1, %ymm2, %ymm2 @@ -1074,11 +1074,11 @@ define <8 x i32> @test11(<8 x i32> %a) { ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] ; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: pmuldq %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,0,3,0] -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,0,3,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] ; SSE41-NEXT: pmuldq %xmm4, %xmm5 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] ; SSE41-NEXT: paddd %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm3, %xmm5 ; SSE41-NEXT: psrld $31, %xmm5 @@ -1088,10 +1088,10 @@ define <8 x i32> @test11(<8 x i32> %a) { ; SSE41-NEXT: pmulld %xmm5, %xmm3 ; SSE41-NEXT: psubd %xmm3, %xmm0 ; SSE41-NEXT: pmuldq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,0,3,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; SSE41-NEXT: pmuldq %xmm4, %xmm3 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE41-NEXT: paddd %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrld $31, %xmm3 @@ -1112,13 +1112,13 @@ define <8 x i32> @test11(<8 x i32> %a) { ; SSE-NEXT: psrad $31, %xmm6 ; SSE-NEXT: pand %xmm2, %xmm6 ; SSE-NEXT: paddd %xmm4, %xmm6 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: pmuludq %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,0,3,0] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,0,3,0] -; SSE-NEXT: pmuludq %xmm5, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm7[1,3] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,2,1,3] +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pmuludq %xmm2, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] ; SSE-NEXT: psubd %xmm6, %xmm7 ; SSE-NEXT: paddd %xmm0, %xmm7 ; SSE-NEXT: movdqa %xmm7, %xmm4 @@ -1126,44 +1126,44 @@ define <8 x i32> @test11(<8 x i32> %a) { ; SSE-NEXT: psrad $2, %xmm7 ; SSE-NEXT: paddd %xmm4, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,0,3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] ; SSE-NEXT: pmuludq %xmm4, %xmm7 ; SSE-NEXT: pmuludq %xmm4, %xmm6 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,2,1,3] -; SSE-NEXT: psubd %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] +; SSE-NEXT: psubd %xmm7, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: psrad $31, %xmm6 ; SSE-NEXT: pand %xmm2, %xmm6 ; SSE-NEXT: paddd %xmm3, %xmm6 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,0,3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; SSE-NEXT: pmuludq %xmm5, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: psubd %xmm6, %xmm2 ; SSE-NEXT: paddd %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: psrld $31, %xmm3 ; SSE-NEXT: psrad $2, %xmm2 ; SSE-NEXT: paddd %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; SSE-NEXT: pmuludq %xmm4, %xmm2 ; SSE-NEXT: pmuludq %xmm4, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: psubd %xmm2, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: test11: ; AVX: # BB#0: ; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX-NEXT: vpalignr {{.*#+}} ymm2 = ymm1[4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3],ymm1[20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19] -; AVX-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,20,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19] +; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] ; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 ; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 -; AVX-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3],ymm1[20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19] +; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] ; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm1 ; AVX-NEXT: vpsrld $31, %ymm1, %ymm2 @@ -1202,15 +1202,16 @@ define <4 x i32> @PR20355(<4 x i32> %a) { ; SSE41-LABEL: PR20355: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: pmuldq %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE41-NEXT: pmuldq %xmm2, %xmm1 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: psrld $31, %xmm1 +; SSE41-NEXT: paddd %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $31, %xmm0 -; SSE41-NEXT: paddd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; SSE-LABEL: PR20355: @@ -1223,27 +1224,28 @@ define <4 x i32> @PR20355(<4 x i32> %a) { ; SSE-NEXT: psrad $31, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: paddd %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE-NEXT: pmuludq %xmm2, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] -; SSE-NEXT: psubd %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: psubd %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $31, %xmm1 +; SSE-NEXT: paddd %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $31, %xmm0 -; SSE-NEXT: paddd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: PR20355: ; AVX: # BB#0: # %entry ; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 ; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,0] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,3],xmm0[1,3] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: vpsrld $31, %xmm0, %xmm1 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll index ee4313c179b..7a329d7670d 100644 --- a/test/CodeGen/X86/vector-sext.ll +++ b/test/CodeGen/X86/vector-sext.ll @@ -11,11 +11,11 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE2-LABEL: sext_8i16_to_8i32: ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: # kill: XMM0 XMM1 +; SSE2-NEXT: # kill: XMM0 XMM1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $16, %xmm0 ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: retq @@ -23,11 +23,11 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; SSSE3-LABEL: sext_8i16_to_8i32: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: # kill: XMM0 XMM1 +; SSSE3-NEXT: # kill: XMM0 XMM1 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $16, %xmm0 ; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $16, %xmm1 ; SSSE3-NEXT: psrad $16, %xmm1 ; SSSE3-NEXT: retq @@ -36,17 +36,17 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE41: # BB#0: # %entry ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmovzxwd %xmm1, %xmm0 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE41-NEXT: pslld $16, %xmm1 -; SSE41-NEXT: psrad $16, %xmm1 ; SSE41-NEXT: pslld $16, %xmm0 ; SSE41-NEXT: psrad $16, %xmm0 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pslld $16, %xmm1 +; SSE41-NEXT: psrad $16, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: sext_8i16_to_8i32: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -60,11 +60,11 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; X32-SSE41: # BB#0: # %entry ; X32-SSE41-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE41-NEXT: pmovzxwd %xmm1, %xmm0 -; X32-SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE41-NEXT: pslld $16, %xmm1 -; X32-SSE41-NEXT: psrad $16, %xmm1 ; X32-SSE41-NEXT: pslld $16, %xmm0 ; X32-SSE41-NEXT: psrad $16, %xmm0 +; X32-SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; X32-SSE41-NEXT: pslld $16, %xmm1 +; X32-SSE41-NEXT: psrad $16, %xmm1 ; X32-SSE41-NEXT: retl entry: %B = sext <8 x i16> %A to <8 x i32> @@ -74,20 +74,20 @@ entry: define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: sext_4i32_to_4i64: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] ; SSE2-NEXT: movd %xmm1, %rax ; SSE2-NEXT: cltq ; SSE2-NEXT: movd %rax, %xmm2 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE2-NEXT: movd %xmm1, %rax ; SSE2-NEXT: cltq ; SSE2-NEXT: movd %rax, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE2-NEXT: movd %xmm0, %rax ; SSE2-NEXT: cltq ; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE2-NEXT: movd %xmm0, %rax ; SSE2-NEXT: cltq ; SSE2-NEXT: movd %rax, %xmm0 @@ -97,20 +97,20 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; ; SSSE3-LABEL: sext_4i32_to_4i64: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] ; SSSE3-NEXT: movd %xmm1, %rax ; SSSE3-NEXT: cltq ; SSSE3-NEXT: movd %rax, %xmm2 -; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSSE3-NEXT: movd %xmm1, %rax ; SSSE3-NEXT: cltq ; SSSE3-NEXT: movd %rax, %xmm1 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSSE3-NEXT: movd %xmm0, %rax ; SSSE3-NEXT: cltq ; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSSE3-NEXT: movd %xmm0, %rax ; SSSE3-NEXT: cltq ; SSSE3-NEXT: movd %rax, %xmm0 @@ -128,7 +128,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; SSE41-NEXT: cltq ; SSE41-NEXT: movd %rax, %xmm2 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE41-NEXT: pextrq $1, %xmm0, %rax ; SSE41-NEXT: cltq ; SSE41-NEXT: movd %rax, %xmm3 @@ -142,7 +142,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; AVX1-LABEL: sext_4i32_to_4i64: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -161,7 +161,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 ; X32-SSE41-NEXT: sarl $31, %ecx ; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,0,3,0] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] ; X32-SSE41-NEXT: movd %xmm1, %eax ; X32-SSE41-NEXT: sarl $31, %eax ; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx @@ -214,24 +214,17 @@ entry: define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) { ; SSE2-LABEL: load_sext_test2: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movl (%rdi), %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shll $8, %ecx -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pextrw $1, %xmm0, %edx -; SSE2-NEXT: pinsrw $1, %ecx, %xmm0 -; SSE2-NEXT: pinsrw $3, %eax, %xmm0 -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: shll $8, %eax -; SSE2-NEXT: pinsrw $5, %eax, %xmm0 -; SSE2-NEXT: pinsrw $7, %edx, %xmm0 +; SSE2-NEXT: movd (%rdi), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_test2: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movd (%rdi), %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: psrad $24, %xmm0 ; SSSE3-NEXT: retq ; @@ -417,20 +410,20 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; SSE2: # BB#0: ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] ; SSE2-NEXT: movd %xmm1, %rax ; SSE2-NEXT: cltq ; SSE2-NEXT: movd %rax, %xmm2 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE2-NEXT: movd %xmm1, %rax ; SSE2-NEXT: cltq ; SSE2-NEXT: movd %rax, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE2-NEXT: movd %xmm0, %rax ; SSE2-NEXT: cltq ; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE2-NEXT: movd %xmm0, %rax ; SSE2-NEXT: cltq ; SSE2-NEXT: movd %rax, %xmm0 @@ -442,20 +435,20 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; SSSE3: # BB#0: ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] ; SSSE3-NEXT: movd %xmm1, %rax ; SSSE3-NEXT: cltq ; SSSE3-NEXT: movd %rax, %xmm2 -; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSSE3-NEXT: movd %xmm1, %rax ; SSSE3-NEXT: cltq ; SSSE3-NEXT: movd %rax, %xmm1 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSSE3-NEXT: movd %xmm0, %rax ; SSSE3-NEXT: cltq ; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSSE3-NEXT: movd %xmm0, %rax ; SSSE3-NEXT: cltq ; SSSE3-NEXT: movd %rax, %xmm0 @@ -475,7 +468,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; SSE41-NEXT: cltq ; SSE41-NEXT: movd %rax, %xmm2 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE41-NEXT: pextrq $1, %xmm0, %rax ; SSE41-NEXT: cltq ; SSE41-NEXT: movd %rax, %xmm3 @@ -491,7 +484,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -514,7 +507,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 ; X32-SSE41-NEXT: sarl $31, %ecx ; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,0,3,0] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] ; X32-SSE41-NEXT: movd %xmm1, %eax ; X32-SSE41-NEXT: sarl $31, %eax ; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx @@ -535,7 +528,7 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psllw $8, %xmm0 ; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psllw $8, %xmm1 ; SSE2-NEXT: psraw $8, %xmm1 ; SSE2-NEXT: retq @@ -547,7 +540,7 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) { ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: psllw $8, %xmm0 ; SSSE3-NEXT: psraw $8, %xmm0 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: psllw $8, %xmm1 ; SSSE3-NEXT: psraw $8, %xmm1 ; SSSE3-NEXT: retq @@ -556,18 +549,18 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) { ; SSE41: # BB#0: # %entry ; SSE41-NEXT: movdqa (%rdi), %xmm1 ; SSE41-NEXT: pmovzxbw %xmm1, %xmm0 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: psraw $8, %xmm1 ; SSE41-NEXT: psllw $8, %xmm0 ; SSE41-NEXT: psraw $8, %xmm0 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: psllw $8, %xmm1 +; SSE41-NEXT: psraw $8, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: sext_16i8_to_16i16: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 -; AVX1-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -583,11 +576,11 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) { ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE41-NEXT: movdqa (%eax), %xmm1 ; X32-SSE41-NEXT: pmovzxbw %xmm1, %xmm0 -; X32-SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; X32-SSE41-NEXT: psllw $8, %xmm1 -; X32-SSE41-NEXT: psraw $8, %xmm1 ; X32-SSE41-NEXT: psllw $8, %xmm0 ; X32-SSE41-NEXT: psraw $8, %xmm0 +; X32-SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-SSE41-NEXT: psllw $8, %xmm1 +; X32-SSE41-NEXT: psraw $8, %xmm1 ; X32-SSE41-NEXT: retl entry: %X = load <16 x i8>* %ptr @@ -600,20 +593,20 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { ; SSE2: # BB#0: ; SSE2-NEXT: pslld $24, %xmm0 ; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] ; SSE2-NEXT: movd %xmm1, %rax ; SSE2-NEXT: cltq ; SSE2-NEXT: movd %rax, %xmm2 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE2-NEXT: movd %xmm1, %rax ; SSE2-NEXT: cltq ; SSE2-NEXT: movd %rax, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE2-NEXT: movd %xmm0, %rax ; SSE2-NEXT: cltq ; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE2-NEXT: movd %xmm0, %rax ; SSE2-NEXT: cltq ; SSE2-NEXT: movd %rax, %xmm0 @@ -625,20 +618,20 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { ; SSSE3: # BB#0: ; SSSE3-NEXT: pslld $24, %xmm0 ; SSSE3-NEXT: psrad $24, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] ; SSSE3-NEXT: movd %xmm1, %rax ; SSSE3-NEXT: cltq ; SSSE3-NEXT: movd %rax, %xmm2 -; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSSE3-NEXT: movd %xmm1, %rax ; SSSE3-NEXT: cltq ; SSSE3-NEXT: movd %rax, %xmm1 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSSE3-NEXT: movd %xmm0, %rax ; SSSE3-NEXT: cltq ; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSSE3-NEXT: movd %xmm0, %rax ; SSSE3-NEXT: cltq ; SSSE3-NEXT: movd %rax, %xmm0 @@ -658,7 +651,7 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { ; SSE41-NEXT: cltq ; SSE41-NEXT: movd %rax, %xmm2 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSE41-NEXT: pextrq $1, %xmm0, %rax ; SSE41-NEXT: cltq ; SSE41-NEXT: movd %rax, %xmm3 @@ -674,7 +667,7 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { ; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 ; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -697,7 +690,7 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { ; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 ; X32-SSE41-NEXT: sarl $31, %ecx ; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,0,3,0] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] ; X32-SSE41-NEXT: movd %xmm1, %eax ; X32-SSE41-NEXT: sarl $31, %eax ; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx @@ -713,29 +706,23 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { ; SSE2-LABEL: load_sext_4i8_to_4i64: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pextrw $1, %xmm1, %ecx -; SSE2-NEXT: pinsrw $0, %eax, %xmm1 -; SSE2-NEXT: movzbl %ah, %eax -; SSE2-NEXT: pinsrw $2, %eax, %xmm1 -; SSE2-NEXT: pinsrw $4, %ecx, %xmm1 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: pinsrw $6, %ecx, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,0] +; SSE2-NEXT: movd (%rdi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] ; SSE2-NEXT: movd %xmm2, %rax ; SSE2-NEXT: movsbq %al, %rax ; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSE2-NEXT: movd %xmm2, %rax ; SSE2-NEXT: movsbq %al, %rax ; SSE2-NEXT: movd %rax, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,0,3,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; SSE2-NEXT: movd %xmm2, %rax ; SSE2-NEXT: movsbq %al, %rax ; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSE2-NEXT: movd %xmm2, %rax ; SSE2-NEXT: movsbq %al, %rax ; SSE2-NEXT: movd %rax, %xmm2 @@ -745,21 +732,22 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { ; SSSE3-LABEL: load_sext_4i8_to_4i64: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movd (%rdi), %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,0] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] ; SSSE3-NEXT: movd %xmm2, %rax ; SSSE3-NEXT: movsbq %al, %rax ; SSSE3-NEXT: movd %rax, %xmm0 -; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSSE3-NEXT: movd %xmm2, %rax ; SSSE3-NEXT: movsbq %al, %rax ; SSSE3-NEXT: movd %rax, %xmm2 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,0,3,0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; SSSE3-NEXT: movd %xmm2, %rax ; SSSE3-NEXT: movsbq %al, %rax ; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSSE3-NEXT: movd %xmm2, %rax ; SSSE3-NEXT: movsbq %al, %rax ; SSSE3-NEXT: movd %rax, %xmm2 @@ -768,9 +756,8 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { ; ; SSE41-LABEL: load_sext_4i8_to_4i64: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movd (%rdi), %xmm0 -; SSE41-NEXT: pmovzxbd %xmm0, %xmm1 -; SSE41-NEXT: pmovzxbq %xmm0, %xmm0 +; SSE41-NEXT: pmovzxbd (%rdi), %xmm1 +; SSE41-NEXT: pmovzxdq %xmm1, %xmm0 ; SSE41-NEXT: pextrq $1, %xmm0, %rax ; SSE41-NEXT: movsbq %al, %rax ; SSE41-NEXT: movd %rax, %xmm2 @@ -778,7 +765,7 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { ; SSE41-NEXT: movsbq %al, %rax ; SSE41-NEXT: movd %rax, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,0,3,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; SSE41-NEXT: pextrq $1, %xmm1, %rax ; SSE41-NEXT: movsbq %al, %rax ; SSE41-NEXT: movd %rax, %xmm2 @@ -792,7 +779,7 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -818,7 +805,7 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { ; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0 ; X32-SSE41-NEXT: sarl $31, %eax ; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,0,3,0] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; X32-SSE41-NEXT: movd %xmm2, %eax ; X32-SSE41-NEXT: movsbl %al, %eax ; X32-SSE41-NEXT: movd %eax, %xmm1 @@ -841,20 +828,20 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movq (%rdi), %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] ; SSE2-NEXT: movd %xmm2, %rax ; SSE2-NEXT: movswq %ax, %rax ; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSE2-NEXT: movd %xmm2, %rax ; SSE2-NEXT: movswq %ax, %rax ; SSE2-NEXT: movd %rax, %xmm2 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,0,3,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; SSE2-NEXT: movd %xmm2, %rax ; SSE2-NEXT: movswq %ax, %rax ; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSE2-NEXT: movd %xmm2, %rax ; SSE2-NEXT: movswq %ax, %rax ; SSE2-NEXT: movd %rax, %xmm2 @@ -865,20 +852,20 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movq (%rdi), %xmm1 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] ; SSSE3-NEXT: movd %xmm2, %rax ; SSSE3-NEXT: movswq %ax, %rax ; SSSE3-NEXT: movd %rax, %xmm0 -; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSSE3-NEXT: movd %xmm2, %rax ; SSSE3-NEXT: movswq %ax, %rax ; SSSE3-NEXT: movd %rax, %xmm2 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,0,3,0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; SSSE3-NEXT: movd %xmm2, %rax ; SSSE3-NEXT: movswq %ax, %rax ; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSSE3-NEXT: movd %xmm2, %rax ; SSSE3-NEXT: movswq %ax, %rax ; SSSE3-NEXT: movd %rax, %xmm2 @@ -897,7 +884,7 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { ; SSE41-NEXT: movswq %ax, %rax ; SSE41-NEXT: movd %rax, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,0,3,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; SSE41-NEXT: pextrq $1, %xmm1, %rax ; SSE41-NEXT: movswq %ax, %rax ; SSE41-NEXT: movd %rax, %xmm2 @@ -911,7 +898,7 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -937,7 +924,7 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { ; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0 ; X32-SSE41-NEXT: sarl $31, %eax ; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,0,3,0] +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] ; X32-SSE41-NEXT: movd %xmm2, %eax ; X32-SSE41-NEXT: cwtl ; X32-SSE41-NEXT: movd %eax, %xmm1 diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index 30d9176c362..5822fc91535 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -277,26 +277,26 @@ define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i3 ; SSE2: # BB#0: ; SSE2-NEXT: andps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test1b: ; SSSE3: # BB#0: ; SSSE3-NEXT: andps %xmm1, %xmm0 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_bitwise_ops_test1b: ; SSE41: # BB#0: -; SSE41-NEXT: andps %xmm1, %xmm0 -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_bitwise_ops_test1b: ; AVX1: # BB#0: -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_bitwise_ops_test1b: @@ -315,26 +315,26 @@ define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i3 ; SSE2: # BB#0: ; SSE2-NEXT: orps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test2b: ; SSSE3: # BB#0: ; SSSE3-NEXT: orps %xmm1, %xmm0 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_bitwise_ops_test2b: ; SSE41: # BB#0: -; SSE41-NEXT: orps %xmm1, %xmm0 -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_bitwise_ops_test2b: ; AVX1: # BB#0: -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_bitwise_ops_test2b: @@ -354,7 +354,7 @@ define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i3 ; SSE2-NEXT: xorps %xmm1, %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test3b: @@ -362,21 +362,21 @@ define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i3 ; SSSE3-NEXT: xorps %xmm1, %xmm0 ; SSSE3-NEXT: xorps %xmm1, %xmm1 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_bitwise_ops_test3b: ; SSE41: # BB#0: -; SSE41-NEXT: xorps %xmm1, %xmm0 -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_bitwise_ops_test3b: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_bitwise_ops_test3b: @@ -408,15 +408,15 @@ define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i3 ; ; SSE41-LABEL: combine_bitwise_ops_test4b: ; SSE41: # BB#0: -; SSE41-NEXT: andps %xmm1, %xmm0 -; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_bitwise_ops_test4b: ; AVX1: # BB#0: -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_bitwise_ops_test4b: @@ -447,15 +447,15 @@ define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i3 ; ; SSE41-LABEL: combine_bitwise_ops_test5b: ; SSE41: # BB#0: -; SSE41-NEXT: orps %xmm1, %xmm0 -; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_bitwise_ops_test5b: ; AVX1: # BB#0: -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_bitwise_ops_test5b: @@ -475,7 +475,8 @@ define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i3 ; SSE2-NEXT: xorps %xmm1, %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_bitwise_ops_test6b: @@ -483,22 +484,23 @@ define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i3 ; SSSE3-NEXT: xorps %xmm1, %xmm0 ; SSSE3-NEXT: xorps %xmm1, %xmm1 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_bitwise_ops_test6b: ; SSE41: # BB#0: -; SSE41-NEXT: xorps %xmm1, %xmm0 -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_bitwise_ops_test6b: ; AVX1: # BB#0: -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_bitwise_ops_test6b: @@ -631,12 +633,12 @@ define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i3 define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) { ; SSE-LABEL: combine_nested_undef_test1: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test1: ; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,0,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -646,12 +648,12 @@ define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) { ; SSE-LABEL: combine_nested_undef_test2: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test2: ; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,0,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -661,12 +663,12 @@ define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) { ; SSE-LABEL: combine_nested_undef_test3: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test3: ; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,0,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -676,13 +678,18 @@ define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) { ; SSE-LABEL: combine_nested_undef_test4: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test4: -; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test4: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test4: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -691,12 +698,12 @@ define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) { ; SSE-LABEL: combine_nested_undef_test5: ; SSE: # BB#0: -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test5: ; AVX: # BB#0: -; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -706,12 +713,12 @@ define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) { ; SSE-LABEL: combine_nested_undef_test6: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test6: ; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -736,12 +743,12 @@ define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) { ; SSE-LABEL: combine_nested_undef_test8: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test8: ; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -751,12 +758,12 @@ define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) { ; SSE-LABEL: combine_nested_undef_test9: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test9: ; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,0,2] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -766,12 +773,12 @@ define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) { ; SSE-LABEL: combine_nested_undef_test10: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test10: ; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -781,12 +788,12 @@ define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) { ; SSE-LABEL: combine_nested_undef_test11: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test11: ; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -796,17 +803,17 @@ define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) { ; SSE-LABEL: combine_nested_undef_test12: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; ; AVX1-LABEL: combine_nested_undef_test12: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test12: ; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -849,14 +856,14 @@ define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) { ; SSE-LABEL: combine_nested_undef_test15: ; SSE: # BB#0: -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,0,3] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test15: ; AVX: # BB#0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[3,1] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; AVX-NEXT: retq @@ -869,24 +876,26 @@ define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) { ; SSE2-LABEL: combine_nested_undef_test16: ; SSE2: # BB#0: ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_nested_undef_test16: ; SSSE3: # BB#0: ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,0,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_nested_undef_test16: ; SSE41: # BB#0: -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_nested_undef_test16: ; AVX1: # BB#0: -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; AVX1-NEXT: retq ; @@ -937,14 +946,14 @@ define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) { ; SSE-LABEL: combine_nested_undef_test19: ; SSE: # BB#0: -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,0,0] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test19: ; AVX: # BB#0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,0,0] ; AVX-NEXT: retq @@ -1026,12 +1035,12 @@ define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) { ; SSE-LABEL: combine_nested_undef_test24: ; SSE: # BB#0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test24: ; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1044,10 +1053,15 @@ define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) { ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test25: -; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test25: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test25: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -1056,12 +1070,12 @@ define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) { define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) { ; SSE-LABEL: combine_nested_undef_test26: ; SSE: # BB#0: -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_nested_undef_test26: ; AVX: # BB#0: -; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1074,10 +1088,15 @@ define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) { ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_nested_undef_test27: -; AVX: # BB#0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: retq +; AVX1-LABEL: combine_nested_undef_test27: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test27: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2 @@ -1101,10 +1120,11 @@ define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) { define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_test1: ; SSE2: # BB#0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test1: @@ -1130,7 +1150,7 @@ define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_test2: ; SSE2: # BB#0: ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE2-NEXT: retq @@ -1138,20 +1158,20 @@ define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) { ; SSSE3-LABEL: combine_test2: ; SSSE3: # BB#0: ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test2: ; SSE41: # BB#0: -; SSE41-NEXT: movss %xmm0, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test2: ; AVX: # BB#0: -; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -1161,12 +1181,12 @@ define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: combine_test3: ; SSE: # BB#0: -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_test3: ; AVX: # BB#0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -1176,12 +1196,13 @@ define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: combine_test4: ; SSE: # BB#0: -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_test4: ; AVX: # BB#0: -; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -1191,23 +1212,28 @@ define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_test5: ; SSE2: # BB#0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test5: ; SSSE3: # BB#0: -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-NEXT: movaps %xmm1, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSSE3-NEXT: movaps %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test5: ; SSE41: # BB#0: -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test5: @@ -1222,10 +1248,11 @@ define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) { define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: combine_test6: ; SSE2: # BB#0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test6: @@ -1251,7 +1278,7 @@ define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: combine_test7: ; SSE2: # BB#0: ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE2-NEXT: retq @@ -1259,21 +1286,26 @@ define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) { ; SSSE3-LABEL: combine_test7: ; SSSE3: # BB#0: ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test7: ; SSE41: # BB#0: -; SSE41-NEXT: movss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test7: -; AVX: # BB#0: -; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_test7: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test7: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> ret <4 x i32> %2 @@ -1282,12 +1314,12 @@ define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) { ; SSE-LABEL: combine_test8: ; SSE: # BB#0: -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_test8: ; AVX: # BB#0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> @@ -1297,12 +1329,13 @@ define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) { ; SSE-LABEL: combine_test9: ; SSE: # BB#0: -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_test9: ; AVX: # BB#0: -; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> @@ -1312,28 +1345,33 @@ define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) { ; SSE2-LABEL: combine_test10: ; SSE2: # BB#0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test10: ; SSSE3: # BB#0: -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-NEXT: movaps %xmm1, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSSE3-NEXT: movaps %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test10: ; SSE41: # BB#0: -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_test10: ; AVX1: # BB#0: -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_test10: @@ -1371,13 +1409,13 @@ define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) { ; ; SSE41-LABEL: combine_test12: ; SSE41: # BB#0: -; SSE41-NEXT: movss %xmm0, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test12: ; AVX: # BB#0: -; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> @@ -1387,12 +1425,12 @@ define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: combine_test13: ; SSE: # BB#0: -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_test13: ; AVX: # BB#0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> @@ -1402,13 +1440,12 @@ define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: combine_test14: ; SSE: # BB#0: -; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_test14: ; AVX: # BB#0: -; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> @@ -1436,7 +1473,8 @@ define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) { ; ; SSE41-LABEL: combine_test15: ; SSE41: # BB#0: -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test15: @@ -1474,14 +1512,19 @@ define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) { ; ; SSE41-LABEL: combine_test17: ; SSE41: # BB#0: -; SSE41-NEXT: movss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test17: -; AVX: # BB#0: -; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_test17: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test17: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> ret <4 x i32> %2 @@ -1490,12 +1533,12 @@ define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) { ; SSE-LABEL: combine_test18: ; SSE: # BB#0: -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_test18: ; AVX: # BB#0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> @@ -1505,13 +1548,12 @@ define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) { ; SSE-LABEL: combine_test19: ; SSE: # BB#0: -; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_test19: ; AVX: # BB#0: -; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> @@ -1539,12 +1581,13 @@ define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) { ; ; SSE41-LABEL: combine_test20: ; SSE41: # BB#0: -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_test20: ; AVX1: # BB#0: -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_test20: @@ -1563,27 +1606,30 @@ define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) { define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_test1b: ; SSE2: # BB#0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test1b: ; SSSE3: # BB#0: -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSSE3-NEXT: movaps %xmm1, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test1b: ; SSE41: # BB#0: -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0] +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0] ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -1601,32 +1647,35 @@ define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_test2b: ; SSE2: # BB#0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test2b: ; SSSE3: # BB#0: -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: movaps %xmm1, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test2b: ; SSE41: # BB#0: -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test2b: ; AVX: # BB#0: ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -1640,7 +1689,7 @@ define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) { ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_test3b: @@ -1648,7 +1697,7 @@ define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) { ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,0],xmm0[3,0] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2] ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -1658,27 +1707,30 @@ define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_test4b: ; SSE2: # BB#0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[0,2] +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[0,2] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test4b: ; SSSE3: # BB#0: -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[0,2] +; SSSE3-NEXT: movaps %xmm1, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[0,2] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test4b: ; SSE41: # BB#0: -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[0,2] +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[0,2] ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -1699,26 +1751,14 @@ define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) { define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) { ; SSE2-LABEL: combine_test1c: ; SSE2: # BB#0: -; SSE2-NEXT: movl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pextrw $1, %xmm0, %ecx -; SSE2-NEXT: pinsrw $0, %eax, %xmm0 -; SSE2-NEXT: movzbl %ah, %eax -; SSE2-NEXT: pinsrw $2, %eax, %xmm0 -; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: pinsrw $6, %ecx, %xmm0 -; SSE2-NEXT: movl (%rsi), %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pextrw $1, %xmm1, %ecx -; SSE2-NEXT: pinsrw $0, %eax, %xmm1 -; SSE2-NEXT: movzbl %ah, %eax -; SSE2-NEXT: pinsrw $2, %eax, %xmm1 -; SSE2-NEXT: pinsrw $4, %ecx, %xmm1 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: pinsrw $6, %ecx, %xmm1 +; SSE2-NEXT: movd (%rdi), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movd (%rsi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSE2-NEXT: retq @@ -1726,29 +1766,37 @@ define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) { ; SSSE3-LABEL: combine_test1c: ; SSSE3: # BB#0: ; SSSE3-NEXT: movd (%rdi), %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,1,128,128,128,2,128,128,128,3,128,128,128] -; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: movd (%rsi), %xmm2 -; SSSE3-NEXT: pshufb %xmm1, %xmm2 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movd (%rsi), %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test1c: ; SSE41: # BB#0: ; SSE41-NEXT: pmovzxbd (%rdi), %xmm1 ; SSE41-NEXT: pmovzxbd (%rsi), %xmm0 -; SSE41-NEXT: movss %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test1c: -; AVX: # BB#0: -; AVX-NEXT: vpmovzxbd (%rdi), %xmm0 -; AVX-NEXT: vpmovzxbd (%rsi), %xmm1 -; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_test1c: +; AVX1: # BB#0: +; AVX1-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX1-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test1c: +; AVX2: # BB#0: +; AVX2-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX2-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq %A = load <4 x i8>* %a %B = load <4 x i8>* %b %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> @@ -1759,49 +1807,38 @@ define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) { define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) { ; SSE2-LABEL: combine_test2c: ; SSE2: # BB#0: -; SSE2-NEXT: movl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pextrw $1, %xmm0, %ecx -; SSE2-NEXT: pinsrw $0, %eax, %xmm0 -; SSE2-NEXT: movzbl %ah, %eax -; SSE2-NEXT: pinsrw $2, %eax, %xmm0 -; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: pinsrw $6, %ecx, %xmm0 -; SSE2-NEXT: movl (%rsi), %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pextrw $1, %xmm1, %ecx -; SSE2-NEXT: pinsrw $0, %eax, %xmm1 -; SSE2-NEXT: movzbl %ah, %eax -; SSE2-NEXT: pinsrw $2, %eax, %xmm1 -; SSE2-NEXT: pinsrw $4, %ecx, %xmm1 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: pinsrw $6, %ecx, %xmm1 -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movd (%rdi), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movd (%rsi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test2c: ; SSSE3: # BB#0: ; SSSE3-NEXT: movd (%rdi), %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,1,128,128,128,2,128,128,128,3,128,128,128] -; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: movd (%rsi), %xmm2 -; SSSE3-NEXT: pshufb %xmm1, %xmm2 -; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movd (%rsi), %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test2c: ; SSE41: # BB#0: ; SSE41-NEXT: pmovzxbd (%rdi), %xmm0 ; SSE41-NEXT: pmovzxbd (%rsi), %xmm1 -; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test2c: ; AVX: # BB#0: ; AVX-NEXT: vpmovzxbd (%rdi), %xmm0 ; AVX-NEXT: vpmovzxbd (%rsi), %xmm1 -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %A = load <4 x i8>* %a %B = load <4 x i8>* %b @@ -1813,49 +1850,38 @@ define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) { define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) { ; SSE2-LABEL: combine_test3c: ; SSE2: # BB#0: -; SSE2-NEXT: movl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pextrw $1, %xmm0, %ecx -; SSE2-NEXT: pinsrw $0, %eax, %xmm0 -; SSE2-NEXT: movzbl %ah, %eax -; SSE2-NEXT: pinsrw $2, %eax, %xmm0 -; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: pinsrw $6, %ecx, %xmm0 -; SSE2-NEXT: movl (%rsi), %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pextrw $1, %xmm1, %ecx -; SSE2-NEXT: pinsrw $0, %eax, %xmm1 -; SSE2-NEXT: movzbl %ah, %eax -; SSE2-NEXT: pinsrw $2, %eax, %xmm1 -; SSE2-NEXT: pinsrw $4, %ecx, %xmm1 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: pinsrw $6, %ecx, %xmm1 -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE2-NEXT: movd (%rdi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd (%rsi), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test3c: ; SSSE3: # BB#0: -; SSSE3-NEXT: movd (%rdi), %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,1,128,128,128,2,128,128,128,3,128,128,128] -; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: movd (%rsi), %xmm2 -; SSSE3-NEXT: pshufb %xmm1, %xmm2 -; SSSE3-NEXT: movhlps {{.*#+}} xmm0 = xmm2[1],xmm0[1] +; SSSE3-NEXT: movd (%rdi), %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd (%rsi), %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test3c: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxbd (%rdi), %xmm0 -; SSE41-NEXT: pmovzxbd (%rsi), %xmm1 -; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE41-NEXT: pmovzxbd (%rdi), %xmm1 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm0 +; SSE41-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_test3c: ; AVX: # BB#0: ; AVX-NEXT: vpmovzxbd (%rdi), %xmm0 ; AVX-NEXT: vpmovzxbd (%rsi), %xmm1 -; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX-NEXT: retq %A = load <4 x i8>* %a %B = load <4 x i8>* %b @@ -1867,55 +1893,46 @@ define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) { define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) { ; SSE2-LABEL: combine_test4c: ; SSE2: # BB#0: -; SSE2-NEXT: movl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pextrw $1, %xmm0, %ecx -; SSE2-NEXT: pinsrw $0, %eax, %xmm0 -; SSE2-NEXT: movzbl %ah, %eax -; SSE2-NEXT: pinsrw $2, %eax, %xmm0 -; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: pinsrw $6, %ecx, %xmm0 -; SSE2-NEXT: movl (%rsi), %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: pextrw $1, %xmm1, %ecx -; SSE2-NEXT: pinsrw $0, %eax, %xmm1 -; SSE2-NEXT: movzbl %ah, %eax -; SSE2-NEXT: pinsrw $2, %eax, %xmm1 -; SSE2-NEXT: pinsrw $4, %ecx, %xmm1 -; SSE2-NEXT: shrl $8, %ecx -; SSE2-NEXT: pinsrw $6, %ecx, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE2-NEXT: movd (%rdi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd (%rsi), %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test4c: ; SSSE3: # BB#0: -; SSSE3-NEXT: movd (%rdi), %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,128,128,128,1,128,128,128,2,128,128,128,3,128,128,128] -; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: movd (%rdi), %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSSE3-NEXT: movd (%rsi), %xmm2 -; SSSE3-NEXT: pshufb %xmm1, %xmm2 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[0,2] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_test4c: ; SSE41: # BB#0: -; SSE41-NEXT: pmovzxbd (%rdi), %xmm0 -; SSE41-NEXT: pmovzxbd (%rsi), %xmm1 -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: pmovzxbd (%rdi), %xmm1 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_test4c: ; AVX1: # BB#0: ; AVX1-NEXT: vpmovzxbd (%rdi), %xmm0 ; AVX1-NEXT: vpmovzxbd (%rsi), %xmm1 -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_test4c: @@ -1964,7 +1981,7 @@ define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) { define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_blend_01: ; SSE2: # BB#0: -; SSE2-NEXT: movss %xmm1, %xmm0 +; SSE2-NEXT: movsd %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 @@ -1972,7 +1989,7 @@ define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) { ; ; SSSE3-LABEL: combine_blend_01: ; SSSE3: # BB#0: -; SSSE3-NEXT: movss %xmm1, %xmm0 +; SSSE3-NEXT: movsd %xmm1, %xmm0 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] ; SSSE3-NEXT: movaps %xmm1, %xmm0 @@ -1980,12 +1997,13 @@ define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) { ; ; SSE41-LABEL: combine_blend_01: ; SSE41: # BB#0: -; SSE41-NEXT: movsd %xmm1, %xmm0 +; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_blend_01: ; AVX: # BB#0: -; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> @@ -2009,7 +2027,8 @@ define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) { ; ; SSE41-LABEL: combine_blend_02: ; SSE41: # BB#0: -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_blend_02: @@ -2025,28 +2044,30 @@ define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_blend_123: ; SSE2: # BB#0: ; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: movss %xmm0, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] +; SSE2-NEXT: movsd %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_blend_123: ; SSSE3: # BB#0: ; SSSE3-NEXT: movaps %xmm1, %xmm2 -; SSSE3-NEXT: movss %xmm0, %xmm2 -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] -; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] +; SSSE3-NEXT: movsd %xmm2, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_blend_123: ; SSE41: # BB#0: -; SSE41-NEXT: movss %xmm0, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_blend_123: ; AVX: # BB#0: -; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> @@ -2057,12 +2078,13 @@ define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) { define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) { ; SSE-LABEL: combine_test_movhl_1: ; SSE: # BB#0: -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_test_movhl_1: ; AVX: # BB#0: -; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> @@ -2072,12 +2094,13 @@ define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) { ; SSE-LABEL: combine_test_movhl_2: ; SSE: # BB#0: -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_test_movhl_2: ; AVX: # BB#0: -; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> @@ -2087,12 +2110,13 @@ define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) { ; SSE-LABEL: combine_test_movhl_3: ; SSE: # BB#0: -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_test_movhl_3: ; AVX: # BB#0: -; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> @@ -2106,26 +2130,27 @@ define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) { define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_undef_input_test1: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,2] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_undef_input_test1: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,2] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_undef_input_test1: ; SSE41: # BB#0: -; SSE41-NEXT: movsd %xmm1, %xmm0 +; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_undef_input_test1: ; AVX: # BB#0: -; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -2135,12 +2160,12 @@ define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: combine_undef_input_test2: ; SSE: # BB#0: -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_undef_input_test2: ; AVX: # BB#0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -2150,12 +2175,12 @@ define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: combine_undef_input_test3: ; SSE: # BB#0: -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_undef_input_test3: ; AVX: # BB#0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -2165,12 +2190,13 @@ define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: combine_undef_input_test4: ; SSE: # BB#0: -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_undef_input_test4: ; AVX: # BB#0: -; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -2180,22 +2206,24 @@ define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_undef_input_test5: ; SSE2: # BB#0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_undef_input_test5: ; SSSE3: # BB#0: -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_undef_input_test5: ; SSE41: # BB#0: -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_undef_input_test5: ; AVX: # BB#0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -2292,26 +2320,27 @@ define <4 x float> @combine_undef_input_test10(<4 x float> %a) { define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_undef_input_test11: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,2] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_undef_input_test11: ; SSSE3: # BB#0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,2] ; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_undef_input_test11: ; SSE41: # BB#0: -; SSE41-NEXT: movsd %xmm1, %xmm0 +; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm0[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_undef_input_test11: ; AVX: # BB#0: -; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> @@ -2321,12 +2350,12 @@ define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: combine_undef_input_test12: ; SSE: # BB#0: -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_undef_input_test12: ; AVX: # BB#0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> @@ -2336,12 +2365,12 @@ define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: combine_undef_input_test13: ; SSE: # BB#0: -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX-LABEL: combine_undef_input_test13: ; AVX: # BB#0: -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> @@ -2351,12 +2380,13 @@ define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: combine_undef_input_test14: ; SSE: # BB#0: -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_undef_input_test14: ; AVX: # BB#0: -; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> @@ -2366,22 +2396,24 @@ define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) { define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) { ; SSE2-LABEL: combine_undef_input_test15: ; SSE2: # BB#0: -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_undef_input_test15: ; SSSE3: # BB#0: -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_undef_input_test15: ; SSE41: # BB#0: -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_undef_input_test15: ; AVX: # BB#0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll index 84df04376df..0ce3ecf5d61 100644 --- a/test/CodeGen/X86/vector-zext.ll +++ b/test/CodeGen/X86/vector-zext.ll @@ -39,10 +39,11 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; ; AVX1-LABEL: zext_8i16_to_8i32: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-NEXT: vpmovzxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_8i16_to_8i32: @@ -57,20 +58,20 @@ entry: define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: zext_4i32_to_4i64: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] ; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,0,3,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] ; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_4i32_to_4i64: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] ; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,0,3,0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] ; SSSE3-NEXT: pand %xmm3, %xmm1 ; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq @@ -80,7 +81,7 @@ define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; SSE41-NEXT: pmovzxdq %xmm0, %xmm2 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] ; SSE41-NEXT: pand %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,0,3,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] ; SSE41-NEXT: pand %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -88,9 +89,9 @@ define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp ; AVX1-LABEL: zext_4i32_to_4i64: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_4i32_to_4i64: @@ -137,16 +138,16 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) { ; ; AVX1-LABEL: zext_8i8_to_8i32: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpmovzxwd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vandps .{{.*}}, %ymm0, %ymm0 +; AVX1-NEXT: vpmovzxwd %xmm0, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps {{.*}}, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_8i8_to_8i32: ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vpmovzxwd %xmm0, %ymm0 -; AVX2-NEXT: vpbroadcastd .{{.*}}, %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*}}, %ymm1 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq entry: @@ -190,10 +191,11 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) { ; ; AVX1-LABEL: zext_16i8_to_16i16: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpmovzxbw %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_16i8_to_16i16: diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll index be4dc1e002d..a98b49a688c 100644 --- a/test/CodeGen/X86/vselect.ll +++ b/test/CodeGen/X86/vselect.ll @@ -247,8 +247,8 @@ define <2 x i64> @test25(<2 x i64> %a, <2 x i64> %b) { define <4 x float> @select_of_shuffles_0(<2 x float> %a0, <2 x float> %b0, <2 x float> %a1, <2 x float> %b1) { ; CHECK-LABEL: select_of_shuffles_0: ; CHECK: # BB#0: -; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; CHECK-NEXT: subps %xmm1, %xmm0 ; CHECK-NEXT: retq %1 = shufflevector <2 x float> %a0, <2 x float> undef, <4 x i32> diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll index 595f346e408..70fdbb7c9c8 100644 --- a/test/CodeGen/X86/widen_shuffle-1.ll +++ b/test/CodeGen/X86/widen_shuffle-1.ll @@ -45,8 +45,8 @@ define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst ; CHECK-LABEL: shuf3: ; CHECK: # BB#0: # %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] -; CHECK-NEXT: movdqa %xmm0, (%eax) +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; CHECK-NEXT: movaps %xmm1, (%eax) ; CHECK-NEXT: retl entry: %shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> @@ -68,9 +68,10 @@ entry: define <8 x i8> @shuf4(<4 x i8> %a, <4 x i8> %b) nounwind readnone { ; CHECK-LABEL: shuf4: ; CHECK: # BB#0: -; CHECK-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,xmm1[4],zero,xmm1[8],zero,xmm1[12],zero -; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[4],zero,xmm0[8],zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; CHECK-NEXT: pshufb %xmm2, %xmm1 +; CHECK-NEXT: pshufb %xmm2, %xmm0 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retl %vshuf = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> ret <8 x i8> %vshuf @@ -82,7 +83,7 @@ define void @shuf5(<8 x i8>* %p) nounwind { ; CHECK: # BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <4,33,u,u,u,u,u,u> -; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,0,0,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; CHECK-NEXT: movlpd %xmm0, (%eax) ; CHECK-NEXT: retl %v = shufflevector <2 x i8> , <2 x i8> undef, <8 x i32> -- 2.34.1