From 561088eb5d0aa7a2cfdbcaca23d60a26d6b3859e Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Sun, 11 Jan 2015 01:36:20 +0000 Subject: [PATCH] [x86] Remove some windows line endings that snuck into the tests here. Folks on Windows, remember to set up your subversion to strip these when submitting... git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225593 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/avx-intrinsics-x86.ll | 60 +++--- test/CodeGen/X86/avx2-intrinsics-x86.ll | 60 +++--- test/CodeGen/X86/fold-tied-op.ll | 168 ++++++++-------- test/CodeGen/X86/movtopush.ll | 224 ++++++++++----------- test/CodeGen/X86/pr22103.ll | 38 ++-- test/CodeGen/X86/sse2-intrinsics-x86.ll | 60 +++--- test/CodeGen/X86/vec_extract-avx.ll | 164 +++++++-------- test/CodeGen/X86/vector-shuffle-128-v16.ll | 152 +++++++------- 8 files changed, 463 insertions(+), 463 deletions(-) diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll index ef3e83fc7ad..bb9354cff03 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -455,21 +455,21 @@ define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) { ret <4 x i32> %res } declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) { - ; CHECK: vpslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] - %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) { - ; CHECK: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8] - %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} + + +define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) { + ; CHECK: vpslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] + %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone + + +define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) { + ; CHECK: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8] + %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone @@ -551,21 +551,21 @@ define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) { ret <4 x i32> %res } declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) { - ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] - %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) { - ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero - %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} + + +define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) { + ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] + %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone + + +define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) { + ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero + %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll index 84b22b76bf6..79a3361bfe8 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -158,21 +158,21 @@ define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) { ret <8 x i32> %res } declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone - - -define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) { - ; CHECK: vpslldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] - %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone - - -define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) { - ; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24] - %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} + + +define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) { + ; CHECK: vpslldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] + %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone + + +define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) { + ; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24] + %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone @@ -254,21 +254,21 @@ define <8 x i32> @test_x86_avx2_psrl_d(<8 x i32> %a0, <4 x i32> %a1) { ret <8 x i32> %res } declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone - - -define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) { - ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] - %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone - - -define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) { - ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero - %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} + + +define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) { + ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] + %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone + + +define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) { + ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero + %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll index 65b8aeb74cb..5bf5dbd1a9c 100644 --- a/test/CodeGen/X86/fold-tied-op.ll +++ b/test/CodeGen/X86/fold-tied-op.ll @@ -1,84 +1,84 @@ -; RUN: llc -verify-machineinstrs -mtriple=i386--netbsd < %s | FileCheck %s -; Regression test for http://reviews.llvm.org/D5701 - -; ModuleID = 'xxhash.i' -target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" -target triple = "i386--netbsd" - -; CHECK-LABEL: fn1 -; CHECK: shldl {{.*#+}} 4-byte Folded Spill -; CHECK: orl {{.*#+}} 4-byte Folded Reload -; CHECK: shldl {{.*#+}} 4-byte Folded Spill -; CHECK: orl {{.*#+}} 4-byte Folded Reload -; CHECK: addl {{.*#+}} 4-byte Folded Reload -; CHECK: imull {{.*#+}} 4-byte Folded Reload -; CHECK: orl {{.*#+}} 4-byte Folded Reload -; CHECK: retl - -%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 } - -@a = common global i32 0, align 4 -@b = common global i64 0, align 8 - -; Function Attrs: nounwind uwtable -define i64 @fn1() #0 { -entry: - %0 = load i32* @a, align 4, !tbaa !1 - %1 = inttoptr i32 %0 to %struct.XXH_state64_t* - %total_len = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 0 - %2 = load i32* %total_len, align 4, !tbaa !5 - %tobool = icmp eq i32 %2, 0 - br i1 %tobool, label %if.else, label %if.then - -if.then: ; preds = %entry - %v3 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 3 - %3 = load i64* %v3, align 4, !tbaa !8 - %v4 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 4 - %4 = load i64* %v4, align 4, !tbaa !9 - %v2 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 2 - %5 = load i64* %v2, align 4, !tbaa !10 - %shl = shl i64 %5, 1 - %or = or i64 %shl, %5 - %shl2 = shl i64 %3, 2 - %shr = lshr i64 %3, 1 - %or3 = or i64 %shl2, %shr - %add = add i64 %or, %or3 - %mul = mul i64 %4, -4417276706812531889 - %shl4 = mul i64 %4, -8834553413625063778 - %shr5 = ashr i64 %mul, 3 - %or6 = or i64 %shr5, %shl4 - %mul7 = mul nsw i64 %or6, 1400714785074694791 - %xor = xor i64 %add, %mul7 - store i64 %xor, i64* @b, align 8, !tbaa !11 - %mul8 = mul nsw i64 %xor, 1400714785074694791 - br label %if.end - -if.else: ; preds = %entry - %6 = load i64* @b, align 8, !tbaa !11 - %xor10 = xor i64 %6, -4417276706812531889 - %mul11 = mul nsw i64 %xor10, 400714785074694791 - br label %if.end - -if.end: ; preds = %if.else, %if.then - %storemerge.in = phi i64 [ %mul11, %if.else ], [ %mul8, %if.then ] - %storemerge = add i64 %storemerge.in, -8796714831421723037 - store i64 %storemerge, i64* @b, align 8, !tbaa !11 - ret i64 undef -} - -attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!llvm.ident = !{!0} - -!0 = !{!"clang version 3.6 (trunk 219587)"} -!1 = !{!2, !2, i64 0} -!2 = !{!"int", !3, i64 0} -!3 = !{!"omnipotent char", !4, i64 0} -!4 = !{!"Simple C/C++ TBAA"} -!5 = !{!6, !2, i64 0} -!6 = !{!"XXH_state64_t", !2, i64 0, !2, i64 4, !7, i64 8, !7, i64 16, !7, i64 24} -!7 = !{!"long long", !3, i64 0} -!8 = !{!6, !7, i64 16} -!9 = !{!6, !7, i64 24} -!10 = !{!6, !7, i64 8} -!11 = !{!7, !7, i64 0} +; RUN: llc -verify-machineinstrs -mtriple=i386--netbsd < %s | FileCheck %s +; Regression test for http://reviews.llvm.org/D5701 + +; ModuleID = 'xxhash.i' +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" +target triple = "i386--netbsd" + +; CHECK-LABEL: fn1 +; CHECK: shldl {{.*#+}} 4-byte Folded Spill +; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: shldl {{.*#+}} 4-byte Folded Spill +; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: addl {{.*#+}} 4-byte Folded Reload +; CHECK: imull {{.*#+}} 4-byte Folded Reload +; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: retl + +%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 } + +@a = common global i32 0, align 4 +@b = common global i64 0, align 8 + +; Function Attrs: nounwind uwtable +define i64 @fn1() #0 { +entry: + %0 = load i32* @a, align 4, !tbaa !1 + %1 = inttoptr i32 %0 to %struct.XXH_state64_t* + %total_len = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 0 + %2 = load i32* %total_len, align 4, !tbaa !5 + %tobool = icmp eq i32 %2, 0 + br i1 %tobool, label %if.else, label %if.then + +if.then: ; preds = %entry + %v3 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 3 + %3 = load i64* %v3, align 4, !tbaa !8 + %v4 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 4 + %4 = load i64* %v4, align 4, !tbaa !9 + %v2 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 2 + %5 = load i64* %v2, align 4, !tbaa !10 + %shl = shl i64 %5, 1 + %or = or i64 %shl, %5 + %shl2 = shl i64 %3, 2 + %shr = lshr i64 %3, 1 + %or3 = or i64 %shl2, %shr + %add = add i64 %or, %or3 + %mul = mul i64 %4, -4417276706812531889 + %shl4 = mul i64 %4, -8834553413625063778 + %shr5 = ashr i64 %mul, 3 + %or6 = or i64 %shr5, %shl4 + %mul7 = mul nsw i64 %or6, 1400714785074694791 + %xor = xor i64 %add, %mul7 + store i64 %xor, i64* @b, align 8, !tbaa !11 + %mul8 = mul nsw i64 %xor, 1400714785074694791 + br label %if.end + +if.else: ; preds = %entry + %6 = load i64* @b, align 8, !tbaa !11 + %xor10 = xor i64 %6, -4417276706812531889 + %mul11 = mul nsw i64 %xor10, 400714785074694791 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %storemerge.in = phi i64 [ %mul11, %if.else ], [ %mul8, %if.then ] + %storemerge = add i64 %storemerge.in, -8796714831421723037 + store i64 %storemerge, i64* @b, align 8, !tbaa !11 + ret i64 undef +} + +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 3.6 (trunk 219587)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} +!5 = !{!6, !2, i64 0} +!6 = !{!"XXH_state64_t", !2, i64 0, !2, i64 4, !7, i64 8, !7, i64 16, !7, i64 24} +!7 = !{!"long long", !3, i64 0} +!8 = !{!6, !7, i64 16} +!9 = !{!6, !7, i64 24} +!10 = !{!6, !7, i64 8} +!11 = !{!7, !7, i64 0} diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll index fd4094a7a1e..cb48ed747be 100644 --- a/test/CodeGen/X86/movtopush.ll +++ b/test/CodeGen/X86/movtopush.ll @@ -1,112 +1,112 @@ -; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL -; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED -declare void @good(i32 %a, i32 %b, i32 %c, i32 %d) -declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d) - -; Here, we should have a reserved frame, so we don't expect pushes -; NORMAL-LABEL: test1 -; NORMAL: subl $16, %esp -; NORMAL-NEXT: movl $4, 12(%esp) -; NORMAL-NEXT: movl $3, 8(%esp) -; NORMAL-NEXT: movl $2, 4(%esp) -; NORMAL-NEXT: movl $1, (%esp) -; NORMAL-NEXT: call -define void @test1() { -entry: - call void @good(i32 1, i32 2, i32 3, i32 4) - ret void -} - -; Here, we expect a sequence of 4 immediate pushes -; NORMAL-LABEL: test2 -; NORMAL-NOT: subl {{.*}} %esp -; NORMAL: pushl $4 -; NORMAL-NEXT: pushl $3 -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl $1 -; NORMAL-NEXT: call -define void @test2(i32 %k) { -entry: - %a = alloca i32, i32 %k - call void @good(i32 1, i32 2, i32 3, i32 4) - ret void -} - -; Again, we expect a sequence of 4 immediate pushes -; Checks that we generate the right pushes for >8bit immediates -; NORMAL-LABEL: test2b -; NORMAL-NOT: subl {{.*}} %esp -; NORMAL: pushl $4096 -; NORMAL-NEXT: pushl $3072 -; NORMAL-NEXT: pushl $2048 -; NORMAL-NEXT: pushl $1024 -; NORMAL-NEXT: call -define void @test2b(i32 %k) { -entry: - %a = alloca i32, i32 %k - call void @good(i32 1024, i32 2048, i32 3072, i32 4096) - ret void -} - -; The first push should push a register -; NORMAL-LABEL: test3 -; NORMAL-NOT: subl {{.*}} %esp -; NORMAL: pushl $4 -; NORMAL-NEXT: pushl $3 -; NORMAL-NEXT: pushl $2 -; NORMAL-NEXT: pushl %e{{..}} -; NORMAL-NEXT: call -define void @test3(i32 %k) { -entry: - %a = alloca i32, i32 %k - call void @good(i32 %k, i32 2, i32 3, i32 4) - ret void -} - -; We don't support weird calling conventions -; NORMAL-LABEL: test4 -; NORMAL: subl $12, %esp -; NORMAL-NEXT: movl $4, 8(%esp) -; NORMAL-NEXT: movl $3, 4(%esp) -; NORMAL-NEXT: movl $1, (%esp) -; NORMAL-NEXT: movl $2, %eax -; NORMAL-NEXT: call -define void @test4(i32 %k) { -entry: - %a = alloca i32, i32 %k - call void @inreg(i32 1, i32 2, i32 3, i32 4) - ret void -} - -; Check that additional alignment is added when the pushes -; don't add up to the required alignment. -; ALIGNED-LABEL: test5 -; ALIGNED: subl $16, %esp -; ALIGNED-NEXT: pushl $4 -; ALIGNED-NEXT: pushl $3 -; ALIGNED-NEXT: pushl $2 -; ALIGNED-NEXT: pushl $1 -; ALIGNED-NEXT: call -define void @test5(i32 %k) { -entry: - %a = alloca i32, i32 %k - call void @good(i32 1, i32 2, i32 3, i32 4) - ret void -} - -; Check that pushing the addresses of globals (Or generally, things that -; aren't exactly immediates) isn't broken. -; Fixes PR21878. -; NORMAL-LABEL: test6 -; NORMAL: pushl $_ext -; NORMAL-NEXT: call -declare void @f(i8*) -@ext = external constant i8 - -define void @test6() { - call void @f(i8* @ext) - br label %bb -bb: - alloca i32 - ret void -} +; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL +; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED +declare void @good(i32 %a, i32 %b, i32 %c, i32 %d) +declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d) + +; Here, we should have a reserved frame, so we don't expect pushes +; NORMAL-LABEL: test1 +; NORMAL: subl $16, %esp +; NORMAL-NEXT: movl $4, 12(%esp) +; NORMAL-NEXT: movl $3, 8(%esp) +; NORMAL-NEXT: movl $2, 4(%esp) +; NORMAL-NEXT: movl $1, (%esp) +; NORMAL-NEXT: call +define void @test1() { +entry: + call void @good(i32 1, i32 2, i32 3, i32 4) + ret void +} + +; Here, we expect a sequence of 4 immediate pushes +; NORMAL-LABEL: test2 +; NORMAL-NOT: subl {{.*}} %esp +; NORMAL: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl $1 +; NORMAL-NEXT: call +define void @test2(i32 %k) { +entry: + %a = alloca i32, i32 %k + call void @good(i32 1, i32 2, i32 3, i32 4) + ret void +} + +; Again, we expect a sequence of 4 immediate pushes +; Checks that we generate the right pushes for >8bit immediates +; NORMAL-LABEL: test2b +; NORMAL-NOT: subl {{.*}} %esp +; NORMAL: pushl $4096 +; NORMAL-NEXT: pushl $3072 +; NORMAL-NEXT: pushl $2048 +; NORMAL-NEXT: pushl $1024 +; NORMAL-NEXT: call +define void @test2b(i32 %k) { +entry: + %a = alloca i32, i32 %k + call void @good(i32 1024, i32 2048, i32 3072, i32 4096) + ret void +} + +; The first push should push a register +; NORMAL-LABEL: test3 +; NORMAL-NOT: subl {{.*}} %esp +; NORMAL: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl %e{{..}} +; NORMAL-NEXT: call +define void @test3(i32 %k) { +entry: + %a = alloca i32, i32 %k + call void @good(i32 %k, i32 2, i32 3, i32 4) + ret void +} + +; We don't support weird calling conventions +; NORMAL-LABEL: test4 +; NORMAL: subl $12, %esp +; NORMAL-NEXT: movl $4, 8(%esp) +; NORMAL-NEXT: movl $3, 4(%esp) +; NORMAL-NEXT: movl $1, (%esp) +; NORMAL-NEXT: movl $2, %eax +; NORMAL-NEXT: call +define void @test4(i32 %k) { +entry: + %a = alloca i32, i32 %k + call void @inreg(i32 1, i32 2, i32 3, i32 4) + ret void +} + +; Check that additional alignment is added when the pushes +; don't add up to the required alignment. +; ALIGNED-LABEL: test5 +; ALIGNED: subl $16, %esp +; ALIGNED-NEXT: pushl $4 +; ALIGNED-NEXT: pushl $3 +; ALIGNED-NEXT: pushl $2 +; ALIGNED-NEXT: pushl $1 +; ALIGNED-NEXT: call +define void @test5(i32 %k) { +entry: + %a = alloca i32, i32 %k + call void @good(i32 1, i32 2, i32 3, i32 4) + ret void +} + +; Check that pushing the addresses of globals (Or generally, things that +; aren't exactly immediates) isn't broken. +; Fixes PR21878. +; NORMAL-LABEL: test6 +; NORMAL: pushl $_ext +; NORMAL-NEXT: call +declare void @f(i8*) +@ext = external constant i8 + +define void @test6() { + call void @f(i8* @ext) + br label %bb +bb: + alloca i32 + ret void +} diff --git a/test/CodeGen/X86/pr22103.ll b/test/CodeGen/X86/pr22103.ll index 3af1d44987f..77c0751e219 100644 --- a/test/CodeGen/X86/pr22103.ll +++ b/test/CodeGen/X86/pr22103.ll @@ -1,19 +1,19 @@ -; RUN: llc < %s | FileCheck %s -; Don't try to emit a direct call through a TLS global. -; This fixes PR22103 - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@a = external thread_local global i64 - -; Function Attrs: nounwind -define void @_Z1fv() { -; CHECK-NOT: callq *$a -; CHECK: movq %fs:0, [[RAX:%r..]] -; CHECK-NEXT: addq a@GOTTPOFF(%rip), [[RAX]] -; CHECK-NEXT: callq *[[RAX]] -entry: - call void bitcast (i64* @a to void ()*)() - ret void -} +; RUN: llc < %s | FileCheck %s +; Don't try to emit a direct call through a TLS global. +; This fixes PR22103 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = external thread_local global i64 + +; Function Attrs: nounwind +define void @_Z1fv() { +; CHECK-NOT: callq *$a +; CHECK: movq %fs:0, [[RAX:%r..]] +; CHECK-NEXT: addq a@GOTTPOFF(%rip), [[RAX]] +; CHECK-NEXT: callq *[[RAX]] +entry: + call void bitcast (i64* @a to void ()*)() + ret void +} diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll index c4d9e6d7e28..ddb04211ec7 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -408,21 +408,21 @@ define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) { ret <4 x i32> %res } declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) { - ; CHECK: pslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] - %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) { - ; CHECK: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8] - %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} + + +define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) { + ; CHECK: pslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] + %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone + + +define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) { + ; CHECK: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8] + %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone @@ -504,21 +504,21 @@ define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) { ret <4 x i32> %res } declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) { - ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] - %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone - - -define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) { - ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero - %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} + + +define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) { + ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] + %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone + + +define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) { + ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero + %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone diff --git a/test/CodeGen/X86/vec_extract-avx.ll b/test/CodeGen/X86/vec_extract-avx.ll index e5e79375fff..fbb84170dc8 100644 --- a/test/CodeGen/X86/vec_extract-avx.ll +++ b/test/CodeGen/X86/vec_extract-avx.ll @@ -1,82 +1,82 @@ -target triple = "x86_64-unknown-unknown" - -; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s - -; When extracting multiple consecutive elements from a larger -; vector into a smaller one, do it efficiently. We should use -; an EXTRACT_SUBVECTOR node internally rather than a bunch of -; single element extractions. - -; Extracting the low elements only requires using the right kind of store. -define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) { - %ext0 = extractelement <8 x float> %v, i32 0 - %ext1 = extractelement <8 x float> %v, i32 1 - %ext2 = extractelement <8 x float> %v, i32 2 - %ext3 = extractelement <8 x float> %v, i32 3 - %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 - %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 - %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 - %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 - store <4 x float> %ins3, <4 x float>* %ptr, align 16 - ret void - -; CHECK-LABEL: low_v8f32_to_v4f32 -; CHECK: vmovaps -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -} - -; Extracting the high elements requires just one AVX instruction. -define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) { - %ext0 = extractelement <8 x float> %v, i32 4 - %ext1 = extractelement <8 x float> %v, i32 5 - %ext2 = extractelement <8 x float> %v, i32 6 - %ext3 = extractelement <8 x float> %v, i32 7 - %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 - %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 - %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 - %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 - store <4 x float> %ins3, <4 x float>* %ptr, align 16 - ret void - -; CHECK-LABEL: high_v8f32_to_v4f32 -; CHECK: vextractf128 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -} - -; Make sure element type doesn't alter the codegen. Note that -; if we were actually using the vector in this function and -; have AVX2, we should generate vextracti128 (the int version). -define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) { - %ext0 = extractelement <8 x i32> %v, i32 4 - %ext1 = extractelement <8 x i32> %v, i32 5 - %ext2 = extractelement <8 x i32> %v, i32 6 - %ext3 = extractelement <8 x i32> %v, i32 7 - %ins0 = insertelement <4 x i32> undef, i32 %ext0, i32 0 - %ins1 = insertelement <4 x i32> %ins0, i32 %ext1, i32 1 - %ins2 = insertelement <4 x i32> %ins1, i32 %ext2, i32 2 - %ins3 = insertelement <4 x i32> %ins2, i32 %ext3, i32 3 - store <4 x i32> %ins3, <4 x i32>* %ptr, align 16 - ret void - -; CHECK-LABEL: high_v8i32_to_v4i32 -; CHECK: vextractf128 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -} - -; Make sure that element size doesn't alter the codegen. -define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) { - %ext0 = extractelement <4 x double> %v, i32 2 - %ext1 = extractelement <4 x double> %v, i32 3 - %ins0 = insertelement <2 x double> undef, double %ext0, i32 0 - %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1 - store <2 x double> %ins1, <2 x double>* %ptr, align 16 - ret void - -; CHECK-LABEL: high_v4f64_to_v2f64 -; CHECK: vextractf128 -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq -} +target triple = "x86_64-unknown-unknown" + +; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s + +; When extracting multiple consecutive elements from a larger +; vector into a smaller one, do it efficiently. We should use +; an EXTRACT_SUBVECTOR node internally rather than a bunch of +; single element extractions. + +; Extracting the low elements only requires using the right kind of store. +define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) { + %ext0 = extractelement <8 x float> %v, i32 0 + %ext1 = extractelement <8 x float> %v, i32 1 + %ext2 = extractelement <8 x float> %v, i32 2 + %ext3 = extractelement <8 x float> %v, i32 3 + %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 + %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 + %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 + %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 + store <4 x float> %ins3, <4 x float>* %ptr, align 16 + ret void + +; CHECK-LABEL: low_v8f32_to_v4f32 +; CHECK: vmovaps +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} + +; Extracting the high elements requires just one AVX instruction. +define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) { + %ext0 = extractelement <8 x float> %v, i32 4 + %ext1 = extractelement <8 x float> %v, i32 5 + %ext2 = extractelement <8 x float> %v, i32 6 + %ext3 = extractelement <8 x float> %v, i32 7 + %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 + %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 + %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 + %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 + store <4 x float> %ins3, <4 x float>* %ptr, align 16 + ret void + +; CHECK-LABEL: high_v8f32_to_v4f32 +; CHECK: vextractf128 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} + +; Make sure element type doesn't alter the codegen. Note that +; if we were actually using the vector in this function and +; have AVX2, we should generate vextracti128 (the int version). +define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) { + %ext0 = extractelement <8 x i32> %v, i32 4 + %ext1 = extractelement <8 x i32> %v, i32 5 + %ext2 = extractelement <8 x i32> %v, i32 6 + %ext3 = extractelement <8 x i32> %v, i32 7 + %ins0 = insertelement <4 x i32> undef, i32 %ext0, i32 0 + %ins1 = insertelement <4 x i32> %ins0, i32 %ext1, i32 1 + %ins2 = insertelement <4 x i32> %ins1, i32 %ext2, i32 2 + %ins3 = insertelement <4 x i32> %ins2, i32 %ext3, i32 3 + store <4 x i32> %ins3, <4 x i32>* %ptr, align 16 + ret void + +; CHECK-LABEL: high_v8i32_to_v4i32 +; CHECK: vextractf128 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} + +; Make sure that element size doesn't alter the codegen. +define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) { + %ext0 = extractelement <4 x double> %v, i32 2 + %ext1 = extractelement <4 x double> %v, i32 3 + %ins0 = insertelement <2 x double> undef, double %ext0, i32 0 + %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1 + store <2 x double> %ins1, <2 x double>* %ptr, align 16 + ret void + +; CHECK-LABEL: high_v4f64_to_v2f64 +; CHECK: vextractf128 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index fd25cc96330..14058c91286 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -467,23 +467,23 @@ define <16 x i8> @PR20540(<8 x i8> %a) { ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq -; -; SSSE3-LABEL: PR20540: -; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: PR20540: -; SSE41: # BB#0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; -; AVX-LABEL: PR20540: -; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq - %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> - ret <16 x i8> %shuffle +; +; SSSE3-LABEL: PR20540: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: PR20540: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: PR20540: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { @@ -493,25 +493,25 @@ define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz( ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: retq ; -; SSSE3-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSSE3: # BB#0: -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE41: # BB#0: -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; -; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq - %a = insertelement <16 x i8> undef, i8 %i, i32 0 - %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> +; SSSE3-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> ret <16 x i8> %shuffle } @@ -523,25 +523,25 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz( ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10] ; SSE2-NEXT: retq ; -; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSSE3: # BB#0: -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE41: # BB#0: -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; -; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq - %a = insertelement <16 x i8> undef, i8 %i, i32 0 - %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> +; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> ret <16 x i8> %shuffle } @@ -571,27 +571,27 @@ define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz( ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSSE3: # BB#0: -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE41: # BB#0: -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; -; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq - %a = insertelement <16 x i8> undef, i8 %i, i32 3 - %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 3 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> ret <16 x i8> %shuffle } -- 2.34.1