1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
4 define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
6 ; ALL: ## BB#0: ## %entry
7 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
10 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
11 ret <8 x float> %shuffle
14 define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
16 ; ALL: ## BB#0: ## %entry
17 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
20 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
21 ret <8 x float> %shuffle
24 define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
26 ; ALL: ## BB#0: ## %entry
27 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
30 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
31 ret <8 x float> %shuffle
34 define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
36 ; ALL: ## BB#0: ## %entry
37 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
40 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
41 ret <8 x float> %shuffle
44 define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
46 ; ALL: ## BB#0: ## %entry
47 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
50 %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
51 ret <32 x i8> %shuffle
54 define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
56 ; ALL: ## BB#0: ## %entry
57 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
60 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
61 ret <4 x i64> %shuffle
64 define <32 x i8> @Ei(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
66 ; AVX1: ## BB#0: ## %entry
67 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
68 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
69 ; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
70 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
71 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
72 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
76 ; AVX2: ## BB#0: ## %entry
77 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
78 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
81 ; add forces execution domain
82 %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
83 %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
84 ret <32 x i8> %shuffle
87 define <4 x i64> @E2i(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
89 ; AVX1: ## BB#0: ## %entry
90 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
91 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
92 ; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
93 ; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0
94 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
95 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
99 ; AVX2: ## BB#0: ## %entry
100 ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
101 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
102 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
105 ; add forces execution domain
106 %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
107 %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
108 ret <4 x i64> %shuffle
111 define <8 x i32> @E3i(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
113 ; AVX1: ## BB#0: ## %entry
114 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
115 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
116 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
117 ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
118 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
119 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
123 ; AVX2: ## BB#0: ## %entry
124 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
125 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
126 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
129 ; add forces execution domain
130 %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
131 %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
132 ret <8 x i32> %shuffle
135 define <16 x i16> @E4i(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
137 ; AVX1: ## BB#0: ## %entry
138 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
139 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
143 ; AVX2: ## BB#0: ## %entry
144 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
145 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
148 ; add forces execution domain
149 %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
150 %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
151 ret <16 x i16> %shuffle
154 define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
156 ; AVX1: ## BB#0: ## %entry
157 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
158 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
159 ; AVX1-NEXT: vmovaps (%rsi), %ymm1
160 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
164 ; AVX2: ## BB#0: ## %entry
165 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
166 ; AVX2-NEXT: vmovdqa (%rsi), %ymm1
167 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
168 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
171 %c = load <16 x i16>* %a
172 %d = load <16 x i16>* %b
173 %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
174 %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
175 ret <16 x i16> %shuffle
178 ;;;; Cases with undef indicies mixed in the mask
180 define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
182 ; ALL: ## BB#0: ## %entry
183 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1,0,1]
186 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
187 ret <8 x float> %shuffle
190 ;;;; Cases we must not select vperm2f128
192 define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
194 ; ALL: ## BB#0: ## %entry
195 ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
196 ; ALL-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7]
197 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
200 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
201 ret <8 x float> %shuffle