1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK-SKX
7 define <16 x float> @test1(<16 x float> %a) nounwind {
8 %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
15 define <16 x i32> @test2(<16 x i32> %a) nounwind {
16 %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
23 define <8 x i64> @test3(<8 x i64> %a) nounwind {
24 %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 5, i32 1, i32 undef, i32 7, i32 undef, i32 3, i32 1>
31 define <8 x double> @test4(<8 x double> %a) nounwind {
32 %c = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
39 define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind {
40 %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
47 define <8 x i64> @test6(<8 x i64> %a) nounwind {
48 %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
55 define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind {
56 %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
63 define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind {
64 %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
71 define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind {
72 %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
76 ; CHECK-LABEL: test10:
79 define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind {
80 %c = load <16 x float>, <16 x float>* %b
81 %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
85 ; CHECK-LABEL: test11:
88 define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind {
89 %c = load <16 x i32>, <16 x i32>* %b
90 %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
95 ; CHECK: vpermilps $177, %zmm
97 define <16 x float> @test13(<16 x float> %a) {
98 %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
102 ; CHECK-LABEL: test14
103 ; CHECK: vpermilpd $203, %zmm
105 define <8 x double> @test14(<8 x double> %a) {
106 %b = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32><i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 7, i32 7>
110 ; CHECK-LABEL: test15
111 ; CHECK: vpshufd $177, %zmm
113 define <16 x i32> @test15(<16 x i32> %a) {
114 ; mask 1-0-3-2 = 10110001 = 0xb1 = 177
115 %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
118 ; CHECK-LABEL: test16
119 ; CHECK: valignq $3, %zmm0, %zmm1
121 define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind {
122 %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
126 ; CHECK-LABEL: test17
127 ; CHECK: vshufpd $19, %zmm1, %zmm0
129 define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind {
130 %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 2, i32 10, i32 5, i32 undef, i32 undef, i32 undef>
134 ; CHECK-LABEL: test18
135 ; CHECK: vpunpckhdq %zmm
137 define <16 x i32> @test18(<16 x i32> %a, <16 x i32> %c) {
138 %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
142 ; CHECK-LABEL: test19
143 ; CHECK: vpunpckldq %zmm
145 define <16 x i32> @test19(<16 x i32> %a, <16 x i32> %c) {
146 %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
150 ; CHECK-LABEL: test20
151 ; CHECK: vpunpckhqdq %zmm
153 define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) {
154 %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32><i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
158 ; CHECK-LABEL: test21
159 ; CHECK: vbroadcastsd %xmm0, %zmm
161 define <8 x double> @test21(<8 x double> %a, <8 x double> %b) {
162 %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
163 ret <8 x double> %shuffle
166 ; CHECK-LABEL: test22
167 ; CHECK: vpbroadcastq %xmm0, %zmm
169 define <8 x i64> @test22(<8 x i64> %a, <8 x i64> %b) {
170 %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
171 ret <8 x i64> %shuffle
174 ; CHECK-LABEL: @test23
178 define <16 x i32> @test23(<16 x i32> %a, <16 x i32> %b) nounwind {
179 %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
183 ; CHECK-LABEL: @test24
186 define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind {
187 %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 25, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
191 ; CHECK-LABEL: @test25
194 define <16 x i32> @test25(<16 x i32> %a, <16 x i32> %b) nounwind {
195 ; mask - 0-1-3-0 00110100 = 0x34 = 52
196 %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 19, i32 16, i32 4, i32 5, i32 23, i32 undef, i32 8, i32 9, i32 27, i32 undef, i32 12, i32 13, i32 undef, i32 undef>
200 ; CHECK-LABEL: @test26
203 define <16 x i32> @test26(<16 x i32> %a) nounwind {
204 %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 undef, i32 9, i32 9, i32 undef, i32 11, i32 13, i32 undef, i32 undef, i32 undef>
208 ; CHECK-LABEL: @test27
210 define <16 x i32> @test27(<4 x i32>%a) {
211 %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
215 ; CHECK-LABEL: test28
216 ; CHECK: vpshufhw $177, %ymm
218 define <16 x i16> @test28(<16 x i16> %a) {
219 %b = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32><i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 7, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 12, i32 15, i32 14>
223 ; CHECK-LABEL: test29
224 ; CHECK: vunpcklps %zmm
226 define <16 x float> @test29(<16 x float> %a, <16 x float> %c) {
227 %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
231 ; CHECK-LABEL: @test30
232 ; CHECK: vshufps $144, %zmm
234 define <16 x float> @test30(<16 x float> %a, <16 x float> %c) {
235 %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 0, i32 17, i32 18, i32 4, i32 4, i32 21, i32 22, i32 8, i32 8, i32 25, i32 26, i32 12, i32 12, i32 29, i32 30>
239 ; CHECK-LABEL: test31
240 ; CHECK: valignd $3, %zmm0, %zmm1
242 define <16 x i32> @test31(<16 x i32> %a, <16 x i32> %b) nounwind {
243 %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
247 ; CHECK-LABEL: test32
248 ; CHECK: vshufpd $99, %zmm0, %zmm1
250 define <8 x double> @test32(<8 x double> %a, <8 x double> %b) nounwind {
251 %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 1, i32 10, i32 2, i32 undef, i32 5, i32 15, i32 undef>
255 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
256 define <8 x double> @test_vshuff64x2_512(<8 x double> %x, <8 x double> %x1) nounwind {
257 ; CHECK-LABEL: test_vshuff64x2_512:
259 ; CHECK-NEXT: vshuff64x2 $136, %zmm0, %zmm0, %zmm0
261 %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 0, i32 1, i32 4, i32 5>
262 ret <8 x double> %res
265 define <8 x double> @test_vshuff64x2_512_mask(<8 x double> %x, <8 x double> %x1, <8 x i1> %mask) nounwind {
266 ; CHECK-LABEL: test_vshuff64x2_512_mask:
268 ; CHECK-NEXT: vpmovsxwq %xmm2, %zmm1
269 ; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
270 ; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1
271 ; CHECK-NEXT: vshuff64x2 $136, %zmm0, %zmm0, %zmm0 {%k1} {z}
273 %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 0, i32 1, i32 4, i32 5>
274 %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer
275 ret <8 x double> %res
278 define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> %mask) nounwind {
279 ; CHECK-LABEL: test_vshufi64x2_512_mask:
281 ; CHECK-NEXT: vpmovsxwq %xmm2, %zmm1
282 ; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
283 ; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1
284 ; CHECK-NEXT: vshufi64x2 $168, %zmm0, %zmm0, %zmm0 {%k1}
286 %y = shufflevector <8 x i64> %x, <8 x i64> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 4, i32 5>
287 %res = select <8 x i1> %mask, <8 x i64> %y, <8 x i64> %x
291 define <8 x double> @test_vshuff64x2_512_mem(<8 x double> %x, <8 x double> *%ptr) nounwind {
292 ; CHECK-LABEL: test_vshuff64x2_512_mem:
294 ; CHECK-NEXT: vshuff64x2 $40, %zmm0, %zmm0, %zmm0
296 %x1 = load <8 x double>,<8 x double> *%ptr,align 1
297 %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 0, i32 1>
298 ret <8 x double> %res
301 define <16 x float> @test_vshuff32x4_512_mem(<16 x float> %x, <16 x float> *%ptr) nounwind {
302 ; CHECK-LABEL: test_vshuff32x4_512_mem:
304 ; CHECK-NEXT: vshuff64x2 $20, %zmm0, %zmm0, %zmm0
306 %x1 = load <16 x float>,<16 x float> *%ptr,align 1
307 %res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
308 ret <16 x float> %res
311 define <16 x i32> @test_align_v16i32_rr(<16 x i32> %a, <16 x i32> %b) nounwind {
312 ; CHECK-LABEL: test_align_v16i32_rr:
314 ; CHECK-NEXT: valignd $3, %zmm0, %zmm1, %zmm0
316 %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
320 define <16 x i32> @test_align_v16i32_rm(<16 x i32>* %a.ptr, <16 x i32> %b) nounwind {
321 ; CHECK-LABEL: test_align_v16i32_rm:
323 ; CHECK-NEXT: valignd $3, (%rdi), %zmm0, %zmm0
325 %a = load <16 x i32>, <16 x i32>* %a.ptr
326 %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
330 define <16 x i32> @test_align_v16i32_rm_mask(<16 x i32>* %a.ptr, <16 x i32> %b, <16 x i1> %mask) nounwind {
331 ; CHECK-LABEL: test_align_v16i32_rm_mask:
333 ; CHECK-NEXT: vpmovsxbd %xmm1, %zmm1
334 ; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1
335 ; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1
336 ; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
337 ; CHECK-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1}
338 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
341 ; CHECK-SKX-LABEL: test_align_v16i32_rm_mask:
342 ; CHECK-SKX: ## BB#0:
343 ; CHECK-SKX-NEXT: vpmovb2m %xmm1, %k1
344 ; CHECK-SKX-NEXT: vmovdqa32 (%rdi), %zmm1
345 ; CHECK-SKX-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1}
346 ; CHECK-SKX-NEXT: vmovaps %zmm1, %zmm0
347 ; CHECK-SKX-NEXT: retq
348 %a = load <16 x i32>, <16 x i32>* %a.ptr
349 %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
350 %res = select <16 x i1> %mask,<16 x i32> %c, <16 x i32> %a
354 define <8 x double> @test_align_v8f64_rr(<8 x double> %a, <8 x double> %b) nounwind {
355 ; CHECK-LABEL: test_align_v8f64_rr:
357 ; CHECK-NEXT: valignq $3, %zmm0, %zmm1, %zmm0
359 %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
363 define <8 x double> @test_align_v18f64_rm(<8 x double>* %a.ptr, <8 x double> %b) nounwind {
364 ; CHECK-LABEL: test_align_v18f64_rm:
366 ; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0
368 %a = load <8 x double>, <8 x double>* %a.ptr
369 %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
373 define <8 x double> @test_align_v18f64_rm_mask(<8 x double>* %a.ptr, <8 x double> %b, <8 x i1> %mask) nounwind {
374 ; CHECK-LABEL: test_align_v18f64_rm_mask:
376 ; CHECK-NEXT: vpmovsxwq %xmm1, %zmm1
377 ; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
378 ; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1
379 ; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z}
382 ; CHECK-SKX-LABEL: test_align_v18f64_rm_mask:
383 ; CHECK-SKX: ## BB#0:
384 ; CHECK-SKX-NEXT: vpmovw2m %xmm1, %k1
385 ; CHECK-SKX-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z}
386 ; CHECK-SKX-NEXT: retq
387 %a = load <8 x double>, <8 x double>* %a.ptr
388 %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
389 %res = select <8 x i1> %mask,<8 x double> %c, <8 x double> zeroinitializer
390 ret <8 x double> %res