1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s --check-prefix=AVX512
3 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX2
4 ; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s --check-prefix=AVX_SCALAR
5 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=skx < %s | FileCheck %s --check-prefix=SKX
8 ; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
11 ; AVX2: vpmaskmovd {{.*}}(%rdi)
12 ; AVX2: vpmaskmovd {{.*}}(%rdi)
15 ; AVX_SCALAR-LABEL: test1
16 ; AVX_SCALAR-NOT: masked
17 ; AVX_SCALAR: extractelement
18 ; AVX_SCALAR: insertelement
19 ; AVX_SCALAR: extractelement
20 ; AVX_SCALAR: insertelement
21 define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) {
22 %mask = icmp eq <16 x i32> %trigger, zeroinitializer
23 %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef)
28 ; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
31 ; AVX2: vpmaskmovd {{.*}}(%rdi)
32 ; AVX2: vpmaskmovd {{.*}}(%rdi)
34 define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) {
35 %mask = icmp eq <16 x i32> %trigger, zeroinitializer
36 %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer)
41 ; AVX512: vmovdqu32 %zmm1, (%rdi) {%k1}
43 ; AVX_SCALAR-LABEL: test3
44 ; AVX_SCALAR-NOT: masked
45 ; AVX_SCALAR: extractelement
47 ; AVX_SCALAR: extractelement
49 ; AVX_SCALAR: extractelement
51 define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) {
52 %mask = icmp eq <16 x i32> %trigger, zeroinitializer
53 call void @llvm.masked.store.v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask)
58 ; AVX512: vmovups (%rdi), %zmm{{.*{%k[1-7]}}}
61 ; AVX2: vmaskmovps {{.*}}(%rdi)
62 ; AVX2: vmaskmovps {{.*}}(%rdi)
64 define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %dst) {
65 %mask = icmp eq <16 x i32> %trigger, zeroinitializer
66 %res = call <16 x float> @llvm.masked.load.v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst)
71 ; AVX512: vmovupd (%rdi), %zmm1 {%k1}
78 define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> %dst) {
79 %mask = icmp eq <8 x i32> %trigger, zeroinitializer
80 %res = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst)
89 ; SKX: vmovupd {{.*}}{%k1}
90 define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
91 %mask = icmp eq <2 x i64> %trigger, zeroinitializer
92 %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
97 ; AVX2: vmaskmovps {{.*}}(%rdi)
101 ; SKX: vmovups (%rdi){{.*}}{%k1}
102 define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) {
103 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
104 %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst)
109 ; AVX2: vpmaskmovd {{.*}}(%rdi)
113 ; SKX: vmovdqu32 (%rdi){{.*}}{%k1}
114 define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
115 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
116 %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
121 ; AVX2: vpmaskmovd %xmm
124 ; SKX: vmovdqu32 %xmm{{.*}}{%k1}
125 define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
126 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
127 call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
132 ; AVX2: vmaskmovpd (%rdi), %ymm
136 ; SKX: vmovapd {{.*}}{%k1}
137 define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
138 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
139 %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst)
140 ret <4 x double> %res
143 ; AVX2-LABEL: test11a
148 ; SKX: vmovaps (%rdi), %ymm1 {%k1}
149 ; AVX512-LABEL: test11a
150 ; AVX512: kshiftlw $8
151 ; AVX512: kshiftrw $8
152 ; AVX512: vmovups (%rdi), %zmm1 {%k1}
153 define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
154 %mask = icmp eq <8 x i32> %trigger, zeroinitializer
155 %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
160 ; SKX: vmovdqu32 (%rdi), %ymm1 {%k1}
161 ; AVX512-LABEL: test11b
162 ; AVX512: kshiftlw $8
163 ; AVX512: kshiftrw $8
164 ; AVX512: vmovdqu32 (%rdi), %zmm1 {%k1}
165 define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
166 %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
171 ; SKX: vmovaps (%rdi), %ymm0 {%k1} {z}
172 ; AVX512-LABEL: test11c
173 ; AVX512: kshiftlw $8
174 ; AVX512: kshiftrw $8
175 ; AVX512: vmovups (%rdi), %zmm0 {%k1} {z}
176 define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
177 %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
182 ; SKX: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
183 ; AVX512-LABEL: test11d
184 ; AVX512: kshiftlw $8
185 ; AVX512: kshiftrw $8
186 ; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
187 define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
188 %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
193 ; AVX2: vpmaskmovd %ymm
196 ; SKX: vmovdqu32 {{.*}}{%k1}
197 define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
198 %mask = icmp eq <8 x i32> %trigger, zeroinitializer
199 call void @llvm.masked.store.v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
203 ; AVX512-LABEL: test13
204 ; AVX512: vmovups %zmm1, (%rdi) {%k1}
206 define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val) {
207 %mask = icmp eq <16 x i32> %trigger, zeroinitializer
208 call void @llvm.masked.store.v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask)
220 ; SKX: vmovups {{.*}}{%k1}
222 define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
223 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
224 call void @llvm.masked.store.v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
233 ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
234 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
235 ; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
236 ; SKX-NEXT: vpmovqd %xmm1, (%rdi) {%k1}
238 define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
239 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
240 call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
251 ; SKX: vmovups {{.*}}{%k1}
252 define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
253 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
254 %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
266 ; SKX: vmovdqu32 {{.*}}{%k1}
267 define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
268 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
269 %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
277 define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
280 ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
281 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
282 ; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
283 ; SKX-NEXT: kshiftlw $2, %k0, %k0
284 ; SKX-NEXT: kshiftrw $2, %k0, %k1
285 ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
287 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
288 %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
292 ; AVX_SCALAR-LABEL: test19
293 ; AVX_SCALAR: load <4 x float>, <4 x float>* %addr, align 4
295 define <4 x float> @test19(<4 x i32> %trigger, <4 x float>* %addr) {
296 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
297 %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
301 ; AVX_SCALAR-LABEL: test20
302 ; AVX_SCALAR: load float, {{.*}}, align 4
303 ; AVX_SCALAR: insertelement <4 x float> undef, float
304 ; AVX_SCALAR: select <4 x i1> <i1 true, i1 false, i1 true, i1 true>
306 define <4 x float> @test20(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %src0) {
307 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
308 %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 16, <4 x i1><i1 true, i1 false, i1 true, i1 true>, <4 x float> %src0)
312 ; AVX_SCALAR-LABEL: test21
313 ; AVX_SCALAR: store <4 x i32> %val
314 define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
315 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
316 call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>)
320 ; AVX_SCALAR-LABEL: test22
321 ; AVX_SCALAR: extractelement <4 x i32> %val, i32 0
322 ; AVX_SCALAR: store i32
323 define void @test22(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
324 %mask = icmp eq <4 x i32> %trigger, zeroinitializer
325 call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
329 declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
330 declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
331 declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
332 declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
333 declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
334 declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
335 declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
336 declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
337 declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
338 declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
339 declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
340 declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
341 declare <8 x i32> @llvm.masked.load.v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
342 declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
343 declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
344 declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
345 declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
346 declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
347 declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
348 declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
349 declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
351 declare <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>)
353 ; AVX512-LABEL: test23
354 ; AVX512: vmovdqu64 64(%rdi), %zmm1 {%k2} {z}
355 ; AVX512: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
357 define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) {
358 %mask = icmp eq <16 x i32*> %trigger, zeroinitializer
359 %res = call <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer)
363 %mystruct = type { i16, i16, [1 x i8*] }
365 declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>)
367 define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
368 ; AVX512-LABEL: test24:
370 ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
371 ; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
372 ; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
373 ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
374 ; AVX512-NEXT: kshiftrw $8, %k1, %k1
375 ; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z}
378 ; AVX2-LABEL: test24:
380 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
381 ; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
382 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
383 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
384 ; AVX2-NEXT: vpmaskmovq (%rdi), %ymm1, %ymm4
385 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
386 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
387 ; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
388 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
389 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
390 ; AVX2-NEXT: vpmaskmovq 96(%rdi), %ymm1, %ymm3
391 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
392 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
393 ; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
394 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
395 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
396 ; AVX2-NEXT: vpmaskmovq 64(%rdi), %ymm1, %ymm2
397 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
398 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
399 ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
400 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
401 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
402 ; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm0, %ymm1
403 ; AVX2-NEXT: vmovdqa %ymm4, %ymm0
408 ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
409 ; SKX-NEXT: vpmovb2m %xmm0, %k1
410 ; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
411 ; SKX-NEXT: kshiftrw $8, %k1, %k1
412 ; SKX-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z}
414 %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer)
415 ret <16 x %mystruct*> %res
418 define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
419 ; AVX512-LABEL: test_store_16i64:
421 ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
422 ; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
423 ; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
424 ; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
425 ; AVX512-NEXT: kshiftrw $8, %k1, %k1
426 ; AVX512-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1}
429 ; AVX2-LABEL: test_store_16i64:
431 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
432 ; AVX2-NEXT: vpslld $31, %xmm5, %xmm5
433 ; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5
434 ; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5
435 ; AVX2-NEXT: vpmaskmovq %ymm1, %ymm5, (%rdi)
436 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
437 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
438 ; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
439 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
440 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
441 ; AVX2-NEXT: vpmaskmovq %ymm4, %ymm1, 96(%rdi)
442 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
443 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
444 ; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
445 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
446 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
447 ; AVX2-NEXT: vpmaskmovq %ymm3, %ymm1, 64(%rdi)
448 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
449 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
450 ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
451 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
452 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
453 ; AVX2-NEXT: vpmaskmovq %ymm2, %ymm0, 32(%rdi)
454 ; AVX2-NEXT: vzeroupper
457 ; SKX-LABEL: test_store_16i64:
459 ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
460 ; SKX-NEXT: vpmovb2m %xmm0, %k1
461 ; SKX-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
462 ; SKX-NEXT: kshiftrw $8, %k1, %k1
463 ; SKX-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1}
465 call void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask)
468 declare void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask)
469 define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) {
470 ; AVX512-LABEL: test_store_16f64:
472 ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
473 ; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
474 ; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
475 ; AVX512-NEXT: vmovupd %zmm1, (%rdi) {%k1}
476 ; AVX512-NEXT: kshiftrw $8, %k1, %k1
477 ; AVX512-NEXT: vmovupd %zmm2, 64(%rdi) {%k1}
480 ; AVX2-LABEL: test_store_16f64:
482 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
483 ; AVX2-NEXT: vpslld $31, %xmm5, %xmm5
484 ; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5
485 ; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5
486 ; AVX2-NEXT: vmaskmovpd %ymm1, %ymm5, (%rdi)
487 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
488 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
489 ; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
490 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
491 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
492 ; AVX2-NEXT: vmaskmovpd %ymm4, %ymm1, 96(%rdi)
493 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
494 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
495 ; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
496 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
497 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
498 ; AVX2-NEXT: vmaskmovpd %ymm3, %ymm1, 64(%rdi)
499 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
500 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
501 ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
502 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
503 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
504 ; AVX2-NEXT: vmaskmovpd %ymm2, %ymm0, 32(%rdi)
505 ; AVX2-NEXT: vzeroupper
508 ; SKX-LABEL: test_store_16f64:
510 ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
511 ; SKX-NEXT: vpmovb2m %xmm0, %k1
512 ; SKX-NEXT: vmovupd %zmm1, (%rdi) {%k1}
513 ; SKX-NEXT: kshiftrw $8, %k1, %k1
514 ; SKX-NEXT: vmovupd %zmm2, 64(%rdi) {%k1}
516 call void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask)
519 declare void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask)
520 define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
521 ; AVX512-LABEL: test_load_16i64:
523 ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
524 ; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
525 ; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
526 ; AVX512-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
527 ; AVX512-NEXT: kshiftrw $8, %k1, %k1
528 ; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
529 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
530 ; AVX512-NEXT: vmovaps %zmm2, %zmm1
533 ; AVX2-LABEL: test_load_16i64:
535 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
536 ; AVX2-NEXT: vpslld $31, %xmm5, %xmm5
537 ; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5
538 ; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5
539 ; AVX2-NEXT: vpmaskmovq (%rdi), %ymm5, %ymm9
540 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,3]
541 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
542 ; AVX2-NEXT: vpslld $31, %xmm7, %xmm7
543 ; AVX2-NEXT: vpsrad $31, %xmm7, %xmm7
544 ; AVX2-NEXT: vpmovsxdq %xmm7, %ymm7
545 ; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm7, %ymm8
546 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
547 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
548 ; AVX2-NEXT: vpslld $31, %xmm6, %xmm6
549 ; AVX2-NEXT: vpsrad $31, %xmm6, %xmm6
550 ; AVX2-NEXT: vpmovsxdq %xmm6, %ymm6
551 ; AVX2-NEXT: vpmaskmovq 64(%rdi), %ymm6, %ymm10
552 ; AVX2-NEXT: vblendvpd %ymm5, %ymm9, %ymm1, %ymm5
553 ; AVX2-NEXT: vblendvpd %ymm7, %ymm8, %ymm2, %ymm1
554 ; AVX2-NEXT: vblendvpd %ymm6, %ymm10, %ymm3, %ymm2
555 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
556 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
557 ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
558 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
559 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
560 ; AVX2-NEXT: vpmaskmovq 96(%rdi), %ymm0, %ymm3
561 ; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm4, %ymm3
562 ; AVX2-NEXT: vmovapd %ymm5, %ymm0
565 ; SKX-LABEL: test_load_16i64:
567 ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
568 ; SKX-NEXT: vpmovb2m %xmm0, %k1
569 ; SKX-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
570 ; SKX-NEXT: kshiftrw $8, %k1, %k1
571 ; SKX-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
572 ; SKX-NEXT: vmovaps %zmm1, %zmm0
573 ; SKX-NEXT: vmovaps %zmm2, %zmm1
575 %res = call <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
578 declare <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
579 define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) {
580 ; AVX512-LABEL: test_load_16f64:
582 ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
583 ; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
584 ; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1
585 ; AVX512-NEXT: vmovupd (%rdi), %zmm1 {%k1}
586 ; AVX512-NEXT: kshiftrw $8, %k1, %k1
587 ; AVX512-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
588 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
589 ; AVX512-NEXT: vmovaps %zmm2, %zmm1
592 ; AVX2-LABEL: test_load_16f64:
594 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
595 ; AVX2-NEXT: vpslld $31, %xmm5, %xmm5
596 ; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5
597 ; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5
598 ; AVX2-NEXT: vmaskmovpd (%rdi), %ymm5, %ymm9
599 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,1,2,3]
600 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
601 ; AVX2-NEXT: vpslld $31, %xmm7, %xmm7
602 ; AVX2-NEXT: vpsrad $31, %xmm7, %xmm7
603 ; AVX2-NEXT: vpmovsxdq %xmm7, %ymm7
604 ; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm7, %ymm8
605 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
606 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
607 ; AVX2-NEXT: vpslld $31, %xmm6, %xmm6
608 ; AVX2-NEXT: vpsrad $31, %xmm6, %xmm6
609 ; AVX2-NEXT: vpmovsxdq %xmm6, %ymm6
610 ; AVX2-NEXT: vmaskmovpd 64(%rdi), %ymm6, %ymm10
611 ; AVX2-NEXT: vblendvpd %ymm5, %ymm9, %ymm1, %ymm5
612 ; AVX2-NEXT: vblendvpd %ymm7, %ymm8, %ymm2, %ymm1
613 ; AVX2-NEXT: vblendvpd %ymm6, %ymm10, %ymm3, %ymm2
614 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
615 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
616 ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
617 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
618 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
619 ; AVX2-NEXT: vmaskmovpd 96(%rdi), %ymm0, %ymm3
620 ; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm4, %ymm3
621 ; AVX2-NEXT: vmovapd %ymm5, %ymm0
624 ; SKX-LABEL: test_load_16f64:
626 ; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
627 ; SKX-NEXT: vpmovb2m %xmm0, %k1
628 ; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1}
629 ; SKX-NEXT: kshiftrw $8, %k1, %k1
630 ; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
631 ; SKX-NEXT: vmovaps %zmm1, %zmm0
632 ; SKX-NEXT: vmovaps %zmm2, %zmm1
634 %res = call <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
635 ret <16 x double> %res
637 declare <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
639 define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) {
640 ; AVX512-LABEL: test_load_32f64:
642 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm5
643 ; AVX512-NEXT: vpmovsxbd %xmm5, %zmm5
644 ; AVX512-NEXT: vpslld $31, %zmm5, %zmm5
645 ; AVX512-NEXT: vptestmd %zmm5, %zmm5, %k1
646 ; AVX512-NEXT: vmovupd 128(%rdi), %zmm3 {%k1}
647 ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
648 ; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
649 ; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k2
650 ; AVX512-NEXT: vmovupd (%rdi), %zmm1 {%k2}
651 ; AVX512-NEXT: kshiftrw $8, %k1, %k1
652 ; AVX512-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
653 ; AVX512-NEXT: kshiftrw $8, %k2, %k1
654 ; AVX512-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
655 ; AVX512-NEXT: vmovaps %zmm1, %zmm0
656 ; AVX512-NEXT: vmovaps %zmm2, %zmm1
657 ; AVX512-NEXT: vmovaps %zmm3, %zmm2
658 ; AVX512-NEXT: vmovaps %zmm4, %zmm3
661 ; AVX2-LABEL: test_load_32f64:
663 ; AVX2-NEXT: pushq %rbp
665 ; AVX2-NEXT: .cfi_def_cfa_offset 16
667 ; AVX2-NEXT: .cfi_offset %rbp, -16
668 ; AVX2-NEXT: movq %rsp, %rbp
670 ; AVX2-NEXT: .cfi_def_cfa_register %rbp
671 ; AVX2-NEXT: andq $-32, %rsp
672 ; AVX2-NEXT: subq $32, %rsp
673 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,3]
674 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
675 ; AVX2-NEXT: vpslld $31, %xmm8, %xmm8
676 ; AVX2-NEXT: vpsrad $31, %xmm8, %xmm8
677 ; AVX2-NEXT: vpmovsxdq %xmm8, %ymm8
678 ; AVX2-NEXT: vmaskmovpd 32(%rsi), %ymm8, %ymm9
679 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,3,0,1]
680 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero
681 ; AVX2-NEXT: vpslld $31, %xmm10, %xmm10
682 ; AVX2-NEXT: vpsrad $31, %xmm10, %xmm10
683 ; AVX2-NEXT: vpmovsxdq %xmm10, %ymm10
684 ; AVX2-NEXT: vmaskmovpd 64(%rsi), %ymm10, %ymm11
685 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[3,1,2,3]
686 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero
687 ; AVX2-NEXT: vpslld $31, %xmm12, %xmm12
688 ; AVX2-NEXT: vpsrad $31, %xmm12, %xmm12
689 ; AVX2-NEXT: vpmovsxdq %xmm12, %ymm12
690 ; AVX2-NEXT: vmaskmovpd 96(%rsi), %ymm12, %ymm13
691 ; AVX2-NEXT: vblendvpd %ymm8, %ymm9, %ymm2, %ymm8
692 ; AVX2-NEXT: vblendvpd %ymm10, %ymm11, %ymm3, %ymm9
693 ; AVX2-NEXT: vblendvpd %ymm12, %ymm13, %ymm4, %ymm11
694 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
695 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,3]
696 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
697 ; AVX2-NEXT: vpslld $31, %xmm3, %xmm3
698 ; AVX2-NEXT: vpsrad $31, %xmm3, %xmm3
699 ; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
700 ; AVX2-NEXT: vmaskmovpd 160(%rsi), %ymm3, %ymm10
701 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
702 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
703 ; AVX2-NEXT: vpslld $31, %xmm4, %xmm4
704 ; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4
705 ; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4
706 ; AVX2-NEXT: vmaskmovpd 192(%rsi), %ymm4, %ymm12
707 ; AVX2-NEXT: vblendvpd %ymm3, %ymm10, %ymm6, %ymm3
708 ; AVX2-NEXT: vmovapd 16(%rbp), %ymm6
709 ; AVX2-NEXT: vblendvpd %ymm4, %ymm12, %ymm7, %ymm4
710 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[3,1,2,3]
711 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
712 ; AVX2-NEXT: vpslld $31, %xmm7, %xmm7
713 ; AVX2-NEXT: vpsrad $31, %xmm7, %xmm7
714 ; AVX2-NEXT: vpmovsxdq %xmm7, %ymm7
715 ; AVX2-NEXT: vmaskmovpd 224(%rsi), %ymm7, %ymm10
716 ; AVX2-NEXT: vblendvpd %ymm7, %ymm10, %ymm6, %ymm6
717 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
718 ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
719 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
720 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
721 ; AVX2-NEXT: vmaskmovpd (%rsi), %ymm0, %ymm7
722 ; AVX2-NEXT: vblendvpd %ymm0, %ymm7, %ymm1, %ymm0
723 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
724 ; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
725 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
726 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
727 ; AVX2-NEXT: vmaskmovpd 128(%rsi), %ymm1, %ymm2
728 ; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm5, %ymm1
729 ; AVX2-NEXT: vmovapd %ymm1, 128(%rdi)
730 ; AVX2-NEXT: vmovapd %ymm0, (%rdi)
731 ; AVX2-NEXT: vmovapd %ymm6, 224(%rdi)
732 ; AVX2-NEXT: vmovapd %ymm4, 192(%rdi)
733 ; AVX2-NEXT: vmovapd %ymm3, 160(%rdi)
734 ; AVX2-NEXT: vmovapd %ymm11, 96(%rdi)
735 ; AVX2-NEXT: vmovapd %ymm9, 64(%rdi)
736 ; AVX2-NEXT: vmovapd %ymm8, 32(%rdi)
737 ; AVX2-NEXT: movq %rdi, %rax
738 ; AVX2-NEXT: movq %rbp, %rsp
739 ; AVX2-NEXT: popq %rbp
740 ; AVX2-NEXT: vzeroupper
743 ; SKX-LABEL: test_load_32f64:
745 ; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
746 ; SKX-NEXT: vpmovb2m %ymm0, %k1
747 ; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1}
748 ; SKX-NEXT: kshiftrd $16, %k1, %k2
749 ; SKX-NEXT: vmovupd 128(%rdi), %zmm3 {%k2}
750 ; SKX-NEXT: kshiftrw $8, %k1, %k1
751 ; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
752 ; SKX-NEXT: kshiftrw $8, %k2, %k1
753 ; SKX-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
754 ; SKX-NEXT: vmovaps %zmm1, %zmm0
755 ; SKX-NEXT: vmovaps %zmm2, %zmm1
756 ; SKX-NEXT: vmovaps %zmm3, %zmm2
757 ; SKX-NEXT: vmovaps %zmm4, %zmm3
759 %res = call <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
760 ret <32 x double> %res
762 declare <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)