1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
4 declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone
5 define i32 @test_kortestz(i16 %a0, i16 %a1) {
6 ; CHECK-LABEL: test_kortestz:
8 ; CHECK-NEXT: kmovw %esi, %k0
9 ; CHECK-NEXT: kmovw %edi, %k1
10 ; CHECK-NEXT: kortestw %k0, %k1
11 ; CHECK-NEXT: sete %al
12 ; CHECK-NEXT: kmovw %eax, %k0
13 ; CHECK-NEXT: kmovw %k0, %eax
14 ; CHECK-NEXT: andl $1, %eax
16 %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1)
20 declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone
21 define i32 @test_kortestc(i16 %a0, i16 %a1) {
22 ; CHECK-LABEL: test_kortestc:
24 ; CHECK-NEXT: kmovw %esi, %k0
25 ; CHECK-NEXT: kmovw %edi, %k1
26 ; CHECK-NEXT: kortestw %k0, %k1
27 ; CHECK-NEXT: sbbl %eax, %eax
28 ; CHECK-NEXT: andl $1, %eax
30 %res = call i32 @llvm.x86.avx512.kortestc.w(i16 %a0, i16 %a1)
34 declare i16 @llvm.x86.avx512.kand.w(i16, i16) nounwind readnone
35 define i16 @test_kand(i16 %a0, i16 %a1) {
36 ; CHECK-LABEL: test_kand:
38 ; CHECK-NEXT: movw $8, %ax
39 ; CHECK-NEXT: kmovw %eax, %k0
40 ; CHECK-NEXT: kmovw %edi, %k1
41 ; CHECK-NEXT: kandw %k0, %k1, %k0
42 ; CHECK-NEXT: kmovw %esi, %k1
43 ; CHECK-NEXT: kandw %k1, %k0, %k0
44 ; CHECK-NEXT: kmovw %k0, %eax
46 %t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8)
47 %t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1)
51 declare i16 @llvm.x86.avx512.knot.w(i16) nounwind readnone
52 define i16 @test_knot(i16 %a0) {
53 ; CHECK-LABEL: test_knot:
55 ; CHECK-NEXT: kmovw %edi, %k0
56 ; CHECK-NEXT: knotw %k0, %k0
57 ; CHECK-NEXT: kmovw %k0, %eax
59 %res = call i16 @llvm.x86.avx512.knot.w(i16 %a0)
63 declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
65 define i16 @unpckbw_test(i16 %a0, i16 %a1) {
66 ; CHECK-LABEL: unpckbw_test:
68 ; CHECK-NEXT: kmovw %edi, %k0
69 ; CHECK-NEXT: kmovw %esi, %k1
70 ; CHECK-NEXT: kunpckbw %k1, %k0, %k0
71 ; CHECK-NEXT: kmovw %k0, %eax
73 %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
77 define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
78 ; CHECK-LABEL: test_rcp_ps_512:
80 ; CHECK-NEXT: vrcp14ps %zmm0, %zmm0
82 %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
85 declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
87 define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
88 ; CHECK-LABEL: test_rcp_pd_512:
90 ; CHECK-NEXT: vrcp14pd %zmm0, %zmm0
92 %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1]
95 declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
97 declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
99 define <8 x double> @test7(<8 x double> %a) {
100 ; CHECK-LABEL: test7:
102 ; CHECK-NEXT: vrndscalepd $11, %zmm0, %zmm0
104 %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4)
108 declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
110 define <16 x float> @test8(<16 x float> %a) {
111 ; CHECK-LABEL: test8:
113 ; CHECK-NEXT: vrndscaleps $11, %zmm0, %zmm0
115 %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4)
119 define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
120 ; CHECK-LABEL: test_rsqrt_ps_512:
122 ; CHECK-NEXT: vrsqrt14ps %zmm0, %zmm0
124 %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
125 ret <16 x float> %res
127 declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
129 define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
130 ; CHECK-LABEL: test_rsqrt14_ss:
132 ; CHECK-NEXT: vrsqrt14ss %xmm0, %xmm0, %xmm0
134 %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
137 declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
139 define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
140 ; CHECK-LABEL: test_rcp14_ss:
142 ; CHECK-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0
144 %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
147 declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
149 define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
150 ; CHECK-LABEL: test_sqrt_pd_512:
152 ; CHECK-NEXT: vsqrtpd %zmm0, %zmm0
154 %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4)
155 ret <8 x double> %res
157 declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
159 define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
160 ; CHECK-LABEL: test_sqrt_ps_512:
162 ; CHECK-NEXT: vsqrtps %zmm0, %zmm0
164 %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
165 ret <16 x float> %res
167 define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) {
168 ; CHECK-LABEL: test_sqrt_round_ps_512:
170 ; CHECK-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm0
172 %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 3)
173 ret <16 x float> %res
175 declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
177 define <8 x double> @test_getexp_pd_512(<8 x double> %a0) {
178 ; CHECK-LABEL: test_getexp_pd_512:
180 ; CHECK-NEXT: vgetexppd %zmm0, %zmm0
182 %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4)
183 ret <8 x double> %res
185 define <8 x double> @test_getexp_round_pd_512(<8 x double> %a0) {
186 ; CHECK-LABEL: test_getexp_round_pd_512:
188 ; CHECK-NEXT: vgetexppd {sae}, %zmm0, %zmm0
190 %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
191 ret <8 x double> %res
193 declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
195 define <16 x float> @test_getexp_ps_512(<16 x float> %a0) {
196 ; CHECK-LABEL: test_getexp_ps_512:
198 ; CHECK-NEXT: vgetexpps %zmm0, %zmm0
200 %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
201 ret <16 x float> %res
204 define <16 x float> @test_getexp_round_ps_512(<16 x float> %a0) {
205 ; CHECK-LABEL: test_getexp_round_ps_512:
207 ; CHECK-NEXT: vgetexpps {sae}, %zmm0, %zmm0
209 %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
210 ret <16 x float> %res
212 declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
214 declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
216 define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
217 ; CHECK-LABEL: test_sqrt_ss:
219 ; CHECK-NEXT: andl $1, %edi
220 ; CHECK-NEXT: kmovw %edi, %k1
221 ; CHECK-NEXT: vmovaps %zmm2, %zmm3
222 ; CHECK-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
223 ; CHECK-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
224 ; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
225 ; CHECK-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
226 ; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm1
227 ; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0
228 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
230 %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
231 %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
232 %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 2)
233 %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 3)
235 %res.1 = fadd <4 x float> %res0, %res1
236 %res.2 = fadd <4 x float> %res2, %res3
237 %res = fadd <4 x float> %res.1, %res.2
241 declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
243 define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
244 ; CHECK-LABEL: test_sqrt_sd:
246 ; CHECK-NEXT: andl $1, %edi
247 ; CHECK-NEXT: kmovw %edi, %k1
248 ; CHECK-NEXT: vmovaps %zmm2, %zmm3
249 ; CHECK-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
250 ; CHECK-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
251 ; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
252 ; CHECK-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
253 ; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm1
254 ; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0
255 ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
257 %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
258 %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
259 %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 2)
260 %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 3)
262 %res.1 = fadd <2 x double> %res0, %res1
263 %res.2 = fadd <2 x double> %res2, %res3
264 %res = fadd <2 x double> %res.1, %res.2
265 ret <2 x double> %res
268 define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
269 ; CHECK-LABEL: test_x86_sse2_cvtsd2si64:
271 ; CHECK-NEXT: vcvtsd2si %xmm0, %rax
273 %res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
276 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
278 define <2 x double> @test_x86_sse2_cvtsi642sd(<2 x double> %a0, i64 %a1) {
279 ; CHECK-LABEL: test_x86_sse2_cvtsi642sd:
281 ; CHECK-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0
283 %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
284 ret <2 x double> %res
286 declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
288 define i64 @test_x86_avx512_cvttsd2si64(<2 x double> %a0) {
289 ; CHECK-LABEL: test_x86_avx512_cvttsd2si64:
291 ; CHECK-NEXT: vcvttsd2si %xmm0, %rcx
292 ; CHECK-NEXT: vcvttsd2si {sae}, %xmm0, %rax
293 ; CHECK-NEXT: addq %rcx, %rax
295 %res0 = call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %a0, i32 4) ;
296 %res1 = call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %a0, i32 8) ;
297 %res2 = add i64 %res0, %res1
300 declare i64 @llvm.x86.avx512.cvttsd2si64(<2 x double>, i32) nounwind readnone
302 define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) {
303 ; CHECK-LABEL: test_x86_avx512_cvttsd2usi:
305 ; CHECK-NEXT: vcvttsd2usi %xmm0, %ecx
306 ; CHECK-NEXT: vcvttsd2usi {sae}, %xmm0, %eax
307 ; CHECK-NEXT: addl %ecx, %eax
309 %res0 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 4) ;
310 %res1 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 8) ;
311 %res2 = add i32 %res0, %res1
314 declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) nounwind readnone
316 define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) {
317 ; CHECK-LABEL: test_x86_avx512_cvttsd2si:
319 ; CHECK-NEXT: vcvttsd2si %xmm0, %ecx
320 ; CHECK-NEXT: vcvttsd2si {sae}, %xmm0, %eax
321 ; CHECK-NEXT: addl %ecx, %eax
323 %res0 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 4) ;
324 %res1 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 8) ;
325 %res2 = add i32 %res0, %res1
328 declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) nounwind readnone
332 define i64 @test_x86_avx512_cvttsd2usi64(<2 x double> %a0) {
333 ; CHECK-LABEL: test_x86_avx512_cvttsd2usi64:
335 ; CHECK-NEXT: vcvttsd2usi %xmm0, %rcx
336 ; CHECK-NEXT: vcvttsd2usi {sae}, %xmm0, %rax
337 ; CHECK-NEXT: addq %rcx, %rax
339 %res0 = call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %a0, i32 4) ;
340 %res1 = call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %a0, i32 8) ;
341 %res2 = add i64 %res0, %res1
344 declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32) nounwind readnone
346 define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) {
347 ; CHECK-LABEL: test_x86_sse_cvtss2si64:
349 ; CHECK-NEXT: vcvtss2si %xmm0, %rax
351 %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; <i64> [#uses=1]
354 declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
357 define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) {
358 ; CHECK-LABEL: test_x86_sse_cvtsi642ss:
360 ; CHECK-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
362 %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ; <<4 x float>> [#uses=1]
365 declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
368 define i32 @test_x86_avx512_cvttss2si(<4 x float> %a0) {
369 ; CHECK-LABEL: test_x86_avx512_cvttss2si:
371 ; CHECK-NEXT: vcvttss2si {sae}, %xmm0, %ecx
372 ; CHECK-NEXT: vcvttss2si %xmm0, %eax
373 ; CHECK-NEXT: addl %ecx, %eax
375 %res0 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 8) ;
376 %res1 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 4) ;
377 %res2 = add i32 %res0, %res1
380 declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32) nounwind readnone
382 define i64 @test_x86_avx512_cvttss2si64(<4 x float> %a0) {
383 ; CHECK-LABEL: test_x86_avx512_cvttss2si64:
385 ; CHECK-NEXT: vcvttss2si %xmm0, %rcx
386 ; CHECK-NEXT: vcvttss2si {sae}, %xmm0, %rax
387 ; CHECK-NEXT: addq %rcx, %rax
389 %res0 = call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %a0, i32 4) ;
390 %res1 = call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %a0, i32 8) ;
391 %res2 = add i64 %res0, %res1
394 declare i64 @llvm.x86.avx512.cvttss2si64(<4 x float>, i32) nounwind readnone
396 define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) {
397 ; CHECK-LABEL: test_x86_avx512_cvttss2usi:
399 ; CHECK-NEXT: vcvttss2usi {sae}, %xmm0, %ecx
400 ; CHECK-NEXT: vcvttss2usi %xmm0, %eax
401 ; CHECK-NEXT: addl %ecx, %eax
403 %res0 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 8) ;
404 %res1 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 4) ;
405 %res2 = add i32 %res0, %res1
408 declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone
410 define i64 @test_x86_avx512_cvttss2usi64(<4 x float> %a0) {
411 ; CHECK-LABEL: test_x86_avx512_cvttss2usi64:
413 ; CHECK-NEXT: vcvttss2usi %xmm0, %rcx
414 ; CHECK-NEXT: vcvttss2usi {sae}, %xmm0, %rax
415 ; CHECK-NEXT: addq %rcx, %rax
417 %res0 = call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %a0, i32 4) ;
418 %res1 = call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %a0, i32 8) ;
419 %res2 = add i64 %res0, %res1
422 declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone
424 define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
425 ; CHECK-LABEL: test_x86_avx512_cvtsd2usi64:
427 ; CHECK-NEXT: vcvtsd2usi %xmm0, %rax
429 %res = call i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double> %a0) ; <i64> [#uses=1]
432 declare i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double>) nounwind readnone
434 define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
435 ; CHECK-LABEL: test_x86_vcvtph2ps_512:
437 ; CHECK-NEXT: vcvtph2ps %ymm0, %zmm0
439 %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
440 ret <16 x float> %res
443 define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) {
444 ; CHECK-LABEL: test_x86_vcvtph2ps_512_sae:
446 ; CHECK-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0
448 %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
449 ret <16 x float> %res
452 define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask) {
453 ; CHECK-LABEL: test_x86_vcvtph2ps_512_rrk:
455 ; CHECK-NEXT: kmovw %edi, %k1
456 ; CHECK-NEXT: vcvtph2ps %ymm0, %zmm1 {%k1}
457 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
459 %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4)
460 ret <16 x float> %res
463 define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask) {
464 ; CHECK-LABEL: test_x86_vcvtph2ps_512_sae_rrkz:
466 ; CHECK-NEXT: kmovw %edi, %k1
467 ; CHECK-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z}
469 %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 8)
470 ret <16 x float> %res
473 define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) {
474 ; CHECK-LABEL: test_x86_vcvtph2ps_512_rrkz:
476 ; CHECK-NEXT: kmovw %edi, %k1
477 ; CHECK-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z}
479 %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4)
480 ret <16 x float> %res
483 declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
486 define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0) {
487 ; CHECK-LABEL: test_x86_vcvtps2ph_256:
489 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm0
491 %res = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
495 declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly
497 define <16 x float> @test_x86_vbroadcast_ss_512(i8* %a0) {
498 ; CHECK-LABEL: test_x86_vbroadcast_ss_512:
500 ; CHECK-NEXT: vbroadcastss (%rdi), %zmm0
502 %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8* %a0) ; <<16 x float>> [#uses=1]
503 ret <16 x float> %res
505 declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8*) nounwind readonly
507 define <8 x double> @test_x86_vbroadcast_sd_512(i8* %a0) {
508 ; CHECK-LABEL: test_x86_vbroadcast_sd_512:
510 ; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0
512 %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8* %a0) ; <<8 x double>> [#uses=1]
513 ret <8 x double> %res
515 declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly
517 define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) {
518 ; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512:
519 ; CHECK: kmovw %edi, %k1
520 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
521 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm2 {%k1} {z}
522 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
523 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
525 %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1)
526 %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask)
527 %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask)
528 %res3 = fadd <16 x float> %res, %res1
529 %res4 = fadd <16 x float> %res2, %res3
530 ret <16 x float> %res4
532 declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly
535 define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) {
536 ; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512:
537 ; CHECK: kmovw %eax, %k1
538 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
539 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2 {%k1} {z}
540 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
541 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
543 %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1)
544 %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask)
545 %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask)
546 %res3 = fadd <8 x double> %res, %res1
547 %res4 = fadd <8 x double> %res2, %res3
548 ret <8 x double> %res4
550 declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly
552 define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) {
553 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512:
555 ; CHECK-NEXT: kmovw %edi, %k1
556 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1}
557 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2 {%k1} {z}
558 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0
559 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
560 ; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
562 %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
563 %res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)
564 %res2 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
565 %res3 = add <16 x i32> %res, %res1
566 %res4 = add <16 x i32> %res2, %res3
569 declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16)
571 define <16 x i32> @test_x86_pbroadcastd_i32_512(i32 %a0) {
572 ; CHECK-LABEL: test_x86_pbroadcastd_i32_512:
574 ; CHECK-NEXT: vpbroadcastd %edi, %zmm0
576 %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32 %a0) ; <<16 x i32>> [#uses=1]
579 declare <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32) nounwind readonly
581 define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) {
582 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512:
584 ; CHECK-NEXT: movzbl %dil, %eax
585 ; CHECK-NEXT: kmovw %eax, %k1
586 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1}
587 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2 {%k1} {z}
588 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0
589 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
590 ; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
592 %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
593 %res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask)
594 %res2 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask)
595 %res3 = add <8 x i64> %res, %res1
596 %res4 = add <8 x i64> %res2, %res3
599 declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8)
601 define <8 x i64> @test_x86_pbroadcastq_i64_512(i64 %a0) {
602 ; CHECK-LABEL: test_x86_pbroadcastq_i64_512:
604 ; CHECK-NEXT: vpbroadcastq %rdi, %zmm0
606 %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64 %a0) ; <<8 x i64>> [#uses=1]
609 declare <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64) nounwind readonly
611 define <16 x i32> @test_conflict_d(<16 x i32> %a) {
612 ; CHECK-LABEL: test_conflict_d:
614 ; CHECK-NEXT: vpconflictd %zmm0, %zmm0
616 %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
620 declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
622 define <8 x i64> @test_conflict_q(<8 x i64> %a) {
623 ; CHECK-LABEL: test_conflict_q:
625 ; CHECK-NEXT: vpconflictq %zmm0, %zmm0
627 %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
631 declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
633 define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
634 ; CHECK-LABEL: test_maskz_conflict_d:
636 ; CHECK-NEXT: kmovw %edi, %k1
637 ; CHECK-NEXT: vpconflictd %zmm0, %zmm0 {%k1} {z}
639 %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 %mask)
643 define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
644 ; CHECK-LABEL: test_mask_conflict_q:
646 ; CHECK-NEXT: movzbl %dil, %eax
647 ; CHECK-NEXT: kmovw %eax, %k1
648 ; CHECK-NEXT: vpconflictq %zmm0, %zmm1 {%k1}
649 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
651 %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
655 define <16 x i32> @test_lzcnt_d(<16 x i32> %a) {
656 ; CHECK-LABEL: test_lzcnt_d:
658 ; CHECK-NEXT: vplzcntd %zmm0, %zmm0
660 %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
664 declare <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
666 define <8 x i64> @test_lzcnt_q(<8 x i64> %a) {
667 ; CHECK-LABEL: test_lzcnt_q:
669 ; CHECK-NEXT: vplzcntq %zmm0, %zmm0
671 %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
675 declare <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
678 define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
679 ; CHECK-LABEL: test_mask_lzcnt_d:
681 ; CHECK-NEXT: kmovw %edi, %k1
682 ; CHECK-NEXT: vplzcntd %zmm0, %zmm1 {%k1}
683 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
685 %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
689 define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
690 ; CHECK-LABEL: test_mask_lzcnt_q:
692 ; CHECK-NEXT: movzbl %dil, %eax
693 ; CHECK-NEXT: kmovw %eax, %k1
694 ; CHECK-NEXT: vplzcntq %zmm0, %zmm1 {%k1}
695 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
697 %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
701 define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
702 ; CHECK-LABEL: test_x86_mask_blend_ps_512:
704 ; CHECK-NEXT: kmovw %edi, %k1
705 ; CHECK-NEXT: vblendmps %zmm1, %zmm0, %zmm0 {%k1}
707 %res = call <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float> %a1, <16 x float> %a2, i16 %a0) ; <<16 x float>> [#uses=1]
708 ret <16 x float> %res
711 declare <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float>, <16 x float>, i16) nounwind readonly
713 define <8 x double> @test_x86_mask_blend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
714 ; CHECK-LABEL: test_x86_mask_blend_pd_512:
716 ; CHECK-NEXT: movzbl %dil, %eax
717 ; CHECK-NEXT: kmovw %eax, %k1
718 ; CHECK-NEXT: vblendmpd %zmm1, %zmm0, %zmm0 {%k1}
720 %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a1, <8 x double> %a2, i8 %a0) ; <<8 x double>> [#uses=1]
721 ret <8 x double> %res
724 define <8 x double> @test_x86_mask_blend_pd_512_memop(<8 x double> %a, <8 x double>* %ptr, i8 %mask) {
725 ; CHECK-LABEL: test_x86_mask_blend_pd_512_memop:
727 ; CHECK-NEXT: movzbl %sil, %eax
728 ; CHECK-NEXT: kmovw %eax, %k1
729 ; CHECK-NEXT: vblendmpd (%rdi), %zmm0, %zmm0 {%k1}
731 %b = load <8 x double>, <8 x double>* %ptr
732 %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a, <8 x double> %b, i8 %mask) ; <<8 x double>> [#uses=1]
733 ret <8 x double> %res
735 declare <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double>, <8 x double>, i8) nounwind readonly
737 define <16 x i32> @test_x86_mask_blend_d_512(i16 %a0, <16 x i32> %a1, <16 x i32> %a2) {
738 ; CHECK-LABEL: test_x86_mask_blend_d_512:
740 ; CHECK-NEXT: kmovw %edi, %k1
741 ; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm0 {%k1}
743 %res = call <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32> %a1, <16 x i32> %a2, i16 %a0) ; <<16 x i32>> [#uses=1]
746 declare <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
748 define <8 x i64> @test_x86_mask_blend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
749 ; CHECK-LABEL: test_x86_mask_blend_q_512:
751 ; CHECK-NEXT: movzbl %dil, %eax
752 ; CHECK-NEXT: kmovw %eax, %k1
753 ; CHECK-NEXT: vpblendmq %zmm1, %zmm0, %zmm0 {%k1}
755 %res = call <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64> %a1, <8 x i64> %a2, i8 %a0) ; <<8 x i64>> [#uses=1]
758 declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
760 define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
761 ; CHECK-LABEL: test_cmpps:
763 ; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
764 ; CHECK-NEXT: kmovw %k0, %eax
766 %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
769 declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32)
771 define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
772 ; CHECK-LABEL: test_cmppd:
774 ; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0
775 ; CHECK-NEXT: kmovw %k0, %eax
777 %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
780 declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
783 define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
784 ; CHECK-LABEL: test_vmaxpd:
786 ; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
788 %res = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1,
789 <8 x double>zeroinitializer, i8 -1, i32 4)
790 ret <8 x double> %res
792 declare <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double>, <8 x double>,
793 <8 x double>, i8, i32)
795 define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) {
796 ; CHECK-LABEL: test_vminpd:
798 ; CHECK-NEXT: vminpd %zmm1, %zmm0, %zmm0
800 %res = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a0, <8 x double> %a1,
801 <8 x double>zeroinitializer, i8 -1, i32 4)
802 ret <8 x double> %res
804 declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>,
805 <8 x double>, i8, i32)
807 declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16)
809 define <16 x i32>@test_int_x86_avx512_mask_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
810 ; CHECK-LABEL: test_int_x86_avx512_mask_pabs_d_512:
812 ; CHECK-NEXT: kmovw %edi, %k1
813 ; CHECK-NEXT: vpabsd %zmm0, %zmm1 {%k1}
814 ; CHECK-NEXT: vpabsd %zmm0, %zmm0
815 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
817 %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
818 %res1 = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 -1)
819 %res2 = add <16 x i32> %res, %res1
823 declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8)
825 define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
826 ; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_512:
828 ; CHECK-NEXT: movzbl %dil, %eax
829 ; CHECK-NEXT: kmovw %eax, %k1
830 ; CHECK-NEXT: vpabsq %zmm0, %zmm1 {%k1}
831 ; CHECK-NEXT: vpabsq %zmm0, %zmm0
832 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
834 %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
835 %res1 = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 -1)
836 %res2 = add <8 x i64> %res, %res1
840 define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1) {
841 ; CHECK-LABEL: test_vptestmq:
843 ; CHECK-NEXT: vptestmq %zmm1, %zmm0, %k0
844 ; CHECK-NEXT: kmovw %k0, %eax
846 %res = call i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
849 declare i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
851 define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1) {
852 ; CHECK-LABEL: test_vptestmd:
854 ; CHECK-NEXT: vptestmd %zmm1, %zmm0, %k0
855 ; CHECK-NEXT: kmovw %k0, %eax
857 %res = call i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
860 declare i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32>, <16 x i32>, i16)
862 define void @test_store1(<16 x float> %data, i8* %ptr, i16 %mask) {
863 ; CHECK-LABEL: test_store1:
865 ; CHECK-NEXT: kmovw %esi, %k1
866 ; CHECK-NEXT: vmovups %zmm0, (%rdi) {%k1}
868 call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
872 declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 )
874 define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) {
875 ; CHECK-LABEL: test_store2:
877 ; CHECK-NEXT: kmovw %esi, %k1
878 ; CHECK-NEXT: vmovupd %zmm0, (%rdi) {%k1}
880 call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
884 declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8)
886 define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
887 ; CHECK-LABEL: test_mask_store_aligned_ps:
889 ; CHECK-NEXT: kmovw %esi, %k1
890 ; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1}
892 call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
896 declare void @llvm.x86.avx512.mask.store.ps.512(i8*, <16 x float>, i16 )
898 define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
899 ; CHECK-LABEL: test_mask_store_aligned_pd:
901 ; CHECK-NEXT: kmovw %esi, %k1
902 ; CHECK-NEXT: vmovapd %zmm0, (%rdi) {%k1}
904 call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
908 declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8)
910 define <16 x float> @test_maskz_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
911 ; CHECK-LABEL: test_maskz_load_aligned_ps:
913 ; CHECK-NEXT: kmovw %esi, %k1
914 ; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
916 %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
917 ret <16 x float> %res
920 declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16)
922 define <8 x double> @test_maskz_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
923 ; CHECK-LABEL: test_maskz_load_aligned_pd:
925 ; CHECK-NEXT: kmovw %esi, %k1
926 ; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z}
928 %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
929 ret <8 x double> %res
932 declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
934 define <16 x float> @test_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
935 ; CHECK-LABEL: test_load_aligned_ps:
937 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
939 %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
940 ret <16 x float> %res
943 define <8 x double> @test_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
944 ; CHECK-LABEL: test_load_aligned_pd:
946 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
948 %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
949 ret <8 x double> %res
952 declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*)
954 define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
955 ; CHECK-LABEL: test_valign_q:
957 ; CHECK-NEXT: valignq $2, %zmm1, %zmm0, %zmm0
959 %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> zeroinitializer, i8 -1)
963 define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) {
964 ; CHECK-LABEL: test_mask_valign_q:
966 ; CHECK-NEXT: movzbl %dil, %eax
967 ; CHECK-NEXT: kmovw %eax, %k1
968 ; CHECK-NEXT: valignq $2, %zmm1, %zmm0, %zmm2 {%k1}
969 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
971 %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> %src, i8 %mask)
975 declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
977 define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
978 ; CHECK-LABEL: test_maskz_valign_d:
980 ; CHECK-NEXT: kmovw %edi, %k1
981 ; CHECK-NEXT: valignd $5, %zmm1, %zmm0, %zmm0 {%k1} {z}
983 %res = call <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32> %a, <16 x i32> %b, i32 5, <16 x i32> zeroinitializer, i16 %mask)
987 declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
989 define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) {
990 ; CHECK-LABEL: test_mask_store_ss:
992 ; CHECK-NEXT: kmovw %esi, %k1
993 ; CHECK-NEXT: vmovss %xmm0, (%rdi) {%k1}
995 call void @llvm.x86.avx512.mask.store.ss(i8* %ptr, <4 x float> %data, i8 %mask)
999 declare void @llvm.x86.avx512.mask.store.ss(i8*, <4 x float>, i8 )
1001 define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
1002 ; CHECK-LABEL: test_pcmpeq_d:
1004 ; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
1005 ; CHECK-NEXT: kmovw %k0, %eax
1007 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
1011 define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
1012 ; CHECK-LABEL: test_mask_pcmpeq_d:
1014 ; CHECK-NEXT: kmovw %edi, %k1
1015 ; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
1016 ; CHECK-NEXT: kmovw %k0, %eax
1018 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
1022 declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
1024 define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
1025 ; CHECK-LABEL: test_pcmpeq_q:
1027 ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
1028 ; CHECK-NEXT: kmovw %k0, %eax
1030 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
1034 define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
1035 ; CHECK-LABEL: test_mask_pcmpeq_q:
1037 ; CHECK-NEXT: movzbl %dil, %eax
1038 ; CHECK-NEXT: kmovw %eax, %k1
1039 ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
1040 ; CHECK-NEXT: kmovw %k0, %eax
1042 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
1046 declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
1048 define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
1049 ; CHECK-LABEL: test_pcmpgt_d:
1051 ; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
1052 ; CHECK-NEXT: kmovw %k0, %eax
1054 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
1058 define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
1059 ; CHECK-LABEL: test_mask_pcmpgt_d:
1061 ; CHECK-NEXT: kmovw %edi, %k1
1062 ; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
1063 ; CHECK-NEXT: kmovw %k0, %eax
1065 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
1069 declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
1071 define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
1072 ; CHECK-LABEL: test_pcmpgt_q:
1074 ; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
1075 ; CHECK-NEXT: kmovw %k0, %eax
1077 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
1081 define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
1082 ; CHECK-LABEL: test_mask_pcmpgt_q:
1084 ; CHECK-NEXT: movzbl %dil, %eax
1085 ; CHECK-NEXT: kmovw %eax, %k1
1086 ; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
1087 ; CHECK-NEXT: kmovw %k0, %eax
1089 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
1093 declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
1095 define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
1096 ; CHECK-LABEL: test_cmp_d_512:
1098 ; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
1099 ; CHECK-NEXT: kmovw %k0, %r8d
1100 ; CHECK-NEXT: vpcmpltd %zmm1, %zmm0, %k0
1101 ; CHECK-NEXT: kmovw %k0, %r9d
1102 ; CHECK-NEXT: vpcmpled %zmm1, %zmm0, %k0
1103 ; CHECK-NEXT: kmovw %k0, %r10d
1104 ; CHECK-NEXT: vpcmpunordd %zmm1, %zmm0, %k0
1105 ; CHECK-NEXT: kmovw %k0, %esi
1106 ; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
1107 ; CHECK-NEXT: kmovw %k0, %edi
1108 ; CHECK-NEXT: vpcmpnltd %zmm1, %zmm0, %k0
1109 ; CHECK-NEXT: kmovw %k0, %eax
1110 ; CHECK-NEXT: vpcmpnled %zmm1, %zmm0, %k0
1111 ; CHECK-NEXT: kmovw %k0, %ecx
1112 ; CHECK-NEXT: vpcmpordd %zmm1, %zmm0, %k0
1113 ; CHECK-NEXT: kmovw %k0, %edx
1114 ; CHECK-NEXT: vmovd %r8d, %xmm0
1115 ; CHECK-NEXT: vpinsrw $1, %r9d, %xmm0, %xmm0
1116 ; CHECK-NEXT: vpinsrw $2, %r10d, %xmm0, %xmm0
1117 ; CHECK-NEXT: vpinsrw $3, %esi, %xmm0, %xmm0
1118 ; CHECK-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0
1119 ; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
1120 ; CHECK-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
1121 ; CHECK-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0
1123 %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
1124 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
1125 %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
1126 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
1127 %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
1128 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
1129 %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
1130 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
1131 %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
1132 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
1133 %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
1134 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
1135 %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
1136 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
1137 %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
1138 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
1142 define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1143 ; CHECK-LABEL: test_mask_cmp_d_512:
1145 ; CHECK-NEXT: kmovw %edi, %k1
1146 ; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
1147 ; CHECK-NEXT: kmovw %k0, %r8d
1148 ; CHECK-NEXT: vpcmpltd %zmm1, %zmm0, %k0 {%k1}
1149 ; CHECK-NEXT: kmovw %k0, %r9d
1150 ; CHECK-NEXT: vpcmpled %zmm1, %zmm0, %k0 {%k1}
1151 ; CHECK-NEXT: kmovw %k0, %r10d
1152 ; CHECK-NEXT: vpcmpunordd %zmm1, %zmm0, %k0 {%k1}
1153 ; CHECK-NEXT: kmovw %k0, %esi
1154 ; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1}
1155 ; CHECK-NEXT: kmovw %k0, %edi
1156 ; CHECK-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 {%k1}
1157 ; CHECK-NEXT: kmovw %k0, %eax
1158 ; CHECK-NEXT: vpcmpnled %zmm1, %zmm0, %k0 {%k1}
1159 ; CHECK-NEXT: kmovw %k0, %ecx
1160 ; CHECK-NEXT: vpcmpordd %zmm1, %zmm0, %k0 {%k1}
1161 ; CHECK-NEXT: kmovw %k0, %edx
1162 ; CHECK-NEXT: vmovd %r8d, %xmm0
1163 ; CHECK-NEXT: vpinsrw $1, %r9d, %xmm0, %xmm0
1164 ; CHECK-NEXT: vpinsrw $2, %r10d, %xmm0, %xmm0
1165 ; CHECK-NEXT: vpinsrw $3, %esi, %xmm0, %xmm0
1166 ; CHECK-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0
1167 ; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
1168 ; CHECK-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
1169 ; CHECK-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0
1171 %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
1172 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
1173 %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
1174 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
1175 %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
1176 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
1177 %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
1178 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
1179 %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
1180 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
1181 %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
1182 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
1183 %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
1184 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
1185 %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
1186 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
1190 declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
1192 define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
1193 ; CHECK-LABEL: test_ucmp_d_512:
1195 ; CHECK-NEXT: vpcmpequd %zmm1, %zmm0, %k0
1196 ; CHECK-NEXT: kmovw %k0, %r8d
1197 ; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0
1198 ; CHECK-NEXT: kmovw %k0, %r9d
1199 ; CHECK-NEXT: vpcmpleud %zmm1, %zmm0, %k0
1200 ; CHECK-NEXT: kmovw %k0, %r10d
1201 ; CHECK-NEXT: vpcmpunordud %zmm1, %zmm0, %k0
1202 ; CHECK-NEXT: kmovw %k0, %esi
1203 ; CHECK-NEXT: vpcmpnequd %zmm1, %zmm0, %k0
1204 ; CHECK-NEXT: kmovw %k0, %edi
1205 ; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k0
1206 ; CHECK-NEXT: kmovw %k0, %eax
1207 ; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
1208 ; CHECK-NEXT: kmovw %k0, %ecx
1209 ; CHECK-NEXT: vpcmpordud %zmm1, %zmm0, %k0
1210 ; CHECK-NEXT: kmovw %k0, %edx
1211 ; CHECK-NEXT: vmovd %r8d, %xmm0
1212 ; CHECK-NEXT: vpinsrw $1, %r9d, %xmm0, %xmm0
1213 ; CHECK-NEXT: vpinsrw $2, %r10d, %xmm0, %xmm0
1214 ; CHECK-NEXT: vpinsrw $3, %esi, %xmm0, %xmm0
1215 ; CHECK-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0
1216 ; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
1217 ; CHECK-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
1218 ; CHECK-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0
1220 %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
1221 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
1222 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
1223 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
1224 %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
1225 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
1226 %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
1227 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
1228 %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
1229 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
1230 %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
1231 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
1232 %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
1233 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
1234 %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
1235 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
1239 define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1240 ; CHECK-LABEL: test_mask_ucmp_d_512:
1242 ; CHECK-NEXT: kmovw %edi, %k1
1243 ; CHECK-NEXT: vpcmpequd %zmm1, %zmm0, %k0 {%k1}
1244 ; CHECK-NEXT: kmovw %k0, %r8d
1245 ; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
1246 ; CHECK-NEXT: kmovw %k0, %r9d
1247 ; CHECK-NEXT: vpcmpleud %zmm1, %zmm0, %k0 {%k1}
1248 ; CHECK-NEXT: kmovw %k0, %r10d
1249 ; CHECK-NEXT: vpcmpunordud %zmm1, %zmm0, %k0 {%k1}
1250 ; CHECK-NEXT: kmovw %k0, %esi
1251 ; CHECK-NEXT: vpcmpnequd %zmm1, %zmm0, %k0 {%k1}
1252 ; CHECK-NEXT: kmovw %k0, %edi
1253 ; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 {%k1}
1254 ; CHECK-NEXT: kmovw %k0, %eax
1255 ; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 {%k1}
1256 ; CHECK-NEXT: kmovw %k0, %ecx
1257 ; CHECK-NEXT: vpcmpordud %zmm1, %zmm0, %k0 {%k1}
1258 ; CHECK-NEXT: kmovw %k0, %edx
1259 ; CHECK-NEXT: vmovd %r8d, %xmm0
1260 ; CHECK-NEXT: vpinsrw $1, %r9d, %xmm0, %xmm0
1261 ; CHECK-NEXT: vpinsrw $2, %r10d, %xmm0, %xmm0
1262 ; CHECK-NEXT: vpinsrw $3, %esi, %xmm0, %xmm0
1263 ; CHECK-NEXT: vpinsrw $4, %edi, %xmm0, %xmm0
1264 ; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
1265 ; CHECK-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
1266 ; CHECK-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0
1268 %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
1269 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
1270 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
1271 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
1272 %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
1273 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
1274 %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
1275 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
1276 %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
1277 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
1278 %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
1279 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
1280 %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
1281 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
1282 %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
1283 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
1287 declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
1289 define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
1290 ; CHECK-LABEL: test_cmp_q_512:
1292 ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
1293 ; CHECK-NEXT: kmovw %k0, %r8d
1294 ; CHECK-NEXT: vpcmpltq %zmm1, %zmm0, %k0
1295 ; CHECK-NEXT: kmovw %k0, %r9d
1296 ; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k0
1297 ; CHECK-NEXT: kmovw %k0, %r10d
1298 ; CHECK-NEXT: vpcmpunordq %zmm1, %zmm0, %k0
1299 ; CHECK-NEXT: kmovw %k0, %r11d
1300 ; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k0
1301 ; CHECK-NEXT: kmovw %k0, %edi
1302 ; CHECK-NEXT: vpcmpnltq %zmm1, %zmm0, %k0
1303 ; CHECK-NEXT: kmovw %k0, %eax
1304 ; CHECK-NEXT: vpcmpnleq %zmm1, %zmm0, %k0
1305 ; CHECK-NEXT: kmovw %k0, %ecx
1306 ; CHECK-NEXT: vpcmpordq %zmm1, %zmm0, %k0
1307 ; CHECK-NEXT: kmovw %k0, %edx
1308 ; CHECK-NEXT: movzbl %r8b, %esi
1309 ; CHECK-NEXT: vpinsrb $0, %esi, %xmm0, %xmm0
1310 ; CHECK-NEXT: movzbl %r9b, %esi
1311 ; CHECK-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0
1312 ; CHECK-NEXT: movzbl %r10b, %esi
1313 ; CHECK-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
1314 ; CHECK-NEXT: movzbl %r11b, %esi
1315 ; CHECK-NEXT: vpinsrb $6, %esi, %xmm0, %xmm0
1316 ; CHECK-NEXT: movzbl %dil, %esi
1317 ; CHECK-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
1318 ; CHECK-NEXT: movzbl %al, %eax
1319 ; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
1320 ; CHECK-NEXT: movzbl %cl, %eax
1321 ; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
1322 ; CHECK-NEXT: movzbl %dl, %eax
1323 ; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
1325 %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
1326 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
1327 %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
1328 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
1329 %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
1330 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
1331 %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
1332 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
1333 %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
1334 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
1335 %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
1336 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
1337 %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
1338 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
1339 %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
1340 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
1344 define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1345 ; CHECK-LABEL: test_mask_cmp_q_512:
1347 ; CHECK-NEXT: movzbl %dil, %eax
1348 ; CHECK-NEXT: kmovw %eax, %k1
1349 ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
1350 ; CHECK-NEXT: kmovw %k0, %r8d
1351 ; CHECK-NEXT: vpcmpltq %zmm1, %zmm0, %k0 {%k1}
1352 ; CHECK-NEXT: kmovw %k0, %r9d
1353 ; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k0 {%k1}
1354 ; CHECK-NEXT: kmovw %k0, %r10d
1355 ; CHECK-NEXT: vpcmpunordq %zmm1, %zmm0, %k0 {%k1}
1356 ; CHECK-NEXT: kmovw %k0, %r11d
1357 ; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 {%k1}
1358 ; CHECK-NEXT: kmovw %k0, %edi
1359 ; CHECK-NEXT: vpcmpnltq %zmm1, %zmm0, %k0 {%k1}
1360 ; CHECK-NEXT: kmovw %k0, %eax
1361 ; CHECK-NEXT: vpcmpnleq %zmm1, %zmm0, %k0 {%k1}
1362 ; CHECK-NEXT: kmovw %k0, %ecx
1363 ; CHECK-NEXT: vpcmpordq %zmm1, %zmm0, %k0 {%k1}
1364 ; CHECK-NEXT: kmovw %k0, %edx
1365 ; CHECK-NEXT: movzbl %r8b, %esi
1366 ; CHECK-NEXT: vpinsrb $0, %esi, %xmm0, %xmm0
1367 ; CHECK-NEXT: movzbl %r9b, %esi
1368 ; CHECK-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0
1369 ; CHECK-NEXT: movzbl %r10b, %esi
1370 ; CHECK-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
1371 ; CHECK-NEXT: movzbl %r11b, %esi
1372 ; CHECK-NEXT: vpinsrb $6, %esi, %xmm0, %xmm0
1373 ; CHECK-NEXT: movzbl %dil, %esi
1374 ; CHECK-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
1375 ; CHECK-NEXT: movzbl %al, %eax
1376 ; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
1377 ; CHECK-NEXT: movzbl %cl, %eax
1378 ; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
1379 ; CHECK-NEXT: movzbl %dl, %eax
1380 ; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
1382 %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
1383 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
1384 %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
1385 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
1386 %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
1387 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
1388 %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
1389 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
1390 %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
1391 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
1392 %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
1393 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
1394 %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
1395 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
1396 %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
1397 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
1401 declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
1403 define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
1404 ; CHECK-LABEL: test_ucmp_q_512:
1406 ; CHECK-NEXT: vpcmpequq %zmm1, %zmm0, %k0
1407 ; CHECK-NEXT: kmovw %k0, %r8d
1408 ; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
1409 ; CHECK-NEXT: kmovw %k0, %r9d
1410 ; CHECK-NEXT: vpcmpleuq %zmm1, %zmm0, %k0
1411 ; CHECK-NEXT: kmovw %k0, %r10d
1412 ; CHECK-NEXT: vpcmpunorduq %zmm1, %zmm0, %k0
1413 ; CHECK-NEXT: kmovw %k0, %r11d
1414 ; CHECK-NEXT: vpcmpnequq %zmm1, %zmm0, %k0
1415 ; CHECK-NEXT: kmovw %k0, %edi
1416 ; CHECK-NEXT: vpcmpnltuq %zmm1, %zmm0, %k0
1417 ; CHECK-NEXT: kmovw %k0, %eax
1418 ; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0
1419 ; CHECK-NEXT: kmovw %k0, %ecx
1420 ; CHECK-NEXT: vpcmporduq %zmm1, %zmm0, %k0
1421 ; CHECK-NEXT: kmovw %k0, %edx
1422 ; CHECK-NEXT: movzbl %r8b, %esi
1423 ; CHECK-NEXT: vpinsrb $0, %esi, %xmm0, %xmm0
1424 ; CHECK-NEXT: movzbl %r9b, %esi
1425 ; CHECK-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0
1426 ; CHECK-NEXT: movzbl %r10b, %esi
1427 ; CHECK-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
1428 ; CHECK-NEXT: movzbl %r11b, %esi
1429 ; CHECK-NEXT: vpinsrb $6, %esi, %xmm0, %xmm0
1430 ; CHECK-NEXT: movzbl %dil, %esi
1431 ; CHECK-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
1432 ; CHECK-NEXT: movzbl %al, %eax
1433 ; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
1434 ; CHECK-NEXT: movzbl %cl, %eax
1435 ; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
1436 ; CHECK-NEXT: movzbl %dl, %eax
1437 ; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
1439 %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
1440 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
1441 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
1442 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
1443 %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
1444 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
1445 %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
1446 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
1447 %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
1448 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
1449 %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
1450 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
1451 %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
1452 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
1453 %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
1454 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
1458 define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1459 ; CHECK-LABEL: test_mask_ucmp_q_512:
1461 ; CHECK-NEXT: movzbl %dil, %eax
1462 ; CHECK-NEXT: kmovw %eax, %k1
1463 ; CHECK-NEXT: vpcmpequq %zmm1, %zmm0, %k0 {%k1}
1464 ; CHECK-NEXT: kmovw %k0, %r8d
1465 ; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
1466 ; CHECK-NEXT: kmovw %k0, %r9d
1467 ; CHECK-NEXT: vpcmpleuq %zmm1, %zmm0, %k0 {%k1}
1468 ; CHECK-NEXT: kmovw %k0, %r10d
1469 ; CHECK-NEXT: vpcmpunorduq %zmm1, %zmm0, %k0 {%k1}
1470 ; CHECK-NEXT: kmovw %k0, %r11d
1471 ; CHECK-NEXT: vpcmpnequq %zmm1, %zmm0, %k0 {%k1}
1472 ; CHECK-NEXT: kmovw %k0, %edi
1473 ; CHECK-NEXT: vpcmpnltuq %zmm1, %zmm0, %k0 {%k1}
1474 ; CHECK-NEXT: kmovw %k0, %eax
1475 ; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0 {%k1}
1476 ; CHECK-NEXT: kmovw %k0, %ecx
1477 ; CHECK-NEXT: vpcmporduq %zmm1, %zmm0, %k0 {%k1}
1478 ; CHECK-NEXT: kmovw %k0, %edx
1479 ; CHECK-NEXT: movzbl %r8b, %esi
1480 ; CHECK-NEXT: vpinsrb $0, %esi, %xmm0, %xmm0
1481 ; CHECK-NEXT: movzbl %r9b, %esi
1482 ; CHECK-NEXT: vpinsrb $2, %esi, %xmm0, %xmm0
1483 ; CHECK-NEXT: movzbl %r10b, %esi
1484 ; CHECK-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
1485 ; CHECK-NEXT: movzbl %r11b, %esi
1486 ; CHECK-NEXT: vpinsrb $6, %esi, %xmm0, %xmm0
1487 ; CHECK-NEXT: movzbl %dil, %esi
1488 ; CHECK-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
1489 ; CHECK-NEXT: movzbl %al, %eax
1490 ; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
1491 ; CHECK-NEXT: movzbl %cl, %eax
1492 ; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
1493 ; CHECK-NEXT: movzbl %dl, %eax
1494 ; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
1496 %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
1497 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
1498 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
1499 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
1500 %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
1501 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
1502 %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
1503 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
1504 %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
1505 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
1506 %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
1507 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
1508 %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
1509 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
1510 %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
1511 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
1515 declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
1517 define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
1518 ; CHECK-LABEL: test_mask_vextractf32x4:
1520 ; CHECK-NEXT: kmovw %edi, %k1
1521 ; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm0 {%k1}
1523 %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask)
1524 ret <4 x float> %res
1527 declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8)
1529 define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) {
1530 ; CHECK-LABEL: test_mask_vextracti64x4:
1532 ; CHECK-NEXT: kmovw %edi, %k1
1533 ; CHECK-NEXT: vextracti64x4 $2, %zmm1, %ymm0 {%k1}
1535 %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask)
1539 declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8)
1541 define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
1542 ; CHECK-LABEL: test_maskz_vextracti32x4:
1544 ; CHECK-NEXT: kmovw %edi, %k1
1545 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z}
1547 %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask)
1551 declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8)
1553 define <4 x double> @test_vextractf64x4(<8 x double> %a) {
1554 ; CHECK-LABEL: test_vextractf64x4:
1556 ; CHECK-NEXT: vextractf64x4 $2, %zmm0, %ymm0
1558 %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1)
1559 ret <4 x double> %res
1562 declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8)
1564 define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
1565 ; CHECK-LABEL: test_x86_avx512_pslli_d:
1567 ; CHECK-NEXT: vpslld $7, %zmm0, %zmm0
1569 %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
1573 define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1574 ; CHECK-LABEL: test_x86_avx512_mask_pslli_d:
1576 ; CHECK-NEXT: kmovw %edi, %k1
1577 ; CHECK-NEXT: vpslld $7, %zmm0, %zmm1 {%k1}
1578 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
1580 %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
1584 define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) {
1585 ; CHECK-LABEL: test_x86_avx512_maskz_pslli_d:
1587 ; CHECK-NEXT: kmovw %edi, %k1
1588 ; CHECK-NEXT: vpslld $7, %zmm0, %zmm0 {%k1} {z}
1590 %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
1594 declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
1596 define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
1597 ; CHECK-LABEL: test_x86_avx512_pslli_q:
1599 ; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0
1601 %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
1605 define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1606 ; CHECK-LABEL: test_x86_avx512_mask_pslli_q:
1608 ; CHECK-NEXT: movzbl %dil, %eax
1609 ; CHECK-NEXT: kmovw %eax, %k1
1610 ; CHECK-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1}
1611 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
1613 %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
1617 define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
1618 ; CHECK-LABEL: test_x86_avx512_maskz_pslli_q:
1620 ; CHECK-NEXT: movzbl %dil, %eax
1621 ; CHECK-NEXT: kmovw %eax, %k1
1622 ; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0 {%k1} {z}
1624 %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
1628 declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
1630 define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
1631 ; CHECK-LABEL: test_x86_avx512_psrli_d:
1633 ; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0
1635 %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
1639 define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1640 ; CHECK-LABEL: test_x86_avx512_mask_psrli_d:
1642 ; CHECK-NEXT: kmovw %edi, %k1
1643 ; CHECK-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1}
1644 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
1646 %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
1650 define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) {
1651 ; CHECK-LABEL: test_x86_avx512_maskz_psrli_d:
1653 ; CHECK-NEXT: kmovw %edi, %k1
1654 ; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
1656 %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
1660 declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
1662 define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
1663 ; CHECK-LABEL: test_x86_avx512_psrli_q:
1665 ; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0
1667 %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
1671 define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1672 ; CHECK-LABEL: test_x86_avx512_mask_psrli_q:
1674 ; CHECK-NEXT: movzbl %dil, %eax
1675 ; CHECK-NEXT: kmovw %eax, %k1
1676 ; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1}
1677 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
1679 %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
1683 define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
1684 ; CHECK-LABEL: test_x86_avx512_maskz_psrli_q:
1686 ; CHECK-NEXT: movzbl %dil, %eax
1687 ; CHECK-NEXT: kmovw %eax, %k1
1688 ; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
1690 %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
1694 declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
1696 define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
1697 ; CHECK-LABEL: test_x86_avx512_psrai_d:
1699 ; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0
1701 %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
1705 define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1706 ; CHECK-LABEL: test_x86_avx512_mask_psrai_d:
1708 ; CHECK-NEXT: kmovw %edi, %k1
1709 ; CHECK-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1}
1710 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
1712 %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
1716 define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) {
1717 ; CHECK-LABEL: test_x86_avx512_maskz_psrai_d:
1719 ; CHECK-NEXT: kmovw %edi, %k1
1720 ; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
1722 %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
1726 declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
1728 define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
1729 ; CHECK-LABEL: test_x86_avx512_psrai_q:
1731 ; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0
1733 %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
1737 define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1738 ; CHECK-LABEL: test_x86_avx512_mask_psrai_q:
1740 ; CHECK-NEXT: movzbl %dil, %eax
1741 ; CHECK-NEXT: kmovw %eax, %k1
1742 ; CHECK-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1}
1743 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
1745 %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
1749 define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
1750 ; CHECK-LABEL: test_x86_avx512_maskz_psrai_q:
1752 ; CHECK-NEXT: movzbl %dil, %eax
1753 ; CHECK-NEXT: kmovw %eax, %k1
1754 ; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
1756 %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
1760 declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
1762 define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1) {
1763 ; CHECK-LABEL: test_x86_avx512_psll_d:
1765 ; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm0
1767 %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
1771 define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
1772 ; CHECK-LABEL: test_x86_avx512_mask_psll_d:
1774 ; CHECK-NEXT: kmovw %edi, %k1
1775 ; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm2 {%k1}
1776 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1778 %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
1782 define <16 x i32> @test_x86_avx512_maskz_psll_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
1783 ; CHECK-LABEL: test_x86_avx512_maskz_psll_d:
1785 ; CHECK-NEXT: kmovw %edi, %k1
1786 ; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
1788 %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
1792 declare <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
1794 define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1) {
1795 ; CHECK-LABEL: test_x86_avx512_psll_q:
1797 ; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm0
1799 %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
1803 define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
1804 ; CHECK-LABEL: test_x86_avx512_mask_psll_q:
1806 ; CHECK-NEXT: movzbl %dil, %eax
1807 ; CHECK-NEXT: kmovw %eax, %k1
1808 ; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm2 {%k1}
1809 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1811 %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
1815 define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
1816 ; CHECK-LABEL: test_x86_avx512_maskz_psll_q:
1818 ; CHECK-NEXT: movzbl %dil, %eax
1819 ; CHECK-NEXT: kmovw %eax, %k1
1820 ; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
1822 %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
1826 declare <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
1828 define <16 x i32> @test_x86_avx512_psrl_d(<16 x i32> %a0, <4 x i32> %a1) {
1829 ; CHECK-LABEL: test_x86_avx512_psrl_d:
1831 ; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm0
1833 %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
1837 define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
1838 ; CHECK-LABEL: test_x86_avx512_mask_psrl_d:
1840 ; CHECK-NEXT: kmovw %edi, %k1
1841 ; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm2 {%k1}
1842 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1844 %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
1848 define <16 x i32> @test_x86_avx512_maskz_psrl_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
1849 ; CHECK-LABEL: test_x86_avx512_maskz_psrl_d:
1851 ; CHECK-NEXT: kmovw %edi, %k1
1852 ; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
1854 %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
1858 declare <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
1860 define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1) {
1861 ; CHECK-LABEL: test_x86_avx512_psrl_q:
1863 ; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
1865 %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
1869 define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
1870 ; CHECK-LABEL: test_x86_avx512_mask_psrl_q:
1872 ; CHECK-NEXT: movzbl %dil, %eax
1873 ; CHECK-NEXT: kmovw %eax, %k1
1874 ; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
1875 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1877 %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
1881 define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
1882 ; CHECK-LABEL: test_x86_avx512_maskz_psrl_q:
1884 ; CHECK-NEXT: movzbl %dil, %eax
1885 ; CHECK-NEXT: kmovw %eax, %k1
1886 ; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
1888 %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
1892 declare <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
1894 define <16 x i32> @test_x86_avx512_psra_d(<16 x i32> %a0, <4 x i32> %a1) {
1895 ; CHECK-LABEL: test_x86_avx512_psra_d:
1897 ; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm0
1899 %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
1903 define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
1904 ; CHECK-LABEL: test_x86_avx512_mask_psra_d:
1906 ; CHECK-NEXT: kmovw %edi, %k1
1907 ; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm2 {%k1}
1908 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1910 %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
1914 define <16 x i32> @test_x86_avx512_maskz_psra_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
1915 ; CHECK-LABEL: test_x86_avx512_maskz_psra_d:
1917 ; CHECK-NEXT: kmovw %edi, %k1
1918 ; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
1920 %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
1924 declare <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
1926 define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1) {
1927 ; CHECK-LABEL: test_x86_avx512_psra_q:
1929 ; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm0
1931 %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
1935 define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
1936 ; CHECK-LABEL: test_x86_avx512_mask_psra_q:
1938 ; CHECK-NEXT: movzbl %dil, %eax
1939 ; CHECK-NEXT: kmovw %eax, %k1
1940 ; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm2 {%k1}
1941 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1943 %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
1947 define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
1948 ; CHECK-LABEL: test_x86_avx512_maskz_psra_q:
1950 ; CHECK-NEXT: movzbl %dil, %eax
1951 ; CHECK-NEXT: kmovw %eax, %k1
1952 ; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
1954 %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
1958 declare <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
1960 define <16 x i32> @test_x86_avx512_psllv_d(<16 x i32> %a0, <16 x i32> %a1) {
1961 ; CHECK-LABEL: test_x86_avx512_psllv_d:
1963 ; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
1965 %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
1969 define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
1970 ; CHECK-LABEL: test_x86_avx512_mask_psllv_d:
1972 ; CHECK-NEXT: kmovw %edi, %k1
1973 ; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
1974 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
1976 %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
1980 define <16 x i32> @test_x86_avx512_maskz_psllv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1981 ; CHECK-LABEL: test_x86_avx512_maskz_psllv_d:
1983 ; CHECK-NEXT: kmovw %edi, %k1
1984 ; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
1986 %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
1990 declare <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
1992 define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1) {
1993 ; CHECK-LABEL: test_x86_avx512_psllv_q:
1995 ; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0
1997 %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
2001 define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
2002 ; CHECK-LABEL: test_x86_avx512_mask_psllv_q:
2004 ; CHECK-NEXT: movzbl %dil, %eax
2005 ; CHECK-NEXT: kmovw %eax, %k1
2006 ; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
2007 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2009 %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
2013 define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
2014 ; CHECK-LABEL: test_x86_avx512_maskz_psllv_q:
2016 ; CHECK-NEXT: movzbl %dil, %eax
2017 ; CHECK-NEXT: kmovw %eax, %k1
2018 ; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
2020 %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
2024 declare <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
2027 define <16 x i32> @test_x86_avx512_psrav_d(<16 x i32> %a0, <16 x i32> %a1) {
2028 ; CHECK-LABEL: test_x86_avx512_psrav_d:
2030 ; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm0
2032 %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
2036 define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
2037 ; CHECK-LABEL: test_x86_avx512_mask_psrav_d:
2039 ; CHECK-NEXT: kmovw %edi, %k1
2040 ; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm2 {%k1}
2041 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2043 %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
2047 define <16 x i32> @test_x86_avx512_maskz_psrav_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
2048 ; CHECK-LABEL: test_x86_avx512_maskz_psrav_d:
2050 ; CHECK-NEXT: kmovw %edi, %k1
2051 ; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
2053 %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
2057 declare <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
2059 define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1) {
2060 ; CHECK-LABEL: test_x86_avx512_psrav_q:
2062 ; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0
2064 %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
2068 define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
2069 ; CHECK-LABEL: test_x86_avx512_mask_psrav_q:
2071 ; CHECK-NEXT: movzbl %dil, %eax
2072 ; CHECK-NEXT: kmovw %eax, %k1
2073 ; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm2 {%k1}
2074 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2076 %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
2080 define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
2081 ; CHECK-LABEL: test_x86_avx512_maskz_psrav_q:
2083 ; CHECK-NEXT: movzbl %dil, %eax
2084 ; CHECK-NEXT: kmovw %eax, %k1
2085 ; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
2087 %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
2091 declare <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
2093 define <16 x i32> @test_x86_avx512_psrlv_d(<16 x i32> %a0, <16 x i32> %a1) {
2094 ; CHECK-LABEL: test_x86_avx512_psrlv_d:
2096 ; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
2098 %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
2102 define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
2103 ; CHECK-LABEL: test_x86_avx512_mask_psrlv_d:
2105 ; CHECK-NEXT: kmovw %edi, %k1
2106 ; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
2107 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2109 %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
2113 define <16 x i32> @test_x86_avx512_maskz_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
2114 ; CHECK-LABEL: test_x86_avx512_maskz_psrlv_d:
2116 ; CHECK-NEXT: kmovw %edi, %k1
2117 ; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
2119 %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
2123 declare <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
2125 define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1) {
2126 ; CHECK-LABEL: test_x86_avx512_psrlv_q:
2128 ; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0
2130 %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
2134 define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
2135 ; CHECK-LABEL: test_x86_avx512_mask_psrlv_q:
2137 ; CHECK-NEXT: movzbl %dil, %eax
2138 ; CHECK-NEXT: kmovw %eax, %k1
2139 ; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
2140 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2142 %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
2146 define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
2147 ; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q:
2149 ; CHECK-NEXT: movzbl %dil, %eax
2150 ; CHECK-NEXT: kmovw %eax, %k1
2151 ; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
2153 %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
2157 declare <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
2159 define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, <8 x i64>* %ptr) {
2160 ; CHECK-LABEL: test_x86_avx512_psrlv_q_memop:
2162 ; CHECK-NEXT: vpsrlvq (%rdi), %zmm0, %zmm0
2164 %b = load <8 x i64>, <8 x i64>* %ptr
2165 %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2169 declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
2170 declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
2171 declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
2173 define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) {
2174 ; CHECK-LABEL: test_vsubps_rn:
2176 ; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
2178 %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
2179 <16 x float> zeroinitializer, i16 -1, i32 0)
2180 ret <16 x float> %res
2183 define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) {
2184 ; CHECK-LABEL: test_vsubps_rd:
2186 ; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
2188 %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
2189 <16 x float> zeroinitializer, i16 -1, i32 1)
2190 ret <16 x float> %res
2193 define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) {
2194 ; CHECK-LABEL: test_vsubps_ru:
2196 ; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
2198 %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
2199 <16 x float> zeroinitializer, i16 -1, i32 2)
2200 ret <16 x float> %res
2203 define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) {
2204 ; CHECK-LABEL: test_vsubps_rz:
2206 ; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
2208 %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
2209 <16 x float> zeroinitializer, i16 -1, i32 3)
2210 ret <16 x float> %res
2213 define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) {
2214 ; CHECK-LABEL: test_vmulps_rn:
2216 ; CHECK-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0
2218 %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2219 <16 x float> zeroinitializer, i16 -1, i32 0)
2220 ret <16 x float> %res
2223 define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) {
2224 ; CHECK-LABEL: test_vmulps_rd:
2226 ; CHECK-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0
2228 %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2229 <16 x float> zeroinitializer, i16 -1, i32 1)
2230 ret <16 x float> %res
2233 define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) {
2234 ; CHECK-LABEL: test_vmulps_ru:
2236 ; CHECK-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0
2238 %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2239 <16 x float> zeroinitializer, i16 -1, i32 2)
2240 ret <16 x float> %res
2243 define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) {
2244 ; CHECK-LABEL: test_vmulps_rz:
2246 ; CHECK-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0
2248 %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2249 <16 x float> zeroinitializer, i16 -1, i32 3)
2250 ret <16 x float> %res
2254 define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2255 ; CHECK-LABEL: test_vmulps_mask_rn:
2257 ; CHECK-NEXT: kmovw %edi, %k1
2258 ; CHECK-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2260 %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2261 <16 x float> zeroinitializer, i16 %mask, i32 0)
2262 ret <16 x float> %res
2265 define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2266 ; CHECK-LABEL: test_vmulps_mask_rd:
2268 ; CHECK-NEXT: kmovw %edi, %k1
2269 ; CHECK-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2271 %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2272 <16 x float> zeroinitializer, i16 %mask, i32 1)
2273 ret <16 x float> %res
2276 define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2277 ; CHECK-LABEL: test_vmulps_mask_ru:
2279 ; CHECK-NEXT: kmovw %edi, %k1
2280 ; CHECK-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2282 %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2283 <16 x float> zeroinitializer, i16 %mask, i32 2)
2284 ret <16 x float> %res
2287 define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2288 ; CHECK-LABEL: test_vmulps_mask_rz:
2290 ; CHECK-NEXT: kmovw %edi, %k1
2291 ; CHECK-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2293 %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2294 <16 x float> zeroinitializer, i16 %mask, i32 3)
2295 ret <16 x float> %res
2298 ;; With Passthru value
2299 define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
2300 ; CHECK-LABEL: test_vmulps_mask_passthru_rn:
2302 ; CHECK-NEXT: kmovw %edi, %k1
2303 ; CHECK-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2304 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2306 %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2307 <16 x float> %passthru, i16 %mask, i32 0)
2308 ret <16 x float> %res
2311 define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
2312 ; CHECK-LABEL: test_vmulps_mask_passthru_rd:
2314 ; CHECK-NEXT: kmovw %edi, %k1
2315 ; CHECK-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2316 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2318 %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2319 <16 x float> %passthru, i16 %mask, i32 1)
2320 ret <16 x float> %res
2323 define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
2324 ; CHECK-LABEL: test_vmulps_mask_passthru_ru:
2326 ; CHECK-NEXT: kmovw %edi, %k1
2327 ; CHECK-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2328 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2330 %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2331 <16 x float> %passthru, i16 %mask, i32 2)
2332 ret <16 x float> %res
2335 define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
2336 ; CHECK-LABEL: test_vmulps_mask_passthru_rz:
2338 ; CHECK-NEXT: kmovw %edi, %k1
2339 ; CHECK-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2340 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2342 %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2343 <16 x float> %passthru, i16 %mask, i32 3)
2344 ret <16 x float> %res
2348 define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
2349 ; CHECK-LABEL: test_vmulpd_mask_rn:
2351 ; CHECK-NEXT: movzbl %dil, %eax
2352 ; CHECK-NEXT: kmovw %eax, %k1
2353 ; CHECK-NEXT: vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2355 %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
2356 <8 x double> zeroinitializer, i8 %mask, i32 0)
2357 ret <8 x double> %res
2360 define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
2361 ; CHECK-LABEL: test_vmulpd_mask_rd:
2363 ; CHECK-NEXT: movzbl %dil, %eax
2364 ; CHECK-NEXT: kmovw %eax, %k1
2365 ; CHECK-NEXT: vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2367 %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
2368 <8 x double> zeroinitializer, i8 %mask, i32 1)
2369 ret <8 x double> %res
2372 define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
2373 ; CHECK-LABEL: test_vmulpd_mask_ru:
2375 ; CHECK-NEXT: movzbl %dil, %eax
2376 ; CHECK-NEXT: kmovw %eax, %k1
2377 ; CHECK-NEXT: vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2379 %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
2380 <8 x double> zeroinitializer, i8 %mask, i32 2)
2381 ret <8 x double> %res
2384 define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
2385 ; CHECK-LABEL: test_vmulpd_mask_rz:
2387 ; CHECK-NEXT: movzbl %dil, %eax
2388 ; CHECK-NEXT: kmovw %eax, %k1
2389 ; CHECK-NEXT: vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2391 %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
2392 <8 x double> zeroinitializer, i8 %mask, i32 3)
2393 ret <8 x double> %res
2396 define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
2397 ; CHECK-LABEL: test_xor_epi32:
2399 ; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm0
2401 %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
2402 ret < 16 x i32> %res
2405 define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
2406 ; CHECK-LABEL: test_mask_xor_epi32:
2408 ; CHECK-NEXT: kmovw %edi, %k1
2409 ; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm2 {%k1}
2410 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2412 %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2413 ret < 16 x i32> %res
2416 declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
2418 define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
2419 ; CHECK-LABEL: test_or_epi32:
2421 ; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0
2423 %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
2424 ret < 16 x i32> %res
2427 define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
2428 ; CHECK-LABEL: test_mask_or_epi32:
2430 ; CHECK-NEXT: kmovw %edi, %k1
2431 ; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm2 {%k1}
2432 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2434 %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2435 ret < 16 x i32> %res
2438 declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
2440 define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
2441 ; CHECK-LABEL: test_and_epi32:
2443 ; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm0
2445 %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
2446 ret < 16 x i32> %res
2449 define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
2450 ; CHECK-LABEL: test_mask_and_epi32:
2452 ; CHECK-NEXT: kmovw %edi, %k1
2453 ; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1}
2454 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2456 %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2457 ret < 16 x i32> %res
2460 declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
2462 define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) {
2463 ; CHECK-LABEL: test_xor_epi64:
2465 ; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0
2467 %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
2471 define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
2472 ; CHECK-LABEL: test_mask_xor_epi64:
2474 ; CHECK-NEXT: movzbl %dil, %eax
2475 ; CHECK-NEXT: kmovw %eax, %k1
2476 ; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm2 {%k1}
2477 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2479 %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2483 declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
2485 define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) {
2486 ; CHECK-LABEL: test_or_epi64:
2488 ; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0
2490 %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
2494 define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
2495 ; CHECK-LABEL: test_mask_or_epi64:
2497 ; CHECK-NEXT: movzbl %dil, %eax
2498 ; CHECK-NEXT: kmovw %eax, %k1
2499 ; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm2 {%k1}
2500 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2502 %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2506 declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
2508 define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) {
2509 ; CHECK-LABEL: test_and_epi64:
2511 ; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0
2513 %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
2517 define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
2518 ; CHECK-LABEL: test_mask_and_epi64:
2520 ; CHECK-NEXT: movzbl %dil, %eax
2521 ; CHECK-NEXT: kmovw %eax, %k1
2522 ; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1}
2523 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2525 %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2529 declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
2532 define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
2533 ; CHECK-LABEL: test_mask_add_epi32_rr:
2535 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
2537 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2538 ret < 16 x i32> %res
2541 define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
2542 ; CHECK-LABEL: test_mask_add_epi32_rrk:
2544 ; CHECK-NEXT: kmovw %edi, %k1
2545 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm2 {%k1}
2546 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2548 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2549 ret < 16 x i32> %res
2552 define <16 x i32> @test_mask_add_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
2553 ; CHECK-LABEL: test_mask_add_epi32_rrkz:
2555 ; CHECK-NEXT: kmovw %edi, %k1
2556 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
2558 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2559 ret < 16 x i32> %res
2562 define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
2563 ; CHECK-LABEL: test_mask_add_epi32_rm:
2565 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
2567 %b = load <16 x i32>, <16 x i32>* %ptr_b
2568 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2569 ret < 16 x i32> %res
2572 define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
2573 ; CHECK-LABEL: test_mask_add_epi32_rmk:
2575 ; CHECK-NEXT: kmovw %esi, %k1
2576 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm1 {%k1}
2577 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
2579 %b = load <16 x i32>, <16 x i32>* %ptr_b
2580 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2581 ret < 16 x i32> %res
2584 define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
2585 ; CHECK-LABEL: test_mask_add_epi32_rmkz:
2587 ; CHECK-NEXT: kmovw %esi, %k1
2588 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
2590 %b = load <16 x i32>, <16 x i32>* %ptr_b
2591 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2592 ret < 16 x i32> %res
2595 define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
2596 ; CHECK-LABEL: test_mask_add_epi32_rmb:
2598 ; CHECK-NEXT: vpaddd (%rdi){1to16}, %zmm0, %zmm0
2600 %q = load i32, i32* %ptr_b
2601 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2602 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2603 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2604 ret < 16 x i32> %res
2607 define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
2608 ; CHECK-LABEL: test_mask_add_epi32_rmbk:
2610 ; CHECK-NEXT: kmovw %esi, %k1
2611 ; CHECK-NEXT: vpaddd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
2612 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
2614 %q = load i32, i32* %ptr_b
2615 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2616 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2617 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2618 ret < 16 x i32> %res
2621 define <16 x i32> @test_mask_add_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
2622 ; CHECK-LABEL: test_mask_add_epi32_rmbkz:
2624 ; CHECK-NEXT: kmovw %esi, %k1
2625 ; CHECK-NEXT: vpaddd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
2627 %q = load i32, i32* %ptr_b
2628 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2629 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2630 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2631 ret < 16 x i32> %res
2634 declare <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
2636 define <16 x i32> @test_mask_sub_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
2637 ; CHECK-LABEL: test_mask_sub_epi32_rr:
2639 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0
2641 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2642 ret < 16 x i32> %res
2645 define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
2646 ; CHECK-LABEL: test_mask_sub_epi32_rrk:
2648 ; CHECK-NEXT: kmovw %edi, %k1
2649 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm2 {%k1}
2650 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2652 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2653 ret < 16 x i32> %res
2656 define <16 x i32> @test_mask_sub_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
2657 ; CHECK-LABEL: test_mask_sub_epi32_rrkz:
2659 ; CHECK-NEXT: kmovw %edi, %k1
2660 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z}
2662 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2663 ret < 16 x i32> %res
2666 define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
2667 ; CHECK-LABEL: test_mask_sub_epi32_rm:
2669 ; CHECK-NEXT: vpsubd (%rdi), %zmm0, %zmm0
2671 %b = load <16 x i32>, <16 x i32>* %ptr_b
2672 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2673 ret < 16 x i32> %res
2676 define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
2677 ; CHECK-LABEL: test_mask_sub_epi32_rmk:
2679 ; CHECK-NEXT: kmovw %esi, %k1
2680 ; CHECK-NEXT: vpsubd (%rdi), %zmm0, %zmm1 {%k1}
2681 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
2683 %b = load <16 x i32>, <16 x i32>* %ptr_b
2684 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2685 ret < 16 x i32> %res
2688 define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
2689 ; CHECK-LABEL: test_mask_sub_epi32_rmkz:
2691 ; CHECK-NEXT: kmovw %esi, %k1
2692 ; CHECK-NEXT: vpsubd (%rdi), %zmm0, %zmm0 {%k1} {z}
2694 %b = load <16 x i32>, <16 x i32>* %ptr_b
2695 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2696 ret < 16 x i32> %res
2699 define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
2700 ; CHECK-LABEL: test_mask_sub_epi32_rmb:
2702 ; CHECK-NEXT: vpsubd (%rdi){1to16}, %zmm0, %zmm0
2704 %q = load i32, i32* %ptr_b
2705 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2706 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2707 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2708 ret < 16 x i32> %res
2711 define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
2712 ; CHECK-LABEL: test_mask_sub_epi32_rmbk:
2714 ; CHECK-NEXT: kmovw %esi, %k1
2715 ; CHECK-NEXT: vpsubd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
2716 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
2718 %q = load i32, i32* %ptr_b
2719 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2720 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2721 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2722 ret < 16 x i32> %res
2725 define <16 x i32> @test_mask_sub_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
2726 ; CHECK-LABEL: test_mask_sub_epi32_rmbkz:
2728 ; CHECK-NEXT: kmovw %esi, %k1
2729 ; CHECK-NEXT: vpsubd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
2731 %q = load i32, i32* %ptr_b
2732 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2733 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2734 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2735 ret < 16 x i32> %res
2738 declare <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
2740 define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
2741 ; CHECK-LABEL: test_mask_add_epi64_rr:
2743 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
2745 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2749 define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
2750 ; CHECK-LABEL: test_mask_add_epi64_rrk:
2752 ; CHECK-NEXT: movzbl %dil, %eax
2753 ; CHECK-NEXT: kmovw %eax, %k1
2754 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm2 {%k1}
2755 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2757 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2761 define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
2762 ; CHECK-LABEL: test_mask_add_epi64_rrkz:
2764 ; CHECK-NEXT: movzbl %dil, %eax
2765 ; CHECK-NEXT: kmovw %eax, %k1
2766 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z}
2768 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2772 define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
2773 ; CHECK-LABEL: test_mask_add_epi64_rm:
2775 ; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm0
2777 %b = load <8 x i64>, <8 x i64>* %ptr_b
2778 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2782 define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2783 ; CHECK-LABEL: test_mask_add_epi64_rmk:
2785 ; CHECK-NEXT: movzbl %sil, %eax
2786 ; CHECK-NEXT: kmovw %eax, %k1
2787 ; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm1 {%k1}
2788 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
2790 %b = load <8 x i64>, <8 x i64>* %ptr_b
2791 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2795 define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
2796 ; CHECK-LABEL: test_mask_add_epi64_rmkz:
2798 ; CHECK-NEXT: movzbl %sil, %eax
2799 ; CHECK-NEXT: kmovw %eax, %k1
2800 ; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm0 {%k1} {z}
2802 %b = load <8 x i64>, <8 x i64>* %ptr_b
2803 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2807 define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
2808 ; CHECK-LABEL: test_mask_add_epi64_rmb:
2810 ; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0
2812 %q = load i64, i64* %ptr_b
2813 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2814 %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2815 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2819 define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2820 ; CHECK-LABEL: test_mask_add_epi64_rmbk:
2822 ; CHECK-NEXT: movzbl %sil, %eax
2823 ; CHECK-NEXT: kmovw %eax, %k1
2824 ; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
2825 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
2827 %q = load i64, i64* %ptr_b
2828 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2829 %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2830 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2834 define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
2835 ; CHECK-LABEL: test_mask_add_epi64_rmbkz:
2837 ; CHECK-NEXT: movzbl %sil, %eax
2838 ; CHECK-NEXT: kmovw %eax, %k1
2839 ; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
2841 %q = load i64, i64* %ptr_b
2842 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2843 %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2844 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2848 declare <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
2850 define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
2851 ; CHECK-LABEL: test_mask_sub_epi64_rr:
2853 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
2855 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2859 define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
2860 ; CHECK-LABEL: test_mask_sub_epi64_rrk:
2862 ; CHECK-NEXT: movzbl %dil, %eax
2863 ; CHECK-NEXT: kmovw %eax, %k1
2864 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm2 {%k1}
2865 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2867 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2871 define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
2872 ; CHECK-LABEL: test_mask_sub_epi64_rrkz:
2874 ; CHECK-NEXT: movzbl %dil, %eax
2875 ; CHECK-NEXT: kmovw %eax, %k1
2876 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z}
2878 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2882 define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
2883 ; CHECK-LABEL: test_mask_sub_epi64_rm:
2885 ; CHECK-NEXT: vpsubq (%rdi), %zmm0, %zmm0
2887 %b = load <8 x i64>, <8 x i64>* %ptr_b
2888 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2892 define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2893 ; CHECK-LABEL: test_mask_sub_epi64_rmk:
2895 ; CHECK-NEXT: movzbl %sil, %eax
2896 ; CHECK-NEXT: kmovw %eax, %k1
2897 ; CHECK-NEXT: vpsubq (%rdi), %zmm0, %zmm1 {%k1}
2898 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
2900 %b = load <8 x i64>, <8 x i64>* %ptr_b
2901 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2905 define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
2906 ; CHECK-LABEL: test_mask_sub_epi64_rmkz:
2908 ; CHECK-NEXT: movzbl %sil, %eax
2909 ; CHECK-NEXT: kmovw %eax, %k1
2910 ; CHECK-NEXT: vpsubq (%rdi), %zmm0, %zmm0 {%k1} {z}
2912 %b = load <8 x i64>, <8 x i64>* %ptr_b
2913 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2917 define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
2918 ; CHECK-LABEL: test_mask_sub_epi64_rmb:
2920 ; CHECK-NEXT: vpsubq (%rdi){1to8}, %zmm0, %zmm0
2922 %q = load i64, i64* %ptr_b
2923 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2924 %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2925 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2929 define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2930 ; CHECK-LABEL: test_mask_sub_epi64_rmbk:
2932 ; CHECK-NEXT: movzbl %sil, %eax
2933 ; CHECK-NEXT: kmovw %eax, %k1
2934 ; CHECK-NEXT: vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
2935 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
2937 %q = load i64, i64* %ptr_b
2938 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2939 %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2940 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2944 define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
2945 ; CHECK-LABEL: test_mask_sub_epi64_rmbkz:
2947 ; CHECK-NEXT: movzbl %sil, %eax
2948 ; CHECK-NEXT: kmovw %eax, %k1
2949 ; CHECK-NEXT: vpsubq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
2951 %q = load i64, i64* %ptr_b
2952 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2953 %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2954 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2958 declare <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
2960 define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
2961 ; CHECK-LABEL: test_mask_mul_epi32_rr:
2963 ; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
2965 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
2969 define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
2970 ; CHECK-LABEL: test_mask_mul_epi32_rrk:
2972 ; CHECK-NEXT: movzbl %dil, %eax
2973 ; CHECK-NEXT: kmovw %eax, %k1
2974 ; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 {%k1}
2975 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2977 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
2981 define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
2982 ; CHECK-LABEL: test_mask_mul_epi32_rrkz:
2984 ; CHECK-NEXT: movzbl %dil, %eax
2985 ; CHECK-NEXT: kmovw %eax, %k1
2986 ; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z}
2988 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
2992 define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
2993 ; CHECK-LABEL: test_mask_mul_epi32_rm:
2995 ; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm0
2997 %b = load <16 x i32>, <16 x i32>* %ptr_b
2998 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
3002 define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
3003 ; CHECK-LABEL: test_mask_mul_epi32_rmk:
3005 ; CHECK-NEXT: movzbl %sil, %eax
3006 ; CHECK-NEXT: kmovw %eax, %k1
3007 ; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm1 {%k1}
3008 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
3010 %b = load <16 x i32>, <16 x i32>* %ptr_b
3011 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
3015 define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
3016 ; CHECK-LABEL: test_mask_mul_epi32_rmkz:
3018 ; CHECK-NEXT: movzbl %sil, %eax
3019 ; CHECK-NEXT: kmovw %eax, %k1
3020 ; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z}
3022 %b = load <16 x i32>, <16 x i32>* %ptr_b
3023 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
3027 define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
3028 ; CHECK-LABEL: test_mask_mul_epi32_rmb:
3030 ; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm0
3032 %q = load i64, i64* %ptr_b
3033 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
3034 %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
3035 %b = bitcast <8 x i64> %b64 to <16 x i32>
3036 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
3040 define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
3041 ; CHECK-LABEL: test_mask_mul_epi32_rmbk:
3043 ; CHECK-NEXT: movzbl %sil, %eax
3044 ; CHECK-NEXT: kmovw %eax, %k1
3045 ; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
3046 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
3048 %q = load i64, i64* %ptr_b
3049 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
3050 %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
3051 %b = bitcast <8 x i64> %b64 to <16 x i32>
3052 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
3056 define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
3057 ; CHECK-LABEL: test_mask_mul_epi32_rmbkz:
3059 ; CHECK-NEXT: movzbl %sil, %eax
3060 ; CHECK-NEXT: kmovw %eax, %k1
3061 ; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
3063 %q = load i64, i64* %ptr_b
3064 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
3065 %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
3066 %b = bitcast <8 x i64> %b64 to <16 x i32>
3067 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
3071 declare <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
3073 define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
3074 ; CHECK-LABEL: test_mask_mul_epu32_rr:
3076 ; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
3078 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
3082 define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
3083 ; CHECK-LABEL: test_mask_mul_epu32_rrk:
3085 ; CHECK-NEXT: movzbl %dil, %eax
3086 ; CHECK-NEXT: kmovw %eax, %k1
3087 ; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 {%k1}
3088 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
3090 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
3094 define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
3095 ; CHECK-LABEL: test_mask_mul_epu32_rrkz:
3097 ; CHECK-NEXT: movzbl %dil, %eax
3098 ; CHECK-NEXT: kmovw %eax, %k1
3099 ; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z}
3101 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
3105 define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
3106 ; CHECK-LABEL: test_mask_mul_epu32_rm:
3108 ; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm0
3110 %b = load <16 x i32>, <16 x i32>* %ptr_b
3111 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
3115 define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
3116 ; CHECK-LABEL: test_mask_mul_epu32_rmk:
3118 ; CHECK-NEXT: movzbl %sil, %eax
3119 ; CHECK-NEXT: kmovw %eax, %k1
3120 ; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm1 {%k1}
3121 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
3123 %b = load <16 x i32>, <16 x i32>* %ptr_b
3124 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
3128 define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
3129 ; CHECK-LABEL: test_mask_mul_epu32_rmkz:
3131 ; CHECK-NEXT: movzbl %sil, %eax
3132 ; CHECK-NEXT: kmovw %eax, %k1
3133 ; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z}
3135 %b = load <16 x i32>, <16 x i32>* %ptr_b
3136 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
3140 define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
3141 ; CHECK-LABEL: test_mask_mul_epu32_rmb:
3143 ; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm0
3145 %q = load i64, i64* %ptr_b
3146 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
3147 %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
3148 %b = bitcast <8 x i64> %b64 to <16 x i32>
3149 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
3153 define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
3154 ; CHECK-LABEL: test_mask_mul_epu32_rmbk:
3156 ; CHECK-NEXT: movzbl %sil, %eax
3157 ; CHECK-NEXT: kmovw %eax, %k1
3158 ; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
3159 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
3161 %q = load i64, i64* %ptr_b
3162 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
3163 %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
3164 %b = bitcast <8 x i64> %b64 to <16 x i32>
3165 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
3169 define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
3170 ; CHECK-LABEL: test_mask_mul_epu32_rmbkz:
3172 ; CHECK-NEXT: movzbl %sil, %eax
3173 ; CHECK-NEXT: kmovw %eax, %k1
3174 ; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
3176 %q = load i64, i64* %ptr_b
3177 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
3178 %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
3179 %b = bitcast <8 x i64> %b64 to <16 x i32>
3180 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
3184 declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
3186 define <16 x i32> @test_mask_mullo_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
3187 ; CHECK-LABEL: test_mask_mullo_epi32_rr_512:
3189 ; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0
3191 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
3195 define <16 x i32> @test_mask_mullo_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
3196 ; CHECK-LABEL: test_mask_mullo_epi32_rrk_512:
3198 ; CHECK-NEXT: kmovw %edi, %k1
3199 ; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm2 {%k1}
3200 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
3202 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
3203 ret < 16 x i32> %res
3206 define <16 x i32> @test_mask_mullo_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
3207 ; CHECK-LABEL: test_mask_mullo_epi32_rrkz_512:
3209 ; CHECK-NEXT: kmovw %edi, %k1
3210 ; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 {%k1} {z}
3212 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
3213 ret < 16 x i32> %res
3216 define <16 x i32> @test_mask_mullo_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
3217 ; CHECK-LABEL: test_mask_mullo_epi32_rm_512:
3219 ; CHECK-NEXT: vpmulld (%rdi), %zmm0, %zmm0
3221 %b = load <16 x i32>, <16 x i32>* %ptr_b
3222 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
3223 ret < 16 x i32> %res
3226 define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
3227 ; CHECK-LABEL: test_mask_mullo_epi32_rmk_512:
3229 ; CHECK-NEXT: kmovw %esi, %k1
3230 ; CHECK-NEXT: vpmulld (%rdi), %zmm0, %zmm1 {%k1}
3231 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
3233 %b = load <16 x i32>, <16 x i32>* %ptr_b
3234 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
3235 ret < 16 x i32> %res
3238 define <16 x i32> @test_mask_mullo_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
3239 ; CHECK-LABEL: test_mask_mullo_epi32_rmkz_512:
3241 ; CHECK-NEXT: kmovw %esi, %k1
3242 ; CHECK-NEXT: vpmulld (%rdi), %zmm0, %zmm0 {%k1} {z}
3244 %b = load <16 x i32>, <16 x i32>* %ptr_b
3245 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
3246 ret < 16 x i32> %res
3249 define <16 x i32> @test_mask_mullo_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
3250 ; CHECK-LABEL: test_mask_mullo_epi32_rmb_512:
3252 ; CHECK-NEXT: vpmulld (%rdi){1to16}, %zmm0, %zmm0
3254 %q = load i32, i32* %ptr_b
3255 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
3256 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
3257 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
3258 ret < 16 x i32> %res
3261 define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
3262 ; CHECK-LABEL: test_mask_mullo_epi32_rmbk_512:
3264 ; CHECK-NEXT: kmovw %esi, %k1
3265 ; CHECK-NEXT: vpmulld (%rdi){1to16}, %zmm0, %zmm1 {%k1}
3266 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
3268 %q = load i32, i32* %ptr_b
3269 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
3270 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
3271 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
3272 ret < 16 x i32> %res
3275 define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
3276 ; CHECK-LABEL: test_mask_mullo_epi32_rmbkz_512:
3278 ; CHECK-NEXT: kmovw %esi, %k1
3279 ; CHECK-NEXT: vpmulld (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
3281 %q = load i32, i32* %ptr_b
3282 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
3283 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
3284 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
3285 ret < 16 x i32> %res
3288 declare <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
3290 define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3291 ; CHECK-LABEL: test_mm512_maskz_add_round_ps_rn_sae:
3293 ; CHECK-NEXT: kmovw %edi, %k1
3294 ; CHECK-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
3296 %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0)
3297 ret <16 x float> %res
3299 define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3300 ; CHECK-LABEL: test_mm512_maskz_add_round_ps_rd_sae:
3302 ; CHECK-NEXT: kmovw %edi, %k1
3303 ; CHECK-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
3305 %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1)
3306 ret <16 x float> %res
3308 define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3309 ; CHECK-LABEL: test_mm512_maskz_add_round_ps_ru_sae:
3311 ; CHECK-NEXT: kmovw %edi, %k1
3312 ; CHECK-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
3314 %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2)
3315 ret <16 x float> %res
3318 define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3319 ; CHECK-LABEL: test_mm512_maskz_add_round_ps_rz_sae:
3321 ; CHECK-NEXT: kmovw %edi, %k1
3322 ; CHECK-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
3324 %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3)
3325 ret <16 x float> %res
3329 define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3330 ; CHECK-LABEL: test_mm512_maskz_add_round_ps_current: