1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
3 define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) {
4 ; CHECK-LABEL: addpd512:
5 ; CHECK: ## BB#0: ## %entry
6 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
9 %add.i = fadd <8 x double> %x, %y
10 ret <8 x double> %add.i
13 define <8 x double> @addpd512fold(<8 x double> %y) {
14 ; CHECK-LABEL: addpd512fold:
15 ; CHECK: ## BB#0: ## %entry
16 ; CHECK-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0
19 %add.i = fadd <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.800000e+00, double 2.300000e+00, double 1.200000e+00>
20 ret <8 x double> %add.i
23 define <16 x float> @addps512(<16 x float> %y, <16 x float> %x) {
24 ; CHECK-LABEL: addps512:
25 ; CHECK: ## BB#0: ## %entry
26 ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
29 %add.i = fadd <16 x float> %x, %y
30 ret <16 x float> %add.i
33 define <16 x float> @addps512fold(<16 x float> %y) {
34 ; CHECK-LABEL: addps512fold:
35 ; CHECK: ## BB#0: ## %entry
36 ; CHECK-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0
39 %add.i = fadd <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 4.500000e+00, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
40 ret <16 x float> %add.i
43 define <8 x double> @subpd512(<8 x double> %y, <8 x double> %x) {
44 ; CHECK-LABEL: subpd512:
45 ; CHECK: ## BB#0: ## %entry
46 ; CHECK-NEXT: vsubpd %zmm0, %zmm1, %zmm0
49 %sub.i = fsub <8 x double> %x, %y
50 ret <8 x double> %sub.i
53 define <8 x double> @subpd512fold(<8 x double> %y, <8 x double>* %x) {
54 ; CHECK-LABEL: subpd512fold:
55 ; CHECK: ## BB#0: ## %entry
56 ; CHECK-NEXT: vsubpd (%rdi), %zmm0, %zmm0
59 %tmp2 = load <8 x double>* %x, align 8
60 %sub.i = fsub <8 x double> %y, %tmp2
61 ret <8 x double> %sub.i
64 define <16 x float> @subps512(<16 x float> %y, <16 x float> %x) {
65 ; CHECK-LABEL: subps512:
66 ; CHECK: ## BB#0: ## %entry
67 ; CHECK-NEXT: vsubps %zmm0, %zmm1, %zmm0
70 %sub.i = fsub <16 x float> %x, %y
71 ret <16 x float> %sub.i
74 define <16 x float> @subps512fold(<16 x float> %y, <16 x float>* %x) {
75 ; CHECK-LABEL: subps512fold:
76 ; CHECK: ## BB#0: ## %entry
77 ; CHECK-NEXT: vsubps (%rdi), %zmm0, %zmm0
80 %tmp2 = load <16 x float>* %x, align 4
81 %sub.i = fsub <16 x float> %y, %tmp2
82 ret <16 x float> %sub.i
85 define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
86 ; CHECK-LABEL: imulq512:
88 ; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm2
89 ; CHECK-NEXT: vpsrlq $32, %zmm0, %zmm3
90 ; CHECK-NEXT: vpmuludq %zmm3, %zmm1, %zmm3
91 ; CHECK-NEXT: vpsllq $32, %zmm3, %zmm3
92 ; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2
93 ; CHECK-NEXT: vpsrlq $32, %zmm1, %zmm1
94 ; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
95 ; CHECK-NEXT: vpsllq $32, %zmm0, %zmm0
96 ; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
98 %z = mul <8 x i64>%x, %y
102 define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) {
103 ; CHECK-LABEL: mulpd512:
104 ; CHECK: ## BB#0: ## %entry
105 ; CHECK-NEXT: vmulpd %zmm0, %zmm1, %zmm0
108 %mul.i = fmul <8 x double> %x, %y
109 ret <8 x double> %mul.i
112 define <8 x double> @mulpd512fold(<8 x double> %y) {
113 ; CHECK-LABEL: mulpd512fold:
114 ; CHECK: ## BB#0: ## %entry
115 ; CHECK-NEXT: vmulpd {{.*}}(%rip), %zmm0, %zmm0
118 %mul.i = fmul <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
119 ret <8 x double> %mul.i
122 define <16 x float> @mulps512(<16 x float> %y, <16 x float> %x) {
123 ; CHECK-LABEL: mulps512:
124 ; CHECK: ## BB#0: ## %entry
125 ; CHECK-NEXT: vmulps %zmm0, %zmm1, %zmm0
128 %mul.i = fmul <16 x float> %x, %y
129 ret <16 x float> %mul.i
132 define <16 x float> @mulps512fold(<16 x float> %y) {
133 ; CHECK-LABEL: mulps512fold:
134 ; CHECK: ## BB#0: ## %entry
135 ; CHECK-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0
138 %mul.i = fmul <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
139 ret <16 x float> %mul.i
142 define <8 x double> @divpd512(<8 x double> %y, <8 x double> %x) {
143 ; CHECK-LABEL: divpd512:
144 ; CHECK: ## BB#0: ## %entry
145 ; CHECK-NEXT: vdivpd %zmm0, %zmm1, %zmm0
148 %div.i = fdiv <8 x double> %x, %y
149 ret <8 x double> %div.i
152 define <8 x double> @divpd512fold(<8 x double> %y) {
153 ; CHECK-LABEL: divpd512fold:
154 ; CHECK: ## BB#0: ## %entry
155 ; CHECK-NEXT: vdivpd {{.*}}(%rip), %zmm0, %zmm0
158 %div.i = fdiv <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
159 ret <8 x double> %div.i
162 define <16 x float> @divps512(<16 x float> %y, <16 x float> %x) {
163 ; CHECK-LABEL: divps512:
164 ; CHECK: ## BB#0: ## %entry
165 ; CHECK-NEXT: vdivps %zmm0, %zmm1, %zmm0
168 %div.i = fdiv <16 x float> %x, %y
169 ret <16 x float> %div.i
172 define <16 x float> @divps512fold(<16 x float> %y) {
173 ; CHECK-LABEL: divps512fold:
174 ; CHECK: ## BB#0: ## %entry
175 ; CHECK-NEXT: vdivps {{.*}}(%rip), %zmm0, %zmm0
178 %div.i = fdiv <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000>
179 ret <16 x float> %div.i
182 define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
183 ; CHECK-LABEL: vpaddq_test:
185 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
187 %x = add <8 x i64> %i, %j
191 define <8 x i64> @vpaddq_fold_test(<8 x i64> %i, <8 x i64>* %j) nounwind {
192 ; CHECK-LABEL: vpaddq_fold_test:
194 ; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm0
196 %tmp = load <8 x i64>* %j, align 4
197 %x = add <8 x i64> %i, %tmp
201 define <8 x i64> @vpaddq_broadcast_test(<8 x i64> %i) nounwind {
202 ; CHECK-LABEL: vpaddq_broadcast_test:
204 ; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
206 %x = add <8 x i64> %i, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
210 define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind {
211 ; CHECK-LABEL: vpaddq_broadcast2_test:
213 ; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0
216 %j.0 = insertelement <8 x i64> undef, i64 %tmp, i32 0
217 %j.1 = insertelement <8 x i64> %j.0, i64 %tmp, i32 1
218 %j.2 = insertelement <8 x i64> %j.1, i64 %tmp, i32 2
219 %j.3 = insertelement <8 x i64> %j.2, i64 %tmp, i32 3
220 %j.4 = insertelement <8 x i64> %j.3, i64 %tmp, i32 4
221 %j.5 = insertelement <8 x i64> %j.4, i64 %tmp, i32 5
222 %j.6 = insertelement <8 x i64> %j.5, i64 %tmp, i32 6
223 %j.7 = insertelement <8 x i64> %j.6, i64 %tmp, i32 7
224 %x = add <8 x i64> %i, %j.7
228 define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
229 ; CHECK-LABEL: vpaddd_test:
231 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
233 %x = add <16 x i32> %i, %j
237 define <16 x i32> @vpaddd_fold_test(<16 x i32> %i, <16 x i32>* %j) nounwind {
238 ; CHECK-LABEL: vpaddd_fold_test:
240 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
242 %tmp = load <16 x i32>* %j, align 4
243 %x = add <16 x i32> %i, %tmp
247 define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind {
248 ; CHECK-LABEL: vpaddd_broadcast_test:
250 ; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
252 %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
256 define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
257 ; CHECK-LABEL: vpaddd_mask_test:
259 ; CHECK-NEXT: vpxord %zmm3, %zmm3, %zmm3
260 ; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
261 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1}
263 %mask = icmp ne <16 x i32> %mask1, zeroinitializer
264 %x = add <16 x i32> %i, %j
265 %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
269 define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
270 ; CHECK-LABEL: vpaddd_maskz_test:
272 ; CHECK-NEXT: vpxord %zmm3, %zmm3, %zmm3
273 ; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
274 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
276 %mask = icmp ne <16 x i32> %mask1, zeroinitializer
277 %x = add <16 x i32> %i, %j
278 %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
282 define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
283 ; CHECK-LABEL: vpaddd_mask_fold_test:
285 ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
286 ; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
287 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1}
289 %mask = icmp ne <16 x i32> %mask1, zeroinitializer
290 %j = load <16 x i32>* %j.ptr
291 %x = add <16 x i32> %i, %j
292 %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
296 define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
297 ; CHECK-LABEL: vpaddd_mask_broadcast_test:
299 ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
300 ; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
301 ; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1}
303 %mask = icmp ne <16 x i32> %mask1, zeroinitializer
304 %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
305 %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
309 define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
310 ; CHECK-LABEL: vpaddd_maskz_fold_test:
312 ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
313 ; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
314 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
316 %mask = icmp ne <16 x i32> %mask1, zeroinitializer
317 %j = load <16 x i32>* %j.ptr
318 %x = add <16 x i32> %i, %j
319 %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
323 define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
324 ; CHECK-LABEL: vpaddd_maskz_broadcast_test:
326 ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
327 ; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
328 ; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z}
330 %mask = icmp ne <16 x i32> %mask1, zeroinitializer
331 %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
332 %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
336 define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
337 ; CHECK-LABEL: vpsubq_test:
339 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
341 %x = sub <8 x i64> %i, %j
345 define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
346 ; CHECK-LABEL: vpsubd_test:
348 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0
350 %x = sub <16 x i32> %i, %j
354 define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) {
355 ; CHECK-LABEL: vpmulld_test:
357 ; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0
359 %x = mul <16 x i32> %i, %j
363 declare float @sqrtf(float) readnone
364 define float @sqrtA(float %a) nounwind uwtable readnone ssp {
365 ; CHECK-LABEL: sqrtA:
366 ; CHECK: ## BB#0: ## %entry
367 ; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
370 %conv1 = tail call float @sqrtf(float %a) nounwind readnone
374 declare double @sqrt(double) readnone
375 define double @sqrtB(double %a) nounwind uwtable readnone ssp {
376 ; CHECK-LABEL: sqrtB:
377 ; CHECK: ## BB#0: ## %entry
378 ; CHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
381 %call = tail call double @sqrt(double %a) nounwind readnone
385 declare float @llvm.sqrt.f32(float)
386 define float @sqrtC(float %a) nounwind {
387 ; CHECK-LABEL: sqrtC:
389 ; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
391 %b = call float @llvm.sqrt.f32(float %a)
395 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
396 define <16 x float> @sqrtD(<16 x float> %a) nounwind {
397 ; CHECK-LABEL: sqrtD:
399 ; CHECK-NEXT: vsqrtps %zmm0, %zmm0
401 %b = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
405 declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
406 define <8 x double> @sqrtE(<8 x double> %a) nounwind {
407 ; CHECK-LABEL: sqrtE:
409 ; CHECK-NEXT: vsqrtpd %zmm0, %zmm0
411 %b = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
415 define <16 x float> @fadd_broadcast(<16 x float> %a) nounwind {
416 ; CHECK-LABEL: fadd_broadcast:
418 ; CHECK-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0
420 %b = fadd <16 x float> %a, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
424 define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind {
425 ; CHECK-LABEL: addq_broadcast:
427 ; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
429 %b = add <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
433 define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
434 ; CHECK-LABEL: orq_broadcast:
436 ; CHECK-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
438 %b = or <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
442 define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) {
443 ; CHECK-LABEL: andd512fold:
444 ; CHECK: ## BB#0: ## %entry
445 ; CHECK-NEXT: vpandd (%rdi), %zmm0, %zmm0
448 %a = load <16 x i32>* %x, align 4
449 %b = and <16 x i32> %y, %a
453 define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) {
454 ; CHECK-LABEL: andqbrst:
455 ; CHECK: ## BB#0: ## %entry
456 ; CHECK-NEXT: vpandq (%rdi){1to8}, %zmm0, %zmm0
459 %a = load i64* %ap, align 8
460 %b = insertelement <8 x i64> undef, i64 %a, i32 0
461 %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
462 %d = and <8 x i64> %p1, %c