1 ; RUN: llc -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s
2 ; RUN: llc -mcpu=x86-64 -mattr=+sse4.1 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE41 %s
3 ; RUN: llc -mcpu=x86-64 -mattr=+avx < %s | FileCheck --check-prefix=AVX %s
5 target triple = "x86_64-unknown-unknown"
7 ; Ensure that the backend no longer emits unnecessary vector insert
8 ; instructions immediately after SSE scalar fp instructions
11 define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
12 ; SSE-LABEL: test_add_ss:
14 ; SSE-NEXT: addss %xmm1, %xmm0
17 ; AVX-LABEL: test_add_ss:
19 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
21 %1 = extractelement <4 x float> %b, i32 0
22 %2 = extractelement <4 x float> %a, i32 0
23 %add = fadd float %2, %1
24 %3 = insertelement <4 x float> %a, float %add, i32 0
28 define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
29 ; SSE-LABEL: test_sub_ss:
31 ; SSE-NEXT: subss %xmm1, %xmm0
34 ; AVX-LABEL: test_sub_ss:
36 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
38 %1 = extractelement <4 x float> %b, i32 0
39 %2 = extractelement <4 x float> %a, i32 0
40 %sub = fsub float %2, %1
41 %3 = insertelement <4 x float> %a, float %sub, i32 0
45 define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
46 ; SSE-LABEL: test_mul_ss:
48 ; SSE-NEXT: mulss %xmm1, %xmm0
51 ; AVX-LABEL: test_mul_ss:
53 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
55 %1 = extractelement <4 x float> %b, i32 0
56 %2 = extractelement <4 x float> %a, i32 0
57 %mul = fmul float %2, %1
58 %3 = insertelement <4 x float> %a, float %mul, i32 0
62 define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
63 ; SSE-LABEL: test_div_ss:
65 ; SSE-NEXT: divss %xmm1, %xmm0
68 ; AVX-LABEL: test_div_ss:
70 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
72 %1 = extractelement <4 x float> %b, i32 0
73 %2 = extractelement <4 x float> %a, i32 0
74 %div = fdiv float %2, %1
75 %3 = insertelement <4 x float> %a, float %div, i32 0
79 define <4 x float> @test_sqrt_ss(<4 x float> %a) {
80 ; SSE2-LABEL: test_sqrt_ss:
82 ; SSE2-NEXT: sqrtss %xmm0, %xmm1
83 ; SSE2-NEXT: movss %xmm1, %xmm0
86 ; SSE41-LABEL: test_sqrt_ss:
88 ; SSE41-NEXT: sqrtss %xmm0, %xmm1
89 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
92 ; AVX-LABEL: test_sqrt_ss:
94 ; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm1
95 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
97 %1 = extractelement <4 x float> %a, i32 0
98 %2 = call float @llvm.sqrt.f32(float %1)
99 %3 = insertelement <4 x float> %a, float %2, i32 0
102 declare float @llvm.sqrt.f32(float)
104 define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
105 ; SSE-LABEL: test_add_sd:
107 ; SSE-NEXT: addsd %xmm1, %xmm0
110 ; AVX-LABEL: test_add_sd:
112 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
114 %1 = extractelement <2 x double> %b, i32 0
115 %2 = extractelement <2 x double> %a, i32 0
116 %add = fadd double %2, %1
117 %3 = insertelement <2 x double> %a, double %add, i32 0
121 define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
122 ; SSE-LABEL: test_sub_sd:
124 ; SSE-NEXT: subsd %xmm1, %xmm0
127 ; AVX-LABEL: test_sub_sd:
129 ; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
131 %1 = extractelement <2 x double> %b, i32 0
132 %2 = extractelement <2 x double> %a, i32 0
133 %sub = fsub double %2, %1
134 %3 = insertelement <2 x double> %a, double %sub, i32 0
138 define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
139 ; SSE-LABEL: test_mul_sd:
141 ; SSE-NEXT: mulsd %xmm1, %xmm0
144 ; AVX-LABEL: test_mul_sd:
146 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
148 %1 = extractelement <2 x double> %b, i32 0
149 %2 = extractelement <2 x double> %a, i32 0
150 %mul = fmul double %2, %1
151 %3 = insertelement <2 x double> %a, double %mul, i32 0
155 define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
156 ; SSE-LABEL: test_div_sd:
158 ; SSE-NEXT: divsd %xmm1, %xmm0
161 ; AVX-LABEL: test_div_sd:
163 ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
165 %1 = extractelement <2 x double> %b, i32 0
166 %2 = extractelement <2 x double> %a, i32 0
167 %div = fdiv double %2, %1
168 %3 = insertelement <2 x double> %a, double %div, i32 0
172 define <2 x double> @test_sqrt_sd(<2 x double> %a) {
173 ; SSE-LABEL: test_sqrt_sd:
175 ; SSE-NEXT: sqrtsd %xmm0, %xmm1
176 ; SSE-NEXT: movsd %xmm1, %xmm0
179 ; AVX-LABEL: test_sqrt_sd:
181 ; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm1
182 ; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0
184 %1 = extractelement <2 x double> %a, i32 0
185 %2 = call double @llvm.sqrt.f64(double %1)
186 %3 = insertelement <2 x double> %a, double %2, i32 0
189 declare double @llvm.sqrt.f64(double)
191 define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
192 ; SSE-LABEL: test2_add_ss:
194 ; SSE-NEXT: addss %xmm0, %xmm1
195 ; SSE-NEXT: movaps %xmm1, %xmm0
198 ; AVX-LABEL: test2_add_ss:
200 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
202 %1 = extractelement <4 x float> %a, i32 0
203 %2 = extractelement <4 x float> %b, i32 0
204 %add = fadd float %1, %2
205 %3 = insertelement <4 x float> %b, float %add, i32 0
209 define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
210 ; SSE-LABEL: test2_sub_ss:
212 ; SSE-NEXT: subss %xmm0, %xmm1
213 ; SSE-NEXT: movaps %xmm1, %xmm0
216 ; AVX-LABEL: test2_sub_ss:
218 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
220 %1 = extractelement <4 x float> %a, i32 0
221 %2 = extractelement <4 x float> %b, i32 0
222 %sub = fsub float %2, %1
223 %3 = insertelement <4 x float> %b, float %sub, i32 0
227 define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
228 ; SSE-LABEL: test2_mul_ss:
230 ; SSE-NEXT: mulss %xmm0, %xmm1
231 ; SSE-NEXT: movaps %xmm1, %xmm0
234 ; AVX-LABEL: test2_mul_ss:
236 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
238 %1 = extractelement <4 x float> %a, i32 0
239 %2 = extractelement <4 x float> %b, i32 0
240 %mul = fmul float %1, %2
241 %3 = insertelement <4 x float> %b, float %mul, i32 0
245 define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
246 ; SSE-LABEL: test2_div_ss:
248 ; SSE-NEXT: divss %xmm0, %xmm1
249 ; SSE-NEXT: movaps %xmm1, %xmm0
252 ; AVX-LABEL: test2_div_ss:
254 ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
256 %1 = extractelement <4 x float> %a, i32 0
257 %2 = extractelement <4 x float> %b, i32 0
258 %div = fdiv float %2, %1
259 %3 = insertelement <4 x float> %b, float %div, i32 0
263 define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
264 ; SSE-LABEL: test2_add_sd:
266 ; SSE-NEXT: addsd %xmm0, %xmm1
267 ; SSE-NEXT: movapd %xmm1, %xmm0
270 ; AVX-LABEL: test2_add_sd:
272 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
274 %1 = extractelement <2 x double> %a, i32 0
275 %2 = extractelement <2 x double> %b, i32 0
276 %add = fadd double %1, %2
277 %3 = insertelement <2 x double> %b, double %add, i32 0
281 define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
282 ; SSE-LABEL: test2_sub_sd:
284 ; SSE-NEXT: subsd %xmm0, %xmm1
285 ; SSE-NEXT: movapd %xmm1, %xmm0
288 ; AVX-LABEL: test2_sub_sd:
290 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
292 %1 = extractelement <2 x double> %a, i32 0
293 %2 = extractelement <2 x double> %b, i32 0
294 %sub = fsub double %2, %1
295 %3 = insertelement <2 x double> %b, double %sub, i32 0
299 define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
300 ; SSE-LABEL: test2_mul_sd:
302 ; SSE-NEXT: mulsd %xmm0, %xmm1
303 ; SSE-NEXT: movapd %xmm1, %xmm0
306 ; AVX-LABEL: test2_mul_sd:
308 ; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
310 %1 = extractelement <2 x double> %a, i32 0
311 %2 = extractelement <2 x double> %b, i32 0
312 %mul = fmul double %1, %2
313 %3 = insertelement <2 x double> %b, double %mul, i32 0
317 define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
318 ; SSE-LABEL: test2_div_sd:
320 ; SSE-NEXT: divsd %xmm0, %xmm1
321 ; SSE-NEXT: movapd %xmm1, %xmm0
324 ; AVX-LABEL: test2_div_sd:
326 ; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0
328 %1 = extractelement <2 x double> %a, i32 0
329 %2 = extractelement <2 x double> %b, i32 0
330 %div = fdiv double %2, %1
331 %3 = insertelement <2 x double> %b, double %div, i32 0
335 define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
336 ; SSE-LABEL: test_multiple_add_ss:
338 ; SSE-NEXT: addss %xmm0, %xmm1
339 ; SSE-NEXT: addss %xmm1, %xmm0
342 ; AVX-LABEL: test_multiple_add_ss:
344 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1
345 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
347 %1 = extractelement <4 x float> %b, i32 0
348 %2 = extractelement <4 x float> %a, i32 0
349 %add = fadd float %2, %1
350 %add2 = fadd float %2, %add
351 %3 = insertelement <4 x float> %a, float %add2, i32 0
355 define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
356 ; SSE-LABEL: test_multiple_sub_ss:
358 ; SSE-NEXT: movaps %xmm0, %xmm2
359 ; SSE-NEXT: subss %xmm1, %xmm2
360 ; SSE-NEXT: subss %xmm2, %xmm0
363 ; AVX-LABEL: test_multiple_sub_ss:
365 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm1
366 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
368 %1 = extractelement <4 x float> %b, i32 0
369 %2 = extractelement <4 x float> %a, i32 0
370 %sub = fsub float %2, %1
371 %sub2 = fsub float %2, %sub
372 %3 = insertelement <4 x float> %a, float %sub2, i32 0
376 define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
377 ; SSE-LABEL: test_multiple_mul_ss:
379 ; SSE-NEXT: mulss %xmm0, %xmm1
380 ; SSE-NEXT: mulss %xmm1, %xmm0
383 ; AVX-LABEL: test_multiple_mul_ss:
385 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1
386 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
388 %1 = extractelement <4 x float> %b, i32 0
389 %2 = extractelement <4 x float> %a, i32 0
390 %mul = fmul float %2, %1
391 %mul2 = fmul float %2, %mul
392 %3 = insertelement <4 x float> %a, float %mul2, i32 0
396 define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
397 ; SSE-LABEL: test_multiple_div_ss:
399 ; SSE-NEXT: movaps %xmm0, %xmm2
400 ; SSE-NEXT: divss %xmm1, %xmm2
401 ; SSE-NEXT: divss %xmm2, %xmm0
404 ; AVX-LABEL: test_multiple_div_ss:
406 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm1
407 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
409 %1 = extractelement <4 x float> %b, i32 0
410 %2 = extractelement <4 x float> %a, i32 0
411 %div = fdiv float %2, %1
412 %div2 = fdiv float %2, %div
413 %3 = insertelement <4 x float> %a, float %div2, i32 0
417 ; With SSE4.1 or greater, the shuffles in the following tests may
418 ; be lowered to X86Blendi nodes.
420 define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
421 ; SSE-LABEL: blend_add_ss:
423 ; SSE-NEXT: addss %xmm1, %xmm0
426 ; AVX-LABEL: blend_add_ss:
428 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
431 %ext = extractelement <4 x float> %a, i32 0
432 %op = fadd float %b, %ext
433 %ins = insertelement <4 x float> undef, float %op, i32 0
434 %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
435 ret <4 x float> %shuf
438 define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
439 ; SSE-LABEL: blend_sub_ss:
441 ; SSE-NEXT: subss %xmm1, %xmm0
444 ; AVX-LABEL: blend_sub_ss:
446 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
449 %ext = extractelement <4 x float> %a, i32 0
450 %op = fsub float %ext, %b
451 %ins = insertelement <4 x float> undef, float %op, i32 0
452 %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
453 ret <4 x float> %shuf
456 define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
457 ; SSE-LABEL: blend_mul_ss:
459 ; SSE-NEXT: mulss %xmm1, %xmm0
462 ; AVX-LABEL: blend_mul_ss:
464 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
467 %ext = extractelement <4 x float> %a, i32 0
468 %op = fmul float %b, %ext
469 %ins = insertelement <4 x float> undef, float %op, i32 0
470 %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
471 ret <4 x float> %shuf
474 define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
475 ; SSE-LABEL: blend_div_ss:
477 ; SSE-NEXT: divss %xmm1, %xmm0
480 ; AVX-LABEL: blend_div_ss:
482 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
485 %ext = extractelement <4 x float> %a, i32 0
486 %op = fdiv float %ext, %b
487 %ins = insertelement <4 x float> undef, float %op, i32 0
488 %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
489 ret <4 x float> %shuf
492 define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
493 ; SSE-LABEL: blend_add_sd:
495 ; SSE-NEXT: addsd %xmm1, %xmm0
498 ; AVX-LABEL: blend_add_sd:
500 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
503 %ext = extractelement <2 x double> %a, i32 0
504 %op = fadd double %b, %ext
505 %ins = insertelement <2 x double> undef, double %op, i32 0
506 %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
507 ret <2 x double> %shuf
510 define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
511 ; SSE-LABEL: blend_sub_sd:
513 ; SSE-NEXT: subsd %xmm1, %xmm0
516 ; AVX-LABEL: blend_sub_sd:
518 ; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
521 %ext = extractelement <2 x double> %a, i32 0
522 %op = fsub double %ext, %b
523 %ins = insertelement <2 x double> undef, double %op, i32 0
524 %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
525 ret <2 x double> %shuf
528 define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
529 ; SSE-LABEL: blend_mul_sd:
531 ; SSE-NEXT: mulsd %xmm1, %xmm0
534 ; AVX-LABEL: blend_mul_sd:
536 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
539 %ext = extractelement <2 x double> %a, i32 0
540 %op = fmul double %b, %ext
541 %ins = insertelement <2 x double> undef, double %op, i32 0
542 %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
543 ret <2 x double> %shuf
546 define <2 x double> @blend_div_sd(<2 x double> %a, double %b) {
547 ; SSE-LABEL: blend_div_sd:
549 ; SSE-NEXT: divsd %xmm1, %xmm0
552 ; AVX-LABEL: blend_div_sd:
554 ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
557 %ext = extractelement <2 x double> %a, i32 0
558 %op = fdiv double %ext, %b
559 %ins = insertelement <2 x double> undef, double %op, i32 0
560 %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
561 ret <2 x double> %shuf
564 ; Ensure that the backend selects SSE/AVX scalar fp instructions
565 ; from a packed fp instruction plus a vector insert.
567 define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) {
568 ; SSE-LABEL: insert_test_add_ss:
570 ; SSE-NEXT: addss %xmm1, %xmm0
573 ; AVX-LABEL: insert_test_add_ss:
575 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
577 %1 = fadd <4 x float> %a, %b
578 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
582 define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) {
583 ; SSE-LABEL: insert_test_sub_ss:
585 ; SSE-NEXT: subss %xmm1, %xmm0
588 ; AVX-LABEL: insert_test_sub_ss:
590 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
592 %1 = fsub <4 x float> %a, %b
593 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
597 define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) {
598 ; SSE-LABEL: insert_test_mul_ss:
600 ; SSE-NEXT: mulss %xmm1, %xmm0
603 ; AVX-LABEL: insert_test_mul_ss:
605 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
607 %1 = fmul <4 x float> %a, %b
608 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
612 define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) {
613 ; SSE-LABEL: insert_test_div_ss:
615 ; SSE-NEXT: divss %xmm1, %xmm0
618 ; AVX-LABEL: insert_test_div_ss:
620 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
622 %1 = fdiv <4 x float> %a, %b
623 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
627 define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) {
628 ; SSE-LABEL: insert_test_add_sd:
630 ; SSE-NEXT: addsd %xmm1, %xmm0
633 ; AVX-LABEL: insert_test_add_sd:
635 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
637 %1 = fadd <2 x double> %a, %b
638 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
642 define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) {
643 ; SSE-LABEL: insert_test_sub_sd:
645 ; SSE-NEXT: subsd %xmm1, %xmm0
648 ; AVX-LABEL: insert_test_sub_sd:
650 ; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
652 %1 = fsub <2 x double> %a, %b
653 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
657 define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) {
658 ; SSE-LABEL: insert_test_mul_sd:
660 ; SSE-NEXT: mulsd %xmm1, %xmm0
663 ; AVX-LABEL: insert_test_mul_sd:
665 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
667 %1 = fmul <2 x double> %a, %b
668 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
672 define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) {
673 ; SSE-LABEL: insert_test_div_sd:
675 ; SSE-NEXT: divsd %xmm1, %xmm0
678 ; AVX-LABEL: insert_test_div_sd:
680 ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
682 %1 = fdiv <2 x double> %a, %b
683 %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
687 define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) {
688 ; SSE-LABEL: insert_test2_add_ss:
690 ; SSE-NEXT: addss %xmm0, %xmm1
691 ; SSE-NEXT: movaps %xmm1, %xmm0
694 ; AVX-LABEL: insert_test2_add_ss:
696 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
698 %1 = fadd <4 x float> %b, %a
699 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
703 define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) {
704 ; SSE-LABEL: insert_test2_sub_ss:
706 ; SSE-NEXT: subss %xmm0, %xmm1
707 ; SSE-NEXT: movaps %xmm1, %xmm0
710 ; AVX-LABEL: insert_test2_sub_ss:
712 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
714 %1 = fsub <4 x float> %b, %a
715 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
719 define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) {
720 ; SSE-LABEL: insert_test2_mul_ss:
722 ; SSE-NEXT: mulss %xmm0, %xmm1
723 ; SSE-NEXT: movaps %xmm1, %xmm0
726 ; AVX-LABEL: insert_test2_mul_ss:
728 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
730 %1 = fmul <4 x float> %b, %a
731 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
735 define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) {
736 ; SSE-LABEL: insert_test2_div_ss:
738 ; SSE-NEXT: divss %xmm0, %xmm1
739 ; SSE-NEXT: movaps %xmm1, %xmm0
742 ; AVX-LABEL: insert_test2_div_ss:
744 ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
746 %1 = fdiv <4 x float> %b, %a
747 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
751 define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) {
752 ; SSE-LABEL: insert_test2_add_sd:
754 ; SSE-NEXT: addsd %xmm0, %xmm1
755 ; SSE-NEXT: movapd %xmm1, %xmm0
758 ; AVX-LABEL: insert_test2_add_sd:
760 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
762 %1 = fadd <2 x double> %b, %a
763 %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
767 define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) {
768 ; SSE-LABEL: insert_test2_sub_sd:
770 ; SSE-NEXT: subsd %xmm0, %xmm1
771 ; SSE-NEXT: movapd %xmm1, %xmm0
774 ; AVX-LABEL: insert_test2_sub_sd:
776 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
778 %1 = fsub <2 x double> %b, %a
779 %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
783 define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) {
784 ; SSE-LABEL: insert_test2_mul_sd:
786 ; SSE-NEXT: mulsd %xmm0, %xmm1
787 ; SSE-NEXT: movapd %xmm1, %xmm0
790 ; AVX-LABEL: insert_test2_mul_sd:
792 ; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
794 %1 = fmul <2 x double> %b, %a
795 %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
799 define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) {
800 ; SSE-LABEL: insert_test2_div_sd:
802 ; SSE-NEXT: divsd %xmm0, %xmm1
803 ; SSE-NEXT: movapd %xmm1, %xmm0
806 ; AVX-LABEL: insert_test2_div_sd:
808 ; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0
810 %1 = fdiv <2 x double> %b, %a
811 %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
815 define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) {
816 ; SSE-LABEL: insert_test3_add_ss:
818 ; SSE-NEXT: addss %xmm1, %xmm0
821 ; AVX-LABEL: insert_test3_add_ss:
823 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
825 %1 = fadd <4 x float> %a, %b
826 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
830 define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) {
831 ; SSE-LABEL: insert_test3_sub_ss:
833 ; SSE-NEXT: subss %xmm1, %xmm0
836 ; AVX-LABEL: insert_test3_sub_ss:
838 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
840 %1 = fsub <4 x float> %a, %b
841 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
845 define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) {
846 ; SSE-LABEL: insert_test3_mul_ss:
848 ; SSE-NEXT: mulss %xmm1, %xmm0
851 ; AVX-LABEL: insert_test3_mul_ss:
853 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
855 %1 = fmul <4 x float> %a, %b
856 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
860 define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) {
861 ; SSE-LABEL: insert_test3_div_ss:
863 ; SSE-NEXT: divss %xmm1, %xmm0
866 ; AVX-LABEL: insert_test3_div_ss:
868 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
870 %1 = fdiv <4 x float> %a, %b
871 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
875 define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) {
876 ; SSE-LABEL: insert_test3_add_sd:
878 ; SSE-NEXT: addsd %xmm1, %xmm0
881 ; AVX-LABEL: insert_test3_add_sd:
883 ; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
885 %1 = fadd <2 x double> %a, %b
886 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
890 define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) {
891 ; SSE-LABEL: insert_test3_sub_sd:
893 ; SSE-NEXT: subsd %xmm1, %xmm0
896 ; AVX-LABEL: insert_test3_sub_sd:
898 ; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
900 %1 = fsub <2 x double> %a, %b
901 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
905 define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) {
906 ; SSE-LABEL: insert_test3_mul_sd:
908 ; SSE-NEXT: mulsd %xmm1, %xmm0
911 ; AVX-LABEL: insert_test3_mul_sd:
913 ; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
915 %1 = fmul <2 x double> %a, %b
916 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
920 define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) {
921 ; SSE-LABEL: insert_test3_div_sd:
923 ; SSE-NEXT: divsd %xmm1, %xmm0
926 ; AVX-LABEL: insert_test3_div_sd:
928 ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
930 %1 = fdiv <2 x double> %a, %b
931 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
935 define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) {
936 ; SSE-LABEL: insert_test4_add_ss:
938 ; SSE-NEXT: addss %xmm0, %xmm1
939 ; SSE-NEXT: movaps %xmm1, %xmm0
942 ; AVX-LABEL: insert_test4_add_ss:
944 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
946 %1 = fadd <4 x float> %b, %a
947 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
951 define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) {
952 ; SSE-LABEL: insert_test4_sub_ss:
954 ; SSE-NEXT: subss %xmm0, %xmm1
955 ; SSE-NEXT: movaps %xmm1, %xmm0
958 ; AVX-LABEL: insert_test4_sub_ss:
960 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
962 %1 = fsub <4 x float> %b, %a
963 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
967 define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) {
968 ; SSE-LABEL: insert_test4_mul_ss:
970 ; SSE-NEXT: mulss %xmm0, %xmm1
971 ; SSE-NEXT: movaps %xmm1, %xmm0
974 ; AVX-LABEL: insert_test4_mul_ss:
976 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
978 %1 = fmul <4 x float> %b, %a
979 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
983 define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) {
984 ; SSE-LABEL: insert_test4_div_ss:
986 ; SSE-NEXT: divss %xmm0, %xmm1
987 ; SSE-NEXT: movaps %xmm1, %xmm0
990 ; AVX-LABEL: insert_test4_div_ss:
992 ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
994 %1 = fdiv <4 x float> %b, %a
995 %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
999 define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) {
1000 ; SSE-LABEL: insert_test4_add_sd:
1002 ; SSE-NEXT: addsd %xmm0, %xmm1
1003 ; SSE-NEXT: movapd %xmm1, %xmm0
1006 ; AVX-LABEL: insert_test4_add_sd:
1008 ; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1010 %1 = fadd <2 x double> %b, %a
1011 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1015 define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) {
1016 ; SSE-LABEL: insert_test4_sub_sd:
1018 ; SSE-NEXT: subsd %xmm0, %xmm1
1019 ; SSE-NEXT: movapd %xmm1, %xmm0
1022 ; AVX-LABEL: insert_test4_sub_sd:
1024 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
1026 %1 = fsub <2 x double> %b, %a
1027 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1031 define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) {
1032 ; SSE-LABEL: insert_test4_mul_sd:
1034 ; SSE-NEXT: mulsd %xmm0, %xmm1
1035 ; SSE-NEXT: movapd %xmm1, %xmm0
1038 ; AVX-LABEL: insert_test4_mul_sd:
1040 ; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
1042 %1 = fmul <2 x double> %b, %a
1043 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1047 define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) {
1048 ; SSE-LABEL: insert_test4_div_sd:
1050 ; SSE-NEXT: divsd %xmm0, %xmm1
1051 ; SSE-NEXT: movapd %xmm1, %xmm0
1054 ; AVX-LABEL: insert_test4_div_sd:
1056 ; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0
1058 %1 = fdiv <2 x double> %b, %a
1059 %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1