1 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 -x86-experimental-vector-shuffle-legality | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
2 ; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-legality | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
3 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-legality | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-legality | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-legality | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
7 ; Verify that the DAG combiner correctly folds bitwise operations across
8 ; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
9 ; basic and always-safe patterns. Also test that the DAG combiner will combine
10 ; target-specific shuffle instructions where reasonable.
12 target triple = "x86_64-unknown-unknown"
14 declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
15 declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
16 declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
18 define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
19 ; ALL-LABEL: combine_pshufd1:
20 ; ALL: # BB#0: # %entry
23 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
24 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
28 define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
29 ; ALL-LABEL: combine_pshufd2:
30 ; ALL: # BB#0: # %entry
33 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
34 %b.cast = bitcast <4 x i32> %b to <8 x i16>
35 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
36 %c.cast = bitcast <8 x i16> %c to <4 x i32>
37 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
41 define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
42 ; ALL-LABEL: combine_pshufd3:
43 ; ALL: # BB#0: # %entry
46 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
47 %b.cast = bitcast <4 x i32> %b to <8 x i16>
48 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
49 %c.cast = bitcast <8 x i16> %c to <4 x i32>
50 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
54 define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
55 ; SSE-LABEL: combine_pshufd4:
56 ; SSE: # BB#0: # %entry
57 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
60 ; AVX-LABEL: combine_pshufd4:
61 ; AVX: # BB#0: # %entry
62 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
65 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
66 %b.cast = bitcast <4 x i32> %b to <8 x i16>
67 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
68 %c.cast = bitcast <8 x i16> %c to <4 x i32>
69 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
73 define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
74 ; SSE-LABEL: combine_pshufd5:
75 ; SSE: # BB#0: # %entry
76 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
79 ; AVX-LABEL: combine_pshufd5:
80 ; AVX: # BB#0: # %entry
81 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
84 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
85 %b.cast = bitcast <4 x i32> %b to <8 x i16>
86 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
87 %c.cast = bitcast <8 x i16> %c to <4 x i32>
88 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
92 define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
93 ; SSE-LABEL: combine_pshufd6:
94 ; SSE: # BB#0: # %entry
95 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
98 ; AVX-LABEL: combine_pshufd6:
99 ; AVX: # BB#0: # %entry
100 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
103 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
104 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
108 define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
109 ; ALL-LABEL: combine_pshuflw1:
110 ; ALL: # BB#0: # %entry
113 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
114 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
118 define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
119 ; ALL-LABEL: combine_pshuflw2:
120 ; ALL: # BB#0: # %entry
123 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
124 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
125 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
129 define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
130 ; SSE-LABEL: combine_pshuflw3:
131 ; SSE: # BB#0: # %entry
132 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
135 ; AVX-LABEL: combine_pshuflw3:
136 ; AVX: # BB#0: # %entry
137 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
140 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
141 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
142 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
146 define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
147 ; SSE-LABEL: combine_pshufhw1:
148 ; SSE: # BB#0: # %entry
149 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
152 ; AVX-LABEL: combine_pshufhw1:
153 ; AVX: # BB#0: # %entry
154 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
157 %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
158 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
159 %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
163 define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
164 ; SSE-LABEL: combine_bitwise_ops_test1:
166 ; SSE-NEXT: pand %xmm1, %xmm0
167 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
170 ; AVX-LABEL: combine_bitwise_ops_test1:
172 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
173 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
175 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
176 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
177 %and = and <4 x i32> %shuf1, %shuf2
181 define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
182 ; SSE-LABEL: combine_bitwise_ops_test2:
184 ; SSE-NEXT: por %xmm1, %xmm0
185 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
188 ; AVX-LABEL: combine_bitwise_ops_test2:
190 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
191 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
193 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
194 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
195 %or = or <4 x i32> %shuf1, %shuf2
199 define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
200 ; SSE-LABEL: combine_bitwise_ops_test3:
202 ; SSE-NEXT: pxor %xmm1, %xmm0
203 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
206 ; AVX-LABEL: combine_bitwise_ops_test3:
208 ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
209 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
211 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
212 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
213 %xor = xor <4 x i32> %shuf1, %shuf2
217 define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
218 ; SSE-LABEL: combine_bitwise_ops_test4:
220 ; SSE-NEXT: pand %xmm1, %xmm0
221 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
224 ; AVX-LABEL: combine_bitwise_ops_test4:
226 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
227 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
229 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
230 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
231 %and = and <4 x i32> %shuf1, %shuf2
235 define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
236 ; SSE-LABEL: combine_bitwise_ops_test5:
238 ; SSE-NEXT: por %xmm1, %xmm0
239 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
242 ; AVX-LABEL: combine_bitwise_ops_test5:
244 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
245 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
247 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
248 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
249 %or = or <4 x i32> %shuf1, %shuf2
253 define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
254 ; SSE-LABEL: combine_bitwise_ops_test6:
256 ; SSE-NEXT: pxor %xmm1, %xmm0
257 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
260 ; AVX-LABEL: combine_bitwise_ops_test6:
262 ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
263 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
265 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
266 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
267 %xor = xor <4 x i32> %shuf1, %shuf2
272 ; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
273 ; are not performing a swizzle operations.
275 define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
276 ; SSE2-LABEL: combine_bitwise_ops_test1b:
278 ; SSE2-NEXT: andps %xmm1, %xmm0
279 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
280 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
283 ; SSSE3-LABEL: combine_bitwise_ops_test1b:
285 ; SSSE3-NEXT: andps %xmm1, %xmm0
286 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
287 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
290 ; SSE41-LABEL: combine_bitwise_ops_test1b:
292 ; SSE41-NEXT: pand %xmm1, %xmm0
293 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
296 ; AVX1-LABEL: combine_bitwise_ops_test1b:
298 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
299 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
302 ; AVX2-LABEL: combine_bitwise_ops_test1b:
304 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
305 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
307 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
308 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
309 %and = and <4 x i32> %shuf1, %shuf2
313 define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
314 ; SSE2-LABEL: combine_bitwise_ops_test2b:
316 ; SSE2-NEXT: orps %xmm1, %xmm0
317 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
318 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
321 ; SSSE3-LABEL: combine_bitwise_ops_test2b:
323 ; SSSE3-NEXT: orps %xmm1, %xmm0
324 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
325 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
328 ; SSE41-LABEL: combine_bitwise_ops_test2b:
330 ; SSE41-NEXT: por %xmm1, %xmm0
331 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
334 ; AVX1-LABEL: combine_bitwise_ops_test2b:
336 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
337 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
340 ; AVX2-LABEL: combine_bitwise_ops_test2b:
342 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
343 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
345 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
346 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
347 %or = or <4 x i32> %shuf1, %shuf2
351 define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
352 ; SSE2-LABEL: combine_bitwise_ops_test3b:
354 ; SSE2-NEXT: xorps %xmm1, %xmm0
355 ; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
358 ; SSSE3-LABEL: combine_bitwise_ops_test3b:
360 ; SSSE3-NEXT: xorps %xmm1, %xmm0
361 ; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
364 ; SSE41-LABEL: combine_bitwise_ops_test3b:
366 ; SSE41-NEXT: pxor %xmm1, %xmm0
367 ; SSE41-NEXT: pxor %xmm1, %xmm1
368 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
371 ; AVX1-LABEL: combine_bitwise_ops_test3b:
373 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
374 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
375 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
378 ; AVX2-LABEL: combine_bitwise_ops_test3b:
380 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
381 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
382 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
384 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
385 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
386 %xor = xor <4 x i32> %shuf1, %shuf2
390 define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
391 ; SSE2-LABEL: combine_bitwise_ops_test4b:
393 ; SSE2-NEXT: andps %xmm1, %xmm0
394 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
395 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
396 ; SSE2-NEXT: movaps %xmm2, %xmm0
399 ; SSSE3-LABEL: combine_bitwise_ops_test4b:
401 ; SSSE3-NEXT: andps %xmm1, %xmm0
402 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
403 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
404 ; SSSE3-NEXT: movaps %xmm2, %xmm0
407 ; SSE41-LABEL: combine_bitwise_ops_test4b:
409 ; SSE41-NEXT: pand %xmm1, %xmm0
410 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
413 ; AVX1-LABEL: combine_bitwise_ops_test4b:
415 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
416 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
419 ; AVX2-LABEL: combine_bitwise_ops_test4b:
421 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
422 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
424 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
425 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
426 %and = and <4 x i32> %shuf1, %shuf2
430 define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
431 ; SSE2-LABEL: combine_bitwise_ops_test5b:
433 ; SSE2-NEXT: orps %xmm1, %xmm0
434 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
435 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
436 ; SSE2-NEXT: movaps %xmm2, %xmm0
439 ; SSSE3-LABEL: combine_bitwise_ops_test5b:
441 ; SSSE3-NEXT: orps %xmm1, %xmm0
442 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
443 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
444 ; SSSE3-NEXT: movaps %xmm2, %xmm0
447 ; SSE41-LABEL: combine_bitwise_ops_test5b:
449 ; SSE41-NEXT: por %xmm1, %xmm0
450 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
453 ; AVX1-LABEL: combine_bitwise_ops_test5b:
455 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
456 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
459 ; AVX2-LABEL: combine_bitwise_ops_test5b:
461 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
462 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
464 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
465 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
466 %or = or <4 x i32> %shuf1, %shuf2
470 define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
471 ; SSE2-LABEL: combine_bitwise_ops_test6b:
473 ; SSE2-NEXT: xorps %xmm1, %xmm0
474 ; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
477 ; SSSE3-LABEL: combine_bitwise_ops_test6b:
479 ; SSSE3-NEXT: xorps %xmm1, %xmm0
480 ; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
483 ; SSE41-LABEL: combine_bitwise_ops_test6b:
485 ; SSE41-NEXT: pxor %xmm1, %xmm0
486 ; SSE41-NEXT: pxor %xmm1, %xmm1
487 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
490 ; AVX1-LABEL: combine_bitwise_ops_test6b:
492 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
493 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
494 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
497 ; AVX2-LABEL: combine_bitwise_ops_test6b:
499 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
500 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
501 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
503 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
504 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
505 %xor = xor <4 x i32> %shuf1, %shuf2
509 define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
510 ; SSE2-LABEL: combine_bitwise_ops_test1c:
512 ; SSE2-NEXT: andps %xmm1, %xmm0
513 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
516 ; SSSE3-LABEL: combine_bitwise_ops_test1c:
518 ; SSSE3-NEXT: andps %xmm1, %xmm0
519 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
522 ; SSE41-LABEL: combine_bitwise_ops_test1c:
524 ; SSE41-NEXT: pand %xmm1, %xmm0
525 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
526 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
529 ; AVX1-LABEL: combine_bitwise_ops_test1c:
531 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
532 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
533 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
536 ; AVX2-LABEL: combine_bitwise_ops_test1c:
538 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
539 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
540 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
542 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
543 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
544 %and = and <4 x i32> %shuf1, %shuf2
548 define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
549 ; SSE2-LABEL: combine_bitwise_ops_test2c:
551 ; SSE2-NEXT: orps %xmm1, %xmm0
552 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
555 ; SSSE3-LABEL: combine_bitwise_ops_test2c:
557 ; SSSE3-NEXT: orps %xmm1, %xmm0
558 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
561 ; SSE41-LABEL: combine_bitwise_ops_test2c:
563 ; SSE41-NEXT: por %xmm1, %xmm0
564 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
565 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
568 ; AVX1-LABEL: combine_bitwise_ops_test2c:
570 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
571 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
572 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
575 ; AVX2-LABEL: combine_bitwise_ops_test2c:
577 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
578 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
579 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
581 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
582 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
583 %or = or <4 x i32> %shuf1, %shuf2
587 define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
588 ; SSE2-LABEL: combine_bitwise_ops_test3c:
590 ; SSE2-NEXT: xorps %xmm1, %xmm0
591 ; SSE2-NEXT: xorps %xmm1, %xmm1
592 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
595 ; SSSE3-LABEL: combine_bitwise_ops_test3c:
597 ; SSSE3-NEXT: xorps %xmm1, %xmm0
598 ; SSSE3-NEXT: xorps %xmm1, %xmm1
599 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
602 ; SSE41-LABEL: combine_bitwise_ops_test3c:
604 ; SSE41-NEXT: pxor %xmm1, %xmm0
605 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
606 ; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
609 ; AVX-LABEL: combine_bitwise_ops_test3c:
611 ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
612 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
613 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
615 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
616 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
617 %xor = xor <4 x i32> %shuf1, %shuf2
621 define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
622 ; SSE2-LABEL: combine_bitwise_ops_test4c:
624 ; SSE2-NEXT: andps %xmm1, %xmm0
625 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
626 ; SSE2-NEXT: movaps %xmm2, %xmm0
629 ; SSSE3-LABEL: combine_bitwise_ops_test4c:
631 ; SSSE3-NEXT: andps %xmm1, %xmm0
632 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
633 ; SSSE3-NEXT: movaps %xmm2, %xmm0
636 ; SSE41-LABEL: combine_bitwise_ops_test4c:
638 ; SSE41-NEXT: pand %xmm1, %xmm0
639 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
640 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
643 ; AVX1-LABEL: combine_bitwise_ops_test4c:
645 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
646 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
647 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
650 ; AVX2-LABEL: combine_bitwise_ops_test4c:
652 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
653 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
654 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
656 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
657 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
658 %and = and <4 x i32> %shuf1, %shuf2
662 define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
663 ; SSE2-LABEL: combine_bitwise_ops_test5c:
665 ; SSE2-NEXT: orps %xmm1, %xmm0
666 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
667 ; SSE2-NEXT: movaps %xmm2, %xmm0
670 ; SSSE3-LABEL: combine_bitwise_ops_test5c:
672 ; SSSE3-NEXT: orps %xmm1, %xmm0
673 ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
674 ; SSSE3-NEXT: movaps %xmm2, %xmm0
677 ; SSE41-LABEL: combine_bitwise_ops_test5c:
679 ; SSE41-NEXT: por %xmm1, %xmm0
680 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
681 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
684 ; AVX1-LABEL: combine_bitwise_ops_test5c:
686 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
687 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
688 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
691 ; AVX2-LABEL: combine_bitwise_ops_test5c:
693 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
694 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
695 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
697 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
698 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
699 %or = or <4 x i32> %shuf1, %shuf2
703 define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
704 ; SSE2-LABEL: combine_bitwise_ops_test6c:
706 ; SSE2-NEXT: xorps %xmm1, %xmm0
707 ; SSE2-NEXT: xorps %xmm1, %xmm1
708 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
709 ; SSE2-NEXT: movaps %xmm1, %xmm0
712 ; SSSE3-LABEL: combine_bitwise_ops_test6c:
714 ; SSSE3-NEXT: xorps %xmm1, %xmm0
715 ; SSSE3-NEXT: xorps %xmm1, %xmm1
716 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
717 ; SSSE3-NEXT: movaps %xmm1, %xmm0
720 ; SSE41-LABEL: combine_bitwise_ops_test6c:
722 ; SSE41-NEXT: pxor %xmm1, %xmm0
723 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
724 ; SSE41-NEXT: pxor %xmm0, %xmm0
725 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
728 ; AVX1-LABEL: combine_bitwise_ops_test6c:
730 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
731 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
732 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
733 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
736 ; AVX2-LABEL: combine_bitwise_ops_test6c:
738 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
739 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
740 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
741 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
743 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
744 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
745 %xor = xor <4 x i32> %shuf1, %shuf2
749 define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
750 ; SSE-LABEL: combine_nested_undef_test1:
752 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
755 ; AVX-LABEL: combine_nested_undef_test1:
757 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
759 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
760 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
764 define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
765 ; SSE-LABEL: combine_nested_undef_test2:
767 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
770 ; AVX-LABEL: combine_nested_undef_test2:
772 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
774 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
775 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
779 define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
780 ; SSE-LABEL: combine_nested_undef_test3:
782 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
785 ; AVX-LABEL: combine_nested_undef_test3:
787 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
789 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
790 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
794 define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
795 ; SSE-LABEL: combine_nested_undef_test4:
797 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
800 ; AVX1-LABEL: combine_nested_undef_test4:
802 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
805 ; AVX2-LABEL: combine_nested_undef_test4:
807 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
809 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
810 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
814 define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
815 ; SSE-LABEL: combine_nested_undef_test5:
817 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
820 ; AVX-LABEL: combine_nested_undef_test5:
822 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
824 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
825 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
829 define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
830 ; SSE-LABEL: combine_nested_undef_test6:
832 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
835 ; AVX-LABEL: combine_nested_undef_test6:
837 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
839 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
840 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
844 define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
845 ; SSE-LABEL: combine_nested_undef_test7:
847 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
850 ; AVX-LABEL: combine_nested_undef_test7:
852 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
854 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
855 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
859 define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
860 ; SSE-LABEL: combine_nested_undef_test8:
862 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
865 ; AVX-LABEL: combine_nested_undef_test8:
867 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
869 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
870 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
874 define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
875 ; SSE-LABEL: combine_nested_undef_test9:
877 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
880 ; AVX-LABEL: combine_nested_undef_test9:
882 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
884 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
885 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
889 define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
890 ; SSE-LABEL: combine_nested_undef_test10:
892 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
895 ; AVX-LABEL: combine_nested_undef_test10:
897 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
899 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
900 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
904 define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
905 ; SSE-LABEL: combine_nested_undef_test11:
907 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
910 ; AVX-LABEL: combine_nested_undef_test11:
912 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
914 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
915 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
919 define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
920 ; SSE-LABEL: combine_nested_undef_test12:
922 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
925 ; AVX1-LABEL: combine_nested_undef_test12:
927 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
930 ; AVX2-LABEL: combine_nested_undef_test12:
932 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
934 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
935 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
939 ; The following pair of shuffles is folded into vector %A.
940 define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
941 ; ALL-LABEL: combine_nested_undef_test13:
944 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
945 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
949 ; The following pair of shuffles is folded into vector %B.
950 define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
951 ; SSE-LABEL: combine_nested_undef_test14:
953 ; SSE-NEXT: movaps %xmm1, %xmm0
956 ; AVX-LABEL: combine_nested_undef_test14:
958 ; AVX-NEXT: vmovaps %xmm1, %xmm0
960 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
961 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
966 ; Verify that we don't optimize the following cases. We expect more than one shuffle.
968 ; FIXME: Many of these already don't make sense, and the rest should stop
969 ; making sense with th enew vector shuffle lowering. Revisit at least testing for
972 define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
973 ; SSE2-LABEL: combine_nested_undef_test15:
975 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
976 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
977 ; SSE2-NEXT: movaps %xmm1, %xmm0
980 ; SSSE3-LABEL: combine_nested_undef_test15:
982 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
983 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
984 ; SSSE3-NEXT: movaps %xmm1, %xmm0
987 ; SSE41-LABEL: combine_nested_undef_test15:
989 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
990 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
991 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
994 ; AVX1-LABEL: combine_nested_undef_test15:
996 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
997 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
998 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1001 ; AVX2-LABEL: combine_nested_undef_test15:
1003 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
1004 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1005 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1007 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
1008 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1012 define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
1013 ; SSE2-LABEL: combine_nested_undef_test16:
1015 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3]
1016 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
1019 ; SSSE3-LABEL: combine_nested_undef_test16:
1021 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,3]
1022 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
1025 ; SSE41-LABEL: combine_nested_undef_test16:
1027 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1028 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1031 ; AVX1-LABEL: combine_nested_undef_test16:
1033 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1034 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1037 ; AVX2-LABEL: combine_nested_undef_test16:
1039 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1040 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1042 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1043 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1047 define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
1048 ; SSE2-LABEL: combine_nested_undef_test17:
1050 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
1051 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
1054 ; SSSE3-LABEL: combine_nested_undef_test17:
1056 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
1057 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
1060 ; SSE41-LABEL: combine_nested_undef_test17:
1062 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1063 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1066 ; AVX1-LABEL: combine_nested_undef_test17:
1068 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1069 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1072 ; AVX2-LABEL: combine_nested_undef_test17:
1074 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1075 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
1077 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
1078 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1082 define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
1083 ; SSE-LABEL: combine_nested_undef_test18:
1085 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
1088 ; AVX-LABEL: combine_nested_undef_test18:
1090 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
1092 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1093 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
1097 define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
1098 ; SSE2-LABEL: combine_nested_undef_test19:
1100 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
1101 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,0]
1102 ; SSE2-NEXT: movaps %xmm1, %xmm0
1105 ; SSSE3-LABEL: combine_nested_undef_test19:
1107 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
1108 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,0]
1109 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1112 ; SSE41-LABEL: combine_nested_undef_test19:
1114 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1115 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
1118 ; AVX1-LABEL: combine_nested_undef_test19:
1120 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1121 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
1124 ; AVX2-LABEL: combine_nested_undef_test19:
1126 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1127 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
1129 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
1130 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
1134 define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
1135 ; SSE2-LABEL: combine_nested_undef_test20:
1137 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
1138 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1139 ; SSE2-NEXT: movaps %xmm1, %xmm0
1142 ; SSSE3-LABEL: combine_nested_undef_test20:
1144 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
1145 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1146 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1149 ; SSE41-LABEL: combine_nested_undef_test20:
1151 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1152 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1155 ; AVX1-LABEL: combine_nested_undef_test20:
1157 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1158 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1161 ; AVX2-LABEL: combine_nested_undef_test20:
1163 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1164 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1166 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
1167 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1171 define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
1172 ; SSE2-LABEL: combine_nested_undef_test21:
1174 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,1]
1175 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
1176 ; SSE2-NEXT: movaps %xmm1, %xmm0
1179 ; SSSE3-LABEL: combine_nested_undef_test21:
1181 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,1]
1182 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
1183 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1186 ; SSE41-LABEL: combine_nested_undef_test21:
1188 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1189 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1192 ; AVX1-LABEL: combine_nested_undef_test21:
1194 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1195 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1198 ; AVX2-LABEL: combine_nested_undef_test21:
1200 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1201 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
1203 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
1204 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1209 ; Test that we correctly combine shuffles according to rule
1210 ; shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
1212 define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
1213 ; SSE-LABEL: combine_nested_undef_test22:
1215 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1218 ; AVX-LABEL: combine_nested_undef_test22:
1220 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1222 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1223 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
1227 define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
1228 ; SSE-LABEL: combine_nested_undef_test23:
1230 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1233 ; AVX-LABEL: combine_nested_undef_test23:
1235 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1237 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1238 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1242 define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
1243 ; SSE-LABEL: combine_nested_undef_test24:
1245 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
1248 ; AVX-LABEL: combine_nested_undef_test24:
1250 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
1252 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1253 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
1257 define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
1258 ; SSE-LABEL: combine_nested_undef_test25:
1260 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1263 ; AVX1-LABEL: combine_nested_undef_test25:
1265 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1268 ; AVX2-LABEL: combine_nested_undef_test25:
1270 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
1272 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
1273 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
1277 define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
1278 ; SSE-LABEL: combine_nested_undef_test26:
1280 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1283 ; AVX-LABEL: combine_nested_undef_test26:
1285 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1287 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
1288 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
1292 define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
1293 ; SSE-LABEL: combine_nested_undef_test27:
1295 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1298 ; AVX1-LABEL: combine_nested_undef_test27:
1300 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1303 ; AVX2-LABEL: combine_nested_undef_test27:
1305 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
1307 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
1308 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
1312 define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
1313 ; SSE-LABEL: combine_nested_undef_test28:
1315 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1318 ; AVX-LABEL: combine_nested_undef_test28:
1320 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1322 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
1323 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
1327 define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
1328 ; SSE-LABEL: combine_test1:
1330 ; SSE-NEXT: movaps %xmm1, %xmm0
1333 ; AVX-LABEL: combine_test1:
1335 ; AVX-NEXT: vmovaps %xmm1, %xmm0
1337 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1338 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1342 define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
1343 ; SSE2-LABEL: combine_test2:
1345 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1346 ; SSE2-NEXT: movaps %xmm1, %xmm0
1349 ; SSSE3-LABEL: combine_test2:
1351 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1352 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1355 ; SSE41-LABEL: combine_test2:
1357 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1360 ; AVX-LABEL: combine_test2:
1362 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1364 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1365 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1369 define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
1370 ; SSE-LABEL: combine_test3:
1372 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1375 ; AVX-LABEL: combine_test3:
1377 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1379 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1380 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1384 define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
1385 ; SSE-LABEL: combine_test4:
1387 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1388 ; SSE-NEXT: movapd %xmm1, %xmm0
1391 ; AVX-LABEL: combine_test4:
1393 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1395 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1396 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1400 define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
1401 ; SSE2-LABEL: combine_test5:
1403 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1404 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1407 ; SSSE3-LABEL: combine_test5:
1409 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1410 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1413 ; SSE41-LABEL: combine_test5:
1415 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1418 ; AVX-LABEL: combine_test5:
1420 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1422 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1423 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1427 define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
1428 ; SSE-LABEL: combine_test6:
1430 ; SSE-NEXT: movaps %xmm1, %xmm0
1433 ; AVX-LABEL: combine_test6:
1435 ; AVX-NEXT: vmovaps %xmm1, %xmm0
1437 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1438 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1442 define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
1443 ; SSE2-LABEL: combine_test7:
1445 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1446 ; SSE2-NEXT: movaps %xmm1, %xmm0
1449 ; SSSE3-LABEL: combine_test7:
1451 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1452 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1455 ; SSE41-LABEL: combine_test7:
1457 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1460 ; AVX1-LABEL: combine_test7:
1462 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1465 ; AVX2-LABEL: combine_test7:
1467 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1469 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1470 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1474 define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
1475 ; SSE-LABEL: combine_test8:
1477 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1480 ; AVX-LABEL: combine_test8:
1482 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1484 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1485 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1489 define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
1490 ; SSE-LABEL: combine_test9:
1492 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1493 ; SSE-NEXT: movdqa %xmm1, %xmm0
1496 ; AVX-LABEL: combine_test9:
1498 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1500 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1501 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1505 define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
1506 ; SSE2-LABEL: combine_test10:
1508 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1509 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1512 ; SSSE3-LABEL: combine_test10:
1514 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1515 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1518 ; SSE41-LABEL: combine_test10:
1520 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1523 ; AVX1-LABEL: combine_test10:
1525 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1528 ; AVX2-LABEL: combine_test10:
1530 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1532 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1533 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1537 define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
1538 ; ALL-LABEL: combine_test11:
1541 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1542 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1546 define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
1547 ; SSE2-LABEL: combine_test12:
1549 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1550 ; SSE2-NEXT: movaps %xmm1, %xmm0
1553 ; SSSE3-LABEL: combine_test12:
1555 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1556 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1559 ; SSE41-LABEL: combine_test12:
1561 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1564 ; AVX-LABEL: combine_test12:
1566 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1568 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1569 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1573 define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
1574 ; SSE-LABEL: combine_test13:
1576 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1579 ; AVX-LABEL: combine_test13:
1581 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1583 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1584 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1588 define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
1589 ; SSE-LABEL: combine_test14:
1591 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1594 ; AVX-LABEL: combine_test14:
1596 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1598 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1599 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1603 define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
1604 ; SSE2-LABEL: combine_test15:
1606 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1607 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1610 ; SSSE3-LABEL: combine_test15:
1612 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1613 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1616 ; SSE41-LABEL: combine_test15:
1618 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1621 ; AVX-LABEL: combine_test15:
1623 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1625 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1626 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1630 define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
1631 ; ALL-LABEL: combine_test16:
1634 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1635 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1639 define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
1640 ; SSE2-LABEL: combine_test17:
1642 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1643 ; SSE2-NEXT: movaps %xmm1, %xmm0
1646 ; SSSE3-LABEL: combine_test17:
1648 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1649 ; SSSE3-NEXT: movaps %xmm1, %xmm0
1652 ; SSE41-LABEL: combine_test17:
1654 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1657 ; AVX1-LABEL: combine_test17:
1659 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1662 ; AVX2-LABEL: combine_test17:
1664 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1666 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1667 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1671 define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
1672 ; SSE-LABEL: combine_test18:
1674 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1677 ; AVX-LABEL: combine_test18:
1679 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1681 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1682 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1686 define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
1687 ; SSE-LABEL: combine_test19:
1689 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1692 ; AVX-LABEL: combine_test19:
1694 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1696 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1697 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1701 define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
1702 ; SSE2-LABEL: combine_test20:
1704 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1705 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1708 ; SSSE3-LABEL: combine_test20:
1710 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1711 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1714 ; SSE41-LABEL: combine_test20:
1716 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1719 ; AVX1-LABEL: combine_test20:
1721 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1724 ; AVX2-LABEL: combine_test20:
1726 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1728 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1729 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1733 define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) {
1734 ; SSE-LABEL: combine_test21:
1736 ; SSE-NEXT: movdqa %xmm0, %xmm2
1737 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1738 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1739 ; SSE-NEXT: movdqa %xmm2, (%rdi)
1742 ; AVX1-LABEL: combine_test21:
1744 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1745 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
1746 ; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1747 ; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
1748 ; AVX1-NEXT: vzeroupper
1751 ; AVX2-LABEL: combine_test21:
1753 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1754 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
1755 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1756 ; AVX2-NEXT: vmovdqa %xmm2, (%rdi)
1757 ; AVX2-NEXT: vzeroupper
1759 %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1760 %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1761 store <4 x i32> %1, <4 x i32>* %ptr, align 16
1765 define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
1766 ; SSE-LABEL: combine_test22:
1768 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1769 ; SSE-NEXT: movhpd (%rsi), %xmm0
1772 ; AVX-LABEL: combine_test22:
1774 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1775 ; AVX-NEXT: vmovhpd (%rsi), %xmm0, %xmm0
1777 ; Current AVX2 lowering of this is still awful, not adding a test case.
1778 %1 = load <2 x float>* %a, align 8
1779 %2 = load <2 x float>* %b, align 8
1780 %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1784 ; Check some negative cases.
1785 ; FIXME: Do any of these really make sense? Are they redundant with the above tests?
1787 define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
1788 ; SSE-LABEL: combine_test1b:
1790 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
1791 ; SSE-NEXT: movaps %xmm1, %xmm0
1794 ; AVX-LABEL: combine_test1b:
1796 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0]
1798 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1799 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
1803 define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
1804 ; SSE2-LABEL: combine_test2b:
1806 ; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0,0]
1807 ; SSE2-NEXT: movaps %xmm1, %xmm0
1810 ; SSSE3-LABEL: combine_test2b:
1812 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
1815 ; SSE41-LABEL: combine_test2b:
1817 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
1820 ; AVX-LABEL: combine_test2b:
1822 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0]
1824 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1825 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
1829 define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
1830 ; SSE2-LABEL: combine_test3b:
1832 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1833 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1836 ; SSSE3-LABEL: combine_test3b:
1838 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1839 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1842 ; SSE41-LABEL: combine_test3b:
1844 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1845 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1848 ; AVX-LABEL: combine_test3b:
1850 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1851 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1853 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
1854 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
1858 define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
1859 ; SSE-LABEL: combine_test4b:
1861 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
1862 ; SSE-NEXT: movaps %xmm1, %xmm0
1865 ; AVX-LABEL: combine_test4b:
1867 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3]
1869 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1870 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
1875 ; Verify that we correctly fold shuffles even when we use illegal vector types.
1877 define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
1878 ; SSE2-LABEL: combine_test1c:
1880 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1881 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1882 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1883 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1884 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1885 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1886 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1889 ; SSSE3-LABEL: combine_test1c:
1891 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1892 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1893 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1894 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1895 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1896 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1897 ; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1900 ; SSE41-LABEL: combine_test1c:
1902 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1903 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1904 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1907 ; AVX1-LABEL: combine_test1c:
1909 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1910 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1911 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1914 ; AVX2-LABEL: combine_test1c:
1916 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1917 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1918 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1920 %A = load <4 x i8>* %a
1921 %B = load <4 x i8>* %b
1922 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1923 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1927 define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
1928 ; SSE2-LABEL: combine_test2c:
1930 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1931 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1932 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1933 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1934 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1935 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1936 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1939 ; SSSE3-LABEL: combine_test2c:
1941 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1942 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1943 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1944 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1945 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1946 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1947 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1950 ; SSE41-LABEL: combine_test2c:
1952 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1953 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1954 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1957 ; AVX-LABEL: combine_test2c:
1959 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1960 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1961 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1963 %A = load <4 x i8>* %a
1964 %B = load <4 x i8>* %b
1965 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
1966 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1970 define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
1971 ; SSE2-LABEL: combine_test3c:
1973 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1974 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1975 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1976 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1977 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1978 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1979 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1982 ; SSSE3-LABEL: combine_test3c:
1984 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1985 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1986 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1987 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1988 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1989 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1990 ; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1993 ; SSE41-LABEL: combine_test3c:
1995 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1996 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1997 ; SSE41-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2000 ; AVX-LABEL: combine_test3c:
2002 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2003 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2004 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2006 %A = load <4 x i8>* %a
2007 %B = load <4 x i8>* %b
2008 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2009 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2013 define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
2014 ; SSE2-LABEL: combine_test4c:
2016 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2017 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2018 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2019 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2020 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2021 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2022 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2023 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2026 ; SSSE3-LABEL: combine_test4c:
2028 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2029 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2030 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2031 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2032 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2033 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2034 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
2035 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
2038 ; SSE41-LABEL: combine_test4c:
2040 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2041 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2042 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
2045 ; AVX1-LABEL: combine_test4c:
2047 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2048 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2049 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
2052 ; AVX2-LABEL: combine_test4c:
2054 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2055 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2056 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
2058 %A = load <4 x i8>* %a
2059 %B = load <4 x i8>* %b
2060 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
2061 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
2066 ; The following test cases are generated from this C++ code
2068 ;__m128 blend_01(__m128 a, __m128 b)
2071 ; s = _mm_blend_ps( s, b, 1<<0 );
2072 ; s = _mm_blend_ps( s, b, 1<<1 );
2076 ;__m128 blend_02(__m128 a, __m128 b)
2079 ; s = _mm_blend_ps( s, b, 1<<0 );
2080 ; s = _mm_blend_ps( s, b, 1<<2 );
2084 ;__m128 blend_123(__m128 a, __m128 b)
2087 ; s = _mm_blend_ps( s, b, 1<<1 );
2088 ; s = _mm_blend_ps( s, b, 1<<2 );
2089 ; s = _mm_blend_ps( s, b, 1<<3 );
2093 ; Ideally, we should collapse the following shuffles into a single one.
2095 define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
2096 ; SSE2-LABEL: combine_blend_01:
2098 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2101 ; SSSE3-LABEL: combine_blend_01:
2103 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2106 ; SSE41-LABEL: combine_blend_01:
2108 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2111 ; AVX-LABEL: combine_blend_01:
2113 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2115 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
2116 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
2117 ret <4 x float> %shuffle6
2120 define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
2121 ; SSE2-LABEL: combine_blend_02:
2123 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
2124 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
2125 ; SSE2-NEXT: movaps %xmm1, %xmm0
2128 ; SSSE3-LABEL: combine_blend_02:
2130 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
2131 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
2132 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2135 ; SSE41-LABEL: combine_blend_02:
2137 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2140 ; AVX-LABEL: combine_blend_02:
2142 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2144 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
2145 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
2146 ret <4 x float> %shuffle6
2149 define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
2150 ; SSE2-LABEL: combine_blend_123:
2152 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2153 ; SSE2-NEXT: movaps %xmm1, %xmm0
2156 ; SSSE3-LABEL: combine_blend_123:
2158 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2159 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2162 ; SSE41-LABEL: combine_blend_123:
2164 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2167 ; AVX-LABEL: combine_blend_123:
2169 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2171 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
2172 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
2173 %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
2174 ret <4 x float> %shuffle12
2177 define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
2178 ; SSE-LABEL: combine_test_movhl_1:
2180 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2181 ; SSE-NEXT: movdqa %xmm1, %xmm0
2184 ; AVX-LABEL: combine_test_movhl_1:
2186 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2188 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
2189 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
2193 define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
2194 ; SSE-LABEL: combine_test_movhl_2:
2196 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2197 ; SSE-NEXT: movdqa %xmm1, %xmm0
2200 ; AVX-LABEL: combine_test_movhl_2:
2202 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2204 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
2205 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
2209 define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
2210 ; SSE-LABEL: combine_test_movhl_3:
2212 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2213 ; SSE-NEXT: movdqa %xmm1, %xmm0
2216 ; AVX-LABEL: combine_test_movhl_3:
2218 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2220 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
2221 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
2226 ; Verify that we fold shuffles according to rule:
2227 ; (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2)
2229 define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
2230 ; SSE2-LABEL: combine_undef_input_test1:
2232 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2235 ; SSSE3-LABEL: combine_undef_input_test1:
2237 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2240 ; SSE41-LABEL: combine_undef_input_test1:
2242 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2245 ; AVX-LABEL: combine_undef_input_test1:
2247 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2249 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2250 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2254 define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
2255 ; SSE-LABEL: combine_undef_input_test2:
2257 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2260 ; AVX-LABEL: combine_undef_input_test2:
2262 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2264 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2265 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2269 define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
2270 ; SSE-LABEL: combine_undef_input_test3:
2272 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2275 ; AVX-LABEL: combine_undef_input_test3:
2277 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2279 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2280 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2284 define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
2285 ; SSE-LABEL: combine_undef_input_test4:
2287 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2288 ; SSE-NEXT: movapd %xmm1, %xmm0
2291 ; AVX-LABEL: combine_undef_input_test4:
2293 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2295 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2296 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2300 define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
2301 ; SSE2-LABEL: combine_undef_input_test5:
2303 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2304 ; SSE2-NEXT: movapd %xmm1, %xmm0
2307 ; SSSE3-LABEL: combine_undef_input_test5:
2309 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2310 ; SSSE3-NEXT: movapd %xmm1, %xmm0
2313 ; SSE41-LABEL: combine_undef_input_test5:
2315 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2318 ; AVX-LABEL: combine_undef_input_test5:
2320 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2322 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2323 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2328 ; Verify that we fold shuffles according to rule:
2329 ; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2331 define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
2332 ; ALL-LABEL: combine_undef_input_test6:
2335 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2336 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2340 define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
2341 ; SSE2-LABEL: combine_undef_input_test7:
2343 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
2346 ; SSSE3-LABEL: combine_undef_input_test7:
2348 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2351 ; SSE41-LABEL: combine_undef_input_test7:
2353 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2356 ; AVX-LABEL: combine_undef_input_test7:
2358 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2360 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2361 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2365 define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
2366 ; SSE2-LABEL: combine_undef_input_test8:
2368 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
2371 ; SSSE3-LABEL: combine_undef_input_test8:
2373 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2376 ; SSE41-LABEL: combine_undef_input_test8:
2378 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2381 ; AVX-LABEL: combine_undef_input_test8:
2383 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2385 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2386 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2390 define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
2391 ; SSE-LABEL: combine_undef_input_test9:
2393 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
2396 ; AVX-LABEL: combine_undef_input_test9:
2398 ; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
2400 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2401 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2405 define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
2406 ; ALL-LABEL: combine_undef_input_test10:
2409 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2410 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2414 define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
2415 ; SSE2-LABEL: combine_undef_input_test11:
2417 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2420 ; SSSE3-LABEL: combine_undef_input_test11:
2422 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2425 ; SSE41-LABEL: combine_undef_input_test11:
2427 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2430 ; AVX-LABEL: combine_undef_input_test11:
2432 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2434 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2435 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6>
2439 define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
2440 ; SSE-LABEL: combine_undef_input_test12:
2442 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2445 ; AVX-LABEL: combine_undef_input_test12:
2447 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2449 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2450 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2454 define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
2455 ; SSE-LABEL: combine_undef_input_test13:
2457 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2460 ; AVX-LABEL: combine_undef_input_test13:
2462 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2464 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2465 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
2469 define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
2470 ; SSE-LABEL: combine_undef_input_test14:
2472 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2473 ; SSE-NEXT: movapd %xmm1, %xmm0
2476 ; AVX-LABEL: combine_undef_input_test14:
2478 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2480 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2481 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2485 define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
2486 ; SSE2-LABEL: combine_undef_input_test15:
2488 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2489 ; SSE2-NEXT: movapd %xmm1, %xmm0
2492 ; SSSE3-LABEL: combine_undef_input_test15:
2494 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2495 ; SSSE3-NEXT: movapd %xmm1, %xmm0
2498 ; SSE41-LABEL: combine_undef_input_test15:
2500 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2503 ; AVX-LABEL: combine_undef_input_test15:
2505 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2507 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2508 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2513 ; Verify that shuffles are canonicalized according to rules:
2514 ; shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
2516 ; This allows to trigger the following combine rule:
2517 ; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2519 ; As a result, all the shuffle pairs in each function below should be
2520 ; combined into a single legal shuffle operation.
2522 define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
2523 ; ALL-LABEL: combine_undef_input_test16:
2526 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2527 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
2531 define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
2532 ; SSE2-LABEL: combine_undef_input_test17:
2534 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
2537 ; SSSE3-LABEL: combine_undef_input_test17:
2539 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2542 ; SSE41-LABEL: combine_undef_input_test17:
2544 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2547 ; AVX-LABEL: combine_undef_input_test17:
2549 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2551 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2552 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2556 define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
2557 ; SSE2-LABEL: combine_undef_input_test18:
2559 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
2562 ; SSSE3-LABEL: combine_undef_input_test18:
2564 ; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2567 ; SSE41-LABEL: combine_undef_input_test18:
2569 ; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
2572 ; AVX-LABEL: combine_undef_input_test18:
2574 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2576 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2577 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
2581 define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
2582 ; SSE-LABEL: combine_undef_input_test19:
2584 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
2587 ; AVX-LABEL: combine_undef_input_test19:
2589 ; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1]
2591 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2592 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2596 define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
2597 ; ALL-LABEL: combine_undef_input_test20:
2600 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2601 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2605 ; These tests are designed to test the ability to combine away unnecessary
2606 ; operations feeding into a shuffle. The AVX cases are the important ones as
2607 ; they leverage operations which cannot be done naturally on the entire vector
2608 ; and thus are decomposed into multiple smaller operations.
2610 define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
2611 ; SSE-LABEL: combine_unneeded_subvector1:
2613 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
2614 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
2615 ; SSE-NEXT: movdqa %xmm0, %xmm1
2618 ; AVX1-LABEL: combine_unneeded_subvector1:
2620 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2621 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
2622 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
2623 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2626 ; AVX2-LABEL: combine_unneeded_subvector1:
2628 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
2629 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
2630 ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
2632 %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2633 %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
2637 define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
2638 ; SSE-LABEL: combine_unneeded_subvector2:
2640 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
2641 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
2642 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
2645 ; AVX1-LABEL: combine_unneeded_subvector2:
2647 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2648 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
2649 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2650 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2651 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2654 ; AVX2-LABEL: combine_unneeded_subvector2:
2656 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
2657 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2658 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2660 %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2661 %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
2665 define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
2666 ; SSE2-LABEL: combine_insertps1:
2668 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2669 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2670 ; SSE2-NEXT: movaps %xmm1, %xmm0
2673 ; SSSE3-LABEL: combine_insertps1:
2675 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2676 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2677 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2680 ; SSE41-LABEL: combine_insertps1:
2682 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2685 ; AVX-LABEL: combine_insertps1:
2687 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2690 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4>
2691 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
2695 define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
2696 ; SSE2-LABEL: combine_insertps2:
2698 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2699 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2700 ; SSE2-NEXT: movaps %xmm1, %xmm0
2703 ; SSSE3-LABEL: combine_insertps2:
2705 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2706 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2707 ; SSSE3-NEXT: movaps %xmm1, %xmm0
2710 ; SSE41-LABEL: combine_insertps2:
2712 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2715 ; AVX-LABEL: combine_insertps2:
2717 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2720 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7>
2721 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2725 define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
2726 ; SSE2-LABEL: combine_insertps3:
2728 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2729 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2732 ; SSSE3-LABEL: combine_insertps3:
2734 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2735 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2738 ; SSE41-LABEL: combine_insertps3:
2740 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2743 ; AVX-LABEL: combine_insertps3:
2745 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2748 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2749 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3>
2753 define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
2754 ; SSE2-LABEL: combine_insertps4:
2756 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
2757 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2760 ; SSSE3-LABEL: combine_insertps4:
2762 ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
2763 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2766 ; SSE41-LABEL: combine_insertps4:
2768 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2771 ; AVX-LABEL: combine_insertps4:
2773 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2776 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2777 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
2781 define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
2782 ; SSE-LABEL: PR22377:
2783 ; SSE: # BB#0: # %entry
2784 ; SSE-NEXT: movaps %xmm0, %xmm1
2785 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
2786 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
2787 ; SSE-NEXT: addps %xmm0, %xmm1
2788 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2791 ; AVX-LABEL: PR22377:
2792 ; AVX: # BB#0: # %entry
2793 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
2794 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
2795 ; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm1
2796 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2799 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
2800 %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2801 %r2 = fadd <4 x float> %s1, %s2
2802 %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2806 define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
2807 ; SSE2-LABEL: PR22390:
2808 ; SSE2: # BB#0: # %entry
2809 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2810 ; SSE2-NEXT: movaps %xmm0, %xmm2
2811 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2812 ; SSE2-NEXT: addps %xmm0, %xmm2
2813 ; SSE2-NEXT: movaps %xmm2, %xmm0
2816 ; SSSE3-LABEL: PR22390:
2817 ; SSSE3: # BB#0: # %entry
2818 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2819 ; SSSE3-NEXT: movaps %xmm0, %xmm2
2820 ; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2821 ; SSSE3-NEXT: addps %xmm0, %xmm2
2822 ; SSSE3-NEXT: movaps %xmm2, %xmm0
2825 ; SSE41-LABEL: PR22390:
2826 ; SSE41: # BB#0: # %entry
2827 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2828 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2829 ; SSE41-NEXT: addps %xmm1, %xmm0
2832 ; AVX-LABEL: PR22390:
2833 ; AVX: # BB#0: # %entry
2834 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2835 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2836 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
2839 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
2840 %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
2841 %r2 = fadd <4 x float> %s1, %s2
2845 define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
2846 ; SSE2-LABEL: PR22412:
2847 ; SSE2: # BB#0: # %entry
2848 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2849 ; SSE2-NEXT: movapd %xmm2, %xmm0
2850 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
2851 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
2852 ; SSE2-NEXT: movaps %xmm3, %xmm1
2855 ; SSSE3-LABEL: PR22412:
2856 ; SSSE3: # BB#0: # %entry
2857 ; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2858 ; SSSE3-NEXT: movapd %xmm2, %xmm0
2859 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
2860 ; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2]
2861 ; SSSE3-NEXT: movaps %xmm3, %xmm1
2864 ; SSE41-LABEL: PR22412:
2865 ; SSE41: # BB#0: # %entry
2866 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
2867 ; SSE41-NEXT: movapd %xmm0, %xmm1
2868 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[3,2]
2869 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[3,2]
2870 ; SSE41-NEXT: movaps %xmm1, %xmm0
2871 ; SSE41-NEXT: movaps %xmm3, %xmm1
2874 ; AVX1-LABEL: PR22412:
2875 ; AVX1: # BB#0: # %entry
2876 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
2877 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
2878 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
2881 ; AVX2-LABEL: PR22412:
2882 ; AVX2: # BB#0: # %entry
2883 ; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
2884 ; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,7,6,5,4,3,2]
2885 ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
2888 %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2889 %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>