1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
4 define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
6 ; AVX1: ## BB#0: ## %entry
7 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
11 ; AVX2: ## BB#0: ## %entry
12 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
15 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
16 ret <8 x float> %shuffle
19 define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
21 ; ALL: ## BB#0: ## %entry
22 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
25 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
26 ret <8 x float> %shuffle
29 define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
31 ; AVX1: ## BB#0: ## %entry
32 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
36 ; AVX2: ## BB#0: ## %entry
37 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
40 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
41 ret <8 x float> %shuffle
44 define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
46 ; AVX1: ## BB#0: ## %entry
47 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
48 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
52 ; AVX2: ## BB#0: ## %entry
53 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
56 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
57 ret <8 x float> %shuffle
60 define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
62 ; AVX1: ## BB#0: ## %entry
63 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
64 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
68 ; AVX2: ## BB#0: ## %entry
69 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
72 %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
73 ret <32 x i8> %shuffle
76 define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
78 ; AVX1: ## BB#0: ## %entry
79 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
80 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
81 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
85 ; AVX2: ## BB#0: ## %entry
86 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
87 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
88 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
91 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
92 ret <4 x i64> %shuffle
95 define <32 x i8> @Ei(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
97 ; AVX1: ## BB#0: ## %entry
98 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
99 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
100 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
104 ; AVX2: ## BB#0: ## %entry
105 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
106 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
109 ; add forces execution domain
110 %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
111 %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
112 ret <32 x i8> %shuffle
115 define <4 x i64> @E2i(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
117 ; AVX1: ## BB#0: ## %entry
118 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
119 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
120 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
121 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
125 ; AVX2: ## BB#0: ## %entry
126 ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
127 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
128 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
129 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
130 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
133 ; add forces execution domain
134 %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
135 %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
136 ret <4 x i64> %shuffle
139 define <8 x i32> @E3i(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
141 ; AVX1: ## BB#0: ## %entry
142 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
143 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
144 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
148 ; AVX2: ## BB#0: ## %entry
149 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
150 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
151 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
152 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
155 ; add forces execution domain
156 %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
157 %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
158 ret <8 x i32> %shuffle
161 define <16 x i16> @E4i(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
163 ; AVX1: ## BB#0: ## %entry
164 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
165 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
166 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
170 ; AVX2: ## BB#0: ## %entry
171 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
172 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
173 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
176 ; add forces execution domain
177 %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
178 %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
179 ret <16 x i16> %shuffle
182 define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
184 ; AVX1: ## BB#0: ## %entry
185 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
186 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
187 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
188 ; AVX1-NEXT: vmovapd (%rsi), %ymm1
189 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
193 ; AVX2: ## BB#0: ## %entry
194 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
195 ; AVX2-NEXT: vmovdqa (%rsi), %ymm1
196 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
197 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
198 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
201 %c = load <16 x i16>* %a
202 %d = load <16 x i16>* %b
203 %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
204 %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
205 ret <16 x i16> %shuffle
208 ;;;; Cases with undef indicies mixed in the mask
210 define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
212 ; AVX1: ## BB#0: ## %entry
213 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
214 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
215 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
219 ; AVX2: ## BB#0: ## %entry
220 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
221 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,1]
222 ; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
225 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
226 ret <8 x float> %shuffle
229 ;;;; Cases we must not select vperm2f128
231 define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
233 ; AVX1: ## BB#0: ## %entry
234 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
235 ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7]
236 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
240 ; AVX2: ## BB#0: ## %entry
241 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
242 ; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7]
243 ; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
246 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
247 ret <8 x float> %shuffle