1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
5 ; 32-bit tests to make sure we're not doing anything stupid.
6 ; RUN: llc < %s -mtriple=i686-unknown-unknown
7 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse
8 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2
11 ; Signed Integer to Double
14 define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
15 ; SSE-LABEL: sitofp_2i64_to_2f64:
17 ; SSE-NEXT: movd %xmm0, %rax
18 ; SSE-NEXT: cvtsi2sdq %rax, %xmm1
19 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
20 ; SSE-NEXT: movd %xmm0, %rax
21 ; SSE-NEXT: xorps %xmm0, %xmm0
22 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0
23 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
24 ; SSE-NEXT: movapd %xmm1, %xmm0
27 ; AVX-LABEL: sitofp_2i64_to_2f64:
29 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
30 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
31 ; AVX-NEXT: vmovq %xmm0, %rax
32 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
33 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
34 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
36 %cvt = sitofp <2 x i64> %a to <2 x double>
40 define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) {
41 ; SSE-LABEL: sitofp_2i32_to_2f64:
43 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
46 ; AVX-LABEL: sitofp_2i32_to_2f64:
48 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
50 %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
51 %cvt = sitofp <2 x i32> %shuf to <2 x double>
55 define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
56 ; SSE-LABEL: sitofp_4i32_to_2f64:
58 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
61 ; AVX-LABEL: sitofp_4i32_to_2f64:
63 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
64 ; AVX-NEXT: vzeroupper
66 %cvt = sitofp <4 x i32> %a to <4 x double>
67 %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
68 ret <2 x double> %shuf
71 define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) {
72 ; SSE-LABEL: sitofp_2i16_to_2f64:
74 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
75 ; SSE-NEXT: psrad $16, %xmm0
76 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
79 ; AVX-LABEL: sitofp_2i16_to_2f64:
81 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
82 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
84 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
85 %cvt = sitofp <2 x i16> %shuf to <2 x double>
89 define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
90 ; SSE-LABEL: sitofp_8i16_to_2f64:
92 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
93 ; SSE-NEXT: psrad $16, %xmm0
94 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
97 ; AVX1-LABEL: sitofp_8i16_to_2f64:
99 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
100 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
101 ; AVX1-NEXT: vzeroupper
104 ; AVX2-LABEL: sitofp_8i16_to_2f64:
106 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
107 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
108 ; AVX2-NEXT: vzeroupper
110 %cvt = sitofp <8 x i16> %a to <8 x double>
111 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
112 ret <2 x double> %shuf
115 define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) {
116 ; SSE-LABEL: sitofp_2i8_to_2f64:
118 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
119 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
120 ; SSE-NEXT: psrad $24, %xmm0
121 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
124 ; AVX-LABEL: sitofp_2i8_to_2f64:
126 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
127 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
129 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
130 %cvt = sitofp <2 x i8> %shuf to <2 x double>
131 ret <2 x double> %cvt
134 define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
135 ; SSE-LABEL: sitofp_16i8_to_2f64:
137 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
138 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
139 ; SSE-NEXT: psrad $24, %xmm0
140 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
143 ; AVX1-LABEL: sitofp_16i8_to_2f64:
145 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
146 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
147 ; AVX1-NEXT: vzeroupper
150 ; AVX2-LABEL: sitofp_16i8_to_2f64:
152 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
153 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
154 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
155 ; AVX2-NEXT: vzeroupper
157 %cvt = sitofp <16 x i8> %a to <16 x double>
158 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
159 ret <2 x double> %shuf
162 define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
163 ; SSE-LABEL: sitofp_4i64_to_4f64:
165 ; SSE-NEXT: movd %xmm0, %rax
166 ; SSE-NEXT: cvtsi2sdq %rax, %xmm2
167 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
168 ; SSE-NEXT: movd %xmm0, %rax
169 ; SSE-NEXT: xorps %xmm0, %xmm0
170 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0
171 ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
172 ; SSE-NEXT: movd %xmm1, %rax
173 ; SSE-NEXT: cvtsi2sdq %rax, %xmm3
174 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
175 ; SSE-NEXT: movd %xmm0, %rax
176 ; SSE-NEXT: xorps %xmm0, %xmm0
177 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0
178 ; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
179 ; SSE-NEXT: movapd %xmm2, %xmm0
180 ; SSE-NEXT: movapd %xmm3, %xmm1
183 ; AVX1-LABEL: sitofp_4i64_to_4f64:
185 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
186 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax
187 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
188 ; AVX1-NEXT: vmovq %xmm1, %rax
189 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
190 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
191 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
192 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
193 ; AVX1-NEXT: vmovq %xmm0, %rax
194 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
195 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
196 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
197 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
200 ; AVX2-LABEL: sitofp_4i64_to_4f64:
202 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
203 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax
204 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
205 ; AVX2-NEXT: vmovq %xmm1, %rax
206 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
207 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
208 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
209 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
210 ; AVX2-NEXT: vmovq %xmm0, %rax
211 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
212 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
213 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
214 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
216 %cvt = sitofp <4 x i64> %a to <4 x double>
217 ret <4 x double> %cvt
220 define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) {
221 ; SSE-LABEL: sitofp_4i32_to_4f64:
223 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
224 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
225 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
226 ; SSE-NEXT: movaps %xmm2, %xmm0
229 ; AVX-LABEL: sitofp_4i32_to_4f64:
231 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
233 %cvt = sitofp <4 x i32> %a to <4 x double>
234 ret <4 x double> %cvt
237 define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) {
238 ; SSE-LABEL: sitofp_4i16_to_4f64:
240 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
241 ; SSE-NEXT: psrad $16, %xmm1
242 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
243 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
244 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
247 ; AVX-LABEL: sitofp_4i16_to_4f64:
249 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
250 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
252 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
253 %cvt = sitofp <4 x i16> %shuf to <4 x double>
254 ret <4 x double> %cvt
257 define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) {
258 ; SSE-LABEL: sitofp_8i16_to_4f64:
260 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
261 ; SSE-NEXT: psrad $16, %xmm1
262 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
263 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
264 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
267 ; AVX1-LABEL: sitofp_8i16_to_4f64:
269 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
270 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
273 ; AVX2-LABEL: sitofp_8i16_to_4f64:
275 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
276 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
278 %cvt = sitofp <8 x i16> %a to <8 x double>
279 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
280 ret <4 x double> %shuf
283 define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) {
284 ; SSE-LABEL: sitofp_4i8_to_4f64:
286 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
287 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
288 ; SSE-NEXT: psrad $24, %xmm1
289 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
290 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
291 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
294 ; AVX-LABEL: sitofp_4i8_to_4f64:
296 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
297 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
299 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
300 %cvt = sitofp <4 x i8> %shuf to <4 x double>
301 ret <4 x double> %cvt
304 define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
305 ; SSE-LABEL: sitofp_16i8_to_4f64:
307 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
308 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
309 ; SSE-NEXT: psrad $24, %xmm1
310 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
311 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
312 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
315 ; AVX1-LABEL: sitofp_16i8_to_4f64:
317 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
318 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
321 ; AVX2-LABEL: sitofp_16i8_to_4f64:
323 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
324 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
325 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
327 %cvt = sitofp <16 x i8> %a to <16 x double>
328 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
329 ret <4 x double> %shuf
333 ; Unsigned Integer to Double
336 define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
337 ; SSE-LABEL: uitofp_2i64_to_2f64:
339 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
340 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
341 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
342 ; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
343 ; SSE-NEXT: subpd %xmm3, %xmm0
344 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
345 ; SSE-NEXT: addpd %xmm4, %xmm0
346 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
347 ; SSE-NEXT: subpd %xmm3, %xmm2
348 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
349 ; SSE-NEXT: addpd %xmm2, %xmm1
350 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
353 ; AVX-LABEL: uitofp_2i64_to_2f64:
355 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
356 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
357 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
358 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
359 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
360 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
361 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
362 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
363 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
364 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
366 %cvt = uitofp <2 x i64> %a to <2 x double>
367 ret <2 x double> %cvt
370 define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
371 ; SSE-LABEL: uitofp_2i32_to_2f64:
373 ; SSE-NEXT: pxor %xmm1, %xmm1
374 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
375 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
376 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
377 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
378 ; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
379 ; SSE-NEXT: subpd %xmm3, %xmm0
380 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
381 ; SSE-NEXT: addpd %xmm4, %xmm0
382 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
383 ; SSE-NEXT: subpd %xmm3, %xmm2
384 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
385 ; SSE-NEXT: addpd %xmm2, %xmm1
386 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
389 ; AVX-LABEL: uitofp_2i32_to_2f64:
391 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
392 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
393 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
394 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
395 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
396 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
397 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
398 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
399 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
400 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
401 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
403 %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
404 %cvt = uitofp <2 x i32> %shuf to <2 x double>
405 ret <2 x double> %cvt
408 define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
409 ; SSE-LABEL: uitofp_4i32_to_2f64:
411 ; SSE-NEXT: pxor %xmm1, %xmm1
412 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
413 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
414 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
415 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
416 ; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
417 ; SSE-NEXT: subpd %xmm3, %xmm0
418 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
419 ; SSE-NEXT: addpd %xmm4, %xmm0
420 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
421 ; SSE-NEXT: subpd %xmm3, %xmm2
422 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
423 ; SSE-NEXT: addpd %xmm2, %xmm1
424 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
427 ; AVX1-LABEL: uitofp_4i32_to_2f64:
429 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
430 ; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
431 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
432 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
433 ; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
434 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
435 ; AVX1-NEXT: vzeroupper
438 ; AVX2-LABEL: uitofp_4i32_to_2f64:
440 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
441 ; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
442 ; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
443 ; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
444 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
445 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
446 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
447 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
448 ; AVX2-NEXT: vzeroupper
450 %cvt = uitofp <4 x i32> %a to <4 x double>
451 %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
452 ret <2 x double> %shuf
455 define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) {
456 ; SSE-LABEL: uitofp_2i16_to_2f64:
458 ; SSE-NEXT: pxor %xmm1, %xmm1
459 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
460 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
463 ; AVX-LABEL: uitofp_2i16_to_2f64:
465 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
466 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
468 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
469 %cvt = uitofp <2 x i16> %shuf to <2 x double>
470 ret <2 x double> %cvt
473 define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
474 ; SSE-LABEL: uitofp_8i16_to_2f64:
476 ; SSE-NEXT: pxor %xmm1, %xmm1
477 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
478 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
481 ; AVX1-LABEL: uitofp_8i16_to_2f64:
483 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
484 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
485 ; AVX1-NEXT: vzeroupper
488 ; AVX2-LABEL: uitofp_8i16_to_2f64:
490 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
491 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
492 ; AVX2-NEXT: vzeroupper
494 %cvt = uitofp <8 x i16> %a to <8 x double>
495 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
496 ret <2 x double> %shuf
499 define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) {
500 ; SSE-LABEL: uitofp_2i8_to_2f64:
502 ; SSE-NEXT: pxor %xmm1, %xmm1
503 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
504 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
505 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
508 ; AVX-LABEL: uitofp_2i8_to_2f64:
510 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
511 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
513 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
514 %cvt = uitofp <2 x i8> %shuf to <2 x double>
515 ret <2 x double> %cvt
518 define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
519 ; SSE-LABEL: uitofp_16i8_to_2f64:
521 ; SSE-NEXT: pxor %xmm1, %xmm1
522 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
523 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
524 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
527 ; AVX1-LABEL: uitofp_16i8_to_2f64:
529 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
530 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
531 ; AVX1-NEXT: vzeroupper
534 ; AVX2-LABEL: uitofp_16i8_to_2f64:
536 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
537 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
538 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
539 ; AVX2-NEXT: vzeroupper
541 %cvt = uitofp <16 x i8> %a to <16 x double>
542 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
543 ret <2 x double> %shuf
546 define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
547 ; SSE-LABEL: uitofp_4i64_to_4f64:
549 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
550 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
551 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
552 ; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
553 ; SSE-NEXT: subpd %xmm4, %xmm0
554 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
555 ; SSE-NEXT: addpd %xmm5, %xmm0
556 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
557 ; SSE-NEXT: subpd %xmm4, %xmm3
558 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
559 ; SSE-NEXT: addpd %xmm3, %xmm5
560 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
561 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
562 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
563 ; SSE-NEXT: subpd %xmm4, %xmm1
564 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
565 ; SSE-NEXT: addpd %xmm5, %xmm1
566 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
567 ; SSE-NEXT: subpd %xmm4, %xmm3
568 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
569 ; SSE-NEXT: addpd %xmm3, %xmm2
570 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
573 ; AVX1-LABEL: uitofp_4i64_to_4f64:
575 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
576 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
577 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
578 ; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
579 ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
580 ; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
581 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
582 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
583 ; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1
584 ; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
585 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
586 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
587 ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
588 ; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
589 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
590 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
591 ; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0
592 ; AVX1-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
593 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
594 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
597 ; AVX2-LABEL: uitofp_4i64_to_4f64:
599 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
600 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
601 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
602 ; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
603 ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
604 ; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
605 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
606 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
607 ; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1
608 ; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
609 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
610 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
611 ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
612 ; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
613 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
614 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
615 ; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0
616 ; AVX2-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
617 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
618 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
620 %cvt = uitofp <4 x i64> %a to <4 x double>
621 ret <4 x double> %cvt
624 define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
625 ; SSE-LABEL: uitofp_4i32_to_4f64:
627 ; SSE-NEXT: movdqa %xmm0, %xmm2
628 ; SSE-NEXT: pxor %xmm1, %xmm1
629 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
630 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
631 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
632 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
633 ; SSE-NEXT: movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25]
634 ; SSE-NEXT: subpd %xmm5, %xmm0
635 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
636 ; SSE-NEXT: addpd %xmm6, %xmm0
637 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
638 ; SSE-NEXT: subpd %xmm5, %xmm4
639 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
640 ; SSE-NEXT: addpd %xmm4, %xmm6
641 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0]
642 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
643 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
644 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
645 ; SSE-NEXT: subpd %xmm5, %xmm2
646 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
647 ; SSE-NEXT: addpd %xmm2, %xmm1
648 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
649 ; SSE-NEXT: subpd %xmm5, %xmm4
650 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
651 ; SSE-NEXT: addpd %xmm4, %xmm2
652 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
655 ; AVX1-LABEL: uitofp_4i32_to_4f64:
657 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
658 ; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
659 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
660 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
661 ; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
662 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
665 ; AVX2-LABEL: uitofp_4i32_to_4f64:
667 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
668 ; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
669 ; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
670 ; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
671 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
672 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
673 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
674 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
676 %cvt = uitofp <4 x i32> %a to <4 x double>
677 ret <4 x double> %cvt
680 define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) {
681 ; SSE-LABEL: uitofp_4i16_to_4f64:
683 ; SSE-NEXT: pxor %xmm1, %xmm1
684 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
685 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
686 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
687 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
688 ; SSE-NEXT: movaps %xmm2, %xmm0
691 ; AVX-LABEL: uitofp_4i16_to_4f64:
693 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
694 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
696 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
697 %cvt = uitofp <4 x i16> %shuf to <4 x double>
698 ret <4 x double> %cvt
701 define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) {
702 ; SSE-LABEL: uitofp_8i16_to_4f64:
704 ; SSE-NEXT: pxor %xmm1, %xmm1
705 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
706 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
707 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
708 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
709 ; SSE-NEXT: movaps %xmm2, %xmm0
712 ; AVX1-LABEL: uitofp_8i16_to_4f64:
714 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
715 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
718 ; AVX2-LABEL: uitofp_8i16_to_4f64:
720 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
721 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
723 %cvt = uitofp <8 x i16> %a to <8 x double>
724 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
725 ret <4 x double> %shuf
728 define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) {
729 ; SSE-LABEL: uitofp_4i8_to_4f64:
731 ; SSE-NEXT: pxor %xmm1, %xmm1
732 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
733 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
734 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
735 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
736 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
737 ; SSE-NEXT: movaps %xmm2, %xmm0
740 ; AVX-LABEL: uitofp_4i8_to_4f64:
742 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
743 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
745 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
746 %cvt = uitofp <4 x i8> %shuf to <4 x double>
747 ret <4 x double> %cvt
750 define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
751 ; SSE-LABEL: uitofp_16i8_to_4f64:
753 ; SSE-NEXT: pxor %xmm1, %xmm1
754 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
755 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
756 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
757 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
758 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
759 ; SSE-NEXT: movaps %xmm2, %xmm0
762 ; AVX1-LABEL: uitofp_16i8_to_4f64:
764 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
765 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
768 ; AVX2-LABEL: uitofp_16i8_to_4f64:
770 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
771 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
772 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
774 %cvt = uitofp <16 x i8> %a to <16 x double>
775 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
776 ret <4 x double> %shuf
780 ; Signed Integer to Float
783 define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
784 ; SSE-LABEL: sitofp_2i64_to_4f32:
786 ; SSE-NEXT: movd %xmm0, %rax
787 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
788 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
789 ; SSE-NEXT: movd %xmm0, %rax
790 ; SSE-NEXT: xorps %xmm0, %xmm0
791 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
792 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
793 ; SSE-NEXT: movaps %xmm1, %xmm0
796 ; AVX-LABEL: sitofp_2i64_to_4f32:
798 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
799 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
800 ; AVX-NEXT: vmovq %xmm0, %rax
801 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
802 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
803 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
804 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
805 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
806 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
808 %cvt = sitofp <2 x i64> %a to <2 x float>
809 %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
813 define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
814 ; SSE-LABEL: sitofp_4i64_to_4f32_undef:
816 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
817 ; SSE-NEXT: movd %xmm0, %rax
818 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
819 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
820 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
821 ; SSE-NEXT: movd %xmm0, %rax
822 ; SSE-NEXT: xorps %xmm0, %xmm0
823 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
824 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
825 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
826 ; SSE-NEXT: movaps %xmm1, %xmm0
829 ; AVX-LABEL: sitofp_4i64_to_4f32_undef:
831 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
832 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
833 ; AVX-NEXT: vmovq %xmm0, %rax
834 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
835 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
836 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
837 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
838 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
839 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
841 %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
842 %cvt = sitofp <4 x i64> %ext to <4 x float>
846 define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) {
847 ; SSE-LABEL: sitofp_4i32_to_4f32:
849 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
852 ; AVX-LABEL: sitofp_4i32_to_4f32:
854 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
856 %cvt = sitofp <4 x i32> %a to <4 x float>
860 define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) {
861 ; SSE-LABEL: sitofp_4i16_to_4f32:
863 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
864 ; SSE-NEXT: psrad $16, %xmm0
865 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
868 ; AVX-LABEL: sitofp_4i16_to_4f32:
870 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
871 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
873 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
874 %cvt = sitofp <4 x i16> %shuf to <4 x float>
878 define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
879 ; SSE-LABEL: sitofp_8i16_to_4f32:
881 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
882 ; SSE-NEXT: psrad $16, %xmm0
883 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
886 ; AVX1-LABEL: sitofp_8i16_to_4f32:
888 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
889 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
890 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
891 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
892 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
893 ; AVX1-NEXT: vzeroupper
896 ; AVX2-LABEL: sitofp_8i16_to_4f32:
898 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
899 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
900 ; AVX2-NEXT: vzeroupper
902 %cvt = sitofp <8 x i16> %a to <8 x float>
903 %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
904 ret <4 x float> %shuf
907 define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) {
908 ; SSE-LABEL: sitofp_4i8_to_4f32:
910 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
911 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
912 ; SSE-NEXT: psrad $24, %xmm0
913 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
916 ; AVX-LABEL: sitofp_4i8_to_4f32:
918 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
919 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
921 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
922 %cvt = sitofp <4 x i8> %shuf to <4 x float>
926 define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
927 ; SSE-LABEL: sitofp_16i8_to_4f32:
929 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
930 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
931 ; SSE-NEXT: psrad $24, %xmm0
932 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
935 ; AVX1-LABEL: sitofp_16i8_to_4f32:
937 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
938 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
939 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
940 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
941 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
942 ; AVX1-NEXT: vzeroupper
945 ; AVX2-LABEL: sitofp_16i8_to_4f32:
947 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
948 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
949 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
950 ; AVX2-NEXT: vzeroupper
952 %cvt = sitofp <16 x i8> %a to <16 x float>
953 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
954 ret <4 x float> %shuf
957 define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
958 ; SSE-LABEL: sitofp_4i64_to_4f32:
960 ; SSE-NEXT: movd %xmm1, %rax
961 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3
962 ; SSE-NEXT: movd %xmm0, %rax
963 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
964 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
965 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
966 ; SSE-NEXT: movd %xmm1, %rax
967 ; SSE-NEXT: xorps %xmm1, %xmm1
968 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
969 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
970 ; SSE-NEXT: movd %xmm0, %rax
971 ; SSE-NEXT: xorps %xmm0, %xmm0
972 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
973 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
974 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
975 ; SSE-NEXT: movaps %xmm2, %xmm0
978 ; AVX1-LABEL: sitofp_4i64_to_4f32:
980 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
981 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
982 ; AVX1-NEXT: vmovq %xmm0, %rax
983 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
984 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
985 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
986 ; AVX1-NEXT: vmovq %xmm0, %rax
987 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
988 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
989 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
990 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
991 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
992 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
993 ; AVX1-NEXT: vzeroupper
996 ; AVX2-LABEL: sitofp_4i64_to_4f32:
998 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
999 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1000 ; AVX2-NEXT: vmovq %xmm0, %rax
1001 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1002 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1003 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1004 ; AVX2-NEXT: vmovq %xmm0, %rax
1005 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1006 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1007 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1008 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
1009 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1010 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1011 ; AVX2-NEXT: vzeroupper
1013 %cvt = sitofp <4 x i64> %a to <4 x float>
1014 ret <4 x float> %cvt
1017 define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) {
1018 ; SSE-LABEL: sitofp_8i32_to_8f32:
1020 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1021 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
1024 ; AVX-LABEL: sitofp_8i32_to_8f32:
1026 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
1028 %cvt = sitofp <8 x i32> %a to <8 x float>
1029 ret <8 x float> %cvt
1032 define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
1033 ; SSE-LABEL: sitofp_8i16_to_8f32:
1035 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1036 ; SSE-NEXT: psrad $16, %xmm1
1037 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm2
1038 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1039 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1040 ; SSE-NEXT: psrad $16, %xmm0
1041 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1042 ; SSE-NEXT: movaps %xmm2, %xmm0
1045 ; AVX1-LABEL: sitofp_8i16_to_8f32:
1047 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
1048 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1049 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
1050 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1051 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1054 ; AVX2-LABEL: sitofp_8i16_to_8f32:
1056 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
1057 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1059 %cvt = sitofp <8 x i16> %a to <8 x float>
1060 ret <8 x float> %cvt
1063 define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
1064 ; SSE-LABEL: sitofp_8i8_to_8f32:
1066 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1067 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1068 ; SSE-NEXT: psrad $24, %xmm1
1069 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm2
1070 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1071 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1072 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1073 ; SSE-NEXT: psrad $24, %xmm0
1074 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1075 ; SSE-NEXT: movaps %xmm2, %xmm0
1078 ; AVX1-LABEL: sitofp_8i8_to_8f32:
1080 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
1081 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1082 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
1083 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1084 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1087 ; AVX2-LABEL: sitofp_8i8_to_8f32:
1089 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1090 ; AVX2-NEXT: vpslld $24, %ymm0, %ymm0
1091 ; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
1092 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1094 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1095 %cvt = sitofp <8 x i8> %shuf to <8 x float>
1096 ret <8 x float> %cvt
1099 define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
1100 ; SSE-LABEL: sitofp_16i8_to_8f32:
1102 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1103 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1104 ; SSE-NEXT: psrad $24, %xmm1
1105 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm2
1106 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1107 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1108 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1109 ; SSE-NEXT: psrad $24, %xmm0
1110 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1111 ; SSE-NEXT: movaps %xmm2, %xmm0
1114 ; AVX1-LABEL: sitofp_16i8_to_8f32:
1116 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
1117 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1118 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
1119 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1120 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1123 ; AVX2-LABEL: sitofp_16i8_to_8f32:
1125 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
1126 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
1127 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1129 %cvt = sitofp <16 x i8> %a to <16 x float>
1130 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1131 ret <8 x float> %shuf
1135 ; Unsigned Integer to Float
1138 define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
1139 ; SSE-LABEL: uitofp_2i64_to_4f32:
1141 ; SSE-NEXT: movdqa %xmm0, %xmm1
1142 ; SSE-NEXT: movd %xmm1, %rax
1143 ; SSE-NEXT: movl %eax, %ecx
1144 ; SSE-NEXT: andl $1, %ecx
1145 ; SSE-NEXT: testq %rax, %rax
1146 ; SSE-NEXT: js .LBB38_1
1148 ; SSE-NEXT: xorps %xmm0, %xmm0
1149 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
1150 ; SSE-NEXT: jmp .LBB38_3
1151 ; SSE-NEXT: .LBB38_1:
1152 ; SSE-NEXT: shrq %rax
1153 ; SSE-NEXT: orq %rax, %rcx
1154 ; SSE-NEXT: xorps %xmm0, %xmm0
1155 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
1156 ; SSE-NEXT: addss %xmm0, %xmm0
1157 ; SSE-NEXT: .LBB38_3:
1158 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1159 ; SSE-NEXT: movd %xmm1, %rax
1160 ; SSE-NEXT: movl %eax, %ecx
1161 ; SSE-NEXT: andl $1, %ecx
1162 ; SSE-NEXT: testq %rax, %rax
1163 ; SSE-NEXT: js .LBB38_4
1165 ; SSE-NEXT: xorps %xmm1, %xmm1
1166 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
1167 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1169 ; SSE-NEXT: .LBB38_4:
1170 ; SSE-NEXT: shrq %rax
1171 ; SSE-NEXT: orq %rax, %rcx
1172 ; SSE-NEXT: xorps %xmm1, %xmm1
1173 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
1174 ; SSE-NEXT: addss %xmm1, %xmm1
1175 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1178 ; AVX-LABEL: uitofp_2i64_to_4f32:
1180 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
1181 ; AVX-NEXT: movl %eax, %ecx
1182 ; AVX-NEXT: andl $1, %ecx
1183 ; AVX-NEXT: testq %rax, %rax
1184 ; AVX-NEXT: js .LBB38_1
1186 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1187 ; AVX-NEXT: jmp .LBB38_3
1188 ; AVX-NEXT: .LBB38_1:
1189 ; AVX-NEXT: shrq %rax
1190 ; AVX-NEXT: orq %rax, %rcx
1191 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1192 ; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1
1193 ; AVX-NEXT: .LBB38_3:
1194 ; AVX-NEXT: vmovq %xmm0, %rax
1195 ; AVX-NEXT: movl %eax, %ecx
1196 ; AVX-NEXT: andl $1, %ecx
1197 ; AVX-NEXT: testq %rax, %rax
1198 ; AVX-NEXT: js .LBB38_4
1200 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1201 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1202 ; AVX-NEXT: jmp .LBB38_6
1203 ; AVX-NEXT: .LBB38_4:
1204 ; AVX-NEXT: shrq %rax
1205 ; AVX-NEXT: orq %rax, %rcx
1206 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1207 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1208 ; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0
1209 ; AVX-NEXT: .LBB38_6:
1210 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
1211 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1212 ; AVX-NEXT: testq %rax, %rax
1213 ; AVX-NEXT: js .LBB38_8
1215 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1216 ; AVX-NEXT: .LBB38_8:
1217 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
1218 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
1220 %cvt = uitofp <2 x i64> %a to <2 x float>
1221 %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1222 ret <4 x float> %ext
1225 define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
1226 ; SSE-LABEL: uitofp_4i64_to_4f32_undef:
1228 ; SSE-NEXT: movdqa %xmm0, %xmm1
1229 ; SSE-NEXT: testq %rax, %rax
1230 ; SSE-NEXT: xorps %xmm2, %xmm2
1231 ; SSE-NEXT: js .LBB39_2
1233 ; SSE-NEXT: xorps %xmm2, %xmm2
1234 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
1235 ; SSE-NEXT: .LBB39_2:
1236 ; SSE-NEXT: movd %xmm1, %rax
1237 ; SSE-NEXT: movl %eax, %ecx
1238 ; SSE-NEXT: andl $1, %ecx
1239 ; SSE-NEXT: testq %rax, %rax
1240 ; SSE-NEXT: js .LBB39_3
1242 ; SSE-NEXT: xorps %xmm0, %xmm0
1243 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
1244 ; SSE-NEXT: jmp .LBB39_5
1245 ; SSE-NEXT: .LBB39_3:
1246 ; SSE-NEXT: shrq %rax
1247 ; SSE-NEXT: orq %rax, %rcx
1248 ; SSE-NEXT: xorps %xmm0, %xmm0
1249 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
1250 ; SSE-NEXT: addss %xmm0, %xmm0
1251 ; SSE-NEXT: .LBB39_5:
1252 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1253 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1254 ; SSE-NEXT: movd %xmm1, %rax
1255 ; SSE-NEXT: movl %eax, %ecx
1256 ; SSE-NEXT: andl $1, %ecx
1257 ; SSE-NEXT: testq %rax, %rax
1258 ; SSE-NEXT: js .LBB39_6
1260 ; SSE-NEXT: xorps %xmm1, %xmm1
1261 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
1262 ; SSE-NEXT: jmp .LBB39_8
1263 ; SSE-NEXT: .LBB39_6:
1264 ; SSE-NEXT: shrq %rax
1265 ; SSE-NEXT: orq %rax, %rcx
1266 ; SSE-NEXT: xorps %xmm1, %xmm1
1267 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
1268 ; SSE-NEXT: addss %xmm1, %xmm1
1269 ; SSE-NEXT: .LBB39_8:
1270 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1271 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1274 ; AVX-LABEL: uitofp_4i64_to_4f32_undef:
1276 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
1277 ; AVX-NEXT: movl %eax, %ecx
1278 ; AVX-NEXT: andl $1, %ecx
1279 ; AVX-NEXT: testq %rax, %rax
1280 ; AVX-NEXT: js .LBB39_1
1282 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1283 ; AVX-NEXT: jmp .LBB39_3
1284 ; AVX-NEXT: .LBB39_1:
1285 ; AVX-NEXT: shrq %rax
1286 ; AVX-NEXT: orq %rax, %rcx
1287 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1288 ; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1
1289 ; AVX-NEXT: .LBB39_3:
1290 ; AVX-NEXT: vmovq %xmm0, %rax
1291 ; AVX-NEXT: movl %eax, %ecx
1292 ; AVX-NEXT: andl $1, %ecx
1293 ; AVX-NEXT: testq %rax, %rax
1294 ; AVX-NEXT: js .LBB39_4
1296 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1297 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1298 ; AVX-NEXT: jmp .LBB39_6
1299 ; AVX-NEXT: .LBB39_4:
1300 ; AVX-NEXT: shrq %rax
1301 ; AVX-NEXT: orq %rax, %rcx
1302 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1303 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1304 ; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0
1305 ; AVX-NEXT: .LBB39_6:
1306 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
1307 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1308 ; AVX-NEXT: testq %rax, %rax
1309 ; AVX-NEXT: js .LBB39_8
1311 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1312 ; AVX-NEXT: .LBB39_8:
1313 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
1314 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
1316 %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1317 %cvt = uitofp <4 x i64> %ext to <4 x float>
1318 ret <4 x float> %cvt
1321 define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
1322 ; SSE-LABEL: uitofp_4i32_to_4f32:
1324 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
1325 ; SSE-NEXT: pand %xmm0, %xmm1
1326 ; SSE-NEXT: por {{.*}}(%rip), %xmm1
1327 ; SSE-NEXT: psrld $16, %xmm0
1328 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
1329 ; SSE-NEXT: addps {{.*}}(%rip), %xmm0
1330 ; SSE-NEXT: addps %xmm1, %xmm0
1333 ; AVX1-LABEL: uitofp_4i32_to_4f32:
1335 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
1336 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
1337 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
1338 ; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
1339 ; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
1342 ; AVX2-LABEL: uitofp_4i32_to_4f32:
1344 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
1345 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1346 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
1347 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
1348 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
1349 ; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
1350 ; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0
1351 ; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
1353 %cvt = uitofp <4 x i32> %a to <4 x float>
1354 ret <4 x float> %cvt
1357 define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) {
1358 ; SSE-LABEL: uitofp_4i16_to_4f32:
1360 ; SSE-NEXT: pxor %xmm1, %xmm1
1361 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1362 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1365 ; AVX-LABEL: uitofp_4i16_to_4f32:
1367 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1368 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
1370 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1371 %cvt = uitofp <4 x i16> %shuf to <4 x float>
1372 ret <4 x float> %cvt
1375 define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
1376 ; SSE-LABEL: uitofp_8i16_to_4f32:
1378 ; SSE-NEXT: pxor %xmm1, %xmm1
1379 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1380 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1383 ; AVX1-LABEL: uitofp_8i16_to_4f32:
1385 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1386 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1387 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1388 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1389 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1390 ; AVX1-NEXT: vzeroupper
1393 ; AVX2-LABEL: uitofp_8i16_to_4f32:
1395 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1396 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1397 ; AVX2-NEXT: vzeroupper
1399 %cvt = uitofp <8 x i16> %a to <8 x float>
1400 %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1401 ret <4 x float> %shuf
1404 define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) {
1405 ; SSE-LABEL: uitofp_4i8_to_4f32:
1407 ; SSE-NEXT: pxor %xmm1, %xmm1
1408 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1409 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1410 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1413 ; AVX-LABEL: uitofp_4i8_to_4f32:
1415 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1416 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
1418 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1419 %cvt = uitofp <4 x i8> %shuf to <4 x float>
1420 ret <4 x float> %cvt
1423 define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
1424 ; SSE-LABEL: uitofp_16i8_to_4f32:
1426 ; SSE-NEXT: pxor %xmm1, %xmm1
1427 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1428 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1429 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1432 ; AVX1-LABEL: uitofp_16i8_to_4f32:
1434 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1435 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1436 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1437 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1438 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1439 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1440 ; AVX1-NEXT: vzeroupper
1443 ; AVX2-LABEL: uitofp_16i8_to_4f32:
1445 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1446 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1447 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1448 ; AVX2-NEXT: vzeroupper
1450 %cvt = uitofp <16 x i8> %a to <16 x float>
1451 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1452 ret <4 x float> %shuf
1455 define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
1456 ; SSE-LABEL: uitofp_4i64_to_4f32:
1458 ; SSE-NEXT: movd %xmm1, %rax
1459 ; SSE-NEXT: movl %eax, %ecx
1460 ; SSE-NEXT: andl $1, %ecx
1461 ; SSE-NEXT: testq %rax, %rax
1462 ; SSE-NEXT: js .LBB45_1
1464 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3
1465 ; SSE-NEXT: jmp .LBB45_3
1466 ; SSE-NEXT: .LBB45_1:
1467 ; SSE-NEXT: shrq %rax
1468 ; SSE-NEXT: orq %rax, %rcx
1469 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm3
1470 ; SSE-NEXT: addss %xmm3, %xmm3
1471 ; SSE-NEXT: .LBB45_3:
1472 ; SSE-NEXT: movd %xmm0, %rax
1473 ; SSE-NEXT: movl %eax, %ecx
1474 ; SSE-NEXT: andl $1, %ecx
1475 ; SSE-NEXT: testq %rax, %rax
1476 ; SSE-NEXT: js .LBB45_4
1478 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
1479 ; SSE-NEXT: jmp .LBB45_6
1480 ; SSE-NEXT: .LBB45_4:
1481 ; SSE-NEXT: shrq %rax
1482 ; SSE-NEXT: orq %rax, %rcx
1483 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm2
1484 ; SSE-NEXT: addss %xmm2, %xmm2
1485 ; SSE-NEXT: .LBB45_6:
1486 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1487 ; SSE-NEXT: movd %xmm1, %rax
1488 ; SSE-NEXT: movl %eax, %ecx
1489 ; SSE-NEXT: andl $1, %ecx
1490 ; SSE-NEXT: testq %rax, %rax
1491 ; SSE-NEXT: js .LBB45_7
1493 ; SSE-NEXT: xorps %xmm1, %xmm1
1494 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
1495 ; SSE-NEXT: jmp .LBB45_9
1496 ; SSE-NEXT: .LBB45_7:
1497 ; SSE-NEXT: shrq %rax
1498 ; SSE-NEXT: orq %rax, %rcx
1499 ; SSE-NEXT: xorps %xmm1, %xmm1
1500 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
1501 ; SSE-NEXT: addss %xmm1, %xmm1
1502 ; SSE-NEXT: .LBB45_9:
1503 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1504 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1505 ; SSE-NEXT: movd %xmm0, %rax
1506 ; SSE-NEXT: movl %eax, %ecx
1507 ; SSE-NEXT: andl $1, %ecx
1508 ; SSE-NEXT: testq %rax, %rax
1509 ; SSE-NEXT: js .LBB45_10
1510 ; SSE-NEXT: # BB#11:
1511 ; SSE-NEXT: xorps %xmm0, %xmm0
1512 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
1513 ; SSE-NEXT: jmp .LBB45_12
1514 ; SSE-NEXT: .LBB45_10:
1515 ; SSE-NEXT: shrq %rax
1516 ; SSE-NEXT: orq %rax, %rcx
1517 ; SSE-NEXT: xorps %xmm0, %xmm0
1518 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
1519 ; SSE-NEXT: addss %xmm0, %xmm0
1520 ; SSE-NEXT: .LBB45_12:
1521 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1522 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1523 ; SSE-NEXT: movaps %xmm2, %xmm0
1526 ; AVX1-LABEL: uitofp_4i64_to_4f32:
1528 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
1529 ; AVX1-NEXT: movl %eax, %ecx
1530 ; AVX1-NEXT: andl $1, %ecx
1531 ; AVX1-NEXT: testq %rax, %rax
1532 ; AVX1-NEXT: js .LBB45_1
1533 ; AVX1-NEXT: # BB#2:
1534 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1535 ; AVX1-NEXT: jmp .LBB45_3
1536 ; AVX1-NEXT: .LBB45_1:
1537 ; AVX1-NEXT: shrq %rax
1538 ; AVX1-NEXT: orq %rax, %rcx
1539 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1540 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
1541 ; AVX1-NEXT: .LBB45_3:
1542 ; AVX1-NEXT: vmovq %xmm0, %rax
1543 ; AVX1-NEXT: movl %eax, %ecx
1544 ; AVX1-NEXT: andl $1, %ecx
1545 ; AVX1-NEXT: testq %rax, %rax
1546 ; AVX1-NEXT: js .LBB45_4
1547 ; AVX1-NEXT: # BB#5:
1548 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1549 ; AVX1-NEXT: jmp .LBB45_6
1550 ; AVX1-NEXT: .LBB45_4:
1551 ; AVX1-NEXT: shrq %rax
1552 ; AVX1-NEXT: orq %rax, %rcx
1553 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1554 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
1555 ; AVX1-NEXT: .LBB45_6:
1556 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1557 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1558 ; AVX1-NEXT: vmovq %xmm0, %rax
1559 ; AVX1-NEXT: movl %eax, %ecx
1560 ; AVX1-NEXT: andl $1, %ecx
1561 ; AVX1-NEXT: testq %rax, %rax
1562 ; AVX1-NEXT: js .LBB45_7
1563 ; AVX1-NEXT: # BB#8:
1564 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1565 ; AVX1-NEXT: jmp .LBB45_9
1566 ; AVX1-NEXT: .LBB45_7:
1567 ; AVX1-NEXT: shrq %rax
1568 ; AVX1-NEXT: orq %rax, %rcx
1569 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1570 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
1571 ; AVX1-NEXT: .LBB45_9:
1572 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1573 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
1574 ; AVX1-NEXT: movl %eax, %ecx
1575 ; AVX1-NEXT: andl $1, %ecx
1576 ; AVX1-NEXT: testq %rax, %rax
1577 ; AVX1-NEXT: js .LBB45_10
1578 ; AVX1-NEXT: # BB#11:
1579 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
1580 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1581 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1582 ; AVX1-NEXT: vzeroupper
1584 ; AVX1-NEXT: .LBB45_10:
1585 ; AVX1-NEXT: shrq %rax
1586 ; AVX1-NEXT: orq %rax, %rcx
1587 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1588 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
1589 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1590 ; AVX1-NEXT: vzeroupper
1593 ; AVX2-LABEL: uitofp_4i64_to_4f32:
1595 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1596 ; AVX2-NEXT: movl %eax, %ecx
1597 ; AVX2-NEXT: andl $1, %ecx
1598 ; AVX2-NEXT: testq %rax, %rax
1599 ; AVX2-NEXT: js .LBB45_1
1600 ; AVX2-NEXT: # BB#2:
1601 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1602 ; AVX2-NEXT: jmp .LBB45_3
1603 ; AVX2-NEXT: .LBB45_1:
1604 ; AVX2-NEXT: shrq %rax
1605 ; AVX2-NEXT: orq %rax, %rcx
1606 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1607 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
1608 ; AVX2-NEXT: .LBB45_3:
1609 ; AVX2-NEXT: vmovq %xmm0, %rax
1610 ; AVX2-NEXT: movl %eax, %ecx
1611 ; AVX2-NEXT: andl $1, %ecx
1612 ; AVX2-NEXT: testq %rax, %rax
1613 ; AVX2-NEXT: js .LBB45_4
1614 ; AVX2-NEXT: # BB#5:
1615 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1616 ; AVX2-NEXT: jmp .LBB45_6
1617 ; AVX2-NEXT: .LBB45_4:
1618 ; AVX2-NEXT: shrq %rax
1619 ; AVX2-NEXT: orq %rax, %rcx
1620 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1621 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
1622 ; AVX2-NEXT: .LBB45_6:
1623 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1624 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1625 ; AVX2-NEXT: vmovq %xmm0, %rax
1626 ; AVX2-NEXT: movl %eax, %ecx
1627 ; AVX2-NEXT: andl $1, %ecx
1628 ; AVX2-NEXT: testq %rax, %rax
1629 ; AVX2-NEXT: js .LBB45_7
1630 ; AVX2-NEXT: # BB#8:
1631 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1632 ; AVX2-NEXT: jmp .LBB45_9
1633 ; AVX2-NEXT: .LBB45_7:
1634 ; AVX2-NEXT: shrq %rax
1635 ; AVX2-NEXT: orq %rax, %rcx
1636 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1637 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
1638 ; AVX2-NEXT: .LBB45_9:
1639 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1640 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1641 ; AVX2-NEXT: movl %eax, %ecx
1642 ; AVX2-NEXT: andl $1, %ecx
1643 ; AVX2-NEXT: testq %rax, %rax
1644 ; AVX2-NEXT: js .LBB45_10
1645 ; AVX2-NEXT: # BB#11:
1646 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
1647 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1648 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1649 ; AVX2-NEXT: vzeroupper
1651 ; AVX2-NEXT: .LBB45_10:
1652 ; AVX2-NEXT: shrq %rax
1653 ; AVX2-NEXT: orq %rax, %rcx
1654 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1655 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
1656 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1657 ; AVX2-NEXT: vzeroupper
1659 %cvt = uitofp <4 x i64> %a to <4 x float>
1660 ret <4 x float> %cvt
1663 define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) {
1664 ; SSE-LABEL: uitofp_8i32_to_8f32:
1666 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
1667 ; SSE-NEXT: movdqa %xmm0, %xmm3
1668 ; SSE-NEXT: pand %xmm2, %xmm3
1669 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
1670 ; SSE-NEXT: por %xmm4, %xmm3
1671 ; SSE-NEXT: psrld $16, %xmm0
1672 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
1673 ; SSE-NEXT: por %xmm5, %xmm0
1674 ; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
1675 ; SSE-NEXT: addps %xmm6, %xmm0
1676 ; SSE-NEXT: addps %xmm3, %xmm0
1677 ; SSE-NEXT: pand %xmm1, %xmm2
1678 ; SSE-NEXT: por %xmm4, %xmm2
1679 ; SSE-NEXT: psrld $16, %xmm1
1680 ; SSE-NEXT: por %xmm5, %xmm1
1681 ; SSE-NEXT: addps %xmm6, %xmm1
1682 ; SSE-NEXT: addps %xmm2, %xmm1
1685 ; AVX1-LABEL: uitofp_8i32_to_8f32:
1687 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
1688 ; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
1689 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
1690 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1691 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
1692 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
1693 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1694 ; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
1695 ; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0
1698 ; AVX2-LABEL: uitofp_8i32_to_8f32:
1700 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
1701 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1702 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
1703 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
1704 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
1705 ; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2
1706 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
1707 ; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
1709 %cvt = uitofp <8 x i32> %a to <8 x float>
1710 ret <8 x float> %cvt
1713 define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
1714 ; SSE-LABEL: uitofp_8i16_to_8f32:
1716 ; SSE-NEXT: pxor %xmm1, %xmm1
1717 ; SSE-NEXT: movdqa %xmm0, %xmm2
1718 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1719 ; SSE-NEXT: cvtdq2ps %xmm2, %xmm2
1720 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1721 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1722 ; SSE-NEXT: movaps %xmm2, %xmm0
1725 ; AVX1-LABEL: uitofp_8i16_to_8f32:
1727 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1728 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1729 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1730 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1731 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1734 ; AVX2-LABEL: uitofp_8i16_to_8f32:
1736 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1737 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1739 %cvt = uitofp <8 x i16> %a to <8 x float>
1740 ret <8 x float> %cvt
1743 define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
1744 ; SSE-LABEL: uitofp_8i8_to_8f32:
1746 ; SSE-NEXT: pxor %xmm1, %xmm1
1747 ; SSE-NEXT: movdqa %xmm0, %xmm2
1748 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1749 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1750 ; SSE-NEXT: cvtdq2ps %xmm2, %xmm2
1751 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1752 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1753 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
1754 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1755 ; SSE-NEXT: movaps %xmm2, %xmm0
1758 ; AVX1-LABEL: uitofp_8i8_to_8f32:
1760 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1761 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1762 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1763 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
1764 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1765 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
1766 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1767 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1770 ; AVX2-LABEL: uitofp_8i8_to_8f32:
1772 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1773 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
1774 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1776 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1777 %cvt = uitofp <8 x i8> %shuf to <8 x float>
1778 ret <8 x float> %cvt
1781 define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
1782 ; SSE-LABEL: uitofp_16i8_to_8f32:
1784 ; SSE-NEXT: pxor %xmm1, %xmm1
1785 ; SSE-NEXT: movdqa %xmm0, %xmm2
1786 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1787 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1788 ; SSE-NEXT: cvtdq2ps %xmm2, %xmm2
1789 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1790 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1791 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
1792 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1793 ; SSE-NEXT: movaps %xmm2, %xmm0
1796 ; AVX1-LABEL: uitofp_16i8_to_8f32:
1798 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1799 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1800 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1801 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1802 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1803 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1806 ; AVX2-LABEL: uitofp_16i8_to_8f32:
1808 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1809 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1810 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1812 %cvt = uitofp <16 x i8> %a to <16 x float>
1813 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1814 ret <8 x float> %shuf
1821 %Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }>
1822 define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
1823 ; SSE-LABEL: aggregate_sitofp_8i16_to_8f32:
1825 ; SSE-NEXT: movq 24(%rdi), %rax
1826 ; SSE-NEXT: movdqu 8(%rdi), %xmm0
1827 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1828 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1829 ; SSE-NEXT: psrad $16, %xmm1
1830 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
1831 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1832 ; SSE-NEXT: psrad $16, %xmm0
1833 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1834 ; SSE-NEXT: movaps %xmm0, (%rax)
1835 ; SSE-NEXT: movaps %xmm1, 16(%rax)
1838 ; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
1840 ; AVX1-NEXT: movq 24(%rdi), %rax
1841 ; AVX1-NEXT: vmovdqu 8(%rdi), %xmm0
1842 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
1843 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1844 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
1845 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1846 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1847 ; AVX1-NEXT: vmovaps %ymm0, (%rax)
1848 ; AVX1-NEXT: vzeroupper
1851 ; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32:
1853 ; AVX2-NEXT: movq 24(%rdi), %rax
1854 ; AVX2-NEXT: vpmovsxwd 8(%rdi), %ymm0
1855 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1856 ; AVX2-NEXT: vmovaps %ymm0, (%rax)
1857 ; AVX2-NEXT: vzeroupper
1859 %1 = load %Arguments, %Arguments* %a0, align 1
1860 %2 = extractvalue %Arguments %1, 1
1861 %3 = extractvalue %Arguments %1, 2
1862 %4 = sitofp <8 x i16> %2 to <8 x float>
1863 store <8 x float> %4, <8 x float>* %3, align 32