1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
5 ; 32-bit tests to make sure we're not doing anything stupid.
6 ; RUN: llc < %s -mtriple=i686-unknown-unknown
7 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse
8 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2
11 ; Signed Integer to Double
14 define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
15 ; SSE-LABEL: sitofp_2i64_to_2f64:
17 ; SSE-NEXT: movd %xmm0, %rax
18 ; SSE-NEXT: cvtsi2sdq %rax, %xmm1
19 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
20 ; SSE-NEXT: movd %xmm0, %rax
21 ; SSE-NEXT: xorps %xmm0, %xmm0
22 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0
23 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
24 ; SSE-NEXT: movapd %xmm1, %xmm0
27 ; AVX-LABEL: sitofp_2i64_to_2f64:
29 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
30 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
31 ; AVX-NEXT: vmovq %xmm0, %rax
32 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
33 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
34 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
36 %cvt = sitofp <2 x i64> %a to <2 x double>
40 define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) {
41 ; SSE-LABEL: sitofp_2i32_to_2f64:
43 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
46 ; AVX-LABEL: sitofp_2i32_to_2f64:
48 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
50 %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
51 %cvt = sitofp <2 x i32> %shuf to <2 x double>
55 define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
56 ; SSE-LABEL: sitofp_4i32_to_2f64:
58 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
61 ; AVX-LABEL: sitofp_4i32_to_2f64:
63 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
64 ; AVX-NEXT: vzeroupper
66 %cvt = sitofp <4 x i32> %a to <4 x double>
67 %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
68 ret <2 x double> %shuf
71 define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) {
72 ; SSE-LABEL: sitofp_2i16_to_2f64:
74 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
75 ; SSE-NEXT: psrad $16, %xmm0
76 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
79 ; AVX-LABEL: sitofp_2i16_to_2f64:
81 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
82 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
84 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
85 %cvt = sitofp <2 x i16> %shuf to <2 x double>
89 define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
90 ; SSE-LABEL: sitofp_8i16_to_2f64:
92 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
93 ; SSE-NEXT: psrad $16, %xmm0
94 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
97 ; AVX1-LABEL: sitofp_8i16_to_2f64:
99 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
100 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
101 ; AVX1-NEXT: vzeroupper
104 ; AVX2-LABEL: sitofp_8i16_to_2f64:
106 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
107 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
108 ; AVX2-NEXT: vzeroupper
110 %cvt = sitofp <8 x i16> %a to <8 x double>
111 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
112 ret <2 x double> %shuf
115 define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) {
116 ; SSE-LABEL: sitofp_2i8_to_2f64:
118 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
119 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
120 ; SSE-NEXT: psrad $24, %xmm0
121 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
124 ; AVX-LABEL: sitofp_2i8_to_2f64:
126 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
127 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
129 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
130 %cvt = sitofp <2 x i8> %shuf to <2 x double>
131 ret <2 x double> %cvt
134 define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
135 ; SSE-LABEL: sitofp_16i8_to_2f64:
137 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
138 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
139 ; SSE-NEXT: psrad $24, %xmm0
140 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
143 ; AVX1-LABEL: sitofp_16i8_to_2f64:
145 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
146 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
147 ; AVX1-NEXT: vzeroupper
150 ; AVX2-LABEL: sitofp_16i8_to_2f64:
152 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
153 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
154 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
155 ; AVX2-NEXT: vzeroupper
157 %cvt = sitofp <16 x i8> %a to <16 x double>
158 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
159 ret <2 x double> %shuf
162 define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
163 ; SSE-LABEL: sitofp_4i64_to_4f64:
165 ; SSE-NEXT: movd %xmm0, %rax
166 ; SSE-NEXT: cvtsi2sdq %rax, %xmm2
167 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
168 ; SSE-NEXT: movd %xmm0, %rax
169 ; SSE-NEXT: xorps %xmm0, %xmm0
170 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0
171 ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
172 ; SSE-NEXT: movd %xmm1, %rax
173 ; SSE-NEXT: cvtsi2sdq %rax, %xmm3
174 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
175 ; SSE-NEXT: movd %xmm0, %rax
176 ; SSE-NEXT: xorps %xmm0, %xmm0
177 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0
178 ; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
179 ; SSE-NEXT: movapd %xmm2, %xmm0
180 ; SSE-NEXT: movapd %xmm3, %xmm1
183 ; AVX1-LABEL: sitofp_4i64_to_4f64:
185 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
186 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax
187 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
188 ; AVX1-NEXT: vmovq %xmm1, %rax
189 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
190 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
191 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
192 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
193 ; AVX1-NEXT: vmovq %xmm0, %rax
194 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
195 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
196 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
197 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
200 ; AVX2-LABEL: sitofp_4i64_to_4f64:
202 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
203 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax
204 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
205 ; AVX2-NEXT: vmovq %xmm1, %rax
206 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
207 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
208 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
209 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
210 ; AVX2-NEXT: vmovq %xmm0, %rax
211 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
212 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
213 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
214 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
216 %cvt = sitofp <4 x i64> %a to <4 x double>
217 ret <4 x double> %cvt
220 define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) {
221 ; SSE-LABEL: sitofp_4i32_to_4f64:
223 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
224 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
225 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
226 ; SSE-NEXT: movaps %xmm2, %xmm0
229 ; AVX-LABEL: sitofp_4i32_to_4f64:
231 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
233 %cvt = sitofp <4 x i32> %a to <4 x double>
234 ret <4 x double> %cvt
237 define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) {
238 ; SSE-LABEL: sitofp_4i16_to_4f64:
240 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
241 ; SSE-NEXT: psrad $16, %xmm1
242 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
243 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
244 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
247 ; AVX-LABEL: sitofp_4i16_to_4f64:
249 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
250 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
252 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
253 %cvt = sitofp <4 x i16> %shuf to <4 x double>
254 ret <4 x double> %cvt
257 define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) {
258 ; SSE-LABEL: sitofp_8i16_to_4f64:
260 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
261 ; SSE-NEXT: psrad $16, %xmm1
262 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
263 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
264 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
267 ; AVX1-LABEL: sitofp_8i16_to_4f64:
269 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
270 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
273 ; AVX2-LABEL: sitofp_8i16_to_4f64:
275 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
276 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
278 %cvt = sitofp <8 x i16> %a to <8 x double>
279 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
280 ret <4 x double> %shuf
283 define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) {
284 ; SSE-LABEL: sitofp_4i8_to_4f64:
286 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
287 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
288 ; SSE-NEXT: psrad $24, %xmm1
289 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
290 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
291 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
294 ; AVX-LABEL: sitofp_4i8_to_4f64:
296 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
297 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
299 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
300 %cvt = sitofp <4 x i8> %shuf to <4 x double>
301 ret <4 x double> %cvt
304 define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
305 ; SSE-LABEL: sitofp_16i8_to_4f64:
307 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
308 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
309 ; SSE-NEXT: psrad $24, %xmm1
310 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
311 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
312 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
315 ; AVX1-LABEL: sitofp_16i8_to_4f64:
317 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
318 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
321 ; AVX2-LABEL: sitofp_16i8_to_4f64:
323 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
324 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
325 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
327 %cvt = sitofp <16 x i8> %a to <16 x double>
328 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
329 ret <4 x double> %shuf
333 ; Unsigned Integer to Double
336 define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
337 ; SSE-LABEL: uitofp_2i64_to_2f64:
339 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
340 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
341 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
342 ; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
343 ; SSE-NEXT: subpd %xmm3, %xmm0
344 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
345 ; SSE-NEXT: addpd %xmm4, %xmm0
346 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
347 ; SSE-NEXT: subpd %xmm3, %xmm2
348 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
349 ; SSE-NEXT: addpd %xmm2, %xmm1
350 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
353 ; AVX-LABEL: uitofp_2i64_to_2f64:
355 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
356 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
357 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
358 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
359 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
360 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
361 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
362 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
363 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
364 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
366 %cvt = uitofp <2 x i64> %a to <2 x double>
367 ret <2 x double> %cvt
370 define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
371 ; SSE-LABEL: uitofp_2i32_to_2f64:
373 ; SSE-NEXT: pxor %xmm1, %xmm1
374 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
375 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
376 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
377 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
378 ; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
379 ; SSE-NEXT: subpd %xmm3, %xmm0
380 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
381 ; SSE-NEXT: addpd %xmm4, %xmm0
382 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
383 ; SSE-NEXT: subpd %xmm3, %xmm2
384 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
385 ; SSE-NEXT: addpd %xmm2, %xmm1
386 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
389 ; AVX-LABEL: uitofp_2i32_to_2f64:
391 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
392 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
393 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
394 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
395 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
396 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
397 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
398 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
399 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
400 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
401 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
403 %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
404 %cvt = uitofp <2 x i32> %shuf to <2 x double>
405 ret <2 x double> %cvt
408 define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
409 ; SSE-LABEL: uitofp_4i32_to_2f64:
411 ; SSE-NEXT: pxor %xmm1, %xmm1
412 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
413 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
414 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
415 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
416 ; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
417 ; SSE-NEXT: subpd %xmm3, %xmm0
418 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
419 ; SSE-NEXT: addpd %xmm4, %xmm0
420 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
421 ; SSE-NEXT: subpd %xmm3, %xmm2
422 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
423 ; SSE-NEXT: addpd %xmm2, %xmm1
424 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
427 ; AVX1-LABEL: uitofp_4i32_to_2f64:
429 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
430 ; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
431 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
432 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
433 ; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
434 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
435 ; AVX1-NEXT: vzeroupper
438 ; AVX2-LABEL: uitofp_4i32_to_2f64:
440 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
441 ; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
442 ; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
443 ; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
444 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
445 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
446 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
447 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
448 ; AVX2-NEXT: vzeroupper
450 %cvt = uitofp <4 x i32> %a to <4 x double>
451 %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
452 ret <2 x double> %shuf
455 define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) {
456 ; SSE-LABEL: uitofp_2i16_to_2f64:
458 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
459 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
460 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
461 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
462 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
465 ; AVX-LABEL: uitofp_2i16_to_2f64:
467 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
468 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
469 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
470 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
472 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
473 %cvt = uitofp <2 x i16> %shuf to <2 x double>
474 ret <2 x double> %cvt
477 define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
478 ; SSE-LABEL: uitofp_8i16_to_2f64:
480 ; SSE-NEXT: pxor %xmm1, %xmm1
481 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
482 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
485 ; AVX1-LABEL: uitofp_8i16_to_2f64:
487 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
488 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
489 ; AVX1-NEXT: vzeroupper
492 ; AVX2-LABEL: uitofp_8i16_to_2f64:
494 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
495 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
496 ; AVX2-NEXT: vzeroupper
498 %cvt = uitofp <8 x i16> %a to <8 x double>
499 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
500 ret <2 x double> %shuf
503 define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) {
504 ; SSE-LABEL: uitofp_2i8_to_2f64:
506 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
507 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
508 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1]
509 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
510 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
511 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
514 ; AVX-LABEL: uitofp_2i8_to_2f64:
516 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
517 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
518 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
519 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
521 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
522 %cvt = uitofp <2 x i8> %shuf to <2 x double>
523 ret <2 x double> %cvt
526 define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
527 ; SSE-LABEL: uitofp_16i8_to_2f64:
529 ; SSE-NEXT: pxor %xmm1, %xmm1
530 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
531 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
532 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
535 ; AVX1-LABEL: uitofp_16i8_to_2f64:
537 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
538 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
539 ; AVX1-NEXT: vzeroupper
542 ; AVX2-LABEL: uitofp_16i8_to_2f64:
544 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
545 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
546 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
547 ; AVX2-NEXT: vzeroupper
549 %cvt = uitofp <16 x i8> %a to <16 x double>
550 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
551 ret <2 x double> %shuf
554 define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
555 ; SSE-LABEL: uitofp_4i64_to_4f64:
557 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
558 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
559 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
560 ; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
561 ; SSE-NEXT: subpd %xmm4, %xmm0
562 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
563 ; SSE-NEXT: addpd %xmm5, %xmm0
564 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
565 ; SSE-NEXT: subpd %xmm4, %xmm3
566 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
567 ; SSE-NEXT: addpd %xmm3, %xmm5
568 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
569 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
570 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
571 ; SSE-NEXT: subpd %xmm4, %xmm1
572 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
573 ; SSE-NEXT: addpd %xmm5, %xmm1
574 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
575 ; SSE-NEXT: subpd %xmm4, %xmm3
576 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
577 ; SSE-NEXT: addpd %xmm3, %xmm2
578 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
581 ; AVX1-LABEL: uitofp_4i64_to_4f64:
583 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
584 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
585 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
586 ; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
587 ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
588 ; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
589 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
590 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
591 ; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1
592 ; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
593 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
594 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
595 ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
596 ; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
597 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
598 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
599 ; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0
600 ; AVX1-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
601 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
602 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
605 ; AVX2-LABEL: uitofp_4i64_to_4f64:
607 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
608 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
609 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
610 ; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
611 ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
612 ; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
613 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
614 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
615 ; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1
616 ; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
617 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
618 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
619 ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
620 ; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
621 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
622 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
623 ; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0
624 ; AVX2-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
625 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
626 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
628 %cvt = uitofp <4 x i64> %a to <4 x double>
629 ret <4 x double> %cvt
632 define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
633 ; SSE-LABEL: uitofp_4i32_to_4f64:
635 ; SSE-NEXT: pxor %xmm1, %xmm1
636 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
637 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
638 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
639 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
640 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
641 ; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
642 ; SSE-NEXT: subpd %xmm4, %xmm0
643 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
644 ; SSE-NEXT: addpd %xmm5, %xmm0
645 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
646 ; SSE-NEXT: subpd %xmm4, %xmm1
647 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
648 ; SSE-NEXT: addpd %xmm1, %xmm5
649 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
650 ; SSE-NEXT: pand {{.*}}(%rip), %xmm2
651 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
652 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
653 ; SSE-NEXT: subpd %xmm4, %xmm2
654 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
655 ; SSE-NEXT: addpd %xmm2, %xmm1
656 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
657 ; SSE-NEXT: subpd %xmm4, %xmm5
658 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
659 ; SSE-NEXT: addpd %xmm5, %xmm2
660 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
663 ; AVX1-LABEL: uitofp_4i32_to_4f64:
665 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
666 ; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
667 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
668 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
669 ; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
670 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
673 ; AVX2-LABEL: uitofp_4i32_to_4f64:
675 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
676 ; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
677 ; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
678 ; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
679 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
680 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
681 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
682 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
684 %cvt = uitofp <4 x i32> %a to <4 x double>
685 ret <4 x double> %cvt
688 define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) {
689 ; SSE-LABEL: uitofp_4i16_to_4f64:
691 ; SSE-NEXT: pxor %xmm1, %xmm1
692 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
693 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
694 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
695 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
696 ; SSE-NEXT: movaps %xmm2, %xmm0
699 ; AVX-LABEL: uitofp_4i16_to_4f64:
701 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
702 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
704 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
705 %cvt = uitofp <4 x i16> %shuf to <4 x double>
706 ret <4 x double> %cvt
709 define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) {
710 ; SSE-LABEL: uitofp_8i16_to_4f64:
712 ; SSE-NEXT: pxor %xmm1, %xmm1
713 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
714 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
715 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
716 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
717 ; SSE-NEXT: movaps %xmm2, %xmm0
720 ; AVX1-LABEL: uitofp_8i16_to_4f64:
722 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
723 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
726 ; AVX2-LABEL: uitofp_8i16_to_4f64:
728 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
729 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
731 %cvt = uitofp <8 x i16> %a to <8 x double>
732 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
733 ret <4 x double> %shuf
736 define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) {
737 ; SSE-LABEL: uitofp_4i8_to_4f64:
739 ; SSE-NEXT: pxor %xmm1, %xmm1
740 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
741 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
742 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
743 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
744 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
745 ; SSE-NEXT: movaps %xmm2, %xmm0
748 ; AVX-LABEL: uitofp_4i8_to_4f64:
750 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
751 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
753 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
754 %cvt = uitofp <4 x i8> %shuf to <4 x double>
755 ret <4 x double> %cvt
758 define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
759 ; SSE-LABEL: uitofp_16i8_to_4f64:
761 ; SSE-NEXT: pxor %xmm1, %xmm1
762 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
763 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
764 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
765 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
766 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
767 ; SSE-NEXT: movaps %xmm2, %xmm0
770 ; AVX1-LABEL: uitofp_16i8_to_4f64:
772 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
773 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
776 ; AVX2-LABEL: uitofp_16i8_to_4f64:
778 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
779 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
780 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
782 %cvt = uitofp <16 x i8> %a to <16 x double>
783 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
784 ret <4 x double> %shuf
788 ; Signed Integer to Float
791 define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
792 ; SSE-LABEL: sitofp_2i64_to_4f32:
794 ; SSE-NEXT: movd %xmm0, %rax
795 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
796 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
797 ; SSE-NEXT: movd %xmm0, %rax
798 ; SSE-NEXT: xorps %xmm0, %xmm0
799 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
800 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
801 ; SSE-NEXT: movaps %xmm1, %xmm0
804 ; AVX-LABEL: sitofp_2i64_to_4f32:
806 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
807 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
808 ; AVX-NEXT: vmovq %xmm0, %rax
809 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
810 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
811 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
812 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
813 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
814 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
816 %cvt = sitofp <2 x i64> %a to <2 x float>
817 %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
821 define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
822 ; SSE-LABEL: sitofp_4i64_to_4f32_undef:
824 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
825 ; SSE-NEXT: movd %xmm0, %rax
826 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
827 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
828 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
829 ; SSE-NEXT: movd %xmm0, %rax
830 ; SSE-NEXT: xorps %xmm0, %xmm0
831 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
832 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
833 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
834 ; SSE-NEXT: movaps %xmm1, %xmm0
837 ; AVX-LABEL: sitofp_4i64_to_4f32_undef:
839 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
840 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
841 ; AVX-NEXT: vmovq %xmm0, %rax
842 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
843 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
844 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
845 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
846 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
847 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
849 %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
850 %cvt = sitofp <4 x i64> %ext to <4 x float>
854 define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) {
855 ; SSE-LABEL: sitofp_4i32_to_4f32:
857 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
860 ; AVX-LABEL: sitofp_4i32_to_4f32:
862 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
864 %cvt = sitofp <4 x i32> %a to <4 x float>
868 define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) {
869 ; SSE-LABEL: sitofp_4i16_to_4f32:
871 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
872 ; SSE-NEXT: psrad $16, %xmm0
873 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
876 ; AVX-LABEL: sitofp_4i16_to_4f32:
878 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
879 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
881 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
882 %cvt = sitofp <4 x i16> %shuf to <4 x float>
886 define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
887 ; SSE-LABEL: sitofp_8i16_to_4f32:
889 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
890 ; SSE-NEXT: psrad $16, %xmm0
891 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
894 ; AVX1-LABEL: sitofp_8i16_to_4f32:
896 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
897 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
898 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
899 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
900 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
901 ; AVX1-NEXT: vzeroupper
904 ; AVX2-LABEL: sitofp_8i16_to_4f32:
906 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
907 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
908 ; AVX2-NEXT: vzeroupper
910 %cvt = sitofp <8 x i16> %a to <8 x float>
911 %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
912 ret <4 x float> %shuf
915 define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) {
916 ; SSE-LABEL: sitofp_4i8_to_4f32:
918 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
919 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
920 ; SSE-NEXT: psrad $24, %xmm0
921 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
924 ; AVX-LABEL: sitofp_4i8_to_4f32:
926 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
927 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
929 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
930 %cvt = sitofp <4 x i8> %shuf to <4 x float>
934 define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
935 ; SSE-LABEL: sitofp_16i8_to_4f32:
937 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
938 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
939 ; SSE-NEXT: psrad $24, %xmm0
940 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
943 ; AVX1-LABEL: sitofp_16i8_to_4f32:
945 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
946 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
947 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
948 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
949 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
950 ; AVX1-NEXT: vzeroupper
953 ; AVX2-LABEL: sitofp_16i8_to_4f32:
955 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
956 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
957 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
958 ; AVX2-NEXT: vzeroupper
960 %cvt = sitofp <16 x i8> %a to <16 x float>
961 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
962 ret <4 x float> %shuf
965 define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
966 ; SSE-LABEL: sitofp_4i64_to_4f32:
968 ; SSE-NEXT: movd %xmm1, %rax
969 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3
970 ; SSE-NEXT: movd %xmm0, %rax
971 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
972 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
973 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
974 ; SSE-NEXT: movd %xmm1, %rax
975 ; SSE-NEXT: xorps %xmm1, %xmm1
976 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
977 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
978 ; SSE-NEXT: movd %xmm0, %rax
979 ; SSE-NEXT: xorps %xmm0, %xmm0
980 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
981 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
982 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
983 ; SSE-NEXT: movaps %xmm2, %xmm0
986 ; AVX1-LABEL: sitofp_4i64_to_4f32:
988 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
989 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
990 ; AVX1-NEXT: vmovq %xmm0, %rax
991 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
992 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
993 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
994 ; AVX1-NEXT: vmovq %xmm0, %rax
995 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
996 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
997 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
998 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
999 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1000 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1001 ; AVX1-NEXT: vzeroupper
1004 ; AVX2-LABEL: sitofp_4i64_to_4f32:
1006 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1007 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1008 ; AVX2-NEXT: vmovq %xmm0, %rax
1009 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1010 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1011 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1012 ; AVX2-NEXT: vmovq %xmm0, %rax
1013 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1014 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1015 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1016 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
1017 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1018 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1019 ; AVX2-NEXT: vzeroupper
1021 %cvt = sitofp <4 x i64> %a to <4 x float>
1022 ret <4 x float> %cvt
1025 define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) {
1026 ; SSE-LABEL: sitofp_8i32_to_8f32:
1028 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1029 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
1032 ; AVX-LABEL: sitofp_8i32_to_8f32:
1034 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
1036 %cvt = sitofp <8 x i32> %a to <8 x float>
1037 ret <8 x float> %cvt
1040 define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
1041 ; SSE-LABEL: sitofp_8i16_to_8f32:
1043 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1044 ; SSE-NEXT: psrad $16, %xmm1
1045 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm2
1046 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1047 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1048 ; SSE-NEXT: psrad $16, %xmm0
1049 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1050 ; SSE-NEXT: movaps %xmm2, %xmm0
1053 ; AVX1-LABEL: sitofp_8i16_to_8f32:
1055 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
1056 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1057 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
1058 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1059 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1062 ; AVX2-LABEL: sitofp_8i16_to_8f32:
1064 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
1065 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1067 %cvt = sitofp <8 x i16> %a to <8 x float>
1068 ret <8 x float> %cvt
1071 define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
1072 ; SSE-LABEL: sitofp_8i8_to_8f32:
1074 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1075 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1076 ; SSE-NEXT: psrad $24, %xmm1
1077 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm2
1078 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1079 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1080 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1081 ; SSE-NEXT: psrad $24, %xmm0
1082 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1083 ; SSE-NEXT: movaps %xmm2, %xmm0
1086 ; AVX1-LABEL: sitofp_8i8_to_8f32:
1088 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
1089 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1090 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
1091 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1092 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1095 ; AVX2-LABEL: sitofp_8i8_to_8f32:
1097 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1098 ; AVX2-NEXT: vpslld $24, %ymm0, %ymm0
1099 ; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
1100 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1102 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1103 %cvt = sitofp <8 x i8> %shuf to <8 x float>
1104 ret <8 x float> %cvt
1107 define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
1108 ; SSE-LABEL: sitofp_16i8_to_8f32:
1110 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1111 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1112 ; SSE-NEXT: psrad $24, %xmm1
1113 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm2
1114 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1115 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1116 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1117 ; SSE-NEXT: psrad $24, %xmm0
1118 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1119 ; SSE-NEXT: movaps %xmm2, %xmm0
1122 ; AVX1-LABEL: sitofp_16i8_to_8f32:
1124 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
1125 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1126 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
1127 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1128 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1131 ; AVX2-LABEL: sitofp_16i8_to_8f32:
1133 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
1134 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
1135 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1137 %cvt = sitofp <16 x i8> %a to <16 x float>
1138 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1139 ret <8 x float> %shuf
1143 ; Unsigned Integer to Float
1146 define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
1147 ; SSE-LABEL: uitofp_2i64_to_4f32:
1149 ; SSE-NEXT: movdqa %xmm0, %xmm1
1150 ; SSE-NEXT: movd %xmm1, %rax
1151 ; SSE-NEXT: movl %eax, %ecx
1152 ; SSE-NEXT: andl $1, %ecx
1153 ; SSE-NEXT: testq %rax, %rax
1154 ; SSE-NEXT: js .LBB38_1
1156 ; SSE-NEXT: xorps %xmm0, %xmm0
1157 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
1158 ; SSE-NEXT: jmp .LBB38_3
1159 ; SSE-NEXT: .LBB38_1:
1160 ; SSE-NEXT: shrq %rax
1161 ; SSE-NEXT: orq %rax, %rcx
1162 ; SSE-NEXT: xorps %xmm0, %xmm0
1163 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
1164 ; SSE-NEXT: addss %xmm0, %xmm0
1165 ; SSE-NEXT: .LBB38_3:
1166 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1167 ; SSE-NEXT: movd %xmm1, %rax
1168 ; SSE-NEXT: movl %eax, %ecx
1169 ; SSE-NEXT: andl $1, %ecx
1170 ; SSE-NEXT: testq %rax, %rax
1171 ; SSE-NEXT: js .LBB38_4
1173 ; SSE-NEXT: xorps %xmm1, %xmm1
1174 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
1175 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1177 ; SSE-NEXT: .LBB38_4:
1178 ; SSE-NEXT: shrq %rax
1179 ; SSE-NEXT: orq %rax, %rcx
1180 ; SSE-NEXT: xorps %xmm1, %xmm1
1181 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
1182 ; SSE-NEXT: addss %xmm1, %xmm1
1183 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1186 ; AVX-LABEL: uitofp_2i64_to_4f32:
1188 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
1189 ; AVX-NEXT: movl %eax, %ecx
1190 ; AVX-NEXT: andl $1, %ecx
1191 ; AVX-NEXT: testq %rax, %rax
1192 ; AVX-NEXT: js .LBB38_1
1194 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1195 ; AVX-NEXT: jmp .LBB38_3
1196 ; AVX-NEXT: .LBB38_1:
1197 ; AVX-NEXT: shrq %rax
1198 ; AVX-NEXT: orq %rax, %rcx
1199 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1200 ; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1
1201 ; AVX-NEXT: .LBB38_3:
1202 ; AVX-NEXT: vmovq %xmm0, %rax
1203 ; AVX-NEXT: movl %eax, %ecx
1204 ; AVX-NEXT: andl $1, %ecx
1205 ; AVX-NEXT: testq %rax, %rax
1206 ; AVX-NEXT: js .LBB38_4
1208 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1209 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1210 ; AVX-NEXT: jmp .LBB38_6
1211 ; AVX-NEXT: .LBB38_4:
1212 ; AVX-NEXT: shrq %rax
1213 ; AVX-NEXT: orq %rax, %rcx
1214 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1215 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1216 ; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0
1217 ; AVX-NEXT: .LBB38_6:
1218 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
1219 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1220 ; AVX-NEXT: testq %rax, %rax
1221 ; AVX-NEXT: js .LBB38_8
1223 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1224 ; AVX-NEXT: .LBB38_8:
1225 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
1226 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
1228 %cvt = uitofp <2 x i64> %a to <2 x float>
1229 %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1230 ret <4 x float> %ext
1233 define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
1234 ; SSE-LABEL: uitofp_4i64_to_4f32_undef:
1236 ; SSE-NEXT: movdqa %xmm0, %xmm1
1237 ; SSE-NEXT: testq %rax, %rax
1238 ; SSE-NEXT: xorps %xmm2, %xmm2
1239 ; SSE-NEXT: js .LBB39_2
1241 ; SSE-NEXT: xorps %xmm2, %xmm2
1242 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
1243 ; SSE-NEXT: .LBB39_2:
1244 ; SSE-NEXT: movd %xmm1, %rax
1245 ; SSE-NEXT: movl %eax, %ecx
1246 ; SSE-NEXT: andl $1, %ecx
1247 ; SSE-NEXT: testq %rax, %rax
1248 ; SSE-NEXT: js .LBB39_3
1250 ; SSE-NEXT: xorps %xmm0, %xmm0
1251 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
1252 ; SSE-NEXT: jmp .LBB39_5
1253 ; SSE-NEXT: .LBB39_3:
1254 ; SSE-NEXT: shrq %rax
1255 ; SSE-NEXT: orq %rax, %rcx
1256 ; SSE-NEXT: xorps %xmm0, %xmm0
1257 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
1258 ; SSE-NEXT: addss %xmm0, %xmm0
1259 ; SSE-NEXT: .LBB39_5:
1260 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1261 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1262 ; SSE-NEXT: movd %xmm1, %rax
1263 ; SSE-NEXT: movl %eax, %ecx
1264 ; SSE-NEXT: andl $1, %ecx
1265 ; SSE-NEXT: testq %rax, %rax
1266 ; SSE-NEXT: js .LBB39_6
1268 ; SSE-NEXT: xorps %xmm1, %xmm1
1269 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
1270 ; SSE-NEXT: jmp .LBB39_8
1271 ; SSE-NEXT: .LBB39_6:
1272 ; SSE-NEXT: shrq %rax
1273 ; SSE-NEXT: orq %rax, %rcx
1274 ; SSE-NEXT: xorps %xmm1, %xmm1
1275 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
1276 ; SSE-NEXT: addss %xmm1, %xmm1
1277 ; SSE-NEXT: .LBB39_8:
1278 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1279 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1282 ; AVX-LABEL: uitofp_4i64_to_4f32_undef:
1284 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
1285 ; AVX-NEXT: movl %eax, %ecx
1286 ; AVX-NEXT: andl $1, %ecx
1287 ; AVX-NEXT: testq %rax, %rax
1288 ; AVX-NEXT: js .LBB39_1
1290 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1291 ; AVX-NEXT: jmp .LBB39_3
1292 ; AVX-NEXT: .LBB39_1:
1293 ; AVX-NEXT: shrq %rax
1294 ; AVX-NEXT: orq %rax, %rcx
1295 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1296 ; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1
1297 ; AVX-NEXT: .LBB39_3:
1298 ; AVX-NEXT: vmovq %xmm0, %rax
1299 ; AVX-NEXT: movl %eax, %ecx
1300 ; AVX-NEXT: andl $1, %ecx
1301 ; AVX-NEXT: testq %rax, %rax
1302 ; AVX-NEXT: js .LBB39_4
1304 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1305 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1306 ; AVX-NEXT: jmp .LBB39_6
1307 ; AVX-NEXT: .LBB39_4:
1308 ; AVX-NEXT: shrq %rax
1309 ; AVX-NEXT: orq %rax, %rcx
1310 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1311 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1312 ; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0
1313 ; AVX-NEXT: .LBB39_6:
1314 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
1315 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1316 ; AVX-NEXT: testq %rax, %rax
1317 ; AVX-NEXT: js .LBB39_8
1319 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1320 ; AVX-NEXT: .LBB39_8:
1321 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
1322 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
1324 %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1325 %cvt = uitofp <4 x i64> %ext to <4 x float>
1326 ret <4 x float> %cvt
1329 define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
1330 ; SSE-LABEL: uitofp_4i32_to_4f32:
1332 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
1333 ; SSE-NEXT: pand %xmm0, %xmm1
1334 ; SSE-NEXT: por {{.*}}(%rip), %xmm1
1335 ; SSE-NEXT: psrld $16, %xmm0
1336 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
1337 ; SSE-NEXT: addps {{.*}}(%rip), %xmm0
1338 ; SSE-NEXT: addps %xmm1, %xmm0
1341 ; AVX1-LABEL: uitofp_4i32_to_4f32:
1343 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
1344 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
1345 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
1346 ; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
1347 ; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
1350 ; AVX2-LABEL: uitofp_4i32_to_4f32:
1352 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
1353 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1354 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
1355 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
1356 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
1357 ; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
1358 ; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0
1359 ; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
1361 %cvt = uitofp <4 x i32> %a to <4 x float>
1362 ret <4 x float> %cvt
1365 define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) {
1366 ; SSE-LABEL: uitofp_4i16_to_4f32:
1368 ; SSE-NEXT: pxor %xmm1, %xmm1
1369 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1370 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1373 ; AVX-LABEL: uitofp_4i16_to_4f32:
1375 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1376 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
1378 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1379 %cvt = uitofp <4 x i16> %shuf to <4 x float>
1380 ret <4 x float> %cvt
1383 define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
1384 ; SSE-LABEL: uitofp_8i16_to_4f32:
1386 ; SSE-NEXT: pxor %xmm1, %xmm1
1387 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1388 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1391 ; AVX1-LABEL: uitofp_8i16_to_4f32:
1393 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1394 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1395 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1396 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1397 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1398 ; AVX1-NEXT: vzeroupper
1401 ; AVX2-LABEL: uitofp_8i16_to_4f32:
1403 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1404 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1405 ; AVX2-NEXT: vzeroupper
1407 %cvt = uitofp <8 x i16> %a to <8 x float>
1408 %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1409 ret <4 x float> %shuf
1412 define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) {
1413 ; SSE-LABEL: uitofp_4i8_to_4f32:
1415 ; SSE-NEXT: pxor %xmm1, %xmm1
1416 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1417 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1418 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1421 ; AVX-LABEL: uitofp_4i8_to_4f32:
1423 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1424 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
1426 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1427 %cvt = uitofp <4 x i8> %shuf to <4 x float>
1428 ret <4 x float> %cvt
1431 define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
1432 ; SSE-LABEL: uitofp_16i8_to_4f32:
1434 ; SSE-NEXT: pxor %xmm1, %xmm1
1435 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1436 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1437 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1440 ; AVX1-LABEL: uitofp_16i8_to_4f32:
1442 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1443 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1444 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1445 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1446 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1447 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1448 ; AVX1-NEXT: vzeroupper
1451 ; AVX2-LABEL: uitofp_16i8_to_4f32:
1453 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1454 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1455 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1456 ; AVX2-NEXT: vzeroupper
1458 %cvt = uitofp <16 x i8> %a to <16 x float>
1459 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1460 ret <4 x float> %shuf
1463 define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
1464 ; SSE-LABEL: uitofp_4i64_to_4f32:
1466 ; SSE-NEXT: movd %xmm1, %rax
1467 ; SSE-NEXT: movl %eax, %ecx
1468 ; SSE-NEXT: andl $1, %ecx
1469 ; SSE-NEXT: testq %rax, %rax
1470 ; SSE-NEXT: js .LBB45_1
1472 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3
1473 ; SSE-NEXT: jmp .LBB45_3
1474 ; SSE-NEXT: .LBB45_1:
1475 ; SSE-NEXT: shrq %rax
1476 ; SSE-NEXT: orq %rax, %rcx
1477 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm3
1478 ; SSE-NEXT: addss %xmm3, %xmm3
1479 ; SSE-NEXT: .LBB45_3:
1480 ; SSE-NEXT: movd %xmm0, %rax
1481 ; SSE-NEXT: movl %eax, %ecx
1482 ; SSE-NEXT: andl $1, %ecx
1483 ; SSE-NEXT: testq %rax, %rax
1484 ; SSE-NEXT: js .LBB45_4
1486 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
1487 ; SSE-NEXT: jmp .LBB45_6
1488 ; SSE-NEXT: .LBB45_4:
1489 ; SSE-NEXT: shrq %rax
1490 ; SSE-NEXT: orq %rax, %rcx
1491 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm2
1492 ; SSE-NEXT: addss %xmm2, %xmm2
1493 ; SSE-NEXT: .LBB45_6:
1494 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1495 ; SSE-NEXT: movd %xmm1, %rax
1496 ; SSE-NEXT: movl %eax, %ecx
1497 ; SSE-NEXT: andl $1, %ecx
1498 ; SSE-NEXT: testq %rax, %rax
1499 ; SSE-NEXT: js .LBB45_7
1501 ; SSE-NEXT: xorps %xmm1, %xmm1
1502 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
1503 ; SSE-NEXT: jmp .LBB45_9
1504 ; SSE-NEXT: .LBB45_7:
1505 ; SSE-NEXT: shrq %rax
1506 ; SSE-NEXT: orq %rax, %rcx
1507 ; SSE-NEXT: xorps %xmm1, %xmm1
1508 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
1509 ; SSE-NEXT: addss %xmm1, %xmm1
1510 ; SSE-NEXT: .LBB45_9:
1511 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1512 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1513 ; SSE-NEXT: movd %xmm0, %rax
1514 ; SSE-NEXT: movl %eax, %ecx
1515 ; SSE-NEXT: andl $1, %ecx
1516 ; SSE-NEXT: testq %rax, %rax
1517 ; SSE-NEXT: js .LBB45_10
1518 ; SSE-NEXT: # BB#11:
1519 ; SSE-NEXT: xorps %xmm0, %xmm0
1520 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
1521 ; SSE-NEXT: jmp .LBB45_12
1522 ; SSE-NEXT: .LBB45_10:
1523 ; SSE-NEXT: shrq %rax
1524 ; SSE-NEXT: orq %rax, %rcx
1525 ; SSE-NEXT: xorps %xmm0, %xmm0
1526 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
1527 ; SSE-NEXT: addss %xmm0, %xmm0
1528 ; SSE-NEXT: .LBB45_12:
1529 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1530 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1531 ; SSE-NEXT: movaps %xmm2, %xmm0
1534 ; AVX1-LABEL: uitofp_4i64_to_4f32:
1536 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
1537 ; AVX1-NEXT: movl %eax, %ecx
1538 ; AVX1-NEXT: andl $1, %ecx
1539 ; AVX1-NEXT: testq %rax, %rax
1540 ; AVX1-NEXT: js .LBB45_1
1541 ; AVX1-NEXT: # BB#2:
1542 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1543 ; AVX1-NEXT: jmp .LBB45_3
1544 ; AVX1-NEXT: .LBB45_1:
1545 ; AVX1-NEXT: shrq %rax
1546 ; AVX1-NEXT: orq %rax, %rcx
1547 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1548 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
1549 ; AVX1-NEXT: .LBB45_3:
1550 ; AVX1-NEXT: vmovq %xmm0, %rax
1551 ; AVX1-NEXT: movl %eax, %ecx
1552 ; AVX1-NEXT: andl $1, %ecx
1553 ; AVX1-NEXT: testq %rax, %rax
1554 ; AVX1-NEXT: js .LBB45_4
1555 ; AVX1-NEXT: # BB#5:
1556 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1557 ; AVX1-NEXT: jmp .LBB45_6
1558 ; AVX1-NEXT: .LBB45_4:
1559 ; AVX1-NEXT: shrq %rax
1560 ; AVX1-NEXT: orq %rax, %rcx
1561 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1562 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
1563 ; AVX1-NEXT: .LBB45_6:
1564 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1565 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1566 ; AVX1-NEXT: vmovq %xmm0, %rax
1567 ; AVX1-NEXT: movl %eax, %ecx
1568 ; AVX1-NEXT: andl $1, %ecx
1569 ; AVX1-NEXT: testq %rax, %rax
1570 ; AVX1-NEXT: js .LBB45_7
1571 ; AVX1-NEXT: # BB#8:
1572 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1573 ; AVX1-NEXT: jmp .LBB45_9
1574 ; AVX1-NEXT: .LBB45_7:
1575 ; AVX1-NEXT: shrq %rax
1576 ; AVX1-NEXT: orq %rax, %rcx
1577 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1578 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
1579 ; AVX1-NEXT: .LBB45_9:
1580 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1581 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
1582 ; AVX1-NEXT: movl %eax, %ecx
1583 ; AVX1-NEXT: andl $1, %ecx
1584 ; AVX1-NEXT: testq %rax, %rax
1585 ; AVX1-NEXT: js .LBB45_10
1586 ; AVX1-NEXT: # BB#11:
1587 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
1588 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1589 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1590 ; AVX1-NEXT: vzeroupper
1592 ; AVX1-NEXT: .LBB45_10:
1593 ; AVX1-NEXT: shrq %rax
1594 ; AVX1-NEXT: orq %rax, %rcx
1595 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1596 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
1597 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1598 ; AVX1-NEXT: vzeroupper
1601 ; AVX2-LABEL: uitofp_4i64_to_4f32:
1603 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1604 ; AVX2-NEXT: movl %eax, %ecx
1605 ; AVX2-NEXT: andl $1, %ecx
1606 ; AVX2-NEXT: testq %rax, %rax
1607 ; AVX2-NEXT: js .LBB45_1
1608 ; AVX2-NEXT: # BB#2:
1609 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1610 ; AVX2-NEXT: jmp .LBB45_3
1611 ; AVX2-NEXT: .LBB45_1:
1612 ; AVX2-NEXT: shrq %rax
1613 ; AVX2-NEXT: orq %rax, %rcx
1614 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1615 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
1616 ; AVX2-NEXT: .LBB45_3:
1617 ; AVX2-NEXT: vmovq %xmm0, %rax
1618 ; AVX2-NEXT: movl %eax, %ecx
1619 ; AVX2-NEXT: andl $1, %ecx
1620 ; AVX2-NEXT: testq %rax, %rax
1621 ; AVX2-NEXT: js .LBB45_4
1622 ; AVX2-NEXT: # BB#5:
1623 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1624 ; AVX2-NEXT: jmp .LBB45_6
1625 ; AVX2-NEXT: .LBB45_4:
1626 ; AVX2-NEXT: shrq %rax
1627 ; AVX2-NEXT: orq %rax, %rcx
1628 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1629 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
1630 ; AVX2-NEXT: .LBB45_6:
1631 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1632 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1633 ; AVX2-NEXT: vmovq %xmm0, %rax
1634 ; AVX2-NEXT: movl %eax, %ecx
1635 ; AVX2-NEXT: andl $1, %ecx
1636 ; AVX2-NEXT: testq %rax, %rax
1637 ; AVX2-NEXT: js .LBB45_7
1638 ; AVX2-NEXT: # BB#8:
1639 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1640 ; AVX2-NEXT: jmp .LBB45_9
1641 ; AVX2-NEXT: .LBB45_7:
1642 ; AVX2-NEXT: shrq %rax
1643 ; AVX2-NEXT: orq %rax, %rcx
1644 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1645 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
1646 ; AVX2-NEXT: .LBB45_9:
1647 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1648 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1649 ; AVX2-NEXT: movl %eax, %ecx
1650 ; AVX2-NEXT: andl $1, %ecx
1651 ; AVX2-NEXT: testq %rax, %rax
1652 ; AVX2-NEXT: js .LBB45_10
1653 ; AVX2-NEXT: # BB#11:
1654 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
1655 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1656 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1657 ; AVX2-NEXT: vzeroupper
1659 ; AVX2-NEXT: .LBB45_10:
1660 ; AVX2-NEXT: shrq %rax
1661 ; AVX2-NEXT: orq %rax, %rcx
1662 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1663 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
1664 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1665 ; AVX2-NEXT: vzeroupper
1667 %cvt = uitofp <4 x i64> %a to <4 x float>
1668 ret <4 x float> %cvt
1671 define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) {
1672 ; SSE-LABEL: uitofp_8i32_to_8f32:
1674 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
1675 ; SSE-NEXT: movdqa %xmm0, %xmm3
1676 ; SSE-NEXT: pand %xmm2, %xmm3
1677 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
1678 ; SSE-NEXT: por %xmm4, %xmm3
1679 ; SSE-NEXT: psrld $16, %xmm0
1680 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
1681 ; SSE-NEXT: por %xmm5, %xmm0
1682 ; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
1683 ; SSE-NEXT: addps %xmm6, %xmm0
1684 ; SSE-NEXT: addps %xmm3, %xmm0
1685 ; SSE-NEXT: pand %xmm1, %xmm2
1686 ; SSE-NEXT: por %xmm4, %xmm2
1687 ; SSE-NEXT: psrld $16, %xmm1
1688 ; SSE-NEXT: por %xmm5, %xmm1
1689 ; SSE-NEXT: addps %xmm6, %xmm1
1690 ; SSE-NEXT: addps %xmm2, %xmm1
1693 ; AVX1-LABEL: uitofp_8i32_to_8f32:
1695 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
1696 ; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
1697 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
1698 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1699 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
1700 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
1701 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1702 ; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
1703 ; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0
1706 ; AVX2-LABEL: uitofp_8i32_to_8f32:
1708 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
1709 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1710 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
1711 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
1712 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
1713 ; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2
1714 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
1715 ; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
1717 %cvt = uitofp <8 x i32> %a to <8 x float>
1718 ret <8 x float> %cvt
1721 define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
1722 ; SSE-LABEL: uitofp_8i16_to_8f32:
1724 ; SSE-NEXT: pxor %xmm1, %xmm1
1725 ; SSE-NEXT: movdqa %xmm0, %xmm2
1726 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1727 ; SSE-NEXT: cvtdq2ps %xmm2, %xmm2
1728 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1729 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
1730 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1731 ; SSE-NEXT: movaps %xmm2, %xmm0
1734 ; AVX1-LABEL: uitofp_8i16_to_8f32:
1736 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1737 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1738 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1739 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1740 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1743 ; AVX2-LABEL: uitofp_8i16_to_8f32:
1745 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1746 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1748 %cvt = uitofp <8 x i16> %a to <8 x float>
1749 ret <8 x float> %cvt
1752 define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
1753 ; SSE-LABEL: uitofp_8i8_to_8f32:
1755 ; SSE-NEXT: pxor %xmm1, %xmm1
1756 ; SSE-NEXT: movdqa %xmm0, %xmm2
1757 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1758 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1759 ; SSE-NEXT: cvtdq2ps %xmm2, %xmm2
1760 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1761 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1762 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
1763 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1764 ; SSE-NEXT: movaps %xmm2, %xmm0
1767 ; AVX1-LABEL: uitofp_8i8_to_8f32:
1769 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1770 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1771 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1772 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1773 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
1774 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1777 ; AVX2-LABEL: uitofp_8i8_to_8f32:
1779 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1780 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
1781 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1782 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1784 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1785 %cvt = uitofp <8 x i8> %shuf to <8 x float>
1786 ret <8 x float> %cvt
1789 define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
1790 ; SSE-LABEL: uitofp_16i8_to_8f32:
1792 ; SSE-NEXT: pxor %xmm1, %xmm1
1793 ; SSE-NEXT: movdqa %xmm0, %xmm2
1794 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1795 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1796 ; SSE-NEXT: cvtdq2ps %xmm2, %xmm2
1797 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1798 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1799 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
1800 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1801 ; SSE-NEXT: movaps %xmm2, %xmm0
1804 ; AVX1-LABEL: uitofp_16i8_to_8f32:
1806 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1807 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1808 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1809 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1810 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1811 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1814 ; AVX2-LABEL: uitofp_16i8_to_8f32:
1816 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1817 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1818 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1820 %cvt = uitofp <16 x i8> %a to <16 x float>
1821 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1822 ret <8 x float> %shuf
1829 %Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }>
1830 define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
1831 ; SSE-LABEL: aggregate_sitofp_8i16_to_8f32:
1833 ; SSE-NEXT: movq 24(%rdi), %rax
1834 ; SSE-NEXT: movdqu 8(%rdi), %xmm0
1835 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1836 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1837 ; SSE-NEXT: psrad $16, %xmm1
1838 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
1839 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1840 ; SSE-NEXT: psrad $16, %xmm0
1841 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1842 ; SSE-NEXT: movaps %xmm0, (%rax)
1843 ; SSE-NEXT: movaps %xmm1, 16(%rax)
1846 ; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
1848 ; AVX1-NEXT: movq 24(%rdi), %rax
1849 ; AVX1-NEXT: vmovdqu 8(%rdi), %xmm0
1850 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
1851 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1852 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
1853 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1854 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1855 ; AVX1-NEXT: vmovaps %ymm0, (%rax)
1856 ; AVX1-NEXT: vzeroupper
1859 ; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32:
1861 ; AVX2-NEXT: movq 24(%rdi), %rax
1862 ; AVX2-NEXT: vpmovsxwd 8(%rdi), %ymm0
1863 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1864 ; AVX2-NEXT: vmovaps %ymm0, (%rax)
1865 ; AVX2-NEXT: vzeroupper
1867 %1 = load %Arguments, %Arguments* %a0, align 1
1868 %2 = extractvalue %Arguments %1, 1
1869 %3 = extractvalue %Arguments %1, 2
1870 %4 = sitofp <8 x i16> %2 to <8 x float>
1871 store <8 x float> %4, <8 x float>* %3, align 32