1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; Signed Integer to Double
9 define <2 x double> @sitofp_2vf64(<2 x i64> %a) {
10 ; SSE2-LABEL: sitofp_2vf64:
12 ; SSE2-NEXT: movd %xmm0, %rax
13 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm1
14 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
15 ; SSE2-NEXT: movd %xmm0, %rax
16 ; SSE2-NEXT: xorps %xmm0, %xmm0
17 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm0
18 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
19 ; SSE2-NEXT: movapd %xmm1, %xmm0
22 ; AVX-LABEL: sitofp_2vf64:
24 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
25 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
26 ; AVX-NEXT: vmovq %xmm0, %rax
27 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
28 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
29 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
31 %cvt = sitofp <2 x i64> %a to <2 x double>
35 define <2 x double> @sitofp_2vf64_i32(<4 x i32> %a) {
36 ; SSE2-LABEL: sitofp_2vf64_i32:
38 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
41 ; AVX-LABEL: sitofp_2vf64_i32:
43 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
45 %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
46 %cvt = sitofp <2 x i32> %shuf to <2 x double>
50 define <2 x double> @sitofp_2vf64_i16(<8 x i16> %a) {
51 ; SSE2-LABEL: sitofp_2vf64_i16:
53 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
54 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
55 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
56 ; SSE2-NEXT: movd %xmm1, %rax
57 ; SSE2-NEXT: movswq %ax, %rax
58 ; SSE2-NEXT: movd %xmm0, %rcx
59 ; SSE2-NEXT: movswq %cx, %rcx
60 ; SSE2-NEXT: xorps %xmm0, %xmm0
61 ; SSE2-NEXT: cvtsi2sdq %rcx, %xmm0
62 ; SSE2-NEXT: xorps %xmm1, %xmm1
63 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm1
64 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
67 ; AVX-LABEL: sitofp_2vf64_i16:
69 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
70 ; AVX-NEXT: vmovq %xmm0, %rax
71 ; AVX-NEXT: movswq %ax, %rax
72 ; AVX-NEXT: vpextrq $1, %xmm0, %rcx
73 ; AVX-NEXT: movswq %cx, %rcx
74 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
75 ; AVX-NEXT: vcvtsi2sdq %rcx, %xmm0, %xmm0
76 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
77 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
79 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
80 %cvt = sitofp <2 x i16> %shuf to <2 x double>
84 define <2 x double> @sitofp_2vf64_i8(<16 x i8> %a) {
85 ; SSE2-LABEL: sitofp_2vf64_i8:
87 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
88 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
89 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1]
90 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
91 ; SSE2-NEXT: movd %xmm1, %rax
92 ; SSE2-NEXT: movsbq %al, %rax
93 ; SSE2-NEXT: movd %xmm0, %rcx
94 ; SSE2-NEXT: movsbq %cl, %rcx
95 ; SSE2-NEXT: xorps %xmm0, %xmm0
96 ; SSE2-NEXT: cvtsi2sdq %rcx, %xmm0
97 ; SSE2-NEXT: xorps %xmm1, %xmm1
98 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm1
99 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
102 ; AVX-LABEL: sitofp_2vf64_i8:
104 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
105 ; AVX-NEXT: vmovq %xmm0, %rax
106 ; AVX-NEXT: movsbq %al, %rax
107 ; AVX-NEXT: vpextrq $1, %xmm0, %rcx
108 ; AVX-NEXT: movsbq %cl, %rcx
109 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
110 ; AVX-NEXT: vcvtsi2sdq %rcx, %xmm0, %xmm0
111 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
112 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
114 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
115 %cvt = sitofp <2 x i8> %shuf to <2 x double>
116 ret <2 x double> %cvt
119 define <4 x double> @sitofp_4vf64(<4 x i64> %a) {
120 ; SSE2-LABEL: sitofp_4vf64:
122 ; SSE2-NEXT: movd %xmm0, %rax
123 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm2
124 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
125 ; SSE2-NEXT: movd %xmm0, %rax
126 ; SSE2-NEXT: xorps %xmm0, %xmm0
127 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm0
128 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
129 ; SSE2-NEXT: movd %xmm1, %rax
130 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm3
131 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
132 ; SSE2-NEXT: movd %xmm0, %rax
133 ; SSE2-NEXT: xorps %xmm0, %xmm0
134 ; SSE2-NEXT: cvtsi2sdq %rax, %xmm0
135 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
136 ; SSE2-NEXT: movapd %xmm2, %xmm0
137 ; SSE2-NEXT: movapd %xmm3, %xmm1
140 ; AVX1-LABEL: sitofp_4vf64:
142 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
143 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax
144 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
145 ; AVX1-NEXT: vmovq %xmm1, %rax
146 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
147 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
148 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
149 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
150 ; AVX1-NEXT: vmovq %xmm0, %rax
151 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
152 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
153 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
154 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
157 ; AVX2-LABEL: sitofp_4vf64:
159 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
160 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax
161 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
162 ; AVX2-NEXT: vmovq %xmm1, %rax
163 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
164 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
165 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
166 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
167 ; AVX2-NEXT: vmovq %xmm0, %rax
168 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
169 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
170 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
171 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
173 %cvt = sitofp <4 x i64> %a to <4 x double>
174 ret <4 x double> %cvt
177 define <4 x double> @sitofp_4vf64_i32(<4 x i32> %a) {
178 ; SSE2-LABEL: sitofp_4vf64_i32:
180 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2
181 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
182 ; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1
183 ; SSE2-NEXT: movaps %xmm2, %xmm0
186 ; AVX-LABEL: sitofp_4vf64_i32:
188 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
190 %cvt = sitofp <4 x i32> %a to <4 x double>
191 ret <4 x double> %cvt
194 define <4 x double> @sitofp_4vf64_i16(<8 x i16> %a) {
195 ; SSE2-LABEL: sitofp_4vf64_i16:
197 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
198 ; SSE2-NEXT: psrad $16, %xmm1
199 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
200 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
201 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
204 ; AVX-LABEL: sitofp_4vf64_i16:
206 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
207 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
209 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
210 %cvt = sitofp <4 x i16> %shuf to <4 x double>
211 ret <4 x double> %cvt
214 define <4 x double> @sitofp_4vf64_i8(<16 x i8> %a) {
215 ; SSE2-LABEL: sitofp_4vf64_i8:
217 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
218 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
219 ; SSE2-NEXT: psrad $24, %xmm1
220 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
221 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
222 ; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
225 ; AVX-LABEL: sitofp_4vf64_i8:
227 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
228 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
230 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
231 %cvt = sitofp <4 x i8> %shuf to <4 x double>
232 ret <4 x double> %cvt
236 ; Unsigned Integer to Double
239 define <2 x double> @uitofp_2vf64(<2 x i64> %a) {
240 ; SSE2-LABEL: uitofp_2vf64:
242 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
243 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
244 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
245 ; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
246 ; SSE2-NEXT: subpd %xmm3, %xmm0
247 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
248 ; SSE2-NEXT: addpd %xmm4, %xmm0
249 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
250 ; SSE2-NEXT: subpd %xmm3, %xmm2
251 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
252 ; SSE2-NEXT: addpd %xmm2, %xmm1
253 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
256 ; AVX-LABEL: uitofp_2vf64:
258 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
259 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
260 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
261 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
262 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
263 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
264 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
265 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
266 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
267 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
269 %cvt = uitofp <2 x i64> %a to <2 x double>
270 ret <2 x double> %cvt
273 define <2 x double> @uitofp_2vf64_i32(<4 x i32> %a) {
274 ; SSE2-LABEL: uitofp_2vf64_i32:
276 ; SSE2-NEXT: pxor %xmm1, %xmm1
277 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
278 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
279 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
280 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
281 ; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
282 ; SSE2-NEXT: subpd %xmm3, %xmm0
283 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
284 ; SSE2-NEXT: addpd %xmm4, %xmm0
285 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
286 ; SSE2-NEXT: subpd %xmm3, %xmm2
287 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
288 ; SSE2-NEXT: addpd %xmm2, %xmm1
289 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
292 ; AVX-LABEL: uitofp_2vf64_i32:
294 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
295 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
296 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
297 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
298 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
299 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
300 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
301 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
302 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
303 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
304 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
306 %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
307 %cvt = uitofp <2 x i32> %shuf to <2 x double>
308 ret <2 x double> %cvt
311 define <2 x double> @uitofp_2vf64_i16(<8 x i16> %a) {
312 ; SSE2-LABEL: uitofp_2vf64_i16:
314 ; SSE2-NEXT: pxor %xmm1, %xmm1
315 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
316 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
317 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
318 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
319 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
320 ; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
321 ; SSE2-NEXT: subpd %xmm3, %xmm0
322 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
323 ; SSE2-NEXT: addpd %xmm4, %xmm0
324 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
325 ; SSE2-NEXT: subpd %xmm3, %xmm2
326 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
327 ; SSE2-NEXT: addpd %xmm2, %xmm1
328 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
331 ; AVX-LABEL: uitofp_2vf64_i16:
333 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
334 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
335 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
336 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
337 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
338 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
339 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
340 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
341 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
342 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
343 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
345 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
346 %cvt = uitofp <2 x i16> %shuf to <2 x double>
347 ret <2 x double> %cvt
350 define <2 x double> @uitofp_2vf64_i8(<16 x i8> %a) {
351 ; SSE2-LABEL: uitofp_2vf64_i8:
353 ; SSE2-NEXT: pxor %xmm1, %xmm1
354 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
355 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
356 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
357 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
358 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
359 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
360 ; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
361 ; SSE2-NEXT: subpd %xmm3, %xmm0
362 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
363 ; SSE2-NEXT: addpd %xmm4, %xmm0
364 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
365 ; SSE2-NEXT: subpd %xmm3, %xmm2
366 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
367 ; SSE2-NEXT: addpd %xmm2, %xmm1
368 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
371 ; AVX-LABEL: uitofp_2vf64_i8:
373 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
374 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
375 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
376 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
377 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
378 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
379 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
380 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
381 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
382 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
383 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
385 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
386 %cvt = uitofp <2 x i8> %shuf to <2 x double>
387 ret <2 x double> %cvt
390 define <4 x double> @uitofp_4vf64(<4 x i64> %a) {
391 ; SSE2-LABEL: uitofp_4vf64:
393 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
394 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
395 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
396 ; SSE2-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
397 ; SSE2-NEXT: subpd %xmm4, %xmm0
398 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
399 ; SSE2-NEXT: addpd %xmm5, %xmm0
400 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
401 ; SSE2-NEXT: subpd %xmm4, %xmm3
402 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
403 ; SSE2-NEXT: addpd %xmm3, %xmm5
404 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
405 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
406 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
407 ; SSE2-NEXT: subpd %xmm4, %xmm1
408 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
409 ; SSE2-NEXT: addpd %xmm5, %xmm1
410 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
411 ; SSE2-NEXT: subpd %xmm4, %xmm3
412 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
413 ; SSE2-NEXT: addpd %xmm3, %xmm2
414 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
417 ; AVX1-LABEL: uitofp_4vf64:
419 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
420 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
421 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
422 ; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
423 ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
424 ; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
425 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
426 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
427 ; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1
428 ; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
429 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
430 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
431 ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
432 ; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
433 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
434 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
435 ; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0
436 ; AVX1-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
437 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
438 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
441 ; AVX2-LABEL: uitofp_4vf64:
443 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
444 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
445 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
446 ; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
447 ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
448 ; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
449 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
450 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
451 ; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1
452 ; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
453 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
454 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
455 ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
456 ; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
457 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
458 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
459 ; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0
460 ; AVX2-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
461 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
462 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
464 %cvt = uitofp <4 x i64> %a to <4 x double>
465 ret <4 x double> %cvt
468 define <4 x double> @uitofp_4vf64_i32(<4 x i32> %a) {
469 ; SSE2-LABEL: uitofp_4vf64_i32:
471 ; SSE2-NEXT: pxor %xmm1, %xmm1
472 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
473 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
474 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
475 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
476 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
477 ; SSE2-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
478 ; SSE2-NEXT: subpd %xmm4, %xmm0
479 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
480 ; SSE2-NEXT: addpd %xmm5, %xmm0
481 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
482 ; SSE2-NEXT: subpd %xmm4, %xmm1
483 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
484 ; SSE2-NEXT: addpd %xmm1, %xmm5
485 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
486 ; SSE2-NEXT: pand .LCPI13_2(%rip), %xmm2
487 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
488 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
489 ; SSE2-NEXT: subpd %xmm4, %xmm2
490 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
491 ; SSE2-NEXT: addpd %xmm2, %xmm1
492 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
493 ; SSE2-NEXT: subpd %xmm4, %xmm5
494 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
495 ; SSE2-NEXT: addpd %xmm5, %xmm2
496 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
499 ; AVX1-LABEL: uitofp_4vf64_i32:
501 ; AVX1-NEXT: vpand .LCPI13_0(%rip), %xmm0, %xmm1
502 ; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
503 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
504 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
505 ; AVX1-NEXT: vmulpd .LCPI13_1(%rip), %ymm0, %ymm0
506 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
509 ; AVX2-LABEL: uitofp_4vf64_i32:
511 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
512 ; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
513 ; AVX2-NEXT: vbroadcastsd .LCPI13_0(%rip), %ymm2
514 ; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
515 ; AVX2-NEXT: vpbroadcastd .LCPI13_1(%rip), %xmm2
516 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
517 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
518 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
520 %cvt = uitofp <4 x i32> %a to <4 x double>
521 ret <4 x double> %cvt
524 define <4 x double> @uitofp_4vf64_i16(<8 x i16> %a) {
525 ; SSE2-LABEL: uitofp_4vf64_i16:
527 ; SSE2-NEXT: pxor %xmm1, %xmm1
528 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
529 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
530 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
531 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
532 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
533 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
534 ; SSE2-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
535 ; SSE2-NEXT: subpd %xmm4, %xmm0
536 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
537 ; SSE2-NEXT: addpd %xmm5, %xmm0
538 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
539 ; SSE2-NEXT: subpd %xmm4, %xmm1
540 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
541 ; SSE2-NEXT: addpd %xmm1, %xmm5
542 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
543 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,1,2,3,4,5,6,7]
544 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,7,5,6,7]
545 ; SSE2-NEXT: pand .LCPI14_2(%rip), %xmm2
546 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
547 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
548 ; SSE2-NEXT: subpd %xmm4, %xmm2
549 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
550 ; SSE2-NEXT: addpd %xmm2, %xmm1
551 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
552 ; SSE2-NEXT: subpd %xmm4, %xmm5
553 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
554 ; SSE2-NEXT: addpd %xmm5, %xmm2
555 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
558 ; AVX-LABEL: uitofp_4vf64_i16:
560 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
561 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
563 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
564 %cvt = uitofp <4 x i16> %shuf to <4 x double>
565 ret <4 x double> %cvt
568 define <4 x double> @uitofp_4vf64_i8(<16 x i8> %a) {
569 ; SSE2-LABEL: uitofp_4vf64_i8:
571 ; SSE2-NEXT: movdqa %xmm0, %xmm1
572 ; SSE2-NEXT: pxor %xmm2, %xmm2
573 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
574 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
575 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
576 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
577 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
578 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
579 ; SSE2-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
580 ; SSE2-NEXT: subpd %xmm3, %xmm0
581 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
582 ; SSE2-NEXT: addpd %xmm5, %xmm0
583 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
584 ; SSE2-NEXT: subpd %xmm3, %xmm4
585 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
586 ; SSE2-NEXT: addpd %xmm4, %xmm5
587 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
588 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
589 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
590 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
591 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,7,5,6,7]
592 ; SSE2-NEXT: pand .LCPI15_2(%rip), %xmm4
593 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,0,1]
594 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
595 ; SSE2-NEXT: subpd %xmm3, %xmm4
596 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1]
597 ; SSE2-NEXT: addpd %xmm4, %xmm1
598 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
599 ; SSE2-NEXT: subpd %xmm3, %xmm5
600 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,0,1]
601 ; SSE2-NEXT: addpd %xmm5, %xmm2
602 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
605 ; AVX-LABEL: uitofp_4vf64_i8:
607 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
608 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
610 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
611 %cvt = uitofp <4 x i8> %shuf to <4 x double>
612 ret <4 x double> %cvt
616 ; Signed Integer to Float
619 define <4 x float> @sitofp_4vf32(<4 x i32> %a) {
620 ; SSE2-LABEL: sitofp_4vf32:
622 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
625 ; AVX-LABEL: sitofp_4vf32:
627 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
629 %cvt = sitofp <4 x i32> %a to <4 x float>
633 define <4 x float> @sitofp_4vf32_i64(<2 x i64> %a) {
634 ; SSE2-LABEL: sitofp_4vf32_i64:
636 ; SSE2-NEXT: movd %xmm0, %rax
637 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm1
638 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
639 ; SSE2-NEXT: movd %xmm0, %rax
640 ; SSE2-NEXT: xorps %xmm0, %xmm0
641 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm0
642 ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
643 ; SSE2-NEXT: movaps %xmm1, %xmm0
646 ; AVX-LABEL: sitofp_4vf32_i64:
648 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
649 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
650 ; AVX-NEXT: vmovq %xmm0, %rax
651 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
652 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
653 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
654 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
655 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
656 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
658 %cvt = sitofp <2 x i64> %a to <2 x float>
659 %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
663 define <4 x float> @sitofp_4vf32_i16(<8 x i16> %a) {
664 ; SSE2-LABEL: sitofp_4vf32_i16:
666 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
667 ; SSE2-NEXT: psrad $16, %xmm0
668 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
671 ; AVX-LABEL: sitofp_4vf32_i16:
673 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
674 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
676 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
677 %cvt = sitofp <4 x i16> %shuf to <4 x float>
681 define <4 x float> @sitofp_4vf32_i8(<16 x i8> %a) {
682 ; SSE2-LABEL: sitofp_4vf32_i8:
684 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
685 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
686 ; SSE2-NEXT: psrad $24, %xmm0
687 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
690 ; AVX-LABEL: sitofp_4vf32_i8:
692 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
693 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
695 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
696 %cvt = sitofp <4 x i8> %shuf to <4 x float>
700 define <8 x float> @sitofp_8vf32(<8 x i32> %a) {
701 ; SSE2-LABEL: sitofp_8vf32:
703 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
704 ; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1
707 ; AVX-LABEL: sitofp_8vf32:
709 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
711 %cvt = sitofp <8 x i32> %a to <8 x float>
715 define <4 x float> @sitofp_4vf32_4i64(<4 x i64> %a) {
716 ; SSE2-LABEL: sitofp_4vf32_4i64:
718 ; SSE2-NEXT: movd %xmm1, %rax
719 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm3
720 ; SSE2-NEXT: movd %xmm0, %rax
721 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm2
722 ; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
723 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
724 ; SSE2-NEXT: movd %xmm1, %rax
725 ; SSE2-NEXT: xorps %xmm1, %xmm1
726 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm1
727 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
728 ; SSE2-NEXT: movd %xmm0, %rax
729 ; SSE2-NEXT: xorps %xmm0, %xmm0
730 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm0
731 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
732 ; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
733 ; SSE2-NEXT: movaps %xmm2, %xmm0
736 ; AVX1-LABEL: sitofp_4vf32_4i64:
738 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
739 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
740 ; AVX1-NEXT: vmovq %xmm0, %rax
741 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
742 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
743 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
744 ; AVX1-NEXT: vmovq %xmm0, %rax
745 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
746 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
747 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
748 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
749 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
750 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
751 ; AVX1-NEXT: vzeroupper
754 ; AVX2-LABEL: sitofp_4vf32_4i64:
756 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
757 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
758 ; AVX2-NEXT: vmovq %xmm0, %rax
759 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
760 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
761 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
762 ; AVX2-NEXT: vmovq %xmm0, %rax
763 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
764 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
765 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
766 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
767 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
768 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
769 ; AVX2-NEXT: vzeroupper
771 %cvt = sitofp <4 x i64> %a to <4 x float>
775 define <8 x float> @sitofp_8vf32_i16(<8 x i16> %a) {
776 ; SSE2-LABEL: sitofp_8vf32_i16:
778 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
779 ; SSE2-NEXT: psrad $16, %xmm1
780 ; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2
781 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
782 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
783 ; SSE2-NEXT: psrad $16, %xmm0
784 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1
785 ; SSE2-NEXT: movaps %xmm2, %xmm0
788 ; AVX1-LABEL: sitofp_8vf32_i16:
790 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
791 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
792 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
793 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
794 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
797 ; AVX2-LABEL: sitofp_8vf32_i16:
799 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
800 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
802 %cvt = sitofp <8 x i16> %a to <8 x float>
806 define <8 x float> @sitofp_8vf32_i8(<16 x i8> %a) {
807 ; SSE2-LABEL: sitofp_8vf32_i8:
809 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
810 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
811 ; SSE2-NEXT: psrad $24, %xmm1
812 ; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2
813 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
814 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
815 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
816 ; SSE2-NEXT: psrad $24, %xmm0
817 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1
818 ; SSE2-NEXT: movaps %xmm2, %xmm0
821 ; AVX1-LABEL: sitofp_8vf32_i8:
823 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
824 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
825 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
826 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
827 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
830 ; AVX2-LABEL: sitofp_8vf32_i8:
832 ; AVX2-NEXT: vpmovzxbd %xmm0, %ymm0
833 ; AVX2-NEXT: vpslld $24, %ymm0, %ymm0
834 ; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
835 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
837 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
838 %cvt = sitofp <8 x i8> %shuf to <8 x float>
843 ; Unsigned Integer to Float
846 define <4 x float> @uitofp_4vf32(<4 x i32> %a) {
847 ; SSE2-LABEL: uitofp_4vf32:
849 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
850 ; SSE2-NEXT: pand %xmm0, %xmm1
851 ; SSE2-NEXT: por .LCPI24_1(%rip), %xmm1
852 ; SSE2-NEXT: psrld $16, %xmm0
853 ; SSE2-NEXT: por .LCPI24_2(%rip), %xmm0
854 ; SSE2-NEXT: addps .LCPI24_3(%rip), %xmm0
855 ; SSE2-NEXT: addps %xmm1, %xmm0
858 ; AVX1-LABEL: uitofp_4vf32:
860 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
861 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
862 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
863 ; AVX1-NEXT: vaddps .LCPI24_2(%rip), %xmm0, %xmm0
864 ; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
867 ; AVX2-LABEL: uitofp_4vf32:
869 ; AVX2-NEXT: vpbroadcastd .LCPI24_0(%rip), %xmm1
870 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
871 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
872 ; AVX2-NEXT: vpbroadcastd .LCPI24_1(%rip), %xmm2
873 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
874 ; AVX2-NEXT: vbroadcastss .LCPI24_2(%rip), %xmm2
875 ; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0
876 ; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
878 %cvt = uitofp <4 x i32> %a to <4 x float>
882 define <4 x float> @uitofp_4vf32_i64(<2 x i64> %a) {
883 ; SSE2-LABEL: uitofp_4vf32_i64:
885 ; SSE2-NEXT: movdqa %xmm0, %xmm1
886 ; SSE2-NEXT: movd %xmm1, %rax
887 ; SSE2-NEXT: movl %eax, %ecx
888 ; SSE2-NEXT: andl $1, %ecx
889 ; SSE2-NEXT: testq %rax, %rax
890 ; SSE2-NEXT: js .LBB25_1
892 ; SSE2-NEXT: xorps %xmm0, %xmm0
893 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm0
894 ; SSE2-NEXT: jmp .LBB25_3
895 ; SSE2-NEXT: .LBB25_1:
896 ; SSE2-NEXT: shrq %rax
897 ; SSE2-NEXT: orq %rax, %rcx
898 ; SSE2-NEXT: xorps %xmm0, %xmm0
899 ; SSE2-NEXT: cvtsi2ssq %rcx, %xmm0
900 ; SSE2-NEXT: addss %xmm0, %xmm0
901 ; SSE2-NEXT: .LBB25_3:
902 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
903 ; SSE2-NEXT: movd %xmm1, %rax
904 ; SSE2-NEXT: movl %eax, %ecx
905 ; SSE2-NEXT: andl $1, %ecx
906 ; SSE2-NEXT: testq %rax, %rax
907 ; SSE2-NEXT: js .LBB25_4
909 ; SSE2-NEXT: xorps %xmm1, %xmm1
910 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm1
911 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
913 ; SSE2-NEXT: .LBB25_4:
914 ; SSE2-NEXT: shrq %rax
915 ; SSE2-NEXT: orq %rax, %rcx
916 ; SSE2-NEXT: xorps %xmm1, %xmm1
917 ; SSE2-NEXT: cvtsi2ssq %rcx, %xmm1
918 ; SSE2-NEXT: addss %xmm1, %xmm1
919 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
922 ; AVX-LABEL: uitofp_4vf32_i64:
924 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
925 ; AVX-NEXT: movl %eax, %ecx
926 ; AVX-NEXT: andl $1, %ecx
927 ; AVX-NEXT: testq %rax, %rax
928 ; AVX-NEXT: js .LBB25_1
930 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
931 ; AVX-NEXT: jmp .LBB25_3
932 ; AVX-NEXT: .LBB25_1:
933 ; AVX-NEXT: shrq %rax
934 ; AVX-NEXT: orq %rax, %rcx
935 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
936 ; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1
937 ; AVX-NEXT: .LBB25_3:
938 ; AVX-NEXT: vmovq %xmm0, %rax
939 ; AVX-NEXT: movl %eax, %ecx
940 ; AVX-NEXT: andl $1, %ecx
941 ; AVX-NEXT: testq %rax, %rax
942 ; AVX-NEXT: js .LBB25_4
944 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
945 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
946 ; AVX-NEXT: jmp .LBB25_6
947 ; AVX-NEXT: .LBB25_4:
948 ; AVX-NEXT: shrq %rax
949 ; AVX-NEXT: orq %rax, %rcx
950 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
951 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
952 ; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0
953 ; AVX-NEXT: .LBB25_6:
954 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
955 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
956 ; AVX-NEXT: testq %rax, %rax
957 ; AVX-NEXT: js .LBB25_8
959 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
960 ; AVX-NEXT: .LBB25_8:
961 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
962 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
964 %cvt = uitofp <2 x i64> %a to <2 x float>
965 %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
969 define <4 x float> @uitofp_4vf32_i16(<8 x i16> %a) {
970 ; SSE2-LABEL: uitofp_4vf32_i16:
972 ; SSE2-NEXT: pxor %xmm1, %xmm1
973 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
974 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
977 ; AVX-LABEL: uitofp_4vf32_i16:
979 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
980 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
982 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
983 %cvt = uitofp <4 x i16> %shuf to <4 x float>
987 define <4 x float> @uitofp_4vf32_i8(<16 x i8> %a) {
988 ; SSE2-LABEL: uitofp_4vf32_i8:
990 ; SSE2-NEXT: pxor %xmm1, %xmm1
991 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
992 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
993 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
996 ; AVX-LABEL: uitofp_4vf32_i8:
998 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
999 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
1001 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1002 %cvt = uitofp <4 x i8> %shuf to <4 x float>
1003 ret <4 x float> %cvt
1006 define <8 x float> @uitofp_8vf32(<8 x i32> %a) {
1007 ; SSE2-LABEL: uitofp_8vf32:
1009 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
1010 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1011 ; SSE2-NEXT: pand %xmm2, %xmm3
1012 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
1013 ; SSE2-NEXT: por %xmm4, %xmm3
1014 ; SSE2-NEXT: psrld $16, %xmm0
1015 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
1016 ; SSE2-NEXT: por %xmm5, %xmm0
1017 ; SSE2-NEXT: movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
1018 ; SSE2-NEXT: addps %xmm6, %xmm0
1019 ; SSE2-NEXT: addps %xmm3, %xmm0
1020 ; SSE2-NEXT: pand %xmm1, %xmm2
1021 ; SSE2-NEXT: por %xmm4, %xmm2
1022 ; SSE2-NEXT: psrld $16, %xmm1
1023 ; SSE2-NEXT: por %xmm5, %xmm1
1024 ; SSE2-NEXT: addps %xmm6, %xmm1
1025 ; SSE2-NEXT: addps %xmm2, %xmm1
1028 ; AVX1-LABEL: uitofp_8vf32:
1030 ; AVX1-NEXT: vandps .LCPI28_0(%rip), %ymm0, %ymm1
1031 ; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
1032 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
1033 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1034 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
1035 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
1036 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1037 ; AVX1-NEXT: vmulps .LCPI28_1(%rip), %ymm0, %ymm0
1038 ; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0
1041 ; AVX2-LABEL: uitofp_8vf32:
1043 ; AVX2-NEXT: vpbroadcastd .LCPI28_0(%rip), %ymm1
1044 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1045 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
1046 ; AVX2-NEXT: vpbroadcastd .LCPI28_1(%rip), %ymm2
1047 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
1048 ; AVX2-NEXT: vbroadcastss .LCPI28_2(%rip), %ymm2
1049 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
1050 ; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
1052 %cvt = uitofp <8 x i32> %a to <8 x float>
1053 ret <8 x float> %cvt
1056 define <4 x float> @uitofp_4vf32_4i64(<4 x i64> %a) {
1057 ; SSE2-LABEL: uitofp_4vf32_4i64:
1059 ; SSE2-NEXT: movd %xmm1, %rax
1060 ; SSE2-NEXT: movl %eax, %ecx
1061 ; SSE2-NEXT: andl $1, %ecx
1062 ; SSE2-NEXT: testq %rax, %rax
1063 ; SSE2-NEXT: js .LBB29_1
1064 ; SSE2-NEXT: # BB#2:
1065 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm3
1066 ; SSE2-NEXT: jmp .LBB29_3
1067 ; SSE2-NEXT: .LBB29_1:
1068 ; SSE2-NEXT: shrq %rax
1069 ; SSE2-NEXT: orq %rax, %rcx
1070 ; SSE2-NEXT: cvtsi2ssq %rcx, %xmm3
1071 ; SSE2-NEXT: addss %xmm3, %xmm3
1072 ; SSE2-NEXT: .LBB29_3:
1073 ; SSE2-NEXT: movd %xmm0, %rax
1074 ; SSE2-NEXT: movl %eax, %ecx
1075 ; SSE2-NEXT: andl $1, %ecx
1076 ; SSE2-NEXT: testq %rax, %rax
1077 ; SSE2-NEXT: js .LBB29_4
1078 ; SSE2-NEXT: # BB#5:
1079 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm2
1080 ; SSE2-NEXT: jmp .LBB29_6
1081 ; SSE2-NEXT: .LBB29_4:
1082 ; SSE2-NEXT: shrq %rax
1083 ; SSE2-NEXT: orq %rax, %rcx
1084 ; SSE2-NEXT: cvtsi2ssq %rcx, %xmm2
1085 ; SSE2-NEXT: addss %xmm2, %xmm2
1086 ; SSE2-NEXT: .LBB29_6:
1087 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1088 ; SSE2-NEXT: movd %xmm1, %rax
1089 ; SSE2-NEXT: movl %eax, %ecx
1090 ; SSE2-NEXT: andl $1, %ecx
1091 ; SSE2-NEXT: testq %rax, %rax
1092 ; SSE2-NEXT: js .LBB29_7
1093 ; SSE2-NEXT: # BB#8:
1094 ; SSE2-NEXT: xorps %xmm1, %xmm1
1095 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm1
1096 ; SSE2-NEXT: jmp .LBB29_9
1097 ; SSE2-NEXT: .LBB29_7:
1098 ; SSE2-NEXT: shrq %rax
1099 ; SSE2-NEXT: orq %rax, %rcx
1100 ; SSE2-NEXT: xorps %xmm1, %xmm1
1101 ; SSE2-NEXT: cvtsi2ssq %rcx, %xmm1
1102 ; SSE2-NEXT: addss %xmm1, %xmm1
1103 ; SSE2-NEXT: .LBB29_9:
1104 ; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1105 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1106 ; SSE2-NEXT: movd %xmm0, %rax
1107 ; SSE2-NEXT: movl %eax, %ecx
1108 ; SSE2-NEXT: andl $1, %ecx
1109 ; SSE2-NEXT: testq %rax, %rax
1110 ; SSE2-NEXT: js .LBB29_10
1111 ; SSE2-NEXT: # BB#11:
1112 ; SSE2-NEXT: xorps %xmm0, %xmm0
1113 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm0
1114 ; SSE2-NEXT: jmp .LBB29_12
1115 ; SSE2-NEXT: .LBB29_10:
1116 ; SSE2-NEXT: shrq %rax
1117 ; SSE2-NEXT: orq %rax, %rcx
1118 ; SSE2-NEXT: xorps %xmm0, %xmm0
1119 ; SSE2-NEXT: cvtsi2ssq %rcx, %xmm0
1120 ; SSE2-NEXT: addss %xmm0, %xmm0
1121 ; SSE2-NEXT: .LBB29_12:
1122 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1123 ; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1124 ; SSE2-NEXT: movaps %xmm2, %xmm0
1127 ; AVX1-LABEL: uitofp_4vf32_4i64:
1129 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
1130 ; AVX1-NEXT: movl %eax, %ecx
1131 ; AVX1-NEXT: andl $1, %ecx
1132 ; AVX1-NEXT: testq %rax, %rax
1133 ; AVX1-NEXT: js .LBB29_1
1134 ; AVX1-NEXT: # BB#2:
1135 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1136 ; AVX1-NEXT: jmp .LBB29_3
1137 ; AVX1-NEXT: .LBB29_1:
1138 ; AVX1-NEXT: shrq %rax
1139 ; AVX1-NEXT: orq %rax, %rcx
1140 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1141 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
1142 ; AVX1-NEXT: .LBB29_3:
1143 ; AVX1-NEXT: vmovq %xmm0, %rax
1144 ; AVX1-NEXT: movl %eax, %ecx
1145 ; AVX1-NEXT: andl $1, %ecx
1146 ; AVX1-NEXT: testq %rax, %rax
1147 ; AVX1-NEXT: js .LBB29_4
1148 ; AVX1-NEXT: # BB#5:
1149 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1150 ; AVX1-NEXT: jmp .LBB29_6
1151 ; AVX1-NEXT: .LBB29_4:
1152 ; AVX1-NEXT: shrq %rax
1153 ; AVX1-NEXT: orq %rax, %rcx
1154 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1155 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
1156 ; AVX1-NEXT: .LBB29_6:
1157 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1158 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1159 ; AVX1-NEXT: vmovq %xmm0, %rax
1160 ; AVX1-NEXT: movl %eax, %ecx
1161 ; AVX1-NEXT: andl $1, %ecx
1162 ; AVX1-NEXT: testq %rax, %rax
1163 ; AVX1-NEXT: js .LBB29_7
1164 ; AVX1-NEXT: # BB#8:
1165 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1166 ; AVX1-NEXT: jmp .LBB29_9
1167 ; AVX1-NEXT: .LBB29_7:
1168 ; AVX1-NEXT: shrq %rax
1169 ; AVX1-NEXT: orq %rax, %rcx
1170 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1171 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
1172 ; AVX1-NEXT: .LBB29_9:
1173 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1174 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
1175 ; AVX1-NEXT: movl %eax, %ecx
1176 ; AVX1-NEXT: andl $1, %ecx
1177 ; AVX1-NEXT: testq %rax, %rax
1178 ; AVX1-NEXT: js .LBB29_10
1179 ; AVX1-NEXT: # BB#11:
1180 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
1181 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1182 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1183 ; AVX1-NEXT: vzeroupper
1185 ; AVX1-NEXT: .LBB29_10:
1186 ; AVX1-NEXT: shrq %rax
1187 ; AVX1-NEXT: orq %rax, %rcx
1188 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1189 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
1190 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1191 ; AVX1-NEXT: vzeroupper
1194 ; AVX2-LABEL: uitofp_4vf32_4i64:
1196 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1197 ; AVX2-NEXT: movl %eax, %ecx
1198 ; AVX2-NEXT: andl $1, %ecx
1199 ; AVX2-NEXT: testq %rax, %rax
1200 ; AVX2-NEXT: js .LBB29_1
1201 ; AVX2-NEXT: # BB#2:
1202 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1203 ; AVX2-NEXT: jmp .LBB29_3
1204 ; AVX2-NEXT: .LBB29_1:
1205 ; AVX2-NEXT: shrq %rax
1206 ; AVX2-NEXT: orq %rax, %rcx
1207 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1208 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
1209 ; AVX2-NEXT: .LBB29_3:
1210 ; AVX2-NEXT: vmovq %xmm0, %rax
1211 ; AVX2-NEXT: movl %eax, %ecx
1212 ; AVX2-NEXT: andl $1, %ecx
1213 ; AVX2-NEXT: testq %rax, %rax
1214 ; AVX2-NEXT: js .LBB29_4
1215 ; AVX2-NEXT: # BB#5:
1216 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1217 ; AVX2-NEXT: jmp .LBB29_6
1218 ; AVX2-NEXT: .LBB29_4:
1219 ; AVX2-NEXT: shrq %rax
1220 ; AVX2-NEXT: orq %rax, %rcx
1221 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1222 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
1223 ; AVX2-NEXT: .LBB29_6:
1224 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1225 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1226 ; AVX2-NEXT: vmovq %xmm0, %rax
1227 ; AVX2-NEXT: movl %eax, %ecx
1228 ; AVX2-NEXT: andl $1, %ecx
1229 ; AVX2-NEXT: testq %rax, %rax
1230 ; AVX2-NEXT: js .LBB29_7
1231 ; AVX2-NEXT: # BB#8:
1232 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1233 ; AVX2-NEXT: jmp .LBB29_9
1234 ; AVX2-NEXT: .LBB29_7:
1235 ; AVX2-NEXT: shrq %rax
1236 ; AVX2-NEXT: orq %rax, %rcx
1237 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1238 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
1239 ; AVX2-NEXT: .LBB29_9:
1240 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1241 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1242 ; AVX2-NEXT: movl %eax, %ecx
1243 ; AVX2-NEXT: andl $1, %ecx
1244 ; AVX2-NEXT: testq %rax, %rax
1245 ; AVX2-NEXT: js .LBB29_10
1246 ; AVX2-NEXT: # BB#11:
1247 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
1248 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1249 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1250 ; AVX2-NEXT: vzeroupper
1252 ; AVX2-NEXT: .LBB29_10:
1253 ; AVX2-NEXT: shrq %rax
1254 ; AVX2-NEXT: orq %rax, %rcx
1255 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1256 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
1257 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1258 ; AVX2-NEXT: vzeroupper
1260 %cvt = uitofp <4 x i64> %a to <4 x float>
1261 ret <4 x float> %cvt
1264 define <8 x float> @uitofp_8vf32_i16(<8 x i16> %a) {
1265 ; SSE2-LABEL: uitofp_8vf32_i16:
1267 ; SSE2-NEXT: pxor %xmm1, %xmm1
1268 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1269 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1270 ; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2
1271 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1272 ; SSE2-NEXT: pand .LCPI30_0(%rip), %xmm0
1273 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1
1274 ; SSE2-NEXT: movaps %xmm2, %xmm0
1277 ; AVX1-LABEL: uitofp_8vf32_i16:
1279 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1280 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1281 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1282 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1283 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1286 ; AVX2-LABEL: uitofp_8vf32_i16:
1288 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1289 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1291 %cvt = uitofp <8 x i16> %a to <8 x float>
1292 ret <8 x float> %cvt
1295 define <8 x float> @uitofp_8vf32_i8(<16 x i8> %a) {
1296 ; SSE2-LABEL: uitofp_8vf32_i8:
1298 ; SSE2-NEXT: pxor %xmm1, %xmm1
1299 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1300 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1301 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1302 ; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2
1303 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1304 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1305 ; SSE2-NEXT: pand .LCPI31_0(%rip), %xmm0
1306 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1
1307 ; SSE2-NEXT: movaps %xmm2, %xmm0
1310 ; AVX1-LABEL: uitofp_8vf32_i8:
1312 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1313 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1314 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1315 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1316 ; AVX1-NEXT: vandps .LCPI31_0(%rip), %ymm0, %ymm0
1317 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1320 ; AVX2-LABEL: uitofp_8vf32_i8:
1322 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1323 ; AVX2-NEXT: vpbroadcastd .LCPI31_0(%rip), %ymm1
1324 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1325 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1327 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1328 %cvt = uitofp <8 x i8> %shuf to <8 x float>
1329 ret <8 x float> %cvt
1336 %Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }>
1337 define void @aggregate_sitofp_8f32_i16(%Arguments* nocapture readonly %a0) {
1338 ; SSE2-LABEL: aggregate_sitofp_8f32_i16:
1340 ; SSE2-NEXT: movq 24(%rdi), %rax
1341 ; SSE2-NEXT: movdqu 8(%rdi), %xmm0
1342 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1343 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1344 ; SSE2-NEXT: psrad $16, %xmm1
1345 ; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1
1346 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1347 ; SSE2-NEXT: psrad $16, %xmm0
1348 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
1349 ; SSE2-NEXT: movaps %xmm0, (%rax)
1350 ; SSE2-NEXT: movaps %xmm1, 16(%rax)
1353 ; AVX1-LABEL: aggregate_sitofp_8f32_i16:
1355 ; AVX1-NEXT: movq 24(%rdi), %rax
1356 ; AVX1-NEXT: vmovdqu 8(%rdi), %xmm0
1357 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
1358 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1359 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
1360 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1361 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1362 ; AVX1-NEXT: vmovaps %ymm0, (%rax)
1363 ; AVX1-NEXT: vzeroupper
1366 ; AVX2-LABEL: aggregate_sitofp_8f32_i16:
1368 ; AVX2-NEXT: movq 24(%rdi), %rax
1369 ; AVX2-NEXT: vpmovsxwd 8(%rdi), %ymm0
1370 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1371 ; AVX2-NEXT: vmovaps %ymm0, (%rax)
1372 ; AVX2-NEXT: vzeroupper
1374 %1 = load %Arguments, %Arguments* %a0, align 1
1375 %2 = extractvalue %Arguments %1, 1
1376 %3 = extractvalue %Arguments %1, 2
1377 %4 = sitofp <8 x i16> %2 to <8 x float>
1378 store <8 x float> %4, <8 x float>* %3, align 32