1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
7 ; Just one 32-bit run to make sure we do reasonable things there.
8 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=i686 -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41
10 define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
11 ; SSE2-LABEL: sext_16i8_to_8i16:
12 ; SSE2: # BB#0: # %entry
13 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
14 ; SSE2-NEXT: psraw $8, %xmm0
17 ; SSSE3-LABEL: sext_16i8_to_8i16:
18 ; SSSE3: # BB#0: # %entry
19 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
20 ; SSSE3-NEXT: psraw $8, %xmm0
23 ; SSE41-LABEL: sext_16i8_to_8i16:
24 ; SSE41: # BB#0: # %entry
25 ; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
28 ; AVX-LABEL: sext_16i8_to_8i16:
29 ; AVX: # BB#0: # %entry
30 ; AVX-NEXT: vpmovsxbw %xmm0, %xmm0
33 ; X32-SSE41-LABEL: sext_16i8_to_8i16:
34 ; X32-SSE41: # BB#0: # %entry
35 ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0
36 ; X32-SSE41-NEXT: retl
38 %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
39 %C = sext <8 x i8> %B to <8 x i16>
43 define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp {
44 ; SSE2-LABEL: sext_16i8_to_16i16:
45 ; SSE2: # BB#0: # %entry
46 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
47 ; SSE2-NEXT: psraw $8, %xmm2
48 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
49 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
50 ; SSE2-NEXT: psraw $8, %xmm1
51 ; SSE2-NEXT: movdqa %xmm2, %xmm0
54 ; SSSE3-LABEL: sext_16i8_to_16i16:
55 ; SSSE3: # BB#0: # %entry
56 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
57 ; SSSE3-NEXT: psraw $8, %xmm2
58 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
59 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
60 ; SSSE3-NEXT: psraw $8, %xmm1
61 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
64 ; SSE41-LABEL: sext_16i8_to_16i16:
65 ; SSE41: # BB#0: # %entry
66 ; SSE41-NEXT: pmovsxbw %xmm0, %xmm2
67 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
68 ; SSE41-NEXT: pmovsxbw %xmm0, %xmm1
69 ; SSE41-NEXT: movdqa %xmm2, %xmm0
72 ; AVX1-LABEL: sext_16i8_to_16i16:
73 ; AVX1: # BB#0: # %entry
74 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
75 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
76 ; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
77 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
80 ; AVX2-LABEL: sext_16i8_to_16i16:
81 ; AVX2: # BB#0: # %entry
82 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
85 ; X32-SSE41-LABEL: sext_16i8_to_16i16:
86 ; X32-SSE41: # BB#0: # %entry
87 ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm2
88 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
89 ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm1
90 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
91 ; X32-SSE41-NEXT: retl
93 %B = sext <16 x i8> %A to <16 x i16>
97 define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
98 ; SSE2-LABEL: sext_16i8_to_4i32:
99 ; SSE2: # BB#0: # %entry
100 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
101 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
102 ; SSE2-NEXT: psrad $24, %xmm0
105 ; SSSE3-LABEL: sext_16i8_to_4i32:
106 ; SSSE3: # BB#0: # %entry
107 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
108 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
109 ; SSSE3-NEXT: psrad $24, %xmm0
112 ; SSE41-LABEL: sext_16i8_to_4i32:
113 ; SSE41: # BB#0: # %entry
114 ; SSE41-NEXT: pmovsxbd %xmm0, %xmm0
117 ; AVX-LABEL: sext_16i8_to_4i32:
118 ; AVX: # BB#0: # %entry
119 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
122 ; X32-SSE41-LABEL: sext_16i8_to_4i32:
123 ; X32-SSE41: # BB#0: # %entry
124 ; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm0
125 ; X32-SSE41-NEXT: retl
127 %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
128 %C = sext <4 x i8> %B to <4 x i32>
132 define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
133 ; SSE2-LABEL: sext_16i8_to_8i32:
134 ; SSE2: # BB#0: # %entry
135 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
136 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
137 ; SSE2-NEXT: psrad $24, %xmm2
138 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
139 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
140 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
141 ; SSE2-NEXT: psrad $24, %xmm1
142 ; SSE2-NEXT: movdqa %xmm2, %xmm0
145 ; SSSE3-LABEL: sext_16i8_to_8i32:
146 ; SSSE3: # BB#0: # %entry
147 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
148 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
149 ; SSSE3-NEXT: psrad $24, %xmm2
150 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
151 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
152 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
153 ; SSSE3-NEXT: psrad $24, %xmm1
154 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
157 ; SSE41-LABEL: sext_16i8_to_8i32:
158 ; SSE41: # BB#0: # %entry
159 ; SSE41-NEXT: pmovsxbd %xmm0, %xmm2
160 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
161 ; SSE41-NEXT: pmovsxbd %xmm0, %xmm1
162 ; SSE41-NEXT: movdqa %xmm2, %xmm0
165 ; AVX1-LABEL: sext_16i8_to_8i32:
166 ; AVX1: # BB#0: # %entry
167 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
168 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
169 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
170 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
173 ; AVX2-LABEL: sext_16i8_to_8i32:
174 ; AVX2: # BB#0: # %entry
175 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
176 ; AVX2-NEXT: vpslld $24, %ymm0, %ymm0
177 ; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
180 ; X32-SSE41-LABEL: sext_16i8_to_8i32:
181 ; X32-SSE41: # BB#0: # %entry
182 ; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm2
183 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
184 ; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm1
185 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
186 ; X32-SSE41-NEXT: retl
188 %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
189 %C = sext <8 x i8> %B to <8 x i32>
193 define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
194 ; SSE2-LABEL: sext_16i8_to_2i64:
195 ; SSE2: # BB#0: # %entry
196 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
197 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
198 ; SSE2-NEXT: movdqa %xmm0, %xmm1
199 ; SSE2-NEXT: psrad $31, %xmm1
200 ; SSE2-NEXT: psrad $24, %xmm0
201 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
204 ; SSSE3-LABEL: sext_16i8_to_2i64:
205 ; SSSE3: # BB#0: # %entry
206 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
207 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
208 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
209 ; SSSE3-NEXT: psrad $31, %xmm1
210 ; SSSE3-NEXT: psrad $24, %xmm0
211 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
214 ; SSE41-LABEL: sext_16i8_to_2i64:
215 ; SSE41: # BB#0: # %entry
216 ; SSE41-NEXT: pmovsxbq %xmm0, %xmm0
219 ; AVX-LABEL: sext_16i8_to_2i64:
220 ; AVX: # BB#0: # %entry
221 ; AVX-NEXT: vpmovsxbq %xmm0, %xmm0
224 ; X32-SSE41-LABEL: sext_16i8_to_2i64:
225 ; X32-SSE41: # BB#0: # %entry
226 ; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm0
227 ; X32-SSE41-NEXT: retl
229 %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
230 %C = sext <2 x i8> %B to <2 x i64>
234 define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
235 ; SSE2-LABEL: sext_16i8_to_4i64:
236 ; SSE2: # BB#0: # %entry
237 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
238 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
239 ; SSE2-NEXT: movdqa %xmm2, %xmm1
240 ; SSE2-NEXT: psrad $31, %xmm1
241 ; SSE2-NEXT: psrad $24, %xmm2
242 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
243 ; SSE2-NEXT: psrld $16, %xmm0
244 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
245 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
246 ; SSE2-NEXT: movdqa %xmm1, %xmm0
247 ; SSE2-NEXT: psrad $31, %xmm0
248 ; SSE2-NEXT: psrad $24, %xmm1
249 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
250 ; SSE2-NEXT: movdqa %xmm2, %xmm0
253 ; SSSE3-LABEL: sext_16i8_to_4i64:
254 ; SSSE3: # BB#0: # %entry
255 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
256 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
257 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
258 ; SSSE3-NEXT: psrad $31, %xmm1
259 ; SSSE3-NEXT: psrad $24, %xmm2
260 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
261 ; SSSE3-NEXT: psrld $16, %xmm0
262 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
263 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
264 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
265 ; SSSE3-NEXT: psrad $31, %xmm0
266 ; SSSE3-NEXT: psrad $24, %xmm1
267 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
268 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
271 ; SSE41-LABEL: sext_16i8_to_4i64:
272 ; SSE41: # BB#0: # %entry
273 ; SSE41-NEXT: pmovsxbq %xmm0, %xmm2
274 ; SSE41-NEXT: psrld $16, %xmm0
275 ; SSE41-NEXT: pmovsxbq %xmm0, %xmm1
276 ; SSE41-NEXT: movdqa %xmm2, %xmm0
279 ; AVX1-LABEL: sext_16i8_to_4i64:
280 ; AVX1: # BB#0: # %entry
281 ; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1
282 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
283 ; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0
284 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
287 ; AVX2-LABEL: sext_16i8_to_4i64:
288 ; AVX2: # BB#0: # %entry
289 ; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
290 ; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
291 ; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
292 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
295 ; X32-SSE41-LABEL: sext_16i8_to_4i64:
296 ; X32-SSE41: # BB#0: # %entry
297 ; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2
298 ; X32-SSE41-NEXT: psrld $16, %xmm0
299 ; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm1
300 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
301 ; X32-SSE41-NEXT: retl
303 %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
304 %C = sext <4 x i8> %B to <4 x i64>
308 define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
309 ; SSE2-LABEL: sext_8i16_to_4i32:
310 ; SSE2: # BB#0: # %entry
311 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
312 ; SSE2-NEXT: psrad $16, %xmm0
315 ; SSSE3-LABEL: sext_8i16_to_4i32:
316 ; SSSE3: # BB#0: # %entry
317 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
318 ; SSSE3-NEXT: psrad $16, %xmm0
321 ; SSE41-LABEL: sext_8i16_to_4i32:
322 ; SSE41: # BB#0: # %entry
323 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
326 ; AVX-LABEL: sext_8i16_to_4i32:
327 ; AVX: # BB#0: # %entry
328 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
331 ; X32-SSE41-LABEL: sext_8i16_to_4i32:
332 ; X32-SSE41: # BB#0: # %entry
333 ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm0
334 ; X32-SSE41-NEXT: retl
336 %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
337 %C = sext <4 x i16> %B to <4 x i32>
341 define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
342 ; SSE2-LABEL: sext_8i16_to_8i32:
343 ; SSE2: # BB#0: # %entry
344 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
345 ; SSE2-NEXT: psrad $16, %xmm2
346 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
347 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
348 ; SSE2-NEXT: psrad $16, %xmm1
349 ; SSE2-NEXT: movdqa %xmm2, %xmm0
352 ; SSSE3-LABEL: sext_8i16_to_8i32:
353 ; SSSE3: # BB#0: # %entry
354 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
355 ; SSSE3-NEXT: psrad $16, %xmm2
356 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
357 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
358 ; SSSE3-NEXT: psrad $16, %xmm1
359 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
362 ; SSE41-LABEL: sext_8i16_to_8i32:
363 ; SSE41: # BB#0: # %entry
364 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm2
365 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
366 ; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
367 ; SSE41-NEXT: movdqa %xmm2, %xmm0
370 ; AVX1-LABEL: sext_8i16_to_8i32:
371 ; AVX1: # BB#0: # %entry
372 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
373 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
374 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
375 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
378 ; AVX2-LABEL: sext_8i16_to_8i32:
379 ; AVX2: # BB#0: # %entry
380 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
383 ; X32-SSE41-LABEL: sext_8i16_to_8i32:
384 ; X32-SSE41: # BB#0: # %entry
385 ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm2
386 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
387 ; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm1
388 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
389 ; X32-SSE41-NEXT: retl
391 %B = sext <8 x i16> %A to <8 x i32>
395 define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
396 ; SSE2-LABEL: sext_8i16_to_2i64:
397 ; SSE2: # BB#0: # %entry
398 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
399 ; SSE2-NEXT: movdqa %xmm0, %xmm1
400 ; SSE2-NEXT: psrad $31, %xmm1
401 ; SSE2-NEXT: psrad $16, %xmm0
402 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
405 ; SSSE3-LABEL: sext_8i16_to_2i64:
406 ; SSSE3: # BB#0: # %entry
407 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
408 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
409 ; SSSE3-NEXT: psrad $31, %xmm1
410 ; SSSE3-NEXT: psrad $16, %xmm0
411 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
414 ; SSE41-LABEL: sext_8i16_to_2i64:
415 ; SSE41: # BB#0: # %entry
416 ; SSE41-NEXT: pmovsxwq %xmm0, %xmm0
419 ; AVX-LABEL: sext_8i16_to_2i64:
420 ; AVX: # BB#0: # %entry
421 ; AVX-NEXT: vpmovsxwq %xmm0, %xmm0
424 ; X32-SSE41-LABEL: sext_8i16_to_2i64:
425 ; X32-SSE41: # BB#0: # %entry
426 ; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm0
427 ; X32-SSE41-NEXT: retl
429 %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
430 %C = sext <2 x i16> %B to <2 x i64>
434 define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
435 ; SSE2-LABEL: sext_8i16_to_4i64:
436 ; SSE2: # BB#0: # %entry
437 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
438 ; SSE2-NEXT: movdqa %xmm2, %xmm1
439 ; SSE2-NEXT: psrad $31, %xmm1
440 ; SSE2-NEXT: psrad $16, %xmm2
441 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
442 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
443 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
444 ; SSE2-NEXT: movdqa %xmm1, %xmm0
445 ; SSE2-NEXT: psrad $31, %xmm0
446 ; SSE2-NEXT: psrad $16, %xmm1
447 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
448 ; SSE2-NEXT: movdqa %xmm2, %xmm0
451 ; SSSE3-LABEL: sext_8i16_to_4i64:
452 ; SSSE3: # BB#0: # %entry
453 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
454 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
455 ; SSSE3-NEXT: psrad $31, %xmm1
456 ; SSSE3-NEXT: psrad $16, %xmm2
457 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
458 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
459 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
460 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
461 ; SSSE3-NEXT: psrad $31, %xmm0
462 ; SSSE3-NEXT: psrad $16, %xmm1
463 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
464 ; SSSE3-NEXT: movdqa %xmm2, %xmm0
467 ; SSE41-LABEL: sext_8i16_to_4i64:
468 ; SSE41: # BB#0: # %entry
469 ; SSE41-NEXT: pmovsxwq %xmm0, %xmm2
470 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
471 ; SSE41-NEXT: pmovsxwq %xmm0, %xmm1
472 ; SSE41-NEXT: movdqa %xmm2, %xmm0
475 ; AVX1-LABEL: sext_8i16_to_4i64:
476 ; AVX1: # BB#0: # %entry
477 ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm1
478 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
479 ; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
480 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
483 ; AVX2-LABEL: sext_8i16_to_4i64:
484 ; AVX2: # BB#0: # %entry
485 ; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
486 ; AVX2-NEXT: vpslld $16, %xmm0, %xmm0
487 ; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
488 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
491 ; X32-SSE41-LABEL: sext_8i16_to_4i64:
492 ; X32-SSE41: # BB#0: # %entry
493 ; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm2
494 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
495 ; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm1
496 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
497 ; X32-SSE41-NEXT: retl
499 %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
500 %C = sext <4 x i16> %B to <4 x i64>
504 define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
505 ; SSE2-LABEL: sext_4i32_to_2i64:
506 ; SSE2: # BB#0: # %entry
507 ; SSE2-NEXT: movdqa %xmm0, %xmm1
508 ; SSE2-NEXT: psrad $31, %xmm1
509 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
512 ; SSSE3-LABEL: sext_4i32_to_2i64:
513 ; SSSE3: # BB#0: # %entry
514 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
515 ; SSSE3-NEXT: psrad $31, %xmm1
516 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
519 ; SSE41-LABEL: sext_4i32_to_2i64:
520 ; SSE41: # BB#0: # %entry
521 ; SSE41-NEXT: pmovsxdq %xmm0, %xmm0
524 ; AVX-LABEL: sext_4i32_to_2i64:
525 ; AVX: # BB#0: # %entry
526 ; AVX-NEXT: vpmovsxdq %xmm0, %xmm0
529 ; X32-SSE41-LABEL: sext_4i32_to_2i64:
530 ; X32-SSE41: # BB#0: # %entry
531 ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm0
532 ; X32-SSE41-NEXT: retl
534 %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
535 %C = sext <2 x i32> %B to <2 x i64>
539 define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
540 ; SSE2-LABEL: sext_4i32_to_4i64:
541 ; SSE2: # BB#0: # %entry
542 ; SSE2-NEXT: movdqa %xmm0, %xmm2
543 ; SSE2-NEXT: psrad $31, %xmm2
544 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
545 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
546 ; SSE2-NEXT: movdqa %xmm1, %xmm2
547 ; SSE2-NEXT: psrad $31, %xmm2
548 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
551 ; SSSE3-LABEL: sext_4i32_to_4i64:
552 ; SSSE3: # BB#0: # %entry
553 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
554 ; SSSE3-NEXT: psrad $31, %xmm2
555 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
556 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
557 ; SSSE3-NEXT: movdqa %xmm1, %xmm2
558 ; SSSE3-NEXT: psrad $31, %xmm2
559 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
562 ; SSE41-LABEL: sext_4i32_to_4i64:
563 ; SSE41: # BB#0: # %entry
564 ; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
565 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
566 ; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
567 ; SSE41-NEXT: movdqa %xmm2, %xmm0
570 ; AVX1-LABEL: sext_4i32_to_4i64:
571 ; AVX1: # BB#0: # %entry
572 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
573 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
574 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
575 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
578 ; AVX2-LABEL: sext_4i32_to_4i64:
579 ; AVX2: # BB#0: # %entry
580 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
583 ; X32-SSE41-LABEL: sext_4i32_to_4i64:
584 ; X32-SSE41: # BB#0: # %entry
585 ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
586 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
587 ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
588 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
589 ; X32-SSE41-NEXT: retl
591 %B = sext <4 x i32> %A to <4 x i64>
595 define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
596 ; SSE2-LABEL: load_sext_2i8_to_2i64:
597 ; SSE2: # BB#0: # %entry
598 ; SSE2-NEXT: movzwl (%rdi), %eax
599 ; SSE2-NEXT: movd %eax, %xmm0
600 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
601 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
602 ; SSE2-NEXT: movdqa %xmm0, %xmm1
603 ; SSE2-NEXT: psrad $31, %xmm1
604 ; SSE2-NEXT: psrad $24, %xmm0
605 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
608 ; SSSE3-LABEL: load_sext_2i8_to_2i64:
609 ; SSSE3: # BB#0: # %entry
610 ; SSSE3-NEXT: movzwl (%rdi), %eax
611 ; SSSE3-NEXT: movd %eax, %xmm0
612 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
613 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
614 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
615 ; SSSE3-NEXT: psrad $31, %xmm1
616 ; SSSE3-NEXT: psrad $24, %xmm0
617 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
620 ; SSE41-LABEL: load_sext_2i8_to_2i64:
621 ; SSE41: # BB#0: # %entry
622 ; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
625 ; AVX-LABEL: load_sext_2i8_to_2i64:
626 ; AVX: # BB#0: # %entry
627 ; AVX-NEXT: vpmovsxbq (%rdi), %xmm0
630 ; X32-SSE41-LABEL: load_sext_2i8_to_2i64:
631 ; X32-SSE41: # BB#0: # %entry
632 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
633 ; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0
634 ; X32-SSE41-NEXT: retl
636 %X = load <2 x i8>, <2 x i8>* %ptr
637 %Y = sext <2 x i8> %X to <2 x i64>
641 define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) {
642 ; SSE2-LABEL: load_sext_4i8_to_4i32:
643 ; SSE2: # BB#0: # %entry
644 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
645 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
646 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
647 ; SSE2-NEXT: psrad $24, %xmm0
650 ; SSSE3-LABEL: load_sext_4i8_to_4i32:
651 ; SSSE3: # BB#0: # %entry
652 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
653 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
654 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
655 ; SSSE3-NEXT: psrad $24, %xmm0
658 ; SSE41-LABEL: load_sext_4i8_to_4i32:
659 ; SSE41: # BB#0: # %entry
660 ; SSE41-NEXT: pmovsxbd (%rdi), %xmm0
663 ; AVX-LABEL: load_sext_4i8_to_4i32:
664 ; AVX: # BB#0: # %entry
665 ; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
668 ; X32-SSE41-LABEL: load_sext_4i8_to_4i32:
669 ; X32-SSE41: # BB#0: # %entry
670 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
671 ; X32-SSE41-NEXT: pmovsxbd (%eax), %xmm0
672 ; X32-SSE41-NEXT: retl
674 %X = load <4 x i8>, <4 x i8>* %ptr
675 %Y = sext <4 x i8> %X to <4 x i32>
679 define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
680 ; SSE2-LABEL: load_sext_4i8_to_4i64:
681 ; SSE2: # BB#0: # %entry
682 ; SSE2-NEXT: movsbq 1(%rdi), %rax
683 ; SSE2-NEXT: movd %rax, %xmm1
684 ; SSE2-NEXT: movsbq (%rdi), %rax
685 ; SSE2-NEXT: movd %rax, %xmm0
686 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
687 ; SSE2-NEXT: movsbq 3(%rdi), %rax
688 ; SSE2-NEXT: movd %rax, %xmm2
689 ; SSE2-NEXT: movsbq 2(%rdi), %rax
690 ; SSE2-NEXT: movd %rax, %xmm1
691 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
694 ; SSSE3-LABEL: load_sext_4i8_to_4i64:
695 ; SSSE3: # BB#0: # %entry
696 ; SSSE3-NEXT: movsbq 1(%rdi), %rax
697 ; SSSE3-NEXT: movd %rax, %xmm1
698 ; SSSE3-NEXT: movsbq (%rdi), %rax
699 ; SSSE3-NEXT: movd %rax, %xmm0
700 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
701 ; SSSE3-NEXT: movsbq 3(%rdi), %rax
702 ; SSSE3-NEXT: movd %rax, %xmm2
703 ; SSSE3-NEXT: movsbq 2(%rdi), %rax
704 ; SSSE3-NEXT: movd %rax, %xmm1
705 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
708 ; SSE41-LABEL: load_sext_4i8_to_4i64:
709 ; SSE41: # BB#0: # %entry
710 ; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
711 ; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1
714 ; AVX1-LABEL: load_sext_4i8_to_4i64:
715 ; AVX1: # BB#0: # %entry
716 ; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0
717 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
718 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
719 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
720 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
723 ; AVX2-LABEL: load_sext_4i8_to_4i64:
724 ; AVX2: # BB#0: # %entry
725 ; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0
728 ; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
729 ; X32-SSE41: # BB#0: # %entry
730 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
731 ; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0
732 ; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1
733 ; X32-SSE41-NEXT: retl
735 %X = load <4 x i8>, <4 x i8>* %ptr
736 %Y = sext <4 x i8> %X to <4 x i64>
740 define <8 x i16> @load_sext_8i8_to_8i16(<8 x i8> *%ptr) {
741 ; SSE2-LABEL: load_sext_8i8_to_8i16:
742 ; SSE2: # BB#0: # %entry
743 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
744 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
745 ; SSE2-NEXT: psraw $8, %xmm0
748 ; SSSE3-LABEL: load_sext_8i8_to_8i16:
749 ; SSSE3: # BB#0: # %entry
750 ; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
751 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
752 ; SSSE3-NEXT: psraw $8, %xmm0
755 ; SSE41-LABEL: load_sext_8i8_to_8i16:
756 ; SSE41: # BB#0: # %entry
757 ; SSE41-NEXT: pmovsxbw (%rdi), %xmm0
760 ; AVX-LABEL: load_sext_8i8_to_8i16:
761 ; AVX: # BB#0: # %entry
762 ; AVX-NEXT: vpmovsxbw (%rdi), %xmm0
765 ; X32-SSE41-LABEL: load_sext_8i8_to_8i16:
766 ; X32-SSE41: # BB#0: # %entry
767 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
768 ; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0
769 ; X32-SSE41-NEXT: retl
771 %X = load <8 x i8>, <8 x i8>* %ptr
772 %Y = sext <8 x i8> %X to <8 x i16>
776 define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) {
777 ; SSE2-LABEL: load_sext_8i8_to_8i32:
778 ; SSE2: # BB#0: # %entry
779 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
780 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
781 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
782 ; SSE2-NEXT: psrad $24, %xmm0
783 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
784 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
785 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
786 ; SSE2-NEXT: psrad $24, %xmm1
789 ; SSSE3-LABEL: load_sext_8i8_to_8i32:
790 ; SSSE3: # BB#0: # %entry
791 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
792 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
793 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
794 ; SSSE3-NEXT: psrad $24, %xmm0
795 ; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
796 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
797 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
798 ; SSSE3-NEXT: psrad $24, %xmm1
801 ; SSE41-LABEL: load_sext_8i8_to_8i32:
802 ; SSE41: # BB#0: # %entry
803 ; SSE41-NEXT: pmovsxbd (%rdi), %xmm0
804 ; SSE41-NEXT: pmovsxbd 4(%rdi), %xmm1
807 ; AVX1-LABEL: load_sext_8i8_to_8i32:
808 ; AVX1: # BB#0: # %entry
809 ; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0
810 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
811 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
812 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
813 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
816 ; AVX2-LABEL: load_sext_8i8_to_8i32:
817 ; AVX2: # BB#0: # %entry
818 ; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0
821 ; X32-SSE41-LABEL: load_sext_8i8_to_8i32:
822 ; X32-SSE41: # BB#0: # %entry
823 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
824 ; X32-SSE41-NEXT: pmovsxbd (%eax), %xmm0
825 ; X32-SSE41-NEXT: pmovsxbd 4(%eax), %xmm1
826 ; X32-SSE41-NEXT: retl
828 %X = load <8 x i8>, <8 x i8>* %ptr
829 %Y = sext <8 x i8> %X to <8 x i32>
833 define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) {
834 ; SSE2-LABEL: load_sext_16i8_to_16i16:
835 ; SSE2: # BB#0: # %entry
836 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
837 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
838 ; SSE2-NEXT: psraw $8, %xmm0
839 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
840 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
841 ; SSE2-NEXT: psraw $8, %xmm1
844 ; SSSE3-LABEL: load_sext_16i8_to_16i16:
845 ; SSSE3: # BB#0: # %entry
846 ; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
847 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
848 ; SSSE3-NEXT: psraw $8, %xmm0
849 ; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
850 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
851 ; SSSE3-NEXT: psraw $8, %xmm1
854 ; SSE41-LABEL: load_sext_16i8_to_16i16:
855 ; SSE41: # BB#0: # %entry
856 ; SSE41-NEXT: pmovsxbw (%rdi), %xmm0
857 ; SSE41-NEXT: pmovsxbw 8(%rdi), %xmm1
860 ; AVX1-LABEL: load_sext_16i8_to_16i16:
861 ; AVX1: # BB#0: # %entry
862 ; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0
863 ; AVX1-NEXT: vpmovsxbw 8(%rdi), %xmm1
864 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
867 ; AVX2-LABEL: load_sext_16i8_to_16i16:
868 ; AVX2: # BB#0: # %entry
869 ; AVX2-NEXT: vpmovsxbw (%rdi), %ymm0
872 ; X32-SSE41-LABEL: load_sext_16i8_to_16i16:
873 ; X32-SSE41: # BB#0: # %entry
874 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
875 ; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0
876 ; X32-SSE41-NEXT: pmovsxbw 8(%eax), %xmm1
877 ; X32-SSE41-NEXT: retl
879 %X = load <16 x i8>, <16 x i8>* %ptr
880 %Y = sext <16 x i8> %X to <16 x i16>
884 define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
885 ; SSE2-LABEL: load_sext_2i16_to_2i64:
886 ; SSE2: # BB#0: # %entry
887 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
888 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
889 ; SSE2-NEXT: movdqa %xmm0, %xmm1
890 ; SSE2-NEXT: psrad $31, %xmm1
891 ; SSE2-NEXT: psrad $16, %xmm0
892 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
895 ; SSSE3-LABEL: load_sext_2i16_to_2i64:
896 ; SSSE3: # BB#0: # %entry
897 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
898 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
899 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
900 ; SSSE3-NEXT: psrad $31, %xmm1
901 ; SSSE3-NEXT: psrad $16, %xmm0
902 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
905 ; SSE41-LABEL: load_sext_2i16_to_2i64:
906 ; SSE41: # BB#0: # %entry
907 ; SSE41-NEXT: pmovsxwq (%rdi), %xmm0
910 ; AVX-LABEL: load_sext_2i16_to_2i64:
911 ; AVX: # BB#0: # %entry
912 ; AVX-NEXT: vpmovsxwq (%rdi), %xmm0
915 ; X32-SSE41-LABEL: load_sext_2i16_to_2i64:
916 ; X32-SSE41: # BB#0: # %entry
917 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
918 ; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0
919 ; X32-SSE41-NEXT: retl
921 %X = load <2 x i16>, <2 x i16>* %ptr
922 %Y = sext <2 x i16> %X to <2 x i64>
926 define <4 x i32> @load_sext_4i16_to_4i32(<4 x i16> *%ptr) {
927 ; SSE2-LABEL: load_sext_4i16_to_4i32:
928 ; SSE2: # BB#0: # %entry
929 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
930 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
931 ; SSE2-NEXT: psrad $16, %xmm0
934 ; SSSE3-LABEL: load_sext_4i16_to_4i32:
935 ; SSSE3: # BB#0: # %entry
936 ; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
937 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
938 ; SSSE3-NEXT: psrad $16, %xmm0
941 ; SSE41-LABEL: load_sext_4i16_to_4i32:
942 ; SSE41: # BB#0: # %entry
943 ; SSE41-NEXT: pmovsxwd (%rdi), %xmm0
946 ; AVX-LABEL: load_sext_4i16_to_4i32:
947 ; AVX: # BB#0: # %entry
948 ; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
951 ; X32-SSE41-LABEL: load_sext_4i16_to_4i32:
952 ; X32-SSE41: # BB#0: # %entry
953 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
954 ; X32-SSE41-NEXT: pmovsxwd (%eax), %xmm0
955 ; X32-SSE41-NEXT: retl
957 %X = load <4 x i16>, <4 x i16>* %ptr
958 %Y = sext <4 x i16> %X to <4 x i32>
962 define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
963 ; SSE2-LABEL: load_sext_4i16_to_4i64:
964 ; SSE2: # BB#0: # %entry
965 ; SSE2-NEXT: movswq 2(%rdi), %rax
966 ; SSE2-NEXT: movd %rax, %xmm1
967 ; SSE2-NEXT: movswq (%rdi), %rax
968 ; SSE2-NEXT: movd %rax, %xmm0
969 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
970 ; SSE2-NEXT: movswq 6(%rdi), %rax
971 ; SSE2-NEXT: movd %rax, %xmm2
972 ; SSE2-NEXT: movswq 4(%rdi), %rax
973 ; SSE2-NEXT: movd %rax, %xmm1
974 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
977 ; SSSE3-LABEL: load_sext_4i16_to_4i64:
978 ; SSSE3: # BB#0: # %entry
979 ; SSSE3-NEXT: movswq 2(%rdi), %rax
980 ; SSSE3-NEXT: movd %rax, %xmm1
981 ; SSSE3-NEXT: movswq (%rdi), %rax
982 ; SSSE3-NEXT: movd %rax, %xmm0
983 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
984 ; SSSE3-NEXT: movswq 6(%rdi), %rax
985 ; SSSE3-NEXT: movd %rax, %xmm2
986 ; SSSE3-NEXT: movswq 4(%rdi), %rax
987 ; SSSE3-NEXT: movd %rax, %xmm1
988 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
991 ; SSE41-LABEL: load_sext_4i16_to_4i64:
992 ; SSE41: # BB#0: # %entry
993 ; SSE41-NEXT: pmovsxwq (%rdi), %xmm0
994 ; SSE41-NEXT: pmovsxwq 4(%rdi), %xmm1
997 ; AVX1-LABEL: load_sext_4i16_to_4i64:
998 ; AVX1: # BB#0: # %entry
999 ; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0
1000 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
1001 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1002 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
1003 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1006 ; AVX2-LABEL: load_sext_4i16_to_4i64:
1007 ; AVX2: # BB#0: # %entry
1008 ; AVX2-NEXT: vpmovsxwq (%rdi), %ymm0
1011 ; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
1012 ; X32-SSE41: # BB#0: # %entry
1013 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1014 ; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0
1015 ; X32-SSE41-NEXT: pmovsxwq 4(%eax), %xmm1
1016 ; X32-SSE41-NEXT: retl
1018 %X = load <4 x i16>, <4 x i16>* %ptr
1019 %Y = sext <4 x i16> %X to <4 x i64>
1023 define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) {
1024 ; SSE2-LABEL: load_sext_8i16_to_8i32:
1025 ; SSE2: # BB#0: # %entry
1026 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1027 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1028 ; SSE2-NEXT: psrad $16, %xmm0
1029 ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
1030 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1031 ; SSE2-NEXT: psrad $16, %xmm1
1034 ; SSSE3-LABEL: load_sext_8i16_to_8i32:
1035 ; SSSE3: # BB#0: # %entry
1036 ; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1037 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1038 ; SSSE3-NEXT: psrad $16, %xmm0
1039 ; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
1040 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1041 ; SSSE3-NEXT: psrad $16, %xmm1
1044 ; SSE41-LABEL: load_sext_8i16_to_8i32:
1045 ; SSE41: # BB#0: # %entry
1046 ; SSE41-NEXT: pmovsxwd (%rdi), %xmm0
1047 ; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1
1050 ; AVX1-LABEL: load_sext_8i16_to_8i32:
1051 ; AVX1: # BB#0: # %entry
1052 ; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0
1053 ; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1
1054 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1057 ; AVX2-LABEL: load_sext_8i16_to_8i32:
1058 ; AVX2: # BB#0: # %entry
1059 ; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0
1062 ; X32-SSE41-LABEL: load_sext_8i16_to_8i32:
1063 ; X32-SSE41: # BB#0: # %entry
1064 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1065 ; X32-SSE41-NEXT: pmovsxwd (%eax), %xmm0
1066 ; X32-SSE41-NEXT: pmovsxwd 8(%eax), %xmm1
1067 ; X32-SSE41-NEXT: retl
1069 %X = load <8 x i16>, <8 x i16>* %ptr
1070 %Y = sext <8 x i16> %X to <8 x i32>
1074 define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) {
1075 ; SSE2-LABEL: load_sext_2i32_to_2i64:
1076 ; SSE2: # BB#0: # %entry
1077 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1078 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1079 ; SSE2-NEXT: psrad $31, %xmm1
1080 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1083 ; SSSE3-LABEL: load_sext_2i32_to_2i64:
1084 ; SSSE3: # BB#0: # %entry
1085 ; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1086 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
1087 ; SSSE3-NEXT: psrad $31, %xmm1
1088 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1091 ; SSE41-LABEL: load_sext_2i32_to_2i64:
1092 ; SSE41: # BB#0: # %entry
1093 ; SSE41-NEXT: pmovsxdq (%rdi), %xmm0
1096 ; AVX-LABEL: load_sext_2i32_to_2i64:
1097 ; AVX: # BB#0: # %entry
1098 ; AVX-NEXT: vpmovsxdq (%rdi), %xmm0
1101 ; X32-SSE41-LABEL: load_sext_2i32_to_2i64:
1102 ; X32-SSE41: # BB#0: # %entry
1103 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1104 ; X32-SSE41-NEXT: pmovsxdq (%eax), %xmm0
1105 ; X32-SSE41-NEXT: retl
1107 %X = load <2 x i32>, <2 x i32>* %ptr
1108 %Y = sext <2 x i32> %X to <2 x i64>
1112 define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
1113 ; SSE2-LABEL: load_sext_4i32_to_4i64:
1114 ; SSE2: # BB#0: # %entry
1115 ; SSE2-NEXT: movdqa (%rdi), %xmm0
1116 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1117 ; SSE2-NEXT: psrad $31, %xmm2
1118 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1119 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1120 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1121 ; SSE2-NEXT: psrad $31, %xmm2
1122 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1125 ; SSSE3-LABEL: load_sext_4i32_to_4i64:
1126 ; SSSE3: # BB#0: # %entry
1127 ; SSSE3-NEXT: movdqa (%rdi), %xmm0
1128 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
1129 ; SSSE3-NEXT: psrad $31, %xmm2
1130 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1131 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1132 ; SSSE3-NEXT: movdqa %xmm1, %xmm2
1133 ; SSSE3-NEXT: psrad $31, %xmm2
1134 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1137 ; SSE41-LABEL: load_sext_4i32_to_4i64:
1138 ; SSE41: # BB#0: # %entry
1139 ; SSE41-NEXT: pmovsxdq (%rdi), %xmm0
1140 ; SSE41-NEXT: pmovsxdq 8(%rdi), %xmm1
1143 ; AVX1-LABEL: load_sext_4i32_to_4i64:
1144 ; AVX1: # BB#0: # %entry
1145 ; AVX1-NEXT: vpmovsxdq (%rdi), %xmm0
1146 ; AVX1-NEXT: vpmovsxdq 8(%rdi), %xmm1
1147 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1150 ; AVX2-LABEL: load_sext_4i32_to_4i64:
1151 ; AVX2: # BB#0: # %entry
1152 ; AVX2-NEXT: vpmovsxdq (%rdi), %ymm0
1155 ; X32-SSE41-LABEL: load_sext_4i32_to_4i64:
1156 ; X32-SSE41: # BB#0: # %entry
1157 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1158 ; X32-SSE41-NEXT: pmovsxdq (%eax), %xmm0
1159 ; X32-SSE41-NEXT: pmovsxdq 8(%eax), %xmm1
1160 ; X32-SSE41-NEXT: retl
1162 %X = load <4 x i32>, <4 x i32>* %ptr
1163 %Y = sext <4 x i32> %X to <4 x i64>
1167 define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
1168 ; SSE2-LABEL: sext_2i8_to_i32:
1169 ; SSE2: # BB#0: # %entry
1170 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1171 ; SSE2-NEXT: psraw $8, %xmm0
1172 ; SSE2-NEXT: movd %xmm0, %eax
1175 ; SSSE3-LABEL: sext_2i8_to_i32:
1176 ; SSSE3: # BB#0: # %entry
1177 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1178 ; SSSE3-NEXT: psraw $8, %xmm0
1179 ; SSSE3-NEXT: movd %xmm0, %eax
1182 ; SSE41-LABEL: sext_2i8_to_i32:
1183 ; SSE41: # BB#0: # %entry
1184 ; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
1185 ; SSE41-NEXT: movd %xmm0, %eax
1188 ; AVX-LABEL: sext_2i8_to_i32:
1189 ; AVX: # BB#0: # %entry
1190 ; AVX-NEXT: vpmovsxbw %xmm0, %xmm0
1191 ; AVX-NEXT: vmovd %xmm0, %eax
1194 ; X32-SSE41-LABEL: sext_2i8_to_i32:
1195 ; X32-SSE41: # BB#0: # %entry
1196 ; X32-SSE41-NEXT: pushl %eax
1197 ; X32-SSE41-NEXT: .Ltmp0:
1198 ; X32-SSE41-NEXT: .cfi_def_cfa_offset 8
1199 ; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0
1200 ; X32-SSE41-NEXT: movd %xmm0, %eax
1201 ; X32-SSE41-NEXT: popl %edx
1202 ; X32-SSE41-NEXT: retl
1204 %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
1205 %Ex = sext <2 x i8> %Shuf to <2 x i16>
1206 %Bc = bitcast <2 x i16> %Ex to i32
1210 define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
1211 ; SSE2-LABEL: sext_4i1_to_4i64:
1213 ; SSE2-NEXT: pslld $31, %xmm0
1214 ; SSE2-NEXT: psrad $31, %xmm0
1215 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1216 ; SSE2-NEXT: psrad $31, %xmm2
1217 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1218 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1219 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1220 ; SSE2-NEXT: psrad $31, %xmm2
1221 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1224 ; SSSE3-LABEL: sext_4i1_to_4i64:
1226 ; SSSE3-NEXT: pslld $31, %xmm0
1227 ; SSSE3-NEXT: psrad $31, %xmm0
1228 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
1229 ; SSSE3-NEXT: psrad $31, %xmm2
1230 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1231 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1232 ; SSSE3-NEXT: movdqa %xmm1, %xmm2
1233 ; SSSE3-NEXT: psrad $31, %xmm2
1234 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1237 ; SSE41-LABEL: sext_4i1_to_4i64:
1239 ; SSE41-NEXT: pslld $31, %xmm0
1240 ; SSE41-NEXT: psrad $31, %xmm0
1241 ; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
1242 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1243 ; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
1244 ; SSE41-NEXT: movdqa %xmm2, %xmm0
1247 ; AVX1-LABEL: sext_4i1_to_4i64:
1249 ; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
1250 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
1251 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
1252 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1253 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
1254 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1257 ; AVX2-LABEL: sext_4i1_to_4i64:
1259 ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
1260 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
1261 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
1264 ; X32-SSE41-LABEL: sext_4i1_to_4i64:
1265 ; X32-SSE41: # BB#0:
1266 ; X32-SSE41-NEXT: pslld $31, %xmm0
1267 ; X32-SSE41-NEXT: psrad $31, %xmm0
1268 ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
1269 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1270 ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
1271 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
1272 ; X32-SSE41-NEXT: retl
1273 %extmask = sext <4 x i1> %mask to <4 x i64>
1274 ret <4 x i64> %extmask
1277 define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
1278 ; SSE2-LABEL: sext_4i8_to_4i64:
1280 ; SSE2-NEXT: pslld $24, %xmm0
1281 ; SSE2-NEXT: psrad $24, %xmm0
1282 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1283 ; SSE2-NEXT: psrad $31, %xmm2
1284 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1285 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1286 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1287 ; SSE2-NEXT: psrad $31, %xmm2
1288 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1291 ; SSSE3-LABEL: sext_4i8_to_4i64:
1293 ; SSSE3-NEXT: pslld $24, %xmm0
1294 ; SSSE3-NEXT: psrad $24, %xmm0
1295 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
1296 ; SSSE3-NEXT: psrad $31, %xmm2
1297 ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1298 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1299 ; SSSE3-NEXT: movdqa %xmm1, %xmm2
1300 ; SSSE3-NEXT: psrad $31, %xmm2
1301 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1304 ; SSE41-LABEL: sext_4i8_to_4i64:
1306 ; SSE41-NEXT: pslld $24, %xmm0
1307 ; SSE41-NEXT: psrad $24, %xmm0
1308 ; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
1309 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1310 ; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
1311 ; SSE41-NEXT: movdqa %xmm2, %xmm0
1314 ; AVX1-LABEL: sext_4i8_to_4i64:
1316 ; AVX1-NEXT: vpslld $24, %xmm0, %xmm0
1317 ; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
1318 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
1319 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1320 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
1321 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1324 ; AVX2-LABEL: sext_4i8_to_4i64:
1326 ; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
1327 ; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
1328 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
1331 ; X32-SSE41-LABEL: sext_4i8_to_4i64:
1332 ; X32-SSE41: # BB#0:
1333 ; X32-SSE41-NEXT: pslld $24, %xmm0
1334 ; X32-SSE41-NEXT: psrad $24, %xmm0
1335 ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
1336 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1337 ; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
1338 ; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
1339 ; X32-SSE41-NEXT: retl
1340 %extmask = sext <4 x i8> %mask to <4 x i64>
1341 ret <4 x i64> %extmask