1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
8 define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
9 ; AVX1-LABEL: var_shift_v4i64:
11 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
12 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
13 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
14 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
15 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
16 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
17 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3
18 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
19 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
20 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
21 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
24 ; AVX2-LABEL: var_shift_v4i64:
26 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
28 %shift = lshr <4 x i64> %a, %b
32 define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
33 ; AVX1-LABEL: var_shift_v8i32:
35 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
36 ; AVX1-NEXT: vpextrd $1, %xmm2, %eax
37 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
38 ; AVX1-NEXT: vpextrd $1, %xmm3, %ecx
39 ; AVX1-NEXT: shrl %cl, %eax
40 ; AVX1-NEXT: vmovd %xmm2, %edx
41 ; AVX1-NEXT: vmovd %xmm3, %ecx
42 ; AVX1-NEXT: shrl %cl, %edx
43 ; AVX1-NEXT: vmovd %edx, %xmm4
44 ; AVX1-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
45 ; AVX1-NEXT: vpextrd $2, %xmm2, %eax
46 ; AVX1-NEXT: vpextrd $2, %xmm3, %ecx
47 ; AVX1-NEXT: shrl %cl, %eax
48 ; AVX1-NEXT: vpinsrd $2, %eax, %xmm4, %xmm4
49 ; AVX1-NEXT: vpextrd $3, %xmm2, %eax
50 ; AVX1-NEXT: vpextrd $3, %xmm3, %ecx
51 ; AVX1-NEXT: shrl %cl, %eax
52 ; AVX1-NEXT: vpinsrd $3, %eax, %xmm4, %xmm2
53 ; AVX1-NEXT: vpextrd $1, %xmm0, %eax
54 ; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
55 ; AVX1-NEXT: shrl %cl, %eax
56 ; AVX1-NEXT: vmovd %xmm0, %edx
57 ; AVX1-NEXT: vmovd %xmm1, %ecx
58 ; AVX1-NEXT: shrl %cl, %edx
59 ; AVX1-NEXT: vmovd %edx, %xmm3
60 ; AVX1-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
61 ; AVX1-NEXT: vpextrd $2, %xmm0, %eax
62 ; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
63 ; AVX1-NEXT: shrl %cl, %eax
64 ; AVX1-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
65 ; AVX1-NEXT: vpextrd $3, %xmm0, %eax
66 ; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
67 ; AVX1-NEXT: shrl %cl, %eax
68 ; AVX1-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
69 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
72 ; AVX2-LABEL: var_shift_v8i32:
74 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
76 %shift = lshr <8 x i32> %a, %b
80 define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
81 ; AVX1-LABEL: var_shift_v16i16:
83 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
84 ; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
85 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
86 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
87 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
88 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
89 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5
90 ; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
91 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm4
92 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
93 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4
94 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
95 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
96 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4
97 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
98 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
99 ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
100 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
101 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
102 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
103 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4
104 ; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
105 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
106 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
107 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
108 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
109 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
110 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
111 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
112 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
113 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
116 ; AVX2-LABEL: var_shift_v16i16:
118 ; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
119 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
120 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
121 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
122 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
123 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
124 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
125 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
126 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
127 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
129 %shift = lshr <16 x i16> %a, %b
130 ret <16 x i16> %shift
133 define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
134 ; AVX1-LABEL: var_shift_v32i8:
136 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
137 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
138 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
139 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
140 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
141 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
142 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
143 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
144 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
145 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
146 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
147 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
148 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
149 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
150 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
151 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
152 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
153 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
154 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
155 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
156 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
157 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3
158 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
159 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
160 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
161 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3
162 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
163 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
164 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
165 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
168 ; AVX2-LABEL: var_shift_v32i8:
170 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
171 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
172 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
173 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
174 ; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
175 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
176 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
177 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
178 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
179 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
180 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
181 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
183 %shift = lshr <32 x i8> %a, %b
188 ; Uniform Variable Shifts
191 define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
192 ; AVX1-LABEL: splatvar_shift_v4i64:
194 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
195 ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
196 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
197 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
200 ; AVX2-LABEL: splatvar_shift_v4i64:
202 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
204 %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
205 %shift = lshr <4 x i64> %a, %splat
209 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
210 ; AVX1-LABEL: splatvar_shift_v8i32:
212 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
213 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
214 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
215 ; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
216 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
217 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
220 ; AVX2-LABEL: splatvar_shift_v8i32:
222 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
223 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
224 ; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
226 %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
227 %shift = lshr <8 x i32> %a, %splat
231 define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
232 ; AVX1-LABEL: splatvar_shift_v16i16:
234 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
235 ; AVX1-NEXT: vmovd %xmm1, %eax
236 ; AVX1-NEXT: movzwl %ax, %eax
237 ; AVX1-NEXT: vmovd %eax, %xmm1
238 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
239 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
240 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
243 ; AVX2-LABEL: splatvar_shift_v16i16:
245 ; AVX2-NEXT: vmovd %xmm1, %eax
246 ; AVX2-NEXT: movzwl %ax, %eax
247 ; AVX2-NEXT: vmovd %eax, %xmm1
248 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
250 %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
251 %shift = lshr <16 x i16> %a, %splat
252 ret <16 x i16> %shift
255 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
256 ; AVX1-LABEL: splatvar_shift_v32i8:
258 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
259 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
260 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
261 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
262 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
263 ; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
264 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
265 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm2
266 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
267 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
268 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
269 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm6
270 ; AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm2, %xmm2
271 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
272 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
273 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
274 ; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm4
275 ; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
276 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
277 ; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
278 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
279 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
280 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
281 ; AVX1-NEXT: vpblendvb %xmm6, %xmm1, %xmm0, %xmm0
282 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
283 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
284 ; AVX1-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
285 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
288 ; AVX2-LABEL: splatvar_shift_v32i8:
290 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
291 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
292 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
293 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
294 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
295 ; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
296 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
297 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
298 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
299 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
300 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
301 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
302 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
304 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
305 %shift = lshr <32 x i8> %a, %splat
313 define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
314 ; AVX1-LABEL: constant_shift_v4i64:
316 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
317 ; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2
318 ; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1
319 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
320 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2
321 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
322 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
323 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
326 ; AVX2-LABEL: constant_shift_v4i64:
328 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
330 %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
334 define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
335 ; AVX1-LABEL: constant_shift_v8i32:
337 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
338 ; AVX1-NEXT: vpextrd $1, %xmm1, %eax
339 ; AVX1-NEXT: shrl $9, %eax
340 ; AVX1-NEXT: vmovd %xmm1, %ecx
341 ; AVX1-NEXT: shrl $8, %ecx
342 ; AVX1-NEXT: vmovd %ecx, %xmm2
343 ; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
344 ; AVX1-NEXT: vpextrd $2, %xmm1, %eax
345 ; AVX1-NEXT: shrl $8, %eax
346 ; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
347 ; AVX1-NEXT: vpextrd $3, %xmm1, %eax
348 ; AVX1-NEXT: shrl $7, %eax
349 ; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
350 ; AVX1-NEXT: vpextrd $1, %xmm0, %eax
351 ; AVX1-NEXT: shrl $5, %eax
352 ; AVX1-NEXT: vmovd %xmm0, %ecx
353 ; AVX1-NEXT: shrl $4, %ecx
354 ; AVX1-NEXT: vmovd %ecx, %xmm2
355 ; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
356 ; AVX1-NEXT: vpextrd $2, %xmm0, %eax
357 ; AVX1-NEXT: shrl $6, %eax
358 ; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
359 ; AVX1-NEXT: vpextrd $3, %xmm0, %eax
360 ; AVX1-NEXT: shrl $7, %eax
361 ; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0
362 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
365 ; AVX2-LABEL: constant_shift_v8i32:
367 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
369 %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
373 define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
374 ; AVX1-LABEL: constant_shift_v16i16:
376 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
377 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm2
378 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,37008,41120,45232,49344,53456,57568,61680]
379 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
380 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
381 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,8480,16704,24928,33152,41376,49600,57824]
382 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
383 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
384 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [512,16960,33408,49856,768,17216,33664,50112]
385 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
386 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
387 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1024,33920,1280,34176,1536,34432,1792,34688]
388 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
389 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
390 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4112,8224,12336,16448,20560,24672,28784]
391 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
392 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
393 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,8224,16448,24672,32896,41120,49344,57568]
394 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
395 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2
396 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,16448,32896,49344,256,16704,33152,49600]
397 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
398 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2
399 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,32896,256,33152,512,33408,768,33664]
400 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
401 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
404 ; AVX2-LABEL: constant_shift_v16i16:
406 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
407 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
408 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
409 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
410 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
411 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
412 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
413 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
414 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
415 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
416 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
418 %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
419 ret <16 x i16> %shift
422 define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
423 ; AVX1-LABEL: constant_shift_v32i8:
425 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
426 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
427 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
428 ; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2
429 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
430 ; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4
431 ; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm1, %xmm1
432 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
433 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
434 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
435 ; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm6
436 ; AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm1, %xmm1
437 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
438 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
439 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
440 ; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm3
441 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
442 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
443 ; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2
444 ; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
445 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2
446 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
447 ; AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm0, %xmm0
448 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2
449 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
450 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
451 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
454 ; AVX2-LABEL: constant_shift_v32i8:
456 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
457 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
458 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
459 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
460 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
461 ; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
462 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
463 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
464 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
465 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
466 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
467 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
468 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
470 %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
475 ; Uniform Constant Shifts
478 define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
479 ; AVX1-LABEL: splatconstant_shift_v4i64:
481 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
482 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
483 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
484 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
487 ; AVX2-LABEL: splatconstant_shift_v4i64:
489 ; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
491 %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
495 define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
496 ; AVX1-LABEL: splatconstant_shift_v8i32:
498 ; AVX1-NEXT: vpsrld $5, %xmm0, %xmm1
499 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
500 ; AVX1-NEXT: vpsrld $5, %xmm0, %xmm0
501 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
504 ; AVX2-LABEL: splatconstant_shift_v8i32:
506 ; AVX2-NEXT: vpsrld $5, %ymm0, %ymm0
508 %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
512 define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
513 ; AVX1-LABEL: splatconstant_shift_v16i16:
515 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1
516 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
517 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
518 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
521 ; AVX2-LABEL: splatconstant_shift_v16i16:
523 ; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
525 %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
526 ret <16 x i16> %shift
529 define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
530 ; AVX1-LABEL: splatconstant_shift_v32i8:
532 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
533 ; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
534 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
535 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
536 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
537 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
538 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
541 ; AVX2-LABEL: splatconstant_shift_v32i8:
543 ; AVX2-NEXT: vpsrlw $3, %ymm0
544 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0
546 %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>