1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
8 define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
9 ; AVX1-LABEL: var_shift_v4i64:
11 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
12 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
13 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
14 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
15 ; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2
16 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
17 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3
18 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
19 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
20 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
21 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
24 ; AVX2-LABEL: var_shift_v4i64:
26 ; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
28 %shift = lshr <4 x i64> %a, %b
32 define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
33 ; AVX1-LABEL: var_shift_v8i32:
35 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
36 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
37 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
38 ; AVX1-NEXT: vpsrld %xmm4, %xmm2, %xmm4
39 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
40 ; AVX1-NEXT: vpsrld %xmm5, %xmm2, %xmm5
41 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
42 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
43 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
44 ; AVX1-NEXT: vpsrld %xmm6, %xmm2, %xmm6
45 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
46 ; AVX1-NEXT: vpsrld %xmm3, %xmm2, %xmm2
47 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
48 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
49 ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
50 ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
51 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
52 ; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
53 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
54 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
55 ; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4
56 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
57 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
58 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
59 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
60 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
63 ; AVX2-LABEL: var_shift_v8i32:
65 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
67 %shift = lshr <8 x i32> %a, %b
71 define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
72 ; AVX1-LABEL: var_shift_v16i16:
74 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
75 ; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
76 ; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
77 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
78 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3
79 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
80 ; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5
81 ; AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
82 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm4
83 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
84 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4
85 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
86 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
87 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4
88 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
89 ; AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
90 ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
91 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
92 ; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
93 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3
94 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4
95 ; AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
96 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
97 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
98 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
99 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
100 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
101 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
102 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3
103 ; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
104 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
107 ; AVX2-LABEL: var_shift_v16i16:
109 ; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
110 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
111 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
112 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
113 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
114 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
115 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
116 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
117 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
118 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
120 %shift = lshr <16 x i16> %a, %b
121 ret <16 x i16> %shift
124 define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
125 ; AVX1-LABEL: var_shift_v32i8:
127 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
128 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
129 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
130 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
131 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
132 ; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5
133 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
134 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
135 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
136 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
137 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
138 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
139 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
140 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
141 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
142 ; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
143 ; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
144 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
145 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
146 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
147 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
148 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3
149 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
150 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
151 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
152 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3
153 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
154 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
155 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
156 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
159 ; AVX2-LABEL: var_shift_v32i8:
161 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
162 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
163 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
164 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
165 ; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
166 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
167 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
168 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
169 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
170 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
171 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
172 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
174 %shift = lshr <32 x i8> %a, %b
179 ; Uniform Variable Shifts
182 define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
183 ; AVX1-LABEL: splatvar_shift_v4i64:
185 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
186 ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
187 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
188 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
191 ; AVX2-LABEL: splatvar_shift_v4i64:
193 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
195 %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
196 %shift = lshr <4 x i64> %a, %splat
200 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
201 ; AVX1-LABEL: splatvar_shift_v8i32:
203 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
204 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
205 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
206 ; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
207 ; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
208 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
211 ; AVX2-LABEL: splatvar_shift_v8i32:
213 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
214 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
215 ; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
217 %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
218 %shift = lshr <8 x i32> %a, %splat
222 define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
223 ; AVX1-LABEL: splatvar_shift_v16i16:
225 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
226 ; AVX1-NEXT: vmovd %xmm1, %eax
227 ; AVX1-NEXT: movzwl %ax, %eax
228 ; AVX1-NEXT: vmovd %eax, %xmm1
229 ; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
230 ; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
231 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
234 ; AVX2-LABEL: splatvar_shift_v16i16:
236 ; AVX2-NEXT: vmovd %xmm1, %eax
237 ; AVX2-NEXT: movzwl %ax, %eax
238 ; AVX2-NEXT: vmovd %eax, %xmm1
239 ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
241 %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
242 %shift = lshr <16 x i16> %a, %splat
243 ret <16 x i16> %shift
246 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
247 ; AVX1-LABEL: splatvar_shift_v32i8:
249 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
250 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
251 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
252 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
253 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
254 ; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
255 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
256 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm2
257 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
258 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
259 ; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3
260 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm6
261 ; AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm2, %xmm2
262 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
263 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
264 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
265 ; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm4
266 ; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
267 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
268 ; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3
269 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
270 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
271 ; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
272 ; AVX1-NEXT: vpblendvb %xmm6, %xmm1, %xmm0, %xmm0
273 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
274 ; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
275 ; AVX1-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
276 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
279 ; AVX2-LABEL: splatvar_shift_v32i8:
281 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
282 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
283 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
284 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
285 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
286 ; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
287 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
288 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
289 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
290 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
291 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
292 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
293 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
295 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
296 %shift = lshr <32 x i8> %a, %splat
304 define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
305 ; AVX1-LABEL: constant_shift_v4i64:
307 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
308 ; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2
309 ; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1
310 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
311 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm2
312 ; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
313 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
314 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
317 ; AVX2-LABEL: constant_shift_v4i64:
319 ; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
321 %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
325 define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
326 ; AVX1-LABEL: constant_shift_v8i32:
328 ; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1
329 ; AVX1-NEXT: vpsrld $5, %xmm0, %xmm2
330 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
331 ; AVX1-NEXT: vpsrld $6, %xmm0, %xmm2
332 ; AVX1-NEXT: vpsrld $4, %xmm0, %xmm3
333 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
334 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
335 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
336 ; AVX1-NEXT: vpsrld $7, %xmm0, %xmm2
337 ; AVX1-NEXT: vpsrld $9, %xmm0, %xmm3
338 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
339 ; AVX1-NEXT: vpsrld $8, %xmm0, %xmm0
340 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
341 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
344 ; AVX2-LABEL: constant_shift_v8i32:
346 ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
348 %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
352 define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
353 ; AVX1-LABEL: constant_shift_v16i16:
355 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
356 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm2
357 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,37008,41120,45232,49344,53456,57568,61680]
358 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
359 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
360 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,8480,16704,24928,33152,41376,49600,57824]
361 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
362 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
363 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [512,16960,33408,49856,768,17216,33664,50112]
364 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
365 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
366 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1024,33920,1280,34176,1536,34432,1792,34688]
367 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
368 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
369 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4112,8224,12336,16448,20560,24672,28784]
370 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
371 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
372 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,8224,16448,24672,32896,41120,49344,57568]
373 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
374 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2
375 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,16448,32896,49344,256,16704,33152,49600]
376 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
377 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2
378 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,32896,256,33152,512,33408,768,33664]
379 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
380 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
383 ; AVX2-LABEL: constant_shift_v16i16:
385 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
386 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
387 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
388 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
389 ; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
390 ; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3
391 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
392 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
393 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
394 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
395 ; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
397 %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
398 ret <16 x i16> %shift
401 define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
402 ; AVX1-LABEL: constant_shift_v32i8:
404 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
405 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
406 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
407 ; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2
408 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
409 ; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4
410 ; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm1, %xmm1
411 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
412 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
413 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
414 ; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm6
415 ; AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm1, %xmm1
416 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
417 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
418 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
419 ; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm3
420 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
421 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
422 ; AVX1-NEXT: vpand %xmm8, %xmm2, %xmm2
423 ; AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
424 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2
425 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
426 ; AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm0, %xmm0
427 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2
428 ; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
429 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
430 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
433 ; AVX2-LABEL: constant_shift_v32i8:
435 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
436 ; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
437 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
438 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
439 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
440 ; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2
441 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
442 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
443 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
444 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2
445 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
446 ; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
447 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
449 %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
454 ; Uniform Constant Shifts
457 define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) {
458 ; AVX1-LABEL: splatconstant_shift_v4i64:
460 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
461 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
462 ; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
463 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
466 ; AVX2-LABEL: splatconstant_shift_v4i64:
468 ; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
470 %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
474 define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) {
475 ; AVX1-LABEL: splatconstant_shift_v8i32:
477 ; AVX1-NEXT: vpsrld $5, %xmm0, %xmm1
478 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
479 ; AVX1-NEXT: vpsrld $5, %xmm0, %xmm0
480 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
483 ; AVX2-LABEL: splatconstant_shift_v8i32:
485 ; AVX2-NEXT: vpsrld $5, %ymm0, %ymm0
487 %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
491 define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
492 ; AVX1-LABEL: splatconstant_shift_v16i16:
494 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1
495 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
496 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
497 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
500 ; AVX2-LABEL: splatconstant_shift_v16i16:
502 ; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
504 %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
505 ret <16 x i16> %shift
508 define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
509 ; AVX1-LABEL: splatconstant_shift_v32i8:
511 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
512 ; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
513 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
514 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
515 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
516 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
517 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
520 ; AVX2-LABEL: splatconstant_shift_v32i8:
522 ; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
523 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
525 %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>