1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; TODO: Add AVX512BW shift support
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
9 define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
10 ; ALL-LABEL: var_shift_v8i64:
12 ; ALL-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0
14 %shift = lshr <8 x i64> %a, %b
18 define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
19 ; ALL-LABEL: var_shift_v16i32:
21 ; ALL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
23 %shift = lshr <16 x i32> %a, %b
27 define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
28 ; ALL-LABEL: var_shift_v32i16:
30 ; ALL-NEXT: vpxor %ymm4, %ymm4, %ymm4
31 ; ALL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
32 ; ALL-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
33 ; ALL-NEXT: vpsrlvd %ymm5, %ymm6, %ymm5
34 ; ALL-NEXT: vpsrld $16, %ymm5, %ymm5
35 ; ALL-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
36 ; ALL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
37 ; ALL-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
38 ; ALL-NEXT: vpsrld $16, %ymm0, %ymm0
39 ; ALL-NEXT: vpackusdw %ymm5, %ymm0, %ymm0
40 ; ALL-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
41 ; ALL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
42 ; ALL-NEXT: vpsrlvd %ymm2, %ymm5, %ymm2
43 ; ALL-NEXT: vpsrld $16, %ymm2, %ymm2
44 ; ALL-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
45 ; ALL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
46 ; ALL-NEXT: vpsrlvd %ymm3, %ymm1, %ymm1
47 ; ALL-NEXT: vpsrld $16, %ymm1, %ymm1
48 ; ALL-NEXT: vpackusdw %ymm2, %ymm1, %ymm1
50 %shift = lshr <32 x i16> %a, %b
54 define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
55 ; ALL-LABEL: var_shift_v64i8:
57 ; ALL-NEXT: vpsrlw $4, %ymm0, %ymm4
58 ; ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
59 ; ALL-NEXT: vpand %ymm5, %ymm4, %ymm4
60 ; ALL-NEXT: vpsllw $5, %ymm2, %ymm2
61 ; ALL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
62 ; ALL-NEXT: vpsrlw $2, %ymm0, %ymm4
63 ; ALL-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
64 ; ALL-NEXT: vpand %ymm6, %ymm4, %ymm4
65 ; ALL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
66 ; ALL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
67 ; ALL-NEXT: vpsrlw $1, %ymm0, %ymm4
68 ; ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
69 ; ALL-NEXT: vpand %ymm7, %ymm4, %ymm4
70 ; ALL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
71 ; ALL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
72 ; ALL-NEXT: vpsrlw $4, %ymm1, %ymm2
73 ; ALL-NEXT: vpand %ymm5, %ymm2, %ymm2
74 ; ALL-NEXT: vpsllw $5, %ymm3, %ymm3
75 ; ALL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
76 ; ALL-NEXT: vpsrlw $2, %ymm1, %ymm2
77 ; ALL-NEXT: vpand %ymm6, %ymm2, %ymm2
78 ; ALL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
79 ; ALL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
80 ; ALL-NEXT: vpsrlw $1, %ymm1, %ymm2
81 ; ALL-NEXT: vpand %ymm7, %ymm2, %ymm2
82 ; ALL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
83 ; ALL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
85 %shift = lshr <64 x i8> %a, %b
90 ; Uniform Variable Shifts
93 define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
94 ; ALL-LABEL: splatvar_shift_v8i64:
96 ; ALL-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
98 %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
99 %shift = lshr <8 x i64> %a, %splat
103 define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
104 ; ALL-LABEL: splatvar_shift_v16i32:
106 ; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2
107 ; ALL-NEXT: vmovss %xmm1, %xmm2, %xmm1
108 ; ALL-NEXT: vpsrld %xmm1, %zmm0, %zmm0
110 %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
111 %shift = lshr <16 x i32> %a, %splat
112 ret <16 x i32> %shift
115 define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
116 ; ALL-LABEL: splatvar_shift_v32i16:
118 ; ALL-NEXT: vmovd %xmm2, %eax
119 ; ALL-NEXT: movzwl %ax, %eax
120 ; ALL-NEXT: vmovd %eax, %xmm2
121 ; ALL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
122 ; ALL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
124 %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
125 %shift = lshr <32 x i16> %a, %splat
126 ret <32 x i16> %shift
129 define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
130 ; ALL-LABEL: splatvar_shift_v64i8:
132 ; ALL-NEXT: vpbroadcastb %xmm2, %ymm2
133 ; ALL-NEXT: vpsrlw $4, %ymm0, %ymm3
134 ; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
135 ; ALL-NEXT: vpand %ymm4, %ymm3, %ymm3
136 ; ALL-NEXT: vpsllw $5, %ymm2, %ymm2
137 ; ALL-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
138 ; ALL-NEXT: vpsrlw $2, %ymm0, %ymm3
139 ; ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
140 ; ALL-NEXT: vpand %ymm5, %ymm3, %ymm3
141 ; ALL-NEXT: vpaddb %ymm2, %ymm2, %ymm6
142 ; ALL-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0
143 ; ALL-NEXT: vpsrlw $1, %ymm0, %ymm3
144 ; ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
145 ; ALL-NEXT: vpand %ymm7, %ymm3, %ymm3
146 ; ALL-NEXT: vpaddb %ymm6, %ymm6, %ymm8
147 ; ALL-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0
148 ; ALL-NEXT: vpsrlw $4, %ymm1, %ymm3
149 ; ALL-NEXT: vpand %ymm4, %ymm3, %ymm3
150 ; ALL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
151 ; ALL-NEXT: vpsrlw $2, %ymm1, %ymm2
152 ; ALL-NEXT: vpand %ymm5, %ymm2, %ymm2
153 ; ALL-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
154 ; ALL-NEXT: vpsrlw $1, %ymm1, %ymm2
155 ; ALL-NEXT: vpand %ymm7, %ymm2, %ymm2
156 ; ALL-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1
158 %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
159 %shift = lshr <64 x i8> %a, %splat
167 define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
168 ; ALL-LABEL: constant_shift_v8i64:
170 ; ALL-NEXT: vpsrlvq {{.*}}(%rip), %zmm0, %zmm0
172 %shift = lshr <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
176 define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
177 ; ALL-LABEL: constant_shift_v16i32:
179 ; ALL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
181 %shift = lshr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
182 ret <16 x i32> %shift
185 define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
186 ; ALL-LABEL: constant_shift_v32i16:
188 ; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2
189 ; ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
190 ; ALL-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
191 ; ALL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
192 ; ALL-NEXT: vpsrlvd %ymm4, %ymm5, %ymm5
193 ; ALL-NEXT: vpsrld $16, %ymm5, %ymm5
194 ; ALL-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
195 ; ALL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
196 ; ALL-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0
197 ; ALL-NEXT: vpsrld $16, %ymm0, %ymm0
198 ; ALL-NEXT: vpackusdw %ymm5, %ymm0, %ymm0
199 ; ALL-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
200 ; ALL-NEXT: vpsrlvd %ymm4, %ymm3, %ymm3
201 ; ALL-NEXT: vpsrld $16, %ymm3, %ymm3
202 ; ALL-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
203 ; ALL-NEXT: vpsrlvd %ymm2, %ymm1, %ymm1
204 ; ALL-NEXT: vpsrld $16, %ymm1, %ymm1
205 ; ALL-NEXT: vpackusdw %ymm3, %ymm1, %ymm1
207 %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
208 ret <32 x i16> %shift
211 define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
212 ; ALL-LABEL: constant_shift_v64i8:
214 ; ALL-NEXT: vpsrlw $4, %ymm0, %ymm2
215 ; ALL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
216 ; ALL-NEXT: vpand %ymm3, %ymm2, %ymm2
217 ; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
218 ; ALL-NEXT: vpsllw $5, %ymm4, %ymm4
219 ; ALL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
220 ; ALL-NEXT: vpsrlw $2, %ymm0, %ymm2
221 ; ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
222 ; ALL-NEXT: vpand %ymm5, %ymm2, %ymm2
223 ; ALL-NEXT: vpaddb %ymm4, %ymm4, %ymm6
224 ; ALL-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
225 ; ALL-NEXT: vpsrlw $1, %ymm0, %ymm2
226 ; ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
227 ; ALL-NEXT: vpand %ymm7, %ymm2, %ymm2
228 ; ALL-NEXT: vpaddb %ymm6, %ymm6, %ymm8
229 ; ALL-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0
230 ; ALL-NEXT: vpsrlw $4, %ymm1, %ymm2
231 ; ALL-NEXT: vpand %ymm3, %ymm2, %ymm2
232 ; ALL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
233 ; ALL-NEXT: vpsrlw $2, %ymm1, %ymm2
234 ; ALL-NEXT: vpand %ymm5, %ymm2, %ymm2
235 ; ALL-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
236 ; ALL-NEXT: vpsrlw $1, %ymm1, %ymm2
237 ; ALL-NEXT: vpand %ymm7, %ymm2, %ymm2
238 ; ALL-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1
240 %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
245 ; Uniform Constant Shifts
248 define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
249 ; ALL-LABEL: splatconstant_shift_v8i64:
251 ; ALL-NEXT: vpsrlq $7, %zmm0, %zmm0
253 %shift = lshr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
257 define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
258 ; ALL-LABEL: splatconstant_shift_v16i32:
260 ; ALL-NEXT: vpsrld $5, %zmm0, %zmm0
262 %shift = lshr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
263 ret <16 x i32> %shift
266 define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
267 ; ALL-LABEL: splatconstant_shift_v32i16:
269 ; ALL-NEXT: vpsrlw $3, %ymm0, %ymm0
270 ; ALL-NEXT: vpsrlw $3, %ymm1, %ymm1
272 %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
273 ret <32 x i16> %shift
276 define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
277 ; ALL-LABEL: splatconstant_shift_v64i8:
279 ; ALL-NEXT: vpsrlw $3, %ymm0, %ymm0
280 ; ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
281 ; ALL-NEXT: vpand %ymm2, %ymm0, %ymm0
282 ; ALL-NEXT: vpsrlw $3, %ymm1, %ymm1
283 ; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1
285 %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>