1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
3 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq| FileCheck %s --check-prefix=VL_BW_DQ
5 target triple = "x86_64-unknown-unknown"
7 define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
8 ; AVX512F-LABEL: shuf2i1_1_0:
10 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
13 ; VL_BW_DQ-LABEL: shuf2i1_1_0:
15 ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
16 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
17 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
18 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
19 ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
20 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
21 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
23 %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
27 define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
28 ; AVX512F-LABEL: shuf2i1_1_2:
30 ; AVX512F-NEXT: movl $1, %eax
31 ; AVX512F-NEXT: vmovq %rax, %xmm1
32 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
35 ; VL_BW_DQ-LABEL: shuf2i1_1_2:
37 ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
38 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
39 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
40 ; VL_BW_DQ-NEXT: movb $1, %al
41 ; VL_BW_DQ-NEXT: kmovb %eax, %k0
42 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1
43 ; VL_BW_DQ-NEXT: vpalignr $8, %xmm0, %xmm1, %xmm0
44 ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
45 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0
46 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
48 %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2>
53 define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
54 ; AVX512F-LABEL: shuf4i1_3_2_10:
56 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
59 ; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
61 ; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
62 ; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
63 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
64 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
65 ; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
66 ; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0
67 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
69 %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
73 define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
74 ; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
76 ; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
77 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0
78 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z}
79 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0]
80 ; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
81 ; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
82 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
83 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
84 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
87 ; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
89 ; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
90 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
91 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
92 ; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
93 ; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
94 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
95 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
97 %a2 = icmp eq <8 x i64> %a, %a1
98 %b2 = icmp eq <8 x i64> %b, %b1
99 %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
103 define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) {
104 ; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
106 ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
107 ; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2
108 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0
109 ; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm1 {%k2} {z}
110 ; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm2 {%k1} {z}
111 ; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
112 ; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2
113 ; AVX512F-NEXT: vpslld $31, %zmm2, %zmm1
114 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1
115 ; AVX512F-NEXT: vmovdqu32 %zmm0, %zmm0 {%k1} {z}
116 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
119 ; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
121 ; VL_BW_DQ-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
122 ; VL_BW_DQ-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
123 ; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0
124 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1
125 ; VL_BW_DQ-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
126 ; VL_BW_DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1
127 ; VL_BW_DQ-NEXT: vpslld $31, %zmm1, %zmm0
128 ; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0
129 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0
130 ; VL_BW_DQ-NEXT: retq
131 %a2 = icmp eq <16 x i32> %a, %a1
132 %b2 = icmp eq <16 x i32> %b, %b1
133 %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
137 define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) {
138 ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
140 ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
141 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
142 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
143 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
144 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
147 ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
149 ; VL_BW_DQ-NEXT: vpsllw $7, %ymm0, %ymm0
150 ; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0
151 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
152 ; VL_BW_DQ-NEXT: vmovdqu16 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
153 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
154 ; VL_BW_DQ-NEXT: vpsllw $15, %zmm0, %zmm0
155 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0
156 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0
157 ; VL_BW_DQ-NEXT: retq
158 %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
162 define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
163 ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
165 ; AVX512F-NEXT: movzbl %dil, %eax
166 ; AVX512F-NEXT: kmovw %eax, %k1
167 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0
168 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k1} {z}
169 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2
170 ; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
171 ; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
172 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
173 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
174 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
177 ; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
179 ; VL_BW_DQ-NEXT: kmovb %edi, %k0
180 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
181 ; VL_BW_DQ-NEXT: vpbroadcastq {{.*}}(%rip), %zmm1
182 ; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
183 ; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
184 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
185 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
186 ; VL_BW_DQ-NEXT: retq
187 %b = bitcast i8 %a to <8 x i1>
188 %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
192 define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
193 ; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
195 ; AVX512F-NEXT: movzbl %dil, %eax
196 ; AVX512F-NEXT: kmovw %eax, %k1
197 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
198 ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
199 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
200 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
201 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
202 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
203 ; AVX512F-NEXT: kmovw %k0, %eax
206 ; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
208 ; VL_BW_DQ-NEXT: kmovb %edi, %k0
209 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
210 ; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
211 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
212 ; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
213 ; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
214 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
215 ; VL_BW_DQ-NEXT: kmovb %k0, %eax
216 ; VL_BW_DQ-NEXT: retq
217 %b = bitcast i8 %a to <8 x i1>
218 %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
219 %d = bitcast <8 x i1> %c to i8
223 define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
224 ; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
226 ; AVX512F-NEXT: movzbl %dil, %eax
227 ; AVX512F-NEXT: kmovw %eax, %k1
228 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
229 ; AVX512F-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1]
230 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
231 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
232 ; AVX512F-NEXT: kmovw %k0, %eax
235 ; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
237 ; VL_BW_DQ-NEXT: kmovb %edi, %k0
238 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
239 ; VL_BW_DQ-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1]
240 ; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
241 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
242 ; VL_BW_DQ-NEXT: kmovb %k0, %eax
243 ; VL_BW_DQ-NEXT: retq
244 %b = bitcast i8 %a to <8 x i1>
245 %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
246 %d = bitcast <8 x i1> %c to i8
250 define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
251 ; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
253 ; AVX512F-NEXT: movzbl %dil, %eax
254 ; AVX512F-NEXT: kmovw %eax, %k1
255 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
256 ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
257 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
258 ; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
259 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
260 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
261 ; AVX512F-NEXT: kmovw %k0, %eax
264 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
266 ; VL_BW_DQ-NEXT: kmovb %edi, %k0
267 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
268 ; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
269 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
270 ; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
271 ; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
272 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
273 ; VL_BW_DQ-NEXT: kmovb %k0, %eax
274 ; VL_BW_DQ-NEXT: retq
275 %b = bitcast i8 %a to <8 x i1>
276 %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
277 %d = bitcast <8 x i1>%c to i8
281 define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
282 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
284 ; AVX512F-NEXT: movzbl %dil, %eax
285 ; AVX512F-NEXT: kmovw %eax, %k1
286 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
287 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
288 ; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2
289 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
290 ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
291 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
292 ; AVX512F-NEXT: kmovw %k0, %eax
295 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
297 ; VL_BW_DQ-NEXT: kmovb %edi, %k0
298 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
299 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
300 ; VL_BW_DQ-NEXT: vpxord %zmm2, %zmm2, %zmm2
301 ; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
302 ; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0
303 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
304 ; VL_BW_DQ-NEXT: kmovb %k0, %eax
305 ; VL_BW_DQ-NEXT: retq
306 %b = bitcast i8 %a to <8 x i1>
307 %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
308 %d = bitcast <8 x i1>%c to i8
312 define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
313 ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
315 ; AVX512F-NEXT: movzbl %dil, %eax
316 ; AVX512F-NEXT: kmovw %eax, %k1
317 ; AVX512F-NEXT: movb $51, %al
318 ; AVX512F-NEXT: movzbl %al, %eax
319 ; AVX512F-NEXT: kmovw %eax, %k2
320 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0
321 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm1 {%k2} {z}
322 ; AVX512F-NEXT: vmovdqu64 %zmm0, %zmm0 {%k1} {z}
323 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
324 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
325 ; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm0
326 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
327 ; AVX512F-NEXT: kmovw %k0, %eax
330 ; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
332 ; VL_BW_DQ-NEXT: kmovb %edi, %k0
333 ; VL_BW_DQ-NEXT: movb $51, %al
334 ; VL_BW_DQ-NEXT: kmovb %eax, %k1
335 ; VL_BW_DQ-NEXT: vpmovm2q %k1, %zmm0
336 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm1
337 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
338 ; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
339 ; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0
340 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
341 ; VL_BW_DQ-NEXT: kmovb %k0, %eax
342 ; VL_BW_DQ-NEXT: retq
343 %b = bitcast i8 %a to <8 x i1>
344 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
345 %c1 = bitcast <8 x i1>%c to i8
349 define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
350 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
352 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
353 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
354 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
355 ; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
356 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
357 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
358 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
359 ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
360 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
361 ; AVX512F-NEXT: kmovw %k0, %eax
364 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
366 ; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0
367 ; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0
368 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
369 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
370 ; VL_BW_DQ-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
371 ; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
372 ; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0
373 ; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
374 ; VL_BW_DQ-NEXT: kmovb %k0, %eax
375 ; VL_BW_DQ-NEXT: retq
376 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
377 %c1 = bitcast <8 x i1>%c to i8
382 define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
383 ; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
385 ; AVX512F-NEXT: kmovw %edi, %k1
386 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
387 ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
388 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
389 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
390 ; AVX512F-NEXT: kmovw %k0, %eax
393 ; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
395 ; VL_BW_DQ-NEXT: kmovw %edi, %k0
396 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0
397 ; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0
398 ; VL_BW_DQ-NEXT: vpslld $31, %zmm0, %zmm0
399 ; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0
400 ; VL_BW_DQ-NEXT: kmovw %k0, %eax
401 ; VL_BW_DQ-NEXT: retq
402 %b = bitcast i16 %a to <16 x i1>
403 %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
404 %d = bitcast <16 x i1> %c to i16
408 define i64 @shuf64i1_zero(i64 %a) {
409 ; AVX512F-LABEL: shuf64i1_zero:
411 ; AVX512F-NEXT: pushq %rbp
412 ; AVX512F-NEXT: .Ltmp0:
413 ; AVX512F-NEXT: .cfi_def_cfa_offset 16
414 ; AVX512F-NEXT: .Ltmp1:
415 ; AVX512F-NEXT: .cfi_offset %rbp, -16
416 ; AVX512F-NEXT: movq %rsp, %rbp
417 ; AVX512F-NEXT: .Ltmp2:
418 ; AVX512F-NEXT: .cfi_def_cfa_register %rbp
419 ; AVX512F-NEXT: andq $-32, %rsp
420 ; AVX512F-NEXT: subq $32, %rsp
421 ; AVX512F-NEXT: movb $0, (%rsp)
422 ; AVX512F-NEXT: movl (%rsp), %ecx
423 ; AVX512F-NEXT: movq %rcx, %rax
424 ; AVX512F-NEXT: shlq $32, %rax
425 ; AVX512F-NEXT: orq %rcx, %rax
426 ; AVX512F-NEXT: movq %rbp, %rsp
427 ; AVX512F-NEXT: popq %rbp
430 ; VL_BW_DQ-LABEL: shuf64i1_zero:
432 ; VL_BW_DQ-NEXT: kxorq %k0, %k0, %k0
433 ; VL_BW_DQ-NEXT: kmovq %k0, %rax
434 ; VL_BW_DQ-NEXT: retq
435 %b = bitcast i64 %a to <64 x i1>
436 %c = shufflevector < 64 x i1> zeroinitializer, <64 x i1> undef, <64 x i32> zeroinitializer
437 %d = bitcast <64 x i1> %c to i64