1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4 define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
5 ; AVX1-LABEL: testv4i64:
7 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
8 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
9 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
10 ; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm3
11 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
12 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
13 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
14 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
15 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
16 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
17 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
18 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
19 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
20 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
21 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
22 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
23 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1
24 ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm1
25 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
26 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
27 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
28 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
29 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
30 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
31 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
32 ; AVX1-NEXT: vpsadbw %xmm0, %xmm2, %xmm0
33 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
36 ; AVX2-LABEL: testv4i64:
38 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
39 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2
40 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
41 ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
42 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
43 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
44 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
45 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
46 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
47 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
48 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
49 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
50 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
51 ; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0
53 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0)
57 define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
58 ; AVX1-LABEL: testv4i64u:
60 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
61 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
62 ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
63 ; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm3
64 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
65 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
66 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
67 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1]
68 ; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
69 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
70 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5
71 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
72 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
73 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
74 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
75 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
76 ; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1
77 ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm1
78 ; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
79 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
80 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
81 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
82 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
83 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
84 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
85 ; AVX1-NEXT: vpsadbw %xmm0, %xmm2, %xmm0
86 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
89 ; AVX2-LABEL: testv4i64u:
91 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
92 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2
93 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
94 ; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
95 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
96 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
97 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
98 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
99 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
100 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
101 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
102 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
103 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
104 ; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0
106 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1)
110 define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
111 ; AVX1-LABEL: testv8i32:
113 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
114 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
115 ; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2
116 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm3
117 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
118 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
119 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
120 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
121 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2
122 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
123 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
124 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
125 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
126 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
127 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
128 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
129 ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2
130 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
131 ; AVX1-NEXT: vpsadbw %xmm5, %xmm1, %xmm5
132 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
133 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm2
134 ; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
135 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0
136 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
137 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
138 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
139 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
140 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
141 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
142 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
143 ; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm3
144 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
145 ; AVX1-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
146 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
147 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
150 ; AVX2-LABEL: testv8i32:
152 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
153 ; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2
154 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
155 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
156 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
157 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
158 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
159 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
160 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
161 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
162 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
163 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
164 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
165 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
166 ; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm2
167 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
168 ; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0
169 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
171 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0)
175 define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
176 ; AVX1-LABEL: testv8i32u:
178 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
179 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
180 ; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm2
181 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm3
182 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
183 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
184 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
185 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
186 ; AVX1-NEXT: vpsubd %xmm3, %xmm2, %xmm2
187 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
188 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
189 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
190 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
191 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
192 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
193 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
194 ; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2
195 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
196 ; AVX1-NEXT: vpsadbw %xmm5, %xmm1, %xmm5
197 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
198 ; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm2
199 ; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
200 ; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0
201 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3
202 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
203 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
204 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
205 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
206 ; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
207 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
208 ; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm3
209 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
210 ; AVX1-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
211 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
212 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
215 ; AVX2-LABEL: testv8i32u:
217 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
218 ; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2
219 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
220 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
221 ; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
222 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
223 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
224 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
225 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
226 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
227 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
228 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
229 ; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
230 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
231 ; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm2
232 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
233 ; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0
234 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
236 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1)
240 define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
241 ; AVX1-LABEL: testv16i16:
243 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
244 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
245 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
246 ; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm2
247 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
248 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
249 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
250 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
251 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
252 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
253 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
254 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
255 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
256 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
257 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
258 ; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2
259 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4
260 ; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
261 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
262 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
263 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
264 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1
265 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
266 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
267 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
268 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
269 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
270 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
271 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
272 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
273 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
276 ; AVX2-LABEL: testv16i16:
278 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
279 ; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1
280 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
281 ; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
282 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
283 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
284 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
285 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
286 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
287 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
288 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
289 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
290 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
291 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
292 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
294 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 0)
298 define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
299 ; AVX1-LABEL: testv16i16u:
301 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
302 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
303 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
304 ; AVX1-NEXT: vpsubw %xmm0, %xmm2, %xmm2
305 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
306 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
307 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
308 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
309 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
310 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
311 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
312 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
313 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
314 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
315 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
316 ; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2
317 ; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4
318 ; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
319 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
320 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
321 ; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
322 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1
323 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
324 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
325 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
326 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
327 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
328 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
329 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
330 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
331 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
334 ; AVX2-LABEL: testv16i16u:
336 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
337 ; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1
338 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
339 ; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
340 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
341 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
342 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
343 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
344 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
345 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
346 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
347 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
348 ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1
349 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
350 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
352 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 -1)
356 define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
357 ; AVX1-LABEL: testv32i8:
359 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
360 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
361 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
362 ; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2
363 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
364 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
365 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
366 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
367 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
368 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
369 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
370 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
371 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
372 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
373 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
374 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
375 ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
376 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0
377 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
378 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
379 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
380 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
381 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
382 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
383 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
386 ; AVX2-LABEL: testv32i8:
388 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
389 ; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1
390 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
391 ; AVX2-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
392 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
393 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
394 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
395 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
396 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
397 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
398 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
399 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
401 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 0)
405 define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
406 ; AVX1-LABEL: testv32i8u:
408 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
409 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
410 ; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
411 ; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2
412 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
413 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
414 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
415 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
416 ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
417 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
418 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
419 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
420 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
421 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
422 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
423 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
424 ; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
425 ; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0
426 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
427 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
428 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
429 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
430 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
431 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
432 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
435 ; AVX2-LABEL: testv32i8u:
437 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
438 ; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1
439 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
440 ; AVX2-NEXT: vpsubb {{.*}}(%rip), %ymm0, %ymm0
441 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
442 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
443 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
444 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
445 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
446 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
447 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
448 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
450 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 -1)
454 define <4 x i64> @foldv4i64() nounwind {
455 ; ALL-LABEL: foldv4i64:
457 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
459 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0)
463 define <4 x i64> @foldv4i64u() nounwind {
464 ; ALL-LABEL: foldv4i64u:
466 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
468 %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1)
472 define <8 x i32> @foldv8i32() nounwind {
473 ; ALL-LABEL: foldv8i32:
475 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
477 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0)
481 define <8 x i32> @foldv8i32u() nounwind {
482 ; ALL-LABEL: foldv8i32u:
484 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
486 %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1)
490 define <16 x i16> @foldv16i16() nounwind {
491 ; ALL-LABEL: foldv16i16:
493 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
495 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0)
499 define <16 x i16> @foldv16i16u() nounwind {
500 ; ALL-LABEL: foldv16i16u:
502 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
504 %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1)
508 define <32 x i8> @foldv32i8() nounwind {
509 ; ALL-LABEL: foldv32i8:
511 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
513 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0)
517 define <32 x i8> @foldv32i8u() nounwind {
518 ; ALL-LABEL: foldv32i8u:
520 ; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
522 %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1)
526 declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1)
527 declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1)
528 declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1)
529 declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1)