1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512BW
5 define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) {
8 ; SSE2-NEXT: movd (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
9 ; SSE2-NEXT: movd (%rsi), %xmm1 # xmm1 = mem[0],zero,zero,zero
10 ; SSE2-NEXT: pavgb %xmm0, %xmm1
11 ; SSE2-NEXT: movd %xmm1, (%rax)
14 ; AVX2-LABEL: avg_v4i8
16 ; AVX2-NEXT: vmovd (%rdi), %xmm0
17 ; AVX2-NEXT: vmovd (%rsi), %xmm1
18 ; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0
19 ; AVX2-NEXT: vmovd %xmm0, (%rax)
22 %1 = load <4 x i8>, <4 x i8>* %a
23 %2 = load <4 x i8>, <4 x i8>* %b
24 %3 = zext <4 x i8> %1 to <4 x i32>
25 %4 = zext <4 x i8> %2 to <4 x i32>
26 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
27 %6 = add nuw nsw <4 x i32> %5, %4
28 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
29 %8 = trunc <4 x i32> %7 to <4 x i8>
30 store <4 x i8> %8, <4 x i8>* undef, align 4
34 define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) {
35 ; SSE2-LABEL: avg_v8i8
37 ; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero
38 ; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero
39 ; SSE2-NEXT: pavgb %xmm0, %xmm1
40 ; SSE2-NEXT: movq %xmm1, (%rax)
43 ; AVX2-LABEL: avg_v8i8
45 ; AVX2-NEXT: vmovq (%rdi), %xmm0
46 ; AVX2-NEXT: vmovq (%rsi), %xmm1
47 ; AVX2-NEXT: vpavgb %xmm0, %xmm1, %xmm0
48 ; AVX2-NEXT: vmovq %xmm0, (%rax)
51 %1 = load <8 x i8>, <8 x i8>* %a
52 %2 = load <8 x i8>, <8 x i8>* %b
53 %3 = zext <8 x i8> %1 to <8 x i32>
54 %4 = zext <8 x i8> %2 to <8 x i32>
55 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
56 %6 = add nuw nsw <8 x i32> %5, %4
57 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
58 %8 = trunc <8 x i32> %7 to <8 x i8>
59 store <8 x i8> %8, <8 x i8>* undef, align 4
63 define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
64 ; SSE2-LABEL: avg_v16i8
66 ; SSE2-NEXT: movdqa (%rsi), %xmm0
67 ; SSE2-NEXT: pavgb (%rdi), %xmm0
68 ; SSE2-NEXT: movdqu %xmm0, (%rax)
71 ; AVX2-LABEL: avg_v16i8
73 ; AVX2-NEXT: vmovdqa (%rsi), %xmm0
74 ; AVX2-NEXT: vpavgb (%rdi), %xmm0, %xmm0
75 ; AVX2-NEXT: vmovdqu %xmm0, (%rax)
78 %1 = load <16 x i8>, <16 x i8>* %a
79 %2 = load <16 x i8>, <16 x i8>* %b
80 %3 = zext <16 x i8> %1 to <16 x i32>
81 %4 = zext <16 x i8> %2 to <16 x i32>
82 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
83 %6 = add nuw nsw <16 x i32> %5, %4
84 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
85 %8 = trunc <16 x i32> %7 to <16 x i8>
86 store <16 x i8> %8, <16 x i8>* undef, align 4
90 define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
91 ; AVX2-LABEL: avg_v32i8
93 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
94 ; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
95 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
97 %1 = load <32 x i8>, <32 x i8>* %a
98 %2 = load <32 x i8>, <32 x i8>* %b
99 %3 = zext <32 x i8> %1 to <32 x i32>
100 %4 = zext <32 x i8> %2 to <32 x i32>
101 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
102 %6 = add nuw nsw <32 x i32> %5, %4
103 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
104 %8 = trunc <32 x i32> %7 to <32 x i8>
105 store <32 x i8> %8, <32 x i8>* undef, align 4
109 define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
110 ; AVX512BW-LABEL: avg_v64i8
112 ; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
113 ; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
114 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
115 ; AVX512BW-NEXT: retq
117 %1 = load <64 x i8>, <64 x i8>* %a
118 %2 = load <64 x i8>, <64 x i8>* %b
119 %3 = zext <64 x i8> %1 to <64 x i32>
120 %4 = zext <64 x i8> %2 to <64 x i32>
121 %5 = add nuw nsw <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
122 %6 = add nuw nsw <64 x i32> %5, %4
123 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
124 %8 = trunc <64 x i32> %7 to <64 x i8>
125 store <64 x i8> %8, <64 x i8>* undef, align 4
129 define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
130 ; SSE2-LABEL: avg_v4i16
132 ; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero
133 ; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero
134 ; SSE2-NEXT: pavgw %xmm0, %xmm1
135 ; SSE2-NEXT: movq %xmm1, (%rax)
138 ; AVX2-LABEL: avg_v4i16
140 ; AVX2-NEXT: vmovq (%rdi), %xmm0
141 ; AVX2-NEXT: vmovq (%rsi), %xmm1
142 ; AVX2-NEXT: vpavgw %xmm0, %xmm1, %xmm0
143 ; AVX2-NEXT: vmovq %xmm0, (%rax)
146 %1 = load <4 x i16>, <4 x i16>* %a
147 %2 = load <4 x i16>, <4 x i16>* %b
148 %3 = zext <4 x i16> %1 to <4 x i32>
149 %4 = zext <4 x i16> %2 to <4 x i32>
150 %5 = add nuw nsw <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
151 %6 = add nuw nsw <4 x i32> %5, %4
152 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
153 %8 = trunc <4 x i32> %7 to <4 x i16>
154 store <4 x i16> %8, <4 x i16>* undef, align 4
158 define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
159 ; SSE2-LABEL: avg_v8i16
161 ; SSE2-NEXT: movdqa (%rsi), %xmm0
162 ; SSE2-NEXT: pavgw (%rdi), %xmm0
163 ; SSE2-NEXT: movdqu %xmm0, (%rax)
166 ; AVX2-LABEL: avg_v8i16
168 ; AVX2-NEXT: vmovdqa (%rsi), %xmm0
169 ; AVX2-NEXT: vpavgw (%rdi), %xmm0, %xmm0
170 ; AVX2-NEXT: vmovdqu %xmm0, (%rax)
173 %1 = load <8 x i16>, <8 x i16>* %a
174 %2 = load <8 x i16>, <8 x i16>* %b
175 %3 = zext <8 x i16> %1 to <8 x i32>
176 %4 = zext <8 x i16> %2 to <8 x i32>
177 %5 = add nuw nsw <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
178 %6 = add nuw nsw <8 x i32> %5, %4
179 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
180 %8 = trunc <8 x i32> %7 to <8 x i16>
181 store <8 x i16> %8, <8 x i16>* undef, align 4
185 define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
186 ; AVX2-LABEL: avg_v16i16
188 ; AVX2-NEXT: vmovdqa (%rsi), %ymm0
189 ; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
190 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
192 %1 = load <16 x i16>, <16 x i16>* %a
193 %2 = load <16 x i16>, <16 x i16>* %b
194 %3 = zext <16 x i16> %1 to <16 x i32>
195 %4 = zext <16 x i16> %2 to <16 x i32>
196 %5 = add nuw nsw <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
197 %6 = add nuw nsw <16 x i32> %5, %4
198 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
199 %8 = trunc <16 x i32> %7 to <16 x i16>
200 store <16 x i16> %8, <16 x i16>* undef, align 4
204 define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
205 ; AVX512BW-LABEL: avg_v32i16
207 ; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0
208 ; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
209 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
210 ; AVX512BW-NEXT: retq
212 %1 = load <32 x i16>, <32 x i16>* %a
213 %2 = load <32 x i16>, <32 x i16>* %b
214 %3 = zext <32 x i16> %1 to <32 x i32>
215 %4 = zext <32 x i16> %2 to <32 x i32>
216 %5 = add nuw nsw <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
217 %6 = add nuw nsw <32 x i32> %5, %4
218 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
219 %8 = trunc <32 x i32> %7 to <32 x i16>
220 store <32 x i16> %8, <32 x i16>* undef, align 4
224 define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
225 ; SSE2-LABEL: avg_v4i8_2
227 ; SSE2-NEXT: movd (%rdi), %xmm0
228 ; SSE2-NEXT: movd (%rsi), %xmm1
229 ; SSE2-NEXT: pavgb %xmm0, %xmm1
230 ; SSE2-NEXT: movd %xmm1, (%rax)
233 ; AVX2-LABEL: avg_v4i8_2
235 ; AVX2-NEXT: vmovd (%rdi), %xmm0
236 ; AVX2-NEXT: vmovd (%rsi), %xmm1
237 ; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0
238 ; AVX2-NEXT: vmovd %xmm0, (%rax)
241 %1 = load <4 x i8>, <4 x i8>* %a
242 %2 = load <4 x i8>, <4 x i8>* %b
243 %3 = zext <4 x i8> %1 to <4 x i32>
244 %4 = zext <4 x i8> %2 to <4 x i32>
245 %5 = add nuw nsw <4 x i32> %3, %4
246 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
247 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
248 %8 = trunc <4 x i32> %7 to <4 x i8>
249 store <4 x i8> %8, <4 x i8>* undef, align 4
253 define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
254 ; SSE2-LABEL: avg_v8i8_2
256 ; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero
257 ; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero
258 ; SSE2-NEXT: pavgb %xmm0, %xmm1
259 ; SSE2-NEXT: movq %xmm1, (%rax)
262 ; AVX2-LABEL: avg_v8i8_2
264 ; AVX2-NEXT: vmovq (%rdi), %xmm0
265 ; AVX2-NEXT: vmovq (%rsi), %xmm1
266 ; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0
267 ; AVX2-NEXT: vmovq %xmm0, (%rax)
270 %1 = load <8 x i8>, <8 x i8>* %a
271 %2 = load <8 x i8>, <8 x i8>* %b
272 %3 = zext <8 x i8> %1 to <8 x i32>
273 %4 = zext <8 x i8> %2 to <8 x i32>
274 %5 = add nuw nsw <8 x i32> %3, %4
275 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
276 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
277 %8 = trunc <8 x i32> %7 to <8 x i8>
278 store <8 x i8> %8, <8 x i8>* undef, align 4
282 define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
283 ; SSE2-LABEL: avg_v16i8_2
285 ; SSE2-NEXT: movdqa (%rdi), %xmm0
286 ; SSE2-NEXT: pavgb (%rsi), %xmm0
287 ; SSE2-NEXT: movdqu %xmm0, (%rax)
290 ; AVX2-LABEL: avg_v16i8_2
292 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
293 ; AVX2-NEXT: vpavgb (%rsi), %xmm0, %xmm0
294 ; AVX2-NEXT: vmovdqu %xmm0, (%rax)
297 %1 = load <16 x i8>, <16 x i8>* %a
298 %2 = load <16 x i8>, <16 x i8>* %b
299 %3 = zext <16 x i8> %1 to <16 x i32>
300 %4 = zext <16 x i8> %2 to <16 x i32>
301 %5 = add nuw nsw <16 x i32> %3, %4
302 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
303 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
304 %8 = trunc <16 x i32> %7 to <16 x i8>
305 store <16 x i8> %8, <16 x i8>* undef, align 4
309 define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
310 ; AVX2-LABEL: avg_v32i8_2
312 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
313 ; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0
314 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
316 %1 = load <32 x i8>, <32 x i8>* %a
317 %2 = load <32 x i8>, <32 x i8>* %b
318 %3 = zext <32 x i8> %1 to <32 x i32>
319 %4 = zext <32 x i8> %2 to <32 x i32>
320 %5 = add nuw nsw <32 x i32> %3, %4
321 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
322 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
323 %8 = trunc <32 x i32> %7 to <32 x i8>
324 store <32 x i8> %8, <32 x i8>* undef, align 4
328 define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
329 ; AVX512BW-LABEL: avg_v64i8_2
331 ; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
332 ; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0
333 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
334 ; AVX512BW-NEXT: retq
336 %1 = load <64 x i8>, <64 x i8>* %a
337 %2 = load <64 x i8>, <64 x i8>* %b
338 %3 = zext <64 x i8> %1 to <64 x i32>
339 %4 = zext <64 x i8> %2 to <64 x i32>
340 %5 = add nuw nsw <64 x i32> %4, %4
341 %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
342 %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
343 %8 = trunc <64 x i32> %7 to <64 x i8>
344 store <64 x i8> %8, <64 x i8>* undef, align 4
349 define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) {
350 ; SSE2-LABEL: avg_v4i16_2
352 ; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero
353 ; SSE2-NEXT: movq (%rsi), %xmm1 # xmm1 = mem[0],zero
354 ; SSE2-NEXT: pavgw %xmm0, %xmm1
355 ; SSE2-NEXT: movq %xmm1, (%rax)
358 ; AVX2-LABEL: avg_v4i16_2
360 ; AVX2-NEXT: vmovq (%rdi), %xmm0
361 ; AVX2-NEXT: vmovq (%rsi), %xmm1
362 ; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0
363 ; AVX2-NEXT: vmovq %xmm0, (%rax)
366 %1 = load <4 x i16>, <4 x i16>* %a
367 %2 = load <4 x i16>, <4 x i16>* %b
368 %3 = zext <4 x i16> %1 to <4 x i32>
369 %4 = zext <4 x i16> %2 to <4 x i32>
370 %5 = add nuw nsw <4 x i32> %3, %4
371 %6 = add nuw nsw <4 x i32> %5, <i32 1, i32 1, i32 1, i32 1>
372 %7 = lshr <4 x i32> %6, <i32 1, i32 1, i32 1, i32 1>
373 %8 = trunc <4 x i32> %7 to <4 x i16>
374 store <4 x i16> %8, <4 x i16>* undef, align 4
378 define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
379 ; SSE2-LABEL: avg_v8i16_2
381 ; SSE2-NEXT: movdqa (%rdi), %xmm0
382 ; SSE2-NEXT: pavgw (%rsi), %xmm0
383 ; SSE2-NEXT: movdqu %xmm0, (%rax)
386 ; AVX2-LABEL: avg_v8i16_2
388 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
389 ; AVX2-NEXT: vpavgw (%rsi), %xmm0, %xmm0
390 ; AVX2-NEXT: vmovdqu %xmm0, (%rax)
393 %1 = load <8 x i16>, <8 x i16>* %a
394 %2 = load <8 x i16>, <8 x i16>* %b
395 %3 = zext <8 x i16> %1 to <8 x i32>
396 %4 = zext <8 x i16> %2 to <8 x i32>
397 %5 = add nuw nsw <8 x i32> %3, %4
398 %6 = add nuw nsw <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
399 %7 = lshr <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
400 %8 = trunc <8 x i32> %7 to <8 x i16>
401 store <8 x i16> %8, <8 x i16>* undef, align 4
405 define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
406 ; AVX2-LABEL: avg_v16i16_2
408 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
409 ; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
410 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
412 %1 = load <16 x i16>, <16 x i16>* %a
413 %2 = load <16 x i16>, <16 x i16>* %b
414 %3 = zext <16 x i16> %1 to <16 x i32>
415 %4 = zext <16 x i16> %2 to <16 x i32>
416 %5 = add nuw nsw <16 x i32> %3, %4
417 %6 = add nuw nsw <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
418 %7 = lshr <16 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
419 %8 = trunc <16 x i32> %7 to <16 x i16>
420 store <16 x i16> %8, <16 x i16>* undef, align 4
424 define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
425 ; AVX512BW-LABEL: avg_v32i16_2
427 ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
428 ; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0
429 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
430 ; AVX512BW-NEXT: retq
432 %1 = load <32 x i16>, <32 x i16>* %a
433 %2 = load <32 x i16>, <32 x i16>* %b
434 %3 = zext <32 x i16> %1 to <32 x i32>
435 %4 = zext <32 x i16> %2 to <32 x i32>
436 %5 = add nuw nsw <32 x i32> %3, %4
437 %6 = add nuw nsw <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
438 %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
439 %8 = trunc <32 x i32> %7 to <32 x i16>
440 store <32 x i16> %8, <32 x i16>* undef, align 4
444 define void @avg_v4i8_const(<4 x i8>* %a) {
445 ; SSE2-LABEL: avg_v4i8_const
447 ; SSE2-NEXT: movd (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
448 ; SSE2-NEXT: pavgb {{.*}}, %xmm0
449 ; SSE2-NEXT: movd %xmm0, (%rax)
452 ; AVX2-LABEL: avg_v4i8_const
454 ; AVX2-NEXT: vmovd (%rdi), %xmm0
455 ; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0
456 ; AVX2-NEXT: vmovd %xmm0, (%rax)
459 %1 = load <4 x i8>, <4 x i8>* %a
460 %2 = zext <4 x i8> %1 to <4 x i32>
461 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
462 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
463 %5 = trunc <4 x i32> %4 to <4 x i8>
464 store <4 x i8> %5, <4 x i8>* undef, align 4
468 define void @avg_v8i8_const(<8 x i8>* %a) {
469 ; SSE2-LABEL: avg_v8i8_const
471 ; SSE2-NEXT: movq (%rdi), %xmm0 # xmm0 = mem[0],zero
472 ; SSE2-NEXT: pavgb {{.*}}, %xmm0
473 ; SSE2-NEXT: movq %xmm0, (%rax)
476 ; AVX2-LABEL: avg_v8i8_const
478 ; AVX2-NEXT: vmovq (%rdi), %xmm0
479 ; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0
480 ; AVX2-NEXT: vmovq %xmm0, (%rax)
483 %1 = load <8 x i8>, <8 x i8>* %a
484 %2 = zext <8 x i8> %1 to <8 x i32>
485 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
486 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
487 %5 = trunc <8 x i32> %4 to <8 x i8>
488 store <8 x i8> %5, <8 x i8>* undef, align 4
492 define void @avg_v16i8_const(<16 x i8>* %a) {
493 ; SSE2-LABEL: avg_v16i8_const
495 ; SSE2-NEXT: movdqa (%rdi), %xmm0
496 ; SSE2-NEXT: pavgb {{.*}}, %xmm0
497 ; SSE2-NEXT: movdqu %xmm0, (%rax)
500 ; AVX2-LABEL: avg_v16i8_const
502 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
503 ; AVX2-NEXT: vpavgb {{.*}}, %xmm0, %xmm0
504 ; AVX2-NEXT: vmovdqu %xmm0, (%rax)
507 %1 = load <16 x i8>, <16 x i8>* %a
508 %2 = zext <16 x i8> %1 to <16 x i32>
509 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
510 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
511 %5 = trunc <16 x i32> %4 to <16 x i8>
512 store <16 x i8> %5, <16 x i8>* undef, align 4
516 define void @avg_v32i8_const(<32 x i8>* %a) {
517 ; AVX2-LABEL: avg_v32i8_const
519 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
520 ; AVX2-NEXT: vpavgb {{.*}}, %ymm0, %ymm0
521 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
523 %1 = load <32 x i8>, <32 x i8>* %a
524 %2 = zext <32 x i8> %1 to <32 x i32>
525 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
526 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
527 %5 = trunc <32 x i32> %4 to <32 x i8>
528 store <32 x i8> %5, <32 x i8>* undef, align 4
532 define void @avg_v64i8_const(<64 x i8>* %a) {
533 ; AVX512BW-LABEL: avg_v64i8_const
535 ; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
536 ; AVX512BW-NEXT: vpavgb {{.*}}, %zmm0, %zmm0
537 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
538 ; AVX512BW-NEXT: retq
540 %1 = load <64 x i8>, <64 x i8>* %a
541 %2 = zext <64 x i8> %1 to <64 x i32>
542 %3 = add nuw nsw <64 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
543 %4 = lshr <64 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
544 %5 = trunc <64 x i32> %4 to <64 x i8>
545 store <64 x i8> %5, <64 x i8>* undef, align 4
549 define void @avg_v4i16_const(<4 x i16>* %a) {
550 ; SSE2-LABEL: avg_v4i16_const
552 ; SSE2-NEXT: movq (%rdi), %xmm0
553 ; SSE2-NEXT: pavgw {{.*}}, %xmm0
554 ; SSE2-NEXT: movq %xmm0, (%rax)
557 ; AVX2-LABEL: avg_v4i16_const
559 ; AVX2-NEXT: vmovq (%rdi), %xmm0
560 ; AVX2-NEXT: vpavgw {{.*}}, %xmm0, %xmm0
561 ; AVX2-NEXT: vmovq %xmm0, (%rax)
564 %1 = load <4 x i16>, <4 x i16>* %a
565 %2 = zext <4 x i16> %1 to <4 x i32>
566 %3 = add nuw nsw <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
567 %4 = lshr <4 x i32> %3, <i32 1, i32 1, i32 1, i32 1>
568 %5 = trunc <4 x i32> %4 to <4 x i16>
569 store <4 x i16> %5, <4 x i16>* undef, align 4
573 define void @avg_v8i16_const(<8 x i16>* %a) {
574 ; SSE2-LABEL: avg_v8i16_const
576 ; SSE2-NEXT: movdqa (%rdi), %xmm0
577 ; SSE2-NEXT: pavgw {{.*}}, %xmm0
578 ; SSE2-NEXT: movdqu %xmm0, (%rax)
581 ; AVX2-LABEL: avg_v8i16_const
583 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
584 ; AVX2-NEXT: vpavgw {{.*}}, %xmm0, %xmm0
585 ; AVX2-NEXT: vmovdqu %xmm0, (%rax)
588 %1 = load <8 x i16>, <8 x i16>* %a
589 %2 = zext <8 x i16> %1 to <8 x i32>
590 %3 = add nuw nsw <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
591 %4 = lshr <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
592 %5 = trunc <8 x i32> %4 to <8 x i16>
593 store <8 x i16> %5, <8 x i16>* undef, align 4
597 define void @avg_v16i16_const(<16 x i16>* %a) {
598 ; AVX2-LABEL: avg_v16i16_const
600 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
601 ; AVX2-NEXT: vpavgw {{.*}}, %ymm0, %ymm0
602 ; AVX2-NEXT: vmovdqu %ymm0, (%rax)
604 %1 = load <16 x i16>, <16 x i16>* %a
605 %2 = zext <16 x i16> %1 to <16 x i32>
606 %3 = add nuw nsw <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
607 %4 = lshr <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
608 %5 = trunc <16 x i32> %4 to <16 x i16>
609 store <16 x i16> %5, <16 x i16>* undef, align 4
613 define void @avg_v32i16_const(<32 x i16>* %a) {
614 ; AVX512BW-LABEL: avg_v32i16_const
616 ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
617 ; AVX512BW-NEXT: vpavgw {{.*}}, %zmm0, %zmm0
618 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
620 %1 = load <32 x i16>, <32 x i16>* %a
621 %2 = zext <32 x i16> %1 to <32 x i32>
622 %3 = add nuw nsw <32 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
623 %4 = lshr <32 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
624 %5 = trunc <32 x i32> %4 to <32 x i16>
625 store <32 x i16> %5, <32 x i16>* undef, align 4