1 ; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
4 define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
7 %tmp1 = load <8 x i8>* %A
8 %tmp2 = load <8 x i8>* %B
9 %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
10 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
14 define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
15 ;CHECK-LABEL: sabdl4s:
17 %tmp1 = load <4 x i16>* %A
18 %tmp2 = load <4 x i16>* %B
19 %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
20 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
24 define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
25 ;CHECK-LABEL: sabdl2d:
27 %tmp1 = load <2 x i32>* %A
28 %tmp2 = load <2 x i32>* %B
29 %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
30 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
34 define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
35 ;CHECK-LABEL: sabdl2_8h:
37 %load1 = load <16 x i8>* %A
38 %load2 = load <16 x i8>* %B
39 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
40 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
41 %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
42 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
46 define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
47 ;CHECK-LABEL: sabdl2_4s:
49 %load1 = load <8 x i16>* %A
50 %load2 = load <8 x i16>* %B
51 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
52 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
53 %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
54 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
58 define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
59 ;CHECK-LABEL: sabdl2_2d:
61 %load1 = load <4 x i32>* %A
62 %load2 = load <4 x i32>* %B
63 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
64 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
65 %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
66 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
70 define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
71 ;CHECK-LABEL: uabdl8h:
73 %tmp1 = load <8 x i8>* %A
74 %tmp2 = load <8 x i8>* %B
75 %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
76 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
80 define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
81 ;CHECK-LABEL: uabdl4s:
83 %tmp1 = load <4 x i16>* %A
84 %tmp2 = load <4 x i16>* %B
85 %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
86 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
90 define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
91 ;CHECK-LABEL: uabdl2d:
93 %tmp1 = load <2 x i32>* %A
94 %tmp2 = load <2 x i32>* %B
95 %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
96 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
100 define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
101 ;CHECK-LABEL: uabdl2_8h:
103 %load1 = load <16 x i8>* %A
104 %load2 = load <16 x i8>* %B
105 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
106 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
108 %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
109 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
113 define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
114 ;CHECK-LABEL: uabdl2_4s:
116 %load1 = load <8 x i16>* %A
117 %load2 = load <8 x i16>* %B
118 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
119 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
120 %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
121 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
125 define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
126 ;CHECK-LABEL: uabdl2_2d:
128 %load1 = load <4 x i32>* %A
129 %load2 = load <4 x i32>* %B
130 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
131 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
132 %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
133 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
137 define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
138 ;CHECK-LABEL: fabd_2s:
140 %tmp1 = load <2 x float>* %A
141 %tmp2 = load <2 x float>* %B
142 %tmp3 = call <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
143 ret <2 x float> %tmp3
146 define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
147 ;CHECK-LABEL: fabd_4s:
149 %tmp1 = load <4 x float>* %A
150 %tmp2 = load <4 x float>* %B
151 %tmp3 = call <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
152 ret <4 x float> %tmp3
155 define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
156 ;CHECK-LABEL: fabd_2d:
158 %tmp1 = load <2 x double>* %A
159 %tmp2 = load <2 x double>* %B
160 %tmp3 = call <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
161 ret <2 x double> %tmp3
164 declare <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
165 declare <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
166 declare <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
168 define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
169 ;CHECK-LABEL: sabd_8b:
171 %tmp1 = load <8 x i8>* %A
172 %tmp2 = load <8 x i8>* %B
173 %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
177 define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
178 ;CHECK-LABEL: sabd_16b:
180 %tmp1 = load <16 x i8>* %A
181 %tmp2 = load <16 x i8>* %B
182 %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
186 define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
187 ;CHECK-LABEL: sabd_4h:
189 %tmp1 = load <4 x i16>* %A
190 %tmp2 = load <4 x i16>* %B
191 %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
195 define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
196 ;CHECK-LABEL: sabd_8h:
198 %tmp1 = load <8 x i16>* %A
199 %tmp2 = load <8 x i16>* %B
200 %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
204 define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
205 ;CHECK-LABEL: sabd_2s:
207 %tmp1 = load <2 x i32>* %A
208 %tmp2 = load <2 x i32>* %B
209 %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
213 define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
214 ;CHECK-LABEL: sabd_4s:
216 %tmp1 = load <4 x i32>* %A
217 %tmp2 = load <4 x i32>* %B
218 %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
222 declare <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
223 declare <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
224 declare <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
225 declare <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
226 declare <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
227 declare <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
229 define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
230 ;CHECK-LABEL: uabd_8b:
232 %tmp1 = load <8 x i8>* %A
233 %tmp2 = load <8 x i8>* %B
234 %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
238 define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
239 ;CHECK-LABEL: uabd_16b:
241 %tmp1 = load <16 x i8>* %A
242 %tmp2 = load <16 x i8>* %B
243 %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
247 define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
248 ;CHECK-LABEL: uabd_4h:
250 %tmp1 = load <4 x i16>* %A
251 %tmp2 = load <4 x i16>* %B
252 %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
256 define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
257 ;CHECK-LABEL: uabd_8h:
259 %tmp1 = load <8 x i16>* %A
260 %tmp2 = load <8 x i16>* %B
261 %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
265 define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
266 ;CHECK-LABEL: uabd_2s:
268 %tmp1 = load <2 x i32>* %A
269 %tmp2 = load <2 x i32>* %B
270 %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
274 define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
275 ;CHECK-LABEL: uabd_4s:
277 %tmp1 = load <4 x i32>* %A
278 %tmp2 = load <4 x i32>* %B
279 %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
283 declare <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
284 declare <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
285 declare <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
286 declare <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
287 declare <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
288 declare <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
290 define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
291 ;CHECK-LABEL: sqabs_8b:
293 %tmp1 = load <8 x i8>* %A
294 %tmp3 = call <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8> %tmp1)
298 define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
299 ;CHECK-LABEL: sqabs_16b:
301 %tmp1 = load <16 x i8>* %A
302 %tmp3 = call <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8> %tmp1)
306 define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
307 ;CHECK-LABEL: sqabs_4h:
309 %tmp1 = load <4 x i16>* %A
310 %tmp3 = call <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16> %tmp1)
314 define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
315 ;CHECK-LABEL: sqabs_8h:
317 %tmp1 = load <8 x i16>* %A
318 %tmp3 = call <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16> %tmp1)
322 define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
323 ;CHECK-LABEL: sqabs_2s:
325 %tmp1 = load <2 x i32>* %A
326 %tmp3 = call <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32> %tmp1)
330 define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
331 ;CHECK-LABEL: sqabs_4s:
333 %tmp1 = load <4 x i32>* %A
334 %tmp3 = call <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32> %tmp1)
338 declare <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
339 declare <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
340 declare <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
341 declare <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
342 declare <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
343 declare <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
345 define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
346 ;CHECK-LABEL: sqneg_8b:
348 %tmp1 = load <8 x i8>* %A
349 %tmp3 = call <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8> %tmp1)
353 define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
354 ;CHECK-LABEL: sqneg_16b:
356 %tmp1 = load <16 x i8>* %A
357 %tmp3 = call <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8> %tmp1)
361 define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
362 ;CHECK-LABEL: sqneg_4h:
364 %tmp1 = load <4 x i16>* %A
365 %tmp3 = call <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16> %tmp1)
369 define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
370 ;CHECK-LABEL: sqneg_8h:
372 %tmp1 = load <8 x i16>* %A
373 %tmp3 = call <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16> %tmp1)
377 define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
378 ;CHECK-LABEL: sqneg_2s:
380 %tmp1 = load <2 x i32>* %A
381 %tmp3 = call <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32> %tmp1)
385 define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
386 ;CHECK-LABEL: sqneg_4s:
388 %tmp1 = load <4 x i32>* %A
389 %tmp3 = call <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32> %tmp1)
393 declare <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
394 declare <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
395 declare <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
396 declare <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
397 declare <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
398 declare <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
400 define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
401 ;CHECK-LABEL: abs_8b:
403 %tmp1 = load <8 x i8>* %A
404 %tmp3 = call <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8> %tmp1)
408 define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
409 ;CHECK-LABEL: abs_16b:
411 %tmp1 = load <16 x i8>* %A
412 %tmp3 = call <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8> %tmp1)
416 define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
417 ;CHECK-LABEL: abs_4h:
419 %tmp1 = load <4 x i16>* %A
420 %tmp3 = call <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16> %tmp1)
424 define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
425 ;CHECK-LABEL: abs_8h:
427 %tmp1 = load <8 x i16>* %A
428 %tmp3 = call <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16> %tmp1)
432 define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
433 ;CHECK-LABEL: abs_2s:
435 %tmp1 = load <2 x i32>* %A
436 %tmp3 = call <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32> %tmp1)
440 define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
441 ;CHECK-LABEL: abs_4s:
443 %tmp1 = load <4 x i32>* %A
444 %tmp3 = call <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32> %tmp1)
448 define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
449 ; CHECK-LABEL: abs_1d:
451 %abs = call <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64> %A)
455 declare <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8>) nounwind readnone
456 declare <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8>) nounwind readnone
457 declare <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16>) nounwind readnone
458 declare <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16>) nounwind readnone
459 declare <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32>) nounwind readnone
460 declare <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32>) nounwind readnone
461 declare <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64>) nounwind readnone
463 define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind {
464 ;CHECK-LABEL: sabal8h:
466 %tmp1 = load <8 x i8>* %A
467 %tmp2 = load <8 x i8>* %B
468 %tmp3 = load <8 x i16>* %C
469 %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
470 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
471 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
475 define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
476 ;CHECK-LABEL: sabal4s:
478 %tmp1 = load <4 x i16>* %A
479 %tmp2 = load <4 x i16>* %B
480 %tmp3 = load <4 x i32>* %C
481 %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
482 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
483 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
487 define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
488 ;CHECK-LABEL: sabal2d:
490 %tmp1 = load <2 x i32>* %A
491 %tmp2 = load <2 x i32>* %B
492 %tmp3 = load <2 x i64>* %C
493 %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
494 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
495 %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
496 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
500 define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
501 ;CHECK-LABEL: sabal2_8h:
503 %load1 = load <16 x i8>* %A
504 %load2 = load <16 x i8>* %B
505 %tmp3 = load <8 x i16>* %C
506 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
507 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
508 %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
509 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
510 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
514 define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
515 ;CHECK-LABEL: sabal2_4s:
517 %load1 = load <8 x i16>* %A
518 %load2 = load <8 x i16>* %B
519 %tmp3 = load <4 x i32>* %C
520 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
521 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
522 %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
523 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
524 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
528 define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
529 ;CHECK-LABEL: sabal2_2d:
531 %load1 = load <4 x i32>* %A
532 %load2 = load <4 x i32>* %B
533 %tmp3 = load <2 x i64>* %C
534 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
535 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
536 %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
537 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
538 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
542 define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind {
543 ;CHECK-LABEL: uabal8h:
545 %tmp1 = load <8 x i8>* %A
546 %tmp2 = load <8 x i8>* %B
547 %tmp3 = load <8 x i16>* %C
548 %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
549 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
550 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
554 define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
555 ;CHECK-LABEL: uabal4s:
557 %tmp1 = load <4 x i16>* %A
558 %tmp2 = load <4 x i16>* %B
559 %tmp3 = load <4 x i32>* %C
560 %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
561 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
562 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
566 define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
567 ;CHECK-LABEL: uabal2d:
569 %tmp1 = load <2 x i32>* %A
570 %tmp2 = load <2 x i32>* %B
571 %tmp3 = load <2 x i64>* %C
572 %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
573 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
574 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
578 define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
579 ;CHECK-LABEL: uabal2_8h:
581 %load1 = load <16 x i8>* %A
582 %load2 = load <16 x i8>* %B
583 %tmp3 = load <8 x i16>* %C
584 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
585 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
586 %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
587 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
588 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
592 define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
593 ;CHECK-LABEL: uabal2_4s:
595 %load1 = load <8 x i16>* %A
596 %load2 = load <8 x i16>* %B
597 %tmp3 = load <4 x i32>* %C
598 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
599 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
600 %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
601 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
602 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
606 define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
607 ;CHECK-LABEL: uabal2_2d:
609 %load1 = load <4 x i32>* %A
610 %load2 = load <4 x i32>* %B
611 %tmp3 = load <2 x i64>* %C
612 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
613 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
614 %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
615 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
616 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
620 define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
621 ;CHECK-LABEL: saba_8b:
623 %tmp1 = load <8 x i8>* %A
624 %tmp2 = load <8 x i8>* %B
625 %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
626 %tmp4 = load <8 x i8>* %C
627 %tmp5 = add <8 x i8> %tmp3, %tmp4
631 define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
632 ;CHECK-LABEL: saba_16b:
634 %tmp1 = load <16 x i8>* %A
635 %tmp2 = load <16 x i8>* %B
636 %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
637 %tmp4 = load <16 x i8>* %C
638 %tmp5 = add <16 x i8> %tmp3, %tmp4
642 define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
643 ;CHECK-LABEL: saba_4h:
645 %tmp1 = load <4 x i16>* %A
646 %tmp2 = load <4 x i16>* %B
647 %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
648 %tmp4 = load <4 x i16>* %C
649 %tmp5 = add <4 x i16> %tmp3, %tmp4
653 define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
654 ;CHECK-LABEL: saba_8h:
656 %tmp1 = load <8 x i16>* %A
657 %tmp2 = load <8 x i16>* %B
658 %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
659 %tmp4 = load <8 x i16>* %C
660 %tmp5 = add <8 x i16> %tmp3, %tmp4
664 define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
665 ;CHECK-LABEL: saba_2s:
667 %tmp1 = load <2 x i32>* %A
668 %tmp2 = load <2 x i32>* %B
669 %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
670 %tmp4 = load <2 x i32>* %C
671 %tmp5 = add <2 x i32> %tmp3, %tmp4
675 define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
676 ;CHECK-LABEL: saba_4s:
678 %tmp1 = load <4 x i32>* %A
679 %tmp2 = load <4 x i32>* %B
680 %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
681 %tmp4 = load <4 x i32>* %C
682 %tmp5 = add <4 x i32> %tmp3, %tmp4
686 define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
687 ;CHECK-LABEL: uaba_8b:
689 %tmp1 = load <8 x i8>* %A
690 %tmp2 = load <8 x i8>* %B
691 %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
692 %tmp4 = load <8 x i8>* %C
693 %tmp5 = add <8 x i8> %tmp3, %tmp4
697 define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
698 ;CHECK-LABEL: uaba_16b:
700 %tmp1 = load <16 x i8>* %A
701 %tmp2 = load <16 x i8>* %B
702 %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
703 %tmp4 = load <16 x i8>* %C
704 %tmp5 = add <16 x i8> %tmp3, %tmp4
708 define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
709 ;CHECK-LABEL: uaba_4h:
711 %tmp1 = load <4 x i16>* %A
712 %tmp2 = load <4 x i16>* %B
713 %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
714 %tmp4 = load <4 x i16>* %C
715 %tmp5 = add <4 x i16> %tmp3, %tmp4
719 define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
720 ;CHECK-LABEL: uaba_8h:
722 %tmp1 = load <8 x i16>* %A
723 %tmp2 = load <8 x i16>* %B
724 %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
725 %tmp4 = load <8 x i16>* %C
726 %tmp5 = add <8 x i16> %tmp3, %tmp4
730 define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
731 ;CHECK-LABEL: uaba_2s:
733 %tmp1 = load <2 x i32>* %A
734 %tmp2 = load <2 x i32>* %B
735 %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
736 %tmp4 = load <2 x i32>* %C
737 %tmp5 = add <2 x i32> %tmp3, %tmp4
741 define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
742 ;CHECK-LABEL: uaba_4s:
744 %tmp1 = load <4 x i32>* %A
745 %tmp2 = load <4 x i32>* %B
746 %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
747 %tmp4 = load <4 x i32>* %C
748 %tmp5 = add <4 x i32> %tmp3, %tmp4
753 define float @fabds(float %a, float %b) nounwind {
754 ; CHECK-LABEL: fabds:
755 ; CHECK: fabd s0, s0, s1
756 %vabd.i = tail call float @llvm.arm64.sisd.fabd.f32(float %a, float %b) nounwind
760 define double @fabdd(double %a, double %b) nounwind {
761 ; CHECK-LABEL: fabdd:
762 ; CHECK: fabd d0, d0, d1
763 %vabd.i = tail call double @llvm.arm64.sisd.fabd.f64(double %a, double %b) nounwind
767 declare double @llvm.arm64.sisd.fabd.f64(double, double) nounwind readnone
768 declare float @llvm.arm64.sisd.fabd.f32(float, float) nounwind readnone
770 define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
771 ; CHECK-LABEL: uabdl_from_extract_dup:
774 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
775 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
777 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
779 %res = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
780 %res1 = zext <2 x i32> %res to <2 x i64>
784 define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
785 ; CHECK-LABEL: sabdl_from_extract_dup:
788 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
789 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
791 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
793 %res = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
794 %res1 = zext <2 x i32> %res to <2 x i64>