1 ; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
4 define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
7 %tmp1 = load <8 x i8>, <8 x i8>* %A
8 %tmp2 = load <8 x i8>, <8 x i8>* %B
9 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
10 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
14 define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
15 ;CHECK-LABEL: sabdl4s:
17 %tmp1 = load <4 x i16>, <4 x i16>* %A
18 %tmp2 = load <4 x i16>, <4 x i16>* %B
19 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
20 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
24 define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
25 ;CHECK-LABEL: sabdl2d:
27 %tmp1 = load <2 x i32>, <2 x i32>* %A
28 %tmp2 = load <2 x i32>, <2 x i32>* %B
29 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
30 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
34 define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
35 ;CHECK-LABEL: sabdl2_8h:
37 %load1 = load <16 x i8>, <16 x i8>* %A
38 %load2 = load <16 x i8>, <16 x i8>* %B
39 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
40 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
41 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
42 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
46 define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
47 ;CHECK-LABEL: sabdl2_4s:
49 %load1 = load <8 x i16>, <8 x i16>* %A
50 %load2 = load <8 x i16>, <8 x i16>* %B
51 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
52 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
53 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
54 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
58 define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
59 ;CHECK-LABEL: sabdl2_2d:
61 %load1 = load <4 x i32>, <4 x i32>* %A
62 %load2 = load <4 x i32>, <4 x i32>* %B
63 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
64 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
65 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
66 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
70 define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
71 ;CHECK-LABEL: uabdl8h:
73 %tmp1 = load <8 x i8>, <8 x i8>* %A
74 %tmp2 = load <8 x i8>, <8 x i8>* %B
75 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
76 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
80 define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
81 ;CHECK-LABEL: uabdl4s:
83 %tmp1 = load <4 x i16>, <4 x i16>* %A
84 %tmp2 = load <4 x i16>, <4 x i16>* %B
85 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
86 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
90 define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
91 ;CHECK-LABEL: uabdl2d:
93 %tmp1 = load <2 x i32>, <2 x i32>* %A
94 %tmp2 = load <2 x i32>, <2 x i32>* %B
95 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
96 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
100 define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
101 ;CHECK-LABEL: uabdl2_8h:
103 %load1 = load <16 x i8>, <16 x i8>* %A
104 %load2 = load <16 x i8>, <16 x i8>* %B
105 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
106 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
108 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
109 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
113 define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
114 ;CHECK-LABEL: uabdl2_4s:
116 %load1 = load <8 x i16>, <8 x i16>* %A
117 %load2 = load <8 x i16>, <8 x i16>* %B
118 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
119 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
120 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
121 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
125 define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
126 ;CHECK-LABEL: uabdl2_2d:
128 %load1 = load <4 x i32>, <4 x i32>* %A
129 %load2 = load <4 x i32>, <4 x i32>* %B
130 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
131 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
132 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
133 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
137 define i16 @uabdl8h_log2_shuffle(<16 x i8>* %a, <16 x i8>* %b) {
138 ; CHECK-LABEL: uabdl8h_log2_shuffle
141 %aload = load <16 x i8>, <16 x i8>* %a, align 1
142 %bload = load <16 x i8>, <16 x i8>* %b, align 1
143 %aext = zext <16 x i8> %aload to <16 x i16>
144 %bext = zext <16 x i8> %bload to <16 x i16>
145 %abdiff = sub nsw <16 x i16> %aext, %bext
146 %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer
147 %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff
148 %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
149 %rdx.shuf = shufflevector <16 x i16> %absel, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
150 %bin1.rdx = add <16 x i16> %absel, %rdx.shuf
151 %rdx.shufx = shufflevector <16 x i16> %bin1.rdx, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
152 %bin.rdx = add <16 x i16> %bin1.rdx, %rdx.shufx
153 %rdx.shuf136 = shufflevector <16 x i16> %bin.rdx, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
154 %bin.rdx137 = add <16 x i16> %bin.rdx, %rdx.shuf136
155 %rdx.shuf138 = shufflevector <16 x i16> %bin.rdx137, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
156 %bin.rdx139 = add <16 x i16> %bin.rdx137, %rdx.shuf138
157 %reduced_v = extractelement <16 x i16> %bin.rdx139, i16 0
161 define i32 @uabdl4s_log2_shuffle(<8 x i16>* %a, <8 x i16>* %b) {
162 ; CHECK-LABEL: uabdl4s_log2_shuffle
165 %aload = load <8 x i16>, <8 x i16>* %a, align 1
166 %bload = load <8 x i16>, <8 x i16>* %b, align 1
167 %aext = zext <8 x i16> %aload to <8 x i32>
168 %bext = zext <8 x i16> %bload to <8 x i32>
169 %abdiff = sub nsw <8 x i32> %aext, %bext
170 %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
171 %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
172 %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
173 %rdx.shuf = shufflevector <8 x i32> %absel, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
174 %bin.rdx = add <8 x i32> %absel, %rdx.shuf
175 %rdx.shuf136 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
176 %bin.rdx137 = add <8 x i32> %bin.rdx, %rdx.shuf136
177 %rdx.shuf138 = shufflevector <8 x i32> %bin.rdx137, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
178 %bin.rdx139 = add <8 x i32> %bin.rdx137, %rdx.shuf138
179 %reduced_v = extractelement <8 x i32> %bin.rdx139, i32 0
183 define i64 @uabdl2d_log2_shuffle(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
184 ; CHECK: uabdl2d_log2_shuffle
187 %aload = load <4 x i32>, <4 x i32>* %a, align 1
188 %bload = load <4 x i32>, <4 x i32>* %b, align 1
189 %aext = zext <4 x i32> %aload to <4 x i64>
190 %bext = zext <4 x i32> %bload to <4 x i64>
191 %abdiff = sub nsw <4 x i64> %aext, %bext
192 %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
193 %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
194 %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
195 %rdx.shuf136 = shufflevector <4 x i64> %absel, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
196 %bin.rdx137 = add <4 x i64> %absel, %rdx.shuf136
197 %rdx.shuf138 = shufflevector <4 x i64> %bin.rdx137, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
198 %bin.rdx139 = add <4 x i64> %bin.rdx137, %rdx.shuf138
199 %reduced_v = extractelement <4 x i64> %bin.rdx139, i16 0
203 define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
204 ;CHECK-LABEL: fabd_2s:
206 %tmp1 = load <2 x float>, <2 x float>* %A
207 %tmp2 = load <2 x float>, <2 x float>* %B
208 %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
209 ret <2 x float> %tmp3
212 define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
213 ;CHECK-LABEL: fabd_4s:
215 %tmp1 = load <4 x float>, <4 x float>* %A
216 %tmp2 = load <4 x float>, <4 x float>* %B
217 %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
218 ret <4 x float> %tmp3
221 define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
222 ;CHECK-LABEL: fabd_2d:
224 %tmp1 = load <2 x double>, <2 x double>* %A
225 %tmp2 = load <2 x double>, <2 x double>* %B
226 %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
227 ret <2 x double> %tmp3
230 declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
231 declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
232 declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
234 define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
235 ;CHECK-LABEL: sabd_8b:
237 %tmp1 = load <8 x i8>, <8 x i8>* %A
238 %tmp2 = load <8 x i8>, <8 x i8>* %B
239 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
243 define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
244 ;CHECK-LABEL: sabd_16b:
246 %tmp1 = load <16 x i8>, <16 x i8>* %A
247 %tmp2 = load <16 x i8>, <16 x i8>* %B
248 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
252 define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
253 ;CHECK-LABEL: sabd_4h:
255 %tmp1 = load <4 x i16>, <4 x i16>* %A
256 %tmp2 = load <4 x i16>, <4 x i16>* %B
257 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
261 define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
262 ;CHECK-LABEL: sabd_8h:
264 %tmp1 = load <8 x i16>, <8 x i16>* %A
265 %tmp2 = load <8 x i16>, <8 x i16>* %B
266 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
270 define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
271 ;CHECK-LABEL: sabd_2s:
273 %tmp1 = load <2 x i32>, <2 x i32>* %A
274 %tmp2 = load <2 x i32>, <2 x i32>* %B
275 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
279 define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
280 ;CHECK-LABEL: sabd_4s:
282 %tmp1 = load <4 x i32>, <4 x i32>* %A
283 %tmp2 = load <4 x i32>, <4 x i32>* %B
284 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
288 declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
289 declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
290 declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
291 declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
292 declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
293 declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
295 define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
296 ;CHECK-LABEL: uabd_8b:
298 %tmp1 = load <8 x i8>, <8 x i8>* %A
299 %tmp2 = load <8 x i8>, <8 x i8>* %B
300 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
304 define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
305 ;CHECK-LABEL: uabd_16b:
307 %tmp1 = load <16 x i8>, <16 x i8>* %A
308 %tmp2 = load <16 x i8>, <16 x i8>* %B
309 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
313 define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
314 ;CHECK-LABEL: uabd_4h:
316 %tmp1 = load <4 x i16>, <4 x i16>* %A
317 %tmp2 = load <4 x i16>, <4 x i16>* %B
318 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
322 define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
323 ;CHECK-LABEL: uabd_8h:
325 %tmp1 = load <8 x i16>, <8 x i16>* %A
326 %tmp2 = load <8 x i16>, <8 x i16>* %B
327 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
331 define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
332 ;CHECK-LABEL: uabd_2s:
334 %tmp1 = load <2 x i32>, <2 x i32>* %A
335 %tmp2 = load <2 x i32>, <2 x i32>* %B
336 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
340 define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
341 ;CHECK-LABEL: uabd_4s:
343 %tmp1 = load <4 x i32>, <4 x i32>* %A
344 %tmp2 = load <4 x i32>, <4 x i32>* %B
345 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
349 declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
350 declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
351 declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
352 declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
353 declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
354 declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
356 define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
357 ;CHECK-LABEL: sqabs_8b:
359 %tmp1 = load <8 x i8>, <8 x i8>* %A
360 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
364 define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
365 ;CHECK-LABEL: sqabs_16b:
367 %tmp1 = load <16 x i8>, <16 x i8>* %A
368 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
372 define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
373 ;CHECK-LABEL: sqabs_4h:
375 %tmp1 = load <4 x i16>, <4 x i16>* %A
376 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
380 define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
381 ;CHECK-LABEL: sqabs_8h:
383 %tmp1 = load <8 x i16>, <8 x i16>* %A
384 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
388 define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
389 ;CHECK-LABEL: sqabs_2s:
391 %tmp1 = load <2 x i32>, <2 x i32>* %A
392 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
396 define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
397 ;CHECK-LABEL: sqabs_4s:
399 %tmp1 = load <4 x i32>, <4 x i32>* %A
400 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
404 declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
405 declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
406 declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
407 declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
408 declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
409 declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
411 define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
412 ;CHECK-LABEL: sqneg_8b:
414 %tmp1 = load <8 x i8>, <8 x i8>* %A
415 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
419 define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
420 ;CHECK-LABEL: sqneg_16b:
422 %tmp1 = load <16 x i8>, <16 x i8>* %A
423 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
427 define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
428 ;CHECK-LABEL: sqneg_4h:
430 %tmp1 = load <4 x i16>, <4 x i16>* %A
431 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
435 define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
436 ;CHECK-LABEL: sqneg_8h:
438 %tmp1 = load <8 x i16>, <8 x i16>* %A
439 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
443 define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
444 ;CHECK-LABEL: sqneg_2s:
446 %tmp1 = load <2 x i32>, <2 x i32>* %A
447 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
451 define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
452 ;CHECK-LABEL: sqneg_4s:
454 %tmp1 = load <4 x i32>, <4 x i32>* %A
455 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
459 declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
460 declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
461 declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
462 declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
463 declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
464 declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
466 define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
467 ;CHECK-LABEL: abs_8b:
469 %tmp1 = load <8 x i8>, <8 x i8>* %A
470 %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
474 define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
475 ;CHECK-LABEL: abs_16b:
477 %tmp1 = load <16 x i8>, <16 x i8>* %A
478 %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
482 define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
483 ;CHECK-LABEL: abs_4h:
485 %tmp1 = load <4 x i16>, <4 x i16>* %A
486 %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
490 define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
491 ;CHECK-LABEL: abs_8h:
493 %tmp1 = load <8 x i16>, <8 x i16>* %A
494 %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
498 define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
499 ;CHECK-LABEL: abs_2s:
501 %tmp1 = load <2 x i32>, <2 x i32>* %A
502 %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
506 define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
507 ;CHECK-LABEL: abs_4s:
509 %tmp1 = load <4 x i32>, <4 x i32>* %A
510 %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
514 define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
515 ; CHECK-LABEL: abs_1d:
517 %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
521 define i64 @abs_1d_honestly(i64 %A) nounwind {
522 ; CHECK-LABEL: abs_1d_honestly:
524 %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
528 declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
529 declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
530 declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
531 declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
532 declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
533 declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
534 declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
535 declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
537 define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind {
538 ;CHECK-LABEL: sabal8h:
540 %tmp1 = load <8 x i8>, <8 x i8>* %A
541 %tmp2 = load <8 x i8>, <8 x i8>* %B
542 %tmp3 = load <8 x i16>, <8 x i16>* %C
543 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
544 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
545 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
549 define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
550 ;CHECK-LABEL: sabal4s:
552 %tmp1 = load <4 x i16>, <4 x i16>* %A
553 %tmp2 = load <4 x i16>, <4 x i16>* %B
554 %tmp3 = load <4 x i32>, <4 x i32>* %C
555 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
556 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
557 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
561 define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
562 ;CHECK-LABEL: sabal2d:
564 %tmp1 = load <2 x i32>, <2 x i32>* %A
565 %tmp2 = load <2 x i32>, <2 x i32>* %B
566 %tmp3 = load <2 x i64>, <2 x i64>* %C
567 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
568 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
569 %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
570 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
574 define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
575 ;CHECK-LABEL: sabal2_8h:
577 %load1 = load <16 x i8>, <16 x i8>* %A
578 %load2 = load <16 x i8>, <16 x i8>* %B
579 %tmp3 = load <8 x i16>, <8 x i16>* %C
580 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
581 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
582 %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
583 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
584 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
588 define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
589 ;CHECK-LABEL: sabal2_4s:
591 %load1 = load <8 x i16>, <8 x i16>* %A
592 %load2 = load <8 x i16>, <8 x i16>* %B
593 %tmp3 = load <4 x i32>, <4 x i32>* %C
594 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
595 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
596 %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
597 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
598 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
602 define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
603 ;CHECK-LABEL: sabal2_2d:
605 %load1 = load <4 x i32>, <4 x i32>* %A
606 %load2 = load <4 x i32>, <4 x i32>* %B
607 %tmp3 = load <2 x i64>, <2 x i64>* %C
608 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
609 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
610 %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
611 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
612 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
616 define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind {
617 ;CHECK-LABEL: uabal8h:
619 %tmp1 = load <8 x i8>, <8 x i8>* %A
620 %tmp2 = load <8 x i8>, <8 x i8>* %B
621 %tmp3 = load <8 x i16>, <8 x i16>* %C
622 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
623 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
624 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
628 define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
629 ;CHECK-LABEL: uabal4s:
631 %tmp1 = load <4 x i16>, <4 x i16>* %A
632 %tmp2 = load <4 x i16>, <4 x i16>* %B
633 %tmp3 = load <4 x i32>, <4 x i32>* %C
634 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
635 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
636 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
640 define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
641 ;CHECK-LABEL: uabal2d:
643 %tmp1 = load <2 x i32>, <2 x i32>* %A
644 %tmp2 = load <2 x i32>, <2 x i32>* %B
645 %tmp3 = load <2 x i64>, <2 x i64>* %C
646 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
647 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
648 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
652 define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
653 ;CHECK-LABEL: uabal2_8h:
655 %load1 = load <16 x i8>, <16 x i8>* %A
656 %load2 = load <16 x i8>, <16 x i8>* %B
657 %tmp3 = load <8 x i16>, <8 x i16>* %C
658 %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
659 %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
660 %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
661 %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
662 %tmp5 = add <8 x i16> %tmp3, %tmp4.1
666 define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
667 ;CHECK-LABEL: uabal2_4s:
669 %load1 = load <8 x i16>, <8 x i16>* %A
670 %load2 = load <8 x i16>, <8 x i16>* %B
671 %tmp3 = load <4 x i32>, <4 x i32>* %C
672 %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
673 %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
674 %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
675 %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
676 %tmp5 = add <4 x i32> %tmp3, %tmp4.1
680 define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
681 ;CHECK-LABEL: uabal2_2d:
683 %load1 = load <4 x i32>, <4 x i32>* %A
684 %load2 = load <4 x i32>, <4 x i32>* %B
685 %tmp3 = load <2 x i64>, <2 x i64>* %C
686 %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
687 %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
688 %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
689 %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
690 %tmp5 = add <2 x i64> %tmp3, %tmp4.1
694 define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
695 ;CHECK-LABEL: saba_8b:
697 %tmp1 = load <8 x i8>, <8 x i8>* %A
698 %tmp2 = load <8 x i8>, <8 x i8>* %B
699 %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
700 %tmp4 = load <8 x i8>, <8 x i8>* %C
701 %tmp5 = add <8 x i8> %tmp3, %tmp4
705 define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
706 ;CHECK-LABEL: saba_16b:
708 %tmp1 = load <16 x i8>, <16 x i8>* %A
709 %tmp2 = load <16 x i8>, <16 x i8>* %B
710 %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
711 %tmp4 = load <16 x i8>, <16 x i8>* %C
712 %tmp5 = add <16 x i8> %tmp3, %tmp4
716 define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
717 ;CHECK-LABEL: saba_4h:
719 %tmp1 = load <4 x i16>, <4 x i16>* %A
720 %tmp2 = load <4 x i16>, <4 x i16>* %B
721 %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
722 %tmp4 = load <4 x i16>, <4 x i16>* %C
723 %tmp5 = add <4 x i16> %tmp3, %tmp4
727 define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
728 ;CHECK-LABEL: saba_8h:
730 %tmp1 = load <8 x i16>, <8 x i16>* %A
731 %tmp2 = load <8 x i16>, <8 x i16>* %B
732 %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
733 %tmp4 = load <8 x i16>, <8 x i16>* %C
734 %tmp5 = add <8 x i16> %tmp3, %tmp4
738 define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
739 ;CHECK-LABEL: saba_2s:
741 %tmp1 = load <2 x i32>, <2 x i32>* %A
742 %tmp2 = load <2 x i32>, <2 x i32>* %B
743 %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
744 %tmp4 = load <2 x i32>, <2 x i32>* %C
745 %tmp5 = add <2 x i32> %tmp3, %tmp4
749 define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
750 ;CHECK-LABEL: saba_4s:
752 %tmp1 = load <4 x i32>, <4 x i32>* %A
753 %tmp2 = load <4 x i32>, <4 x i32>* %B
754 %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
755 %tmp4 = load <4 x i32>, <4 x i32>* %C
756 %tmp5 = add <4 x i32> %tmp3, %tmp4
760 define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
761 ;CHECK-LABEL: uaba_8b:
763 %tmp1 = load <8 x i8>, <8 x i8>* %A
764 %tmp2 = load <8 x i8>, <8 x i8>* %B
765 %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
766 %tmp4 = load <8 x i8>, <8 x i8>* %C
767 %tmp5 = add <8 x i8> %tmp3, %tmp4
771 define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
772 ;CHECK-LABEL: uaba_16b:
774 %tmp1 = load <16 x i8>, <16 x i8>* %A
775 %tmp2 = load <16 x i8>, <16 x i8>* %B
776 %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
777 %tmp4 = load <16 x i8>, <16 x i8>* %C
778 %tmp5 = add <16 x i8> %tmp3, %tmp4
782 define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
783 ;CHECK-LABEL: uaba_4h:
785 %tmp1 = load <4 x i16>, <4 x i16>* %A
786 %tmp2 = load <4 x i16>, <4 x i16>* %B
787 %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
788 %tmp4 = load <4 x i16>, <4 x i16>* %C
789 %tmp5 = add <4 x i16> %tmp3, %tmp4
793 define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
794 ;CHECK-LABEL: uaba_8h:
796 %tmp1 = load <8 x i16>, <8 x i16>* %A
797 %tmp2 = load <8 x i16>, <8 x i16>* %B
798 %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
799 %tmp4 = load <8 x i16>, <8 x i16>* %C
800 %tmp5 = add <8 x i16> %tmp3, %tmp4
804 define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
805 ;CHECK-LABEL: uaba_2s:
807 %tmp1 = load <2 x i32>, <2 x i32>* %A
808 %tmp2 = load <2 x i32>, <2 x i32>* %B
809 %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
810 %tmp4 = load <2 x i32>, <2 x i32>* %C
811 %tmp5 = add <2 x i32> %tmp3, %tmp4
815 define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
816 ;CHECK-LABEL: uaba_4s:
818 %tmp1 = load <4 x i32>, <4 x i32>* %A
819 %tmp2 = load <4 x i32>, <4 x i32>* %B
820 %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
821 %tmp4 = load <4 x i32>, <4 x i32>* %C
822 %tmp5 = add <4 x i32> %tmp3, %tmp4
827 define float @fabds(float %a, float %b) nounwind {
828 ; CHECK-LABEL: fabds:
829 ; CHECK: fabd s0, s0, s1
830 %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
834 define double @fabdd(double %a, double %b) nounwind {
835 ; CHECK-LABEL: fabdd:
836 ; CHECK: fabd d0, d0, d1
837 %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
841 declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
842 declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
844 define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
845 ; CHECK-LABEL: uabdl_from_extract_dup:
848 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
849 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
851 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
853 %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
854 %res1 = zext <2 x i32> %res to <2 x i64>
858 define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
859 ; CHECK-LABEL: sabdl_from_extract_dup:
862 %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
863 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
865 %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
867 %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
868 %res1 = zext <2 x i32> %res to <2 x i64>
872 define <2 x i32> @abspattern1(<2 x i32> %a) nounwind {
873 ; CHECK-LABEL: abspattern1:
876 %tmp1neg = sub <2 x i32> zeroinitializer, %a
877 %b = icmp sge <2 x i32> %a, zeroinitializer
878 %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg
882 define <4 x i16> @abspattern2(<4 x i16> %a) nounwind {
883 ; CHECK-LABEL: abspattern2:
886 %tmp1neg = sub <4 x i16> zeroinitializer, %a
887 %b = icmp sgt <4 x i16> %a, zeroinitializer
888 %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg
892 define <8 x i8> @abspattern3(<8 x i8> %a) nounwind {
893 ; CHECK-LABEL: abspattern3:
896 %tmp1neg = sub <8 x i8> zeroinitializer, %a
897 %b = icmp slt <8 x i8> %a, zeroinitializer
898 %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a
902 define <4 x i32> @abspattern4(<4 x i32> %a) nounwind {
903 ; CHECK-LABEL: abspattern4:
906 %tmp1neg = sub <4 x i32> zeroinitializer, %a
907 %b = icmp sge <4 x i32> %a, zeroinitializer
908 %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
912 define <8 x i16> @abspattern5(<8 x i16> %a) nounwind {
913 ; CHECK-LABEL: abspattern5:
916 %tmp1neg = sub <8 x i16> zeroinitializer, %a
917 %b = icmp sgt <8 x i16> %a, zeroinitializer
918 %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
922 define <16 x i8> @abspattern6(<16 x i8> %a) nounwind {
923 ; CHECK-LABEL: abspattern6:
926 %tmp1neg = sub <16 x i8> zeroinitializer, %a
927 %b = icmp slt <16 x i8> %a, zeroinitializer
928 %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
932 define <2 x i64> @abspattern7(<2 x i64> %a) nounwind {
933 ; CHECK-LABEL: abspattern7:
936 %tmp1neg = sub <2 x i64> zeroinitializer, %a
937 %b = icmp sle <2 x i64> %a, zeroinitializer
938 %abs = select <2 x i1> %b, <2 x i64> %tmp1neg, <2 x i64> %a