1 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
2 ; arm64 has its own copy of this test in its directory.
4 declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>)
6 declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
8 declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
10 declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
12 declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
14 declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
16 declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
18 declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
20 declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
22 declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>)
24 declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
26 declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
28 declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>)
30 declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>)
32 declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>)
34 declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>)
36 declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>)
38 declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>)
40 declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>)
42 declare <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64>, <2 x i64>)
44 declare <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32>, <4 x i32>)
46 declare <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16>, <8 x i16>)
48 declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>)
50 declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>)
52 declare <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>)
54 define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) {
55 ; CHECK: test_vaddl_s8:
56 ; CHECK: saddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
58 %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
59 %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
60 %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
64 define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) {
65 ; CHECK: test_vaddl_s16:
66 ; CHECK: saddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
68 %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
69 %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
70 %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
74 define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) {
75 ; CHECK: test_vaddl_s32:
76 ; CHECK: saddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
78 %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
79 %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
80 %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
84 define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) {
85 ; CHECK: test_vaddl_u8:
86 ; CHECK: uaddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
88 %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
89 %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
90 %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
94 define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) {
95 ; CHECK: test_vaddl_u16:
96 ; CHECK: uaddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
98 %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
99 %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
100 %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
104 define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) {
105 ; CHECK: test_vaddl_u32:
106 ; CHECK: uaddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
108 %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
109 %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
110 %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
114 define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) {
115 ; CHECK: test_vaddl_high_s8:
116 ; CHECK: saddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
118 %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
119 %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
120 %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
121 %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
122 %add.i = add <8 x i16> %0, %1
126 define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) {
127 ; CHECK: test_vaddl_high_s16:
128 ; CHECK: saddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
130 %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
131 %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
132 %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
133 %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
134 %add.i = add <4 x i32> %0, %1
138 define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) {
139 ; CHECK: test_vaddl_high_s32:
140 ; CHECK: saddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
142 %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
143 %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
144 %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
145 %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
146 %add.i = add <2 x i64> %0, %1
150 define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) {
151 ; CHECK: test_vaddl_high_u8:
152 ; CHECK: uaddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
154 %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
155 %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
156 %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
157 %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
158 %add.i = add <8 x i16> %0, %1
162 define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) {
163 ; CHECK: test_vaddl_high_u16:
164 ; CHECK: uaddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
166 %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
167 %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
168 %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
169 %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
170 %add.i = add <4 x i32> %0, %1
174 define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) {
175 ; CHECK: test_vaddl_high_u32:
176 ; CHECK: uaddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
178 %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
179 %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
180 %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
181 %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
182 %add.i = add <2 x i64> %0, %1
186 define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) {
187 ; CHECK: test_vaddw_s8:
188 ; CHECK: saddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
190 %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
191 %add.i = add <8 x i16> %vmovl.i.i, %a
195 define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) {
196 ; CHECK: test_vaddw_s16:
197 ; CHECK: saddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
199 %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
200 %add.i = add <4 x i32> %vmovl.i.i, %a
204 define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) {
205 ; CHECK: test_vaddw_s32:
206 ; CHECK: saddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
208 %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
209 %add.i = add <2 x i64> %vmovl.i.i, %a
213 define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) {
214 ; CHECK: test_vaddw_u8:
215 ; CHECK: uaddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
217 %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
218 %add.i = add <8 x i16> %vmovl.i.i, %a
222 define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) {
223 ; CHECK: test_vaddw_u16:
224 ; CHECK: uaddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
226 %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
227 %add.i = add <4 x i32> %vmovl.i.i, %a
231 define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) {
232 ; CHECK: test_vaddw_u32:
233 ; CHECK: uaddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
235 %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
236 %add.i = add <2 x i64> %vmovl.i.i, %a
240 define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) {
241 ; CHECK: test_vaddw_high_s8:
242 ; CHECK: saddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
244 %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
245 %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
246 %add.i = add <8 x i16> %0, %a
250 define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) {
251 ; CHECK: test_vaddw_high_s16:
252 ; CHECK: saddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
254 %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
255 %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
256 %add.i = add <4 x i32> %0, %a
260 define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) {
261 ; CHECK: test_vaddw_high_s32:
262 ; CHECK: saddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
264 %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
265 %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
266 %add.i = add <2 x i64> %0, %a
270 define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) {
271 ; CHECK: test_vaddw_high_u8:
272 ; CHECK: uaddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
274 %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
275 %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
276 %add.i = add <8 x i16> %0, %a
280 define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) {
281 ; CHECK: test_vaddw_high_u16:
282 ; CHECK: uaddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
284 %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
285 %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
286 %add.i = add <4 x i32> %0, %a
290 define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) {
291 ; CHECK: test_vaddw_high_u32:
292 ; CHECK: uaddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
294 %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
295 %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
296 %add.i = add <2 x i64> %0, %a
300 define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) {
301 ; CHECK: test_vsubl_s8:
302 ; CHECK: ssubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
304 %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
305 %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
306 %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
310 define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) {
311 ; CHECK: test_vsubl_s16:
312 ; CHECK: ssubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
314 %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
315 %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
316 %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
320 define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) {
321 ; CHECK: test_vsubl_s32:
322 ; CHECK: ssubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
324 %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
325 %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
326 %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
330 define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) {
331 ; CHECK: test_vsubl_u8:
332 ; CHECK: usubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
334 %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
335 %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
336 %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
340 define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) {
341 ; CHECK: test_vsubl_u16:
342 ; CHECK: usubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
344 %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
345 %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
346 %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
350 define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) {
351 ; CHECK: test_vsubl_u32:
352 ; CHECK: usubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
354 %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
355 %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
356 %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
360 define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) {
361 ; CHECK: test_vsubl_high_s8:
362 ; CHECK: ssubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
364 %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
365 %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
366 %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
367 %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
368 %sub.i = sub <8 x i16> %0, %1
372 define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) {
373 ; CHECK: test_vsubl_high_s16:
374 ; CHECK: ssubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
376 %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
377 %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
378 %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
379 %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
380 %sub.i = sub <4 x i32> %0, %1
384 define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) {
385 ; CHECK: test_vsubl_high_s32:
386 ; CHECK: ssubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
388 %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
389 %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
390 %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
391 %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
392 %sub.i = sub <2 x i64> %0, %1
396 define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) {
397 ; CHECK: test_vsubl_high_u8:
398 ; CHECK: usubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
400 %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
401 %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
402 %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
403 %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
404 %sub.i = sub <8 x i16> %0, %1
408 define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) {
409 ; CHECK: test_vsubl_high_u16:
410 ; CHECK: usubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
412 %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
413 %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
414 %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
415 %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
416 %sub.i = sub <4 x i32> %0, %1
420 define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) {
421 ; CHECK: test_vsubl_high_u32:
422 ; CHECK: usubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
424 %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
425 %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
426 %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
427 %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
428 %sub.i = sub <2 x i64> %0, %1
432 define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) {
433 ; CHECK: test_vsubw_s8:
434 ; CHECK: ssubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
436 %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
437 %sub.i = sub <8 x i16> %a, %vmovl.i.i
441 define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) {
442 ; CHECK: test_vsubw_s16:
443 ; CHECK: ssubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
445 %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
446 %sub.i = sub <4 x i32> %a, %vmovl.i.i
450 define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) {
451 ; CHECK: test_vsubw_s32:
452 ; CHECK: ssubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
454 %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
455 %sub.i = sub <2 x i64> %a, %vmovl.i.i
459 define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) {
460 ; CHECK: test_vsubw_u8:
461 ; CHECK: usubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
463 %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
464 %sub.i = sub <8 x i16> %a, %vmovl.i.i
468 define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) {
469 ; CHECK: test_vsubw_u16:
470 ; CHECK: usubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
472 %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
473 %sub.i = sub <4 x i32> %a, %vmovl.i.i
477 define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) {
478 ; CHECK: test_vsubw_u32:
479 ; CHECK: usubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
481 %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
482 %sub.i = sub <2 x i64> %a, %vmovl.i.i
486 define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) {
487 ; CHECK: test_vsubw_high_s8:
488 ; CHECK: ssubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
490 %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
491 %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
492 %sub.i = sub <8 x i16> %a, %0
496 define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) {
497 ; CHECK: test_vsubw_high_s16:
498 ; CHECK: ssubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
500 %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
501 %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
502 %sub.i = sub <4 x i32> %a, %0
506 define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) {
507 ; CHECK: test_vsubw_high_s32:
508 ; CHECK: ssubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
510 %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
511 %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
512 %sub.i = sub <2 x i64> %a, %0
516 define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) {
517 ; CHECK: test_vsubw_high_u8:
518 ; CHECK: usubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
520 %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
521 %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
522 %sub.i = sub <8 x i16> %a, %0
526 define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) {
527 ; CHECK: test_vsubw_high_u16:
528 ; CHECK: usubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
530 %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
531 %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
532 %sub.i = sub <4 x i32> %a, %0
536 define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) {
537 ; CHECK: test_vsubw_high_u32:
538 ; CHECK: usubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
540 %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
541 %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
542 %sub.i = sub <2 x i64> %a, %0
546 define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) {
547 ; CHECK: test_vaddhn_s16:
548 ; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
550 %vaddhn.i = add <8 x i16> %a, %b
551 %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
552 %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
553 ret <8 x i8> %vaddhn2.i
556 define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) {
557 ; CHECK: test_vaddhn_s32:
558 ; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
560 %vaddhn.i = add <4 x i32> %a, %b
561 %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
562 %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
563 ret <4 x i16> %vaddhn2.i
566 define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) {
567 ; CHECK: test_vaddhn_s64:
568 ; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
570 %vaddhn.i = add <2 x i64> %a, %b
571 %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
572 %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
573 ret <2 x i32> %vaddhn2.i
576 define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) {
577 ; CHECK: test_vaddhn_u16:
578 ; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
580 %vaddhn.i = add <8 x i16> %a, %b
581 %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
582 %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
583 ret <8 x i8> %vaddhn2.i
586 define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) {
587 ; CHECK: test_vaddhn_u32:
588 ; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
590 %vaddhn.i = add <4 x i32> %a, %b
591 %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
592 %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
593 ret <4 x i16> %vaddhn2.i
596 define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) {
597 ; CHECK: test_vaddhn_u64:
598 ; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
600 %vaddhn.i = add <2 x i64> %a, %b
601 %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
602 %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
603 ret <2 x i32> %vaddhn2.i
606 define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
607 ; CHECK: test_vaddhn_high_s16:
608 ; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
610 %vaddhn.i.i = add <8 x i16> %a, %b
611 %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
612 %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
613 %0 = bitcast <8 x i8> %r to <1 x i64>
614 %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
615 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
616 %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
620 define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
621 ; CHECK: test_vaddhn_high_s32:
622 ; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
624 %vaddhn.i.i = add <4 x i32> %a, %b
625 %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
626 %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
627 %0 = bitcast <4 x i16> %r to <1 x i64>
628 %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
629 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
630 %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
634 define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
635 ; CHECK: test_vaddhn_high_s64:
636 ; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
638 %vaddhn.i.i = add <2 x i64> %a, %b
639 %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
640 %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
641 %0 = bitcast <2 x i32> %r to <1 x i64>
642 %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
643 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
644 %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
648 define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
649 ; CHECK: test_vaddhn_high_u16:
650 ; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
652 %vaddhn.i.i = add <8 x i16> %a, %b
653 %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
654 %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
655 %0 = bitcast <8 x i8> %r to <1 x i64>
656 %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
657 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
658 %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
662 define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
663 ; CHECK: test_vaddhn_high_u32:
664 ; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
666 %vaddhn.i.i = add <4 x i32> %a, %b
667 %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
668 %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
669 %0 = bitcast <4 x i16> %r to <1 x i64>
670 %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
671 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
672 %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
676 define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
677 ; CHECK: test_vaddhn_high_u64:
678 ; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
680 %vaddhn.i.i = add <2 x i64> %a, %b
681 %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
682 %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
683 %0 = bitcast <2 x i32> %r to <1 x i64>
684 %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
685 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
686 %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
690 define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) {
691 ; CHECK: test_vraddhn_s16:
692 ; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
694 %vraddhn2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
695 ret <8 x i8> %vraddhn2.i
698 define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) {
699 ; CHECK: test_vraddhn_s32:
700 ; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
702 %vraddhn2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
703 ret <4 x i16> %vraddhn2.i
706 define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) {
707 ; CHECK: test_vraddhn_s64:
708 ; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
710 %vraddhn2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
711 ret <2 x i32> %vraddhn2.i
714 define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) {
715 ; CHECK: test_vraddhn_u16:
716 ; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
718 %vraddhn2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
719 ret <8 x i8> %vraddhn2.i
722 define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) {
723 ; CHECK: test_vraddhn_u32:
724 ; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
726 %vraddhn2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
727 ret <4 x i16> %vraddhn2.i
730 define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) {
731 ; CHECK: test_vraddhn_u64:
732 ; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
734 %vraddhn2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
735 ret <2 x i32> %vraddhn2.i
738 define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
739 ; CHECK: test_vraddhn_high_s16:
740 ; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
742 %vraddhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
743 %0 = bitcast <8 x i8> %r to <1 x i64>
744 %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
745 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
746 %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
750 define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
751 ; CHECK: test_vraddhn_high_s32:
752 ; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
754 %vraddhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
755 %0 = bitcast <4 x i16> %r to <1 x i64>
756 %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
757 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
758 %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
762 define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
763 ; CHECK: test_vraddhn_high_s64:
764 ; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
766 %vraddhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
767 %0 = bitcast <2 x i32> %r to <1 x i64>
768 %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
769 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
770 %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
774 define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
775 ; CHECK: test_vraddhn_high_u16:
776 ; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
778 %vraddhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
779 %0 = bitcast <8 x i8> %r to <1 x i64>
780 %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
781 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
782 %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
786 define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
787 ; CHECK: test_vraddhn_high_u32:
788 ; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
790 %vraddhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
791 %0 = bitcast <4 x i16> %r to <1 x i64>
792 %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
793 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
794 %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
798 define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
799 ; CHECK: test_vraddhn_high_u64:
800 ; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
802 %vraddhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
803 %0 = bitcast <2 x i32> %r to <1 x i64>
804 %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
805 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
806 %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
810 define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
811 ; CHECK: test_vsubhn_s16:
812 ; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
814 %vsubhn.i = sub <8 x i16> %a, %b
815 %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
816 %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
817 ret <8 x i8> %vsubhn2.i
820 define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
821 ; CHECK: test_vsubhn_s32:
822 ; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
824 %vsubhn.i = sub <4 x i32> %a, %b
825 %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
826 %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
827 ret <4 x i16> %vsubhn2.i
830 define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
831 ; CHECK: test_vsubhn_s64:
832 ; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
834 %vsubhn.i = sub <2 x i64> %a, %b
835 %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
836 %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
837 ret <2 x i32> %vsubhn2.i
840 define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
841 ; CHECK: test_vsubhn_u16:
842 ; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
844 %vsubhn.i = sub <8 x i16> %a, %b
845 %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
846 %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
847 ret <8 x i8> %vsubhn2.i
850 define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
851 ; CHECK: test_vsubhn_u32:
852 ; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
854 %vsubhn.i = sub <4 x i32> %a, %b
855 %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
856 %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
857 ret <4 x i16> %vsubhn2.i
860 define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
861 ; CHECK: test_vsubhn_u64:
862 ; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
864 %vsubhn.i = sub <2 x i64> %a, %b
865 %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
866 %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
867 ret <2 x i32> %vsubhn2.i
870 define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
871 ; CHECK: test_vsubhn_high_s16:
872 ; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
874 %vsubhn.i.i = sub <8 x i16> %a, %b
875 %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
876 %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
877 %0 = bitcast <8 x i8> %r to <1 x i64>
878 %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
879 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
880 %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
884 define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
885 ; CHECK: test_vsubhn_high_s32:
886 ; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
888 %vsubhn.i.i = sub <4 x i32> %a, %b
889 %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
890 %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
891 %0 = bitcast <4 x i16> %r to <1 x i64>
892 %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
893 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
894 %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
898 define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
899 ; CHECK: test_vsubhn_high_s64:
900 ; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
902 %vsubhn.i.i = sub <2 x i64> %a, %b
903 %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
904 %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
905 %0 = bitcast <2 x i32> %r to <1 x i64>
906 %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
907 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
908 %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
912 define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
913 ; CHECK: test_vsubhn_high_u16:
914 ; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
916 %vsubhn.i.i = sub <8 x i16> %a, %b
917 %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
918 %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
919 %0 = bitcast <8 x i8> %r to <1 x i64>
920 %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
921 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
922 %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
926 define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
927 ; CHECK: test_vsubhn_high_u32:
928 ; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
930 %vsubhn.i.i = sub <4 x i32> %a, %b
931 %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
932 %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
933 %0 = bitcast <4 x i16> %r to <1 x i64>
934 %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
935 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
936 %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
940 define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
941 ; CHECK: test_vsubhn_high_u64:
942 ; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
944 %vsubhn.i.i = sub <2 x i64> %a, %b
945 %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
946 %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
947 %0 = bitcast <2 x i32> %r to <1 x i64>
948 %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
949 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
950 %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
954 define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
955 ; CHECK: test_vrsubhn_s16:
956 ; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
958 %vrsubhn2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
959 ret <8 x i8> %vrsubhn2.i
962 define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
963 ; CHECK: test_vrsubhn_s32:
964 ; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
966 %vrsubhn2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
967 ret <4 x i16> %vrsubhn2.i
970 define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
971 ; CHECK: test_vrsubhn_s64:
972 ; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
974 %vrsubhn2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
975 ret <2 x i32> %vrsubhn2.i
978 define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
979 ; CHECK: test_vrsubhn_u16:
980 ; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
982 %vrsubhn2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
983 ret <8 x i8> %vrsubhn2.i
986 define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
987 ; CHECK: test_vrsubhn_u32:
988 ; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
990 %vrsubhn2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
991 ret <4 x i16> %vrsubhn2.i
994 define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
995 ; CHECK: test_vrsubhn_u64:
996 ; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
998 %vrsubhn2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
999 ret <2 x i32> %vrsubhn2.i
1002 define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1003 ; CHECK: test_vrsubhn_high_s16:
1004 ; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
1006 %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1007 %0 = bitcast <8 x i8> %r to <1 x i64>
1008 %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
1009 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1010 %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1014 define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1015 ; CHECK: test_vrsubhn_high_s32:
1016 ; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
1018 %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1019 %0 = bitcast <4 x i16> %r to <1 x i64>
1020 %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
1021 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1022 %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1026 define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1027 ; CHECK: test_vrsubhn_high_s64:
1028 ; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
1030 %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1031 %0 = bitcast <2 x i32> %r to <1 x i64>
1032 %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
1033 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1034 %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1038 define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1039 ; CHECK: test_vrsubhn_high_u16:
1040 ; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
1042 %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1043 %0 = bitcast <8 x i8> %r to <1 x i64>
1044 %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
1045 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1046 %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1050 define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1051 ; CHECK: test_vrsubhn_high_u32:
1052 ; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
1054 %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1055 %0 = bitcast <4 x i16> %r to <1 x i64>
1056 %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
1057 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1058 %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1062 define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1063 ; CHECK: test_vrsubhn_high_u64:
1064 ; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
1066 %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1067 %0 = bitcast <2 x i32> %r to <1 x i64>
1068 %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
1069 %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1070 %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1074 define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) {
1075 ; CHECK: test_vabdl_s8:
1076 ; CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
1078 %vabd.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b)
1079 %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
1080 ret <8 x i16> %vmovl.i.i
1083 define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) {
1084 ; CHECK: test_vabdl_s16:
1085 ; CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
1087 %vabd2.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b)
1088 %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
1089 ret <4 x i32> %vmovl.i.i
1092 define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) {
1093 ; CHECK: test_vabdl_s32:
1094 ; CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
1096 %vabd2.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b)
1097 %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
1098 ret <2 x i64> %vmovl.i.i
1101 define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) {
1102 ; CHECK: test_vabdl_u8:
1103 ; CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
1105 %vabd.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b)
1106 %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
1107 ret <8 x i16> %vmovl.i.i
1110 define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) {
1111 ; CHECK: test_vabdl_u16:
1112 ; CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
1114 %vabd2.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b)
1115 %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
1116 ret <4 x i32> %vmovl.i.i
1119 define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) {
1120 ; CHECK: test_vabdl_u32:
1121 ; CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
1123 %vabd2.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b)
1124 %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
1125 ret <2 x i64> %vmovl.i.i
1128 define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
1129 ; CHECK: test_vabal_s8:
1130 ; CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
1132 %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c)
1133 %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
1134 %add.i = add <8 x i16> %vmovl.i.i.i, %a
1135 ret <8 x i16> %add.i
1138 define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
1139 ; CHECK: test_vabal_s16:
1140 ; CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
1142 %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c)
1143 %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
1144 %add.i = add <4 x i32> %vmovl.i.i.i, %a
1145 ret <4 x i32> %add.i
1148 define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
1149 ; CHECK: test_vabal_s32:
1150 ; CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
1152 %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c)
1153 %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
1154 %add.i = add <2 x i64> %vmovl.i.i.i, %a
1155 ret <2 x i64> %add.i
1158 define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
1159 ; CHECK: test_vabal_u8:
1160 ; CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
1162 %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c)
1163 %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
1164 %add.i = add <8 x i16> %vmovl.i.i.i, %a
1165 ret <8 x i16> %add.i
1168 define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
1169 ; CHECK: test_vabal_u16:
1170 ; CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
1172 %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c)
1173 %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
1174 %add.i = add <4 x i32> %vmovl.i.i.i, %a
1175 ret <4 x i32> %add.i
1178 define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
1179 ; CHECK: test_vabal_u32:
1180 ; CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
1182 %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c)
1183 %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
1184 %add.i = add <2 x i64> %vmovl.i.i.i, %a
1185 ret <2 x i64> %add.i
1188 define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) {
1189 ; CHECK: test_vabdl_high_s8:
1190 ; CHECK: sabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
1192 %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1193 %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1194 %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1195 %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
1196 ret <8 x i16> %vmovl.i.i.i
1199 define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) {
1200 ; CHECK: test_vabdl_high_s16:
1201 ; CHECK: sabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
1203 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1204 %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1205 %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1206 %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
1207 ret <4 x i32> %vmovl.i.i.i
1210 define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) {
1211 ; CHECK: test_vabdl_high_s32:
1212 ; CHECK: sabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
1214 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1215 %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1216 %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1217 %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
1218 ret <2 x i64> %vmovl.i.i.i
1221 define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) {
1222 ; CHECK: test_vabdl_high_u8:
1223 ; CHECK: uabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
1225 %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1226 %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1227 %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1228 %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
1229 ret <8 x i16> %vmovl.i.i.i
1232 define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) {
1233 ; CHECK: test_vabdl_high_u16:
1234 ; CHECK: uabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
1236 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1237 %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1238 %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1239 %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
1240 ret <4 x i32> %vmovl.i.i.i
1243 define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) {
1244 ; CHECK: test_vabdl_high_u32:
1245 ; CHECK: uabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
1247 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1248 %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1249 %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1250 %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
1251 ret <2 x i64> %vmovl.i.i.i
1254 define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
1255 ; CHECK: test_vabal_high_s8:
1256 ; CHECK: sabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
1258 %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1259 %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1260 %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1261 %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
1262 %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
1263 ret <8 x i16> %add.i.i
1266 define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
1267 ; CHECK: test_vabal_high_s16:
1268 ; CHECK: sabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
1270 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1271 %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1272 %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1273 %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
1274 %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
1275 ret <4 x i32> %add.i.i
1278 define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
1279 ; CHECK: test_vabal_high_s32:
1280 ; CHECK: sabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
1282 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1283 %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1284 %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1285 %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
1286 %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
1287 ret <2 x i64> %add.i.i
1290 define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
1291 ; CHECK: test_vabal_high_u8:
1292 ; CHECK: uabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
1294 %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1295 %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1296 %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1297 %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
1298 %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
1299 ret <8 x i16> %add.i.i
1302 define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
1303 ; CHECK: test_vabal_high_u16:
1304 ; CHECK: uabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
1306 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1307 %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1308 %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1309 %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
1310 %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
1311 ret <4 x i32> %add.i.i
1314 define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
1315 ; CHECK: test_vabal_high_u32:
1316 ; CHECK: uabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
1318 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1319 %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1320 %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1321 %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
1322 %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
1323 ret <2 x i64> %add.i.i
1326 define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) {
1327 ; CHECK: test_vmull_s8:
1328 ; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
1330 %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b)
1331 ret <8 x i16> %vmull.i
1334 define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) {
1335 ; CHECK: test_vmull_s16:
1336 ; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
1338 %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %b)
1339 ret <4 x i32> %vmull2.i
1342 define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) {
1343 ; CHECK: test_vmull_s32:
1344 ; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
1346 %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %b)
1347 ret <2 x i64> %vmull2.i
1350 define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) {
1351 ; CHECK: test_vmull_u8:
1352 ; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
1354 %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b)
1355 ret <8 x i16> %vmull.i
1358 define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) {
1359 ; CHECK: test_vmull_u16:
1360 ; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
1362 %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %b)
1363 ret <4 x i32> %vmull2.i
1366 define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) {
1367 ; CHECK: test_vmull_u32:
1368 ; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
1370 %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %b)
1371 ret <2 x i64> %vmull2.i
1374 define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) {
1375 ; CHECK: test_vmull_high_s8:
1376 ; CHECK: smull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
1378 %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1379 %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1380 %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1381 ret <8 x i16> %vmull.i.i
1384 define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
1385 ; CHECK: test_vmull_high_s16:
1386 ; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
1388 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1389 %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1390 %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1391 ret <4 x i32> %vmull2.i.i
1394 define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
1395 ; CHECK: test_vmull_high_s32:
1396 ; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
1398 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1399 %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1400 %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1401 ret <2 x i64> %vmull2.i.i
1404 define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) {
1405 ; CHECK: test_vmull_high_u8:
1406 ; CHECK: umull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
1408 %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1409 %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1410 %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1411 ret <8 x i16> %vmull.i.i
1414 define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) {
1415 ; CHECK: test_vmull_high_u16:
1416 ; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
1418 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1419 %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1420 %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1421 ret <4 x i32> %vmull2.i.i
1424 define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) {
1425 ; CHECK: test_vmull_high_u32:
1426 ; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
1428 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1429 %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1430 %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1431 ret <2 x i64> %vmull2.i.i
1434 define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
1435 ; CHECK: test_vmlal_s8:
1436 ; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
1438 %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
1439 %add.i = add <8 x i16> %vmull.i.i, %a
1440 ret <8 x i16> %add.i
1443 define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
1444 ; CHECK: test_vmlal_s16:
1445 ; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
1447 %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
1448 %add.i = add <4 x i32> %vmull2.i.i, %a
1449 ret <4 x i32> %add.i
1452 define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
1453 ; CHECK: test_vmlal_s32:
1454 ; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
1456 %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
1457 %add.i = add <2 x i64> %vmull2.i.i, %a
1458 ret <2 x i64> %add.i
1461 define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
1462 ; CHECK: test_vmlal_u8:
1463 ; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
1465 %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
1466 %add.i = add <8 x i16> %vmull.i.i, %a
1467 ret <8 x i16> %add.i
1470 define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
1471 ; CHECK: test_vmlal_u16:
1472 ; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
1474 %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
1475 %add.i = add <4 x i32> %vmull2.i.i, %a
1476 ret <4 x i32> %add.i
1479 define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
1480 ; CHECK: test_vmlal_u32:
1481 ; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
1483 %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
1484 %add.i = add <2 x i64> %vmull2.i.i, %a
1485 ret <2 x i64> %add.i
1488 define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
1489 ; CHECK: test_vmlal_high_s8:
1490 ; CHECK: smlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
1492 %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1493 %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1494 %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1495 %add.i.i = add <8 x i16> %vmull.i.i.i, %a
1496 ret <8 x i16> %add.i.i
1499 define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
1500 ; CHECK: test_vmlal_high_s16:
1501 ; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
1503 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1504 %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1505 %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1506 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
1507 ret <4 x i32> %add.i.i
1510 define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
1511 ; CHECK: test_vmlal_high_s32:
1512 ; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
1514 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1515 %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1516 %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1517 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
1518 ret <2 x i64> %add.i.i
1521 define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
1522 ; CHECK: test_vmlal_high_u8:
1523 ; CHECK: umlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
1525 %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1526 %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1527 %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1528 %add.i.i = add <8 x i16> %vmull.i.i.i, %a
1529 ret <8 x i16> %add.i.i
1532 define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
1533 ; CHECK: test_vmlal_high_u16:
1534 ; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
1536 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1537 %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1538 %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1539 %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
1540 ret <4 x i32> %add.i.i
1543 define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
1544 ; CHECK: test_vmlal_high_u32:
1545 ; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
1547 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1548 %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1549 %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1550 %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
1551 ret <2 x i64> %add.i.i
1554 define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
1555 ; CHECK: test_vmlsl_s8:
1556 ; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
1558 %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
1559 %sub.i = sub <8 x i16> %a, %vmull.i.i
1560 ret <8 x i16> %sub.i
1563 define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
1564 ; CHECK: test_vmlsl_s16:
1565 ; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
1567 %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
1568 %sub.i = sub <4 x i32> %a, %vmull2.i.i
1569 ret <4 x i32> %sub.i
1572 define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
1573 ; CHECK: test_vmlsl_s32:
1574 ; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
1576 %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
1577 %sub.i = sub <2 x i64> %a, %vmull2.i.i
1578 ret <2 x i64> %sub.i
1581 define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
1582 ; CHECK: test_vmlsl_u8:
1583 ; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
1585 %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
1586 %sub.i = sub <8 x i16> %a, %vmull.i.i
1587 ret <8 x i16> %sub.i
1590 define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
1591 ; CHECK: test_vmlsl_u16:
1592 ; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
1594 %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
1595 %sub.i = sub <4 x i32> %a, %vmull2.i.i
1596 ret <4 x i32> %sub.i
1599 define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
1600 ; CHECK: test_vmlsl_u32:
1601 ; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
1603 %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
1604 %sub.i = sub <2 x i64> %a, %vmull2.i.i
1605 ret <2 x i64> %sub.i
1608 define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
1609 ; CHECK: test_vmlsl_high_s8:
1610 ; CHECK: smlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
1612 %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1613 %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1614 %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1615 %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
1616 ret <8 x i16> %sub.i.i
1619 define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
1620 ; CHECK: test_vmlsl_high_s16:
1621 ; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
1623 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1624 %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1625 %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1626 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
1627 ret <4 x i32> %sub.i.i
1630 define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
1631 ; CHECK: test_vmlsl_high_s32:
1632 ; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
1634 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1635 %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1636 %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1637 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
1638 ret <2 x i64> %sub.i.i
1641 define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
1642 ; CHECK: test_vmlsl_high_u8:
1643 ; CHECK: umlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
1645 %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1646 %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1647 %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1648 %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
1649 ret <8 x i16> %sub.i.i
1652 define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
1653 ; CHECK: test_vmlsl_high_u16:
1654 ; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
1656 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1657 %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1658 %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1659 %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
1660 ret <4 x i32> %sub.i.i
1663 define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
1664 ; CHECK: test_vmlsl_high_u32:
1665 ; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
1667 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1668 %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1669 %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1670 %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
1671 ret <2 x i64> %sub.i.i
1674 define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) {
1675 ; CHECK: test_vqdmull_s16:
1676 ; CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
1678 %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
1679 ret <4 x i32> %vqdmull2.i
1682 define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) {
1683 ; CHECK: test_vqdmull_s32:
1684 ; CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
1686 %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
1687 ret <2 x i64> %vqdmull2.i
1690 define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
1691 ; CHECK: test_vqdmlal_s16:
1692 ; CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
1694 %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
1695 %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
1696 ret <4 x i32> %vqdmlal4.i
1699 define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
1700 ; CHECK: test_vqdmlal_s32:
1701 ; CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
1703 %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
1704 %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
1705 ret <2 x i64> %vqdmlal4.i
1708 define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
1709 ; CHECK: test_vqdmlsl_s16:
1710 ; CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
1712 %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
1713 %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
1714 ret <4 x i32> %vqdmlsl4.i
1717 define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
1718 ; CHECK: test_vqdmlsl_s32:
1719 ; CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
1721 %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
1722 %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
1723 ret <2 x i64> %vqdmlsl4.i
1726 define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
1727 ; CHECK: test_vqdmull_high_s16:
1728 ; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
1730 %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1731 %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1732 %vqdmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1733 ret <4 x i32> %vqdmull2.i.i
1736 define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
1737 ; CHECK: test_vqdmull_high_s32:
1738 ; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
1740 %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1741 %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1742 %vqdmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1743 ret <2 x i64> %vqdmull2.i.i
1746 define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
1747 ; CHECK: test_vqdmlal_high_s16:
1748 ; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
1750 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1751 %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1752 %vqdmlal2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1753 %vqdmlal4.i.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
1754 ret <4 x i32> %vqdmlal4.i.i
1757 define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
1758 ; CHECK: test_vqdmlal_high_s32:
1759 ; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
1761 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1762 %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1763 %vqdmlal2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1764 %vqdmlal4.i.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
1765 ret <2 x i64> %vqdmlal4.i.i
1768 define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
1769 ; CHECK: test_vqdmlsl_high_s16:
1770 ; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
1772 %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1773 %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1774 %vqdmlsl2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1775 %vqdmlsl4.i.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
1776 ret <4 x i32> %vqdmlsl4.i.i
1779 define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
1780 ; CHECK: test_vqdmlsl_high_s32:
1781 ; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
1783 %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1784 %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1785 %vqdmlsl2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1786 %vqdmlsl4.i.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
1787 ret <2 x i64> %vqdmlsl4.i.i
1790 define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) {
1791 ; CHECK: test_vmull_p8:
1792 ; CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
1794 %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b)
1795 ret <8 x i16> %vmull.i
1798 define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) {
1799 ; CHECK: test_vmull_high_p8:
1800 ; CHECK: pmull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
1802 %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1803 %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1804 %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1805 ret <8 x i16> %vmull.i.i
1808 define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
1809 ; CHECK: test_vmull_p64
1810 ; CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
1812 %vmull.i = insertelement <1 x i64> undef, i64 %a, i32 0
1813 %vmull1.i = insertelement <1 x i64> undef, i64 %b, i32 0
1814 %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i, <1 x i64> %vmull1.i) #1
1815 %vmull3.i = bitcast <16 x i8> %vmull2.i to i128
1819 define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
1820 ; CHECK: test_vmull_high_p64
1821 ; CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
1823 %0 = extractelement <2 x i64> %a, i32 1
1824 %1 = extractelement <2 x i64> %b, i32 1
1825 %vmull.i.i = insertelement <1 x i64> undef, i64 %0, i32 0
1826 %vmull1.i.i = insertelement <1 x i64> undef, i64 %1, i32 0
1827 %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i.i, <1 x i64> %vmull1.i.i) #1
1828 %vmull3.i.i = bitcast <16 x i8> %vmull2.i.i to i128
1829 ret i128 %vmull3.i.i
1832 declare <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64>, <1 x i64>) #5