1 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
3 %struct.int8x16x2_t = type { [2 x <16 x i8>] }
4 %struct.int16x8x2_t = type { [2 x <8 x i16>] }
5 %struct.int32x4x2_t = type { [2 x <4 x i32>] }
6 %struct.int64x2x2_t = type { [2 x <2 x i64>] }
7 %struct.float32x4x2_t = type { [2 x <4 x float>] }
8 %struct.float64x2x2_t = type { [2 x <2 x double>] }
9 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
10 %struct.int16x4x2_t = type { [2 x <4 x i16>] }
11 %struct.int32x2x2_t = type { [2 x <2 x i32>] }
12 %struct.int64x1x2_t = type { [2 x <1 x i64>] }
13 %struct.float32x2x2_t = type { [2 x <2 x float>] }
14 %struct.float64x1x2_t = type { [2 x <1 x double>] }
15 %struct.int8x16x3_t = type { [3 x <16 x i8>] }
16 %struct.int16x8x3_t = type { [3 x <8 x i16>] }
17 %struct.int32x4x3_t = type { [3 x <4 x i32>] }
18 %struct.int64x2x3_t = type { [3 x <2 x i64>] }
19 %struct.float32x4x3_t = type { [3 x <4 x float>] }
20 %struct.float64x2x3_t = type { [3 x <2 x double>] }
21 %struct.int8x8x3_t = type { [3 x <8 x i8>] }
22 %struct.int16x4x3_t = type { [3 x <4 x i16>] }
23 %struct.int32x2x3_t = type { [3 x <2 x i32>] }
24 %struct.int64x1x3_t = type { [3 x <1 x i64>] }
25 %struct.float32x2x3_t = type { [3 x <2 x float>] }
26 %struct.float64x1x3_t = type { [3 x <1 x double>] }
27 %struct.int8x16x4_t = type { [4 x <16 x i8>] }
28 %struct.int16x8x4_t = type { [4 x <8 x i16>] }
29 %struct.int32x4x4_t = type { [4 x <4 x i32>] }
30 %struct.int64x2x4_t = type { [4 x <2 x i64>] }
31 %struct.float32x4x4_t = type { [4 x <4 x float>] }
32 %struct.float64x2x4_t = type { [4 x <2 x double>] }
33 %struct.int8x8x4_t = type { [4 x <8 x i8>] }
34 %struct.int16x4x4_t = type { [4 x <4 x i16>] }
35 %struct.int32x2x4_t = type { [4 x <2 x i32>] }
36 %struct.int64x1x4_t = type { [4 x <1 x i64>] }
37 %struct.float32x2x4_t = type { [4 x <2 x float>] }
38 %struct.float64x1x4_t = type { [4 x <1 x double>] }
40 define <16 x i8> @test_ld_from_poll_v16i8(<16 x i8> %a) {
41 ; CHECK-LABEL: test_ld_from_poll_v16i8
42 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
43 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
45 %b = add <16 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 2, i8 13, i8 14, i8 15, i8 16>
49 define <8 x i16> @test_ld_from_poll_v8i16(<8 x i16> %a) {
50 ; CHECK-LABEL: test_ld_from_poll_v8i16
51 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
52 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
54 %b = add <8 x i16> %a, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
58 define <4 x i32> @test_ld_from_poll_v4i32(<4 x i32> %a) {
59 ; CHECK-LABEL: test_ld_from_poll_v4i32
60 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
61 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
63 %b = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
67 define <2 x i64> @test_ld_from_poll_v2i64(<2 x i64> %a) {
68 ; CHECK-LABEL: test_ld_from_poll_v2i64
69 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
70 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
72 %b = add <2 x i64> %a, <i64 1, i64 2>
76 define <4 x float> @test_ld_from_poll_v4f32(<4 x float> %a) {
77 ; CHECK-LABEL: test_ld_from_poll_v4f32
78 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
79 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
81 %b = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
85 define <2 x double> @test_ld_from_poll_v2f64(<2 x double> %a) {
86 ; CHECK-LABEL: test_ld_from_poll_v2f64
87 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
88 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
90 %b = fadd <2 x double> %a, <double 1.0, double 2.0>
94 define <8 x i8> @test_ld_from_poll_v8i8(<8 x i8> %a) {
95 ; CHECK-LABEL: test_ld_from_poll_v8i8
96 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
97 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
99 %b = add <8 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
103 define <4 x i16> @test_ld_from_poll_v4i16(<4 x i16> %a) {
104 ; CHECK-LABEL: test_ld_from_poll_v4i16
105 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
106 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
108 %b = add <4 x i16> %a, <i16 1, i16 2, i16 3, i16 4>
112 define <2 x i32> @test_ld_from_poll_v2i32(<2 x i32> %a) {
113 ; CHECK-LABEL: test_ld_from_poll_v2i32
114 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
115 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
117 %b = add <2 x i32> %a, <i32 1, i32 2>
121 define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
122 ; CHECK-LABEL: test_vld1q_dup_s8
123 ; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0]
125 %0 = load i8* %a, align 1
126 %1 = insertelement <16 x i8> undef, i8 %0, i32 0
127 %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
131 define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
132 ; CHECK-LABEL: test_vld1q_dup_s16
133 ; CHECK: ld1r {{{v[0-9]+}}.8h}, [x0]
135 %0 = load i16* %a, align 2
136 %1 = insertelement <8 x i16> undef, i16 %0, i32 0
137 %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
141 define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
142 ; CHECK-LABEL: test_vld1q_dup_s32
143 ; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
145 %0 = load i32* %a, align 4
146 %1 = insertelement <4 x i32> undef, i32 %0, i32 0
147 %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
151 define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
152 ; CHECK-LABEL: test_vld1q_dup_s64
153 ; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
155 %0 = load i64* %a, align 8
156 %1 = insertelement <2 x i64> undef, i64 %0, i32 0
157 %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
161 define <4 x float> @test_vld1q_dup_f32(float* %a) {
162 ; CHECK-LABEL: test_vld1q_dup_f32
163 ; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
165 %0 = load float* %a, align 4
166 %1 = insertelement <4 x float> undef, float %0, i32 0
167 %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
168 ret <4 x float> %lane
171 define <2 x double> @test_vld1q_dup_f64(double* %a) {
172 ; CHECK-LABEL: test_vld1q_dup_f64
173 ; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
175 %0 = load double* %a, align 8
176 %1 = insertelement <2 x double> undef, double %0, i32 0
177 %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
178 ret <2 x double> %lane
181 define <8 x i8> @test_vld1_dup_s8(i8* %a) {
182 ; CHECK-LABEL: test_vld1_dup_s8
183 ; CHECK: ld1r {{{v[0-9]+}}.8b}, [x0]
185 %0 = load i8* %a, align 1
186 %1 = insertelement <8 x i8> undef, i8 %0, i32 0
187 %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
191 define <4 x i16> @test_vld1_dup_s16(i16* %a) {
192 ; CHECK-LABEL: test_vld1_dup_s16
193 ; CHECK: ld1r {{{v[0-9]+}}.4h}, [x0]
195 %0 = load i16* %a, align 2
196 %1 = insertelement <4 x i16> undef, i16 %0, i32 0
197 %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
201 define <2 x i32> @test_vld1_dup_s32(i32* %a) {
202 ; CHECK-LABEL: test_vld1_dup_s32
203 ; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
205 %0 = load i32* %a, align 4
206 %1 = insertelement <2 x i32> undef, i32 %0, i32 0
207 %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
211 define <1 x i64> @test_vld1_dup_s64(i64* %a) {
212 ; CHECK-LABEL: test_vld1_dup_s64
213 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
215 %0 = load i64* %a, align 8
216 %1 = insertelement <1 x i64> undef, i64 %0, i32 0
220 define <2 x float> @test_vld1_dup_f32(float* %a) {
221 ; CHECK-LABEL: test_vld1_dup_f32
222 ; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
224 %0 = load float* %a, align 4
225 %1 = insertelement <2 x float> undef, float %0, i32 0
226 %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
227 ret <2 x float> %lane
230 define <1 x double> @test_vld1_dup_f64(double* %a) {
231 ; CHECK-LABEL: test_vld1_dup_f64
232 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
234 %0 = load double* %a, align 8
235 %1 = insertelement <1 x double> undef, double %0, i32 0
239 define <1 x i64> @testDUP.v1i64(i64* %a, i64* %b) #0 {
240 ; As there is a store operation depending on %1, LD1R pattern can't be selected.
241 ; So LDR and FMOV should be emitted.
242 ; CHECK-LABEL: testDUP.v1i64
243 ; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
244 ; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
245 ; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}]
246 %1 = load i64* %a, align 8
247 store i64 %1, i64* %b, align 8
248 %vecinit.i = insertelement <1 x i64> undef, i64 %1, i32 0
249 ret <1 x i64> %vecinit.i
252 define <1 x double> @testDUP.v1f64(double* %a, double* %b) #0 {
253 ; As there is a store operation depending on %1, LD1R pattern can't be selected.
254 ; So LDR and FMOV should be emitted.
255 ; CHECK-LABEL: testDUP.v1f64
256 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
257 ; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}]
258 %1 = load double* %a, align 8
259 store double %1, double* %b, align 8
260 %vecinit.i = insertelement <1 x double> undef, double %1, i32 0
261 ret <1 x double> %vecinit.i
264 define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) {
265 ; CHECK-LABEL: test_vld2q_dup_s8
266 ; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
268 %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
269 %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0
270 %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
271 %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1
272 %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
273 %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0
274 %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
275 ret %struct.int8x16x2_t %.fca.0.1.insert
278 define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) {
279 ; CHECK-LABEL: test_vld2q_dup_s16
280 ; CHECK: ld2r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
282 %0 = bitcast i16* %a to i8*
283 %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
284 %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0
285 %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
286 %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1
287 %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
288 %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0
289 %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
290 ret %struct.int16x8x2_t %.fca.0.1.insert
293 define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) {
294 ; CHECK-LABEL: test_vld2q_dup_s32
295 ; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
297 %0 = bitcast i32* %a to i8*
298 %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
299 %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0
300 %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
301 %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1
302 %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
303 %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0
304 %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
305 ret %struct.int32x4x2_t %.fca.0.1.insert
308 define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) {
309 ; CHECK-LABEL: test_vld2q_dup_s64
310 ; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
312 %0 = bitcast i64* %a to i8*
313 %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
314 %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0
315 %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
316 %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1
317 %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
318 %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0
319 %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
320 ret %struct.int64x2x2_t %.fca.0.1.insert
323 define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) {
324 ; CHECK-LABEL: test_vld2q_dup_f32
325 ; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
327 %0 = bitcast float* %a to i8*
328 %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
329 %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0
330 %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
331 %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1
332 %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
333 %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0
334 %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
335 ret %struct.float32x4x2_t %.fca.0.1.insert
338 define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) {
339 ; CHECK-LABEL: test_vld2q_dup_f64
340 ; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
342 %0 = bitcast double* %a to i8*
343 %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
344 %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0
345 %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
346 %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1
347 %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
348 %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0
349 %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
350 ret %struct.float64x2x2_t %.fca.0.1.insert
353 define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) {
354 ; CHECK-LABEL: test_vld2_dup_s8
355 ; CHECK: ld2r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
357 %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
358 %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0
359 %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
360 %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1
361 %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
362 %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0
363 %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
364 ret %struct.int8x8x2_t %.fca.0.1.insert
367 define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) {
368 ; CHECK-LABEL: test_vld2_dup_s16
369 ; CHECK: ld2r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
371 %0 = bitcast i16* %a to i8*
372 %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
373 %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0
374 %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
375 %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1
376 %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
377 %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0
378 %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
379 ret %struct.int16x4x2_t %.fca.0.1.insert
382 define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) {
383 ; CHECK-LABEL: test_vld2_dup_s32
384 ; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
386 %0 = bitcast i32* %a to i8*
387 %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
388 %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0
389 %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
390 %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1
391 %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
392 %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0
393 %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
394 ret %struct.int32x2x2_t %.fca.0.1.insert
397 define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) {
398 ; CHECK-LABEL: test_vld2_dup_s64
399 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
401 %0 = bitcast i64* %a to i8*
402 %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8)
403 %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0
404 %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1
405 %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
406 %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
407 ret %struct.int64x1x2_t %.fca.0.1.insert
410 define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) {
411 ; CHECK-LABEL: test_vld2_dup_f32
412 ; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
414 %0 = bitcast float* %a to i8*
415 %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
416 %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0
417 %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
418 %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1
419 %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
420 %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0
421 %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
422 ret %struct.float32x2x2_t %.fca.0.1.insert
425 define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) {
426 ; CHECK-LABEL: test_vld2_dup_f64
427 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
429 %0 = bitcast double* %a to i8*
430 %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8)
431 %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0
432 %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1
433 %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
434 %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
435 ret %struct.float64x1x2_t %.fca.0.1.insert
438 define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) {
439 ; CHECK-LABEL: test_vld3q_dup_s8
440 ; CHECK: ld3r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
442 %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
443 %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
444 %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
445 %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
446 %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
447 %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
448 %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
449 %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0
450 %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
451 %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
452 ret %struct.int8x16x3_t %.fca.0.2.insert
455 define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) {
456 ; CHECK-LABEL: test_vld3q_dup_s16
457 ; CHECK: ld3r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
459 %0 = bitcast i16* %a to i8*
460 %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
461 %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
462 %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
463 %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
464 %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
465 %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
466 %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
467 %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0
468 %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
469 %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
470 ret %struct.int16x8x3_t %.fca.0.2.insert
473 define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) {
474 ; CHECK-LABEL: test_vld3q_dup_s32
475 ; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
477 %0 = bitcast i32* %a to i8*
478 %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
479 %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
480 %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
481 %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
482 %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
483 %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
484 %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
485 %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0
486 %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
487 %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
488 ret %struct.int32x4x3_t %.fca.0.2.insert
491 define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) {
492 ; CHECK-LABEL: test_vld3q_dup_s64
493 ; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
495 %0 = bitcast i64* %a to i8*
496 %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
497 %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
498 %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
499 %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
500 %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
501 %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
502 %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
503 %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0
504 %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
505 %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
506 ret %struct.int64x2x3_t %.fca.0.2.insert
509 define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) {
510 ; CHECK-LABEL: test_vld3q_dup_f32
511 ; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
513 %0 = bitcast float* %a to i8*
514 %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
515 %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
516 %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
517 %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
518 %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
519 %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
520 %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
521 %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0
522 %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
523 %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
524 ret %struct.float32x4x3_t %.fca.0.2.insert
527 define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) {
528 ; CHECK-LABEL: test_vld3q_dup_f64
529 ; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
531 %0 = bitcast double* %a to i8*
532 %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
533 %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
534 %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
535 %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
536 %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
537 %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
538 %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
539 %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0
540 %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
541 %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
542 ret %struct.float64x2x3_t %.fca.0.2.insert
545 define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) {
546 ; CHECK-LABEL: test_vld3_dup_s8
547 ; CHECK: ld3r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
549 %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
550 %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
551 %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
552 %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
553 %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
554 %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
555 %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
556 %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0
557 %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
558 %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
559 ret %struct.int8x8x3_t %.fca.0.2.insert
562 define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) {
563 ; CHECK-LABEL: test_vld3_dup_s16
564 ; CHECK: ld3r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
566 %0 = bitcast i16* %a to i8*
567 %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
568 %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
569 %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
570 %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
571 %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
572 %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
573 %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
574 %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0
575 %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
576 %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
577 ret %struct.int16x4x3_t %.fca.0.2.insert
580 define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) {
581 ; CHECK-LABEL: test_vld3_dup_s32
582 ; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
584 %0 = bitcast i32* %a to i8*
585 %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
586 %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
587 %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
588 %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
589 %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
590 %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
591 %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
592 %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0
593 %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
594 %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
595 ret %struct.int32x2x3_t %.fca.0.2.insert
598 define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) {
599 ; CHECK-LABEL: test_vld3_dup_s64
600 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
602 %0 = bitcast i64* %a to i8*
603 %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8)
604 %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
605 %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
606 %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
607 %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
608 %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
609 %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
610 ret %struct.int64x1x3_t %.fca.0.2.insert
613 define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) {
614 ; CHECK-LABEL: test_vld3_dup_f32
615 ; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
617 %0 = bitcast float* %a to i8*
618 %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
619 %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
620 %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
621 %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
622 %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
623 %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
624 %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
625 %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0
626 %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
627 %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
628 ret %struct.float32x2x3_t %.fca.0.2.insert
631 define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) {
632 ; CHECK-LABEL: test_vld3_dup_f64
633 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
635 %0 = bitcast double* %a to i8*
636 %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8)
637 %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
638 %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
639 %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
640 %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
641 %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
642 %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
643 ret %struct.float64x1x3_t %.fca.0.2.insert
646 define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) {
647 ; CHECK-LABEL: test_vld4q_dup_s8
648 ; CHECK: ld4r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
650 %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
651 %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
652 %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
653 %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
654 %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
655 %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
656 %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
657 %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3
658 %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer
659 %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0
660 %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
661 %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
662 %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3
663 ret %struct.int8x16x4_t %.fca.0.3.insert
666 define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) {
667 ; CHECK-LABEL: test_vld4q_dup_s16
668 ; CHECK: ld4r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
670 %0 = bitcast i16* %a to i8*
671 %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
672 %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
673 %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
674 %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
675 %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
676 %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
677 %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
678 %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3
679 %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer
680 %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0
681 %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
682 %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
683 %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3
684 ret %struct.int16x8x4_t %.fca.0.3.insert
687 define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) {
688 ; CHECK-LABEL: test_vld4q_dup_s32
689 ; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
691 %0 = bitcast i32* %a to i8*
692 %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
693 %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
694 %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
695 %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
696 %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
697 %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
698 %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
699 %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3
700 %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer
701 %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0
702 %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
703 %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
704 %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3
705 ret %struct.int32x4x4_t %.fca.0.3.insert
708 define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) {
709 ; CHECK-LABEL: test_vld4q_dup_s64
710 ; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
712 %0 = bitcast i64* %a to i8*
713 %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
714 %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
715 %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
716 %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
717 %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
718 %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
719 %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
720 %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3
721 %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer
722 %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0
723 %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
724 %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
725 %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3
726 ret %struct.int64x2x4_t %.fca.0.3.insert
729 define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) {
730 ; CHECK-LABEL: test_vld4q_dup_f32
731 ; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
733 %0 = bitcast float* %a to i8*
734 %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
735 %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
736 %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
737 %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
738 %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
739 %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
740 %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
741 %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3
742 %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer
743 %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0
744 %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
745 %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
746 %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3
747 ret %struct.float32x4x4_t %.fca.0.3.insert
750 define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) {
751 ; CHECK-LABEL: test_vld4q_dup_f64
752 ; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
754 %0 = bitcast double* %a to i8*
755 %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
756 %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
757 %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
758 %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
759 %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
760 %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
761 %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
762 %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3
763 %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer
764 %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0
765 %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
766 %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
767 %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3
768 ret %struct.float64x2x4_t %.fca.0.3.insert
771 define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) {
772 ; CHECK-LABEL: test_vld4_dup_s8
773 ; CHECK: ld4r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
775 %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
776 %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
777 %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
778 %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
779 %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
780 %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
781 %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
782 %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3
783 %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer
784 %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0
785 %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
786 %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
787 %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3
788 ret %struct.int8x8x4_t %.fca.0.3.insert
791 define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) {
792 ; CHECK-LABEL: test_vld4_dup_s16
793 ; CHECK: ld4r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
795 %0 = bitcast i16* %a to i8*
796 %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
797 %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
798 %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
799 %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
800 %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
801 %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
802 %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
803 %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3
804 %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer
805 %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0
806 %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
807 %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
808 %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3
809 ret %struct.int16x4x4_t %.fca.0.3.insert
812 define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) {
813 ; CHECK-LABEL: test_vld4_dup_s32
814 ; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
816 %0 = bitcast i32* %a to i8*
817 %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
818 %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
819 %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
820 %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
821 %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
822 %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
823 %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
824 %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3
825 %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer
826 %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0
827 %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
828 %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
829 %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3
830 ret %struct.int32x2x4_t %.fca.0.3.insert
833 define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) {
834 ; CHECK-LABEL: test_vld4_dup_s64
835 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
837 %0 = bitcast i64* %a to i8*
838 %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8)
839 %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
840 %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
841 %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
842 %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3
843 %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
844 %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
845 %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
846 %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3
847 ret %struct.int64x1x4_t %.fca.0.3.insert
850 define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) {
851 ; CHECK-LABEL: test_vld4_dup_f32
852 ; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
854 %0 = bitcast float* %a to i8*
855 %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
856 %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
857 %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
858 %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
859 %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
860 %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
861 %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
862 %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3
863 %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer
864 %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0
865 %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
866 %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
867 %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3
868 ret %struct.float32x2x4_t %.fca.0.3.insert
871 define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) {
872 ; CHECK-LABEL: test_vld4_dup_f64
873 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
875 %0 = bitcast double* %a to i8*
876 %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8)
877 %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
878 %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
879 %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
880 %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3
881 %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
882 %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
883 %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
884 %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3
885 ret %struct.float64x1x4_t %.fca.0.3.insert
888 define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
889 ; CHECK-LABEL: test_vld1q_lane_s8
890 ; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
892 %0 = load i8* %a, align 1
893 %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
894 ret <16 x i8> %vld1_lane
897 define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
898 ; CHECK-LABEL: test_vld1q_lane_s16
899 ; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
901 %0 = load i16* %a, align 2
902 %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
903 ret <8 x i16> %vld1_lane
906 define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
907 ; CHECK-LABEL: test_vld1q_lane_s32
908 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
910 %0 = load i32* %a, align 4
911 %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
912 ret <4 x i32> %vld1_lane
915 define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
916 ; CHECK-LABEL: test_vld1q_lane_s64
917 ; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
919 %0 = load i64* %a, align 8
920 %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
921 ret <2 x i64> %vld1_lane
924 define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
925 ; CHECK-LABEL: test_vld1q_lane_f32
926 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
928 %0 = load float* %a, align 4
929 %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
930 ret <4 x float> %vld1_lane
933 define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
934 ; CHECK-LABEL: test_vld1q_lane_f64
935 ; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
937 %0 = load double* %a, align 8
938 %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
939 ret <2 x double> %vld1_lane
942 define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
943 ; CHECK-LABEL: test_vld1_lane_s8
944 ; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
946 %0 = load i8* %a, align 1
947 %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
948 ret <8 x i8> %vld1_lane
951 define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
952 ; CHECK-LABEL: test_vld1_lane_s16
953 ; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
955 %0 = load i16* %a, align 2
956 %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
957 ret <4 x i16> %vld1_lane
960 define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
961 ; CHECK-LABEL: test_vld1_lane_s32
962 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
964 %0 = load i32* %a, align 4
965 %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
966 ret <2 x i32> %vld1_lane
969 define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
970 ; CHECK-LABEL: test_vld1_lane_s64
971 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
973 %0 = load i64* %a, align 8
974 %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
975 ret <1 x i64> %vld1_lane
978 define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
979 ; CHECK-LABEL: test_vld1_lane_f32
980 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
982 %0 = load float* %a, align 4
983 %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
984 ret <2 x float> %vld1_lane
987 define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
988 ; CHECK-LABEL: test_vld1_lane_f64
989 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
991 %0 = load double* %a, align 8
992 %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
993 ret <1 x double> %vld1_lane
996 define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
997 ; CHECK-LABEL: test_vld2q_lane_s16
998 ; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1000 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
1001 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
1002 %0 = bitcast i16* %a to i8*
1003 %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
1004 %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0
1005 %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1
1006 %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0
1007 %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1
1008 ret %struct.int16x8x2_t %.fca.0.1.insert
1011 define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
1012 ; CHECK-LABEL: test_vld2q_lane_s32
1013 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1015 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
1016 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
1017 %0 = bitcast i32* %a to i8*
1018 %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
1019 %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0
1020 %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1
1021 %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0
1022 %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1
1023 ret %struct.int32x4x2_t %.fca.0.1.insert
1026 define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
1027 ; CHECK-LABEL: test_vld2q_lane_s64
1028 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1030 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
1031 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
1032 %0 = bitcast i64* %a to i8*
1033 %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
1034 %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0
1035 %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1
1036 %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0
1037 %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1
1038 ret %struct.int64x2x2_t %.fca.0.1.insert
1041 define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
1042 ; CHECK-LABEL: test_vld2q_lane_f32
1043 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1045 %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
1046 %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
1047 %0 = bitcast float* %a to i8*
1048 %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
1049 %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0
1050 %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1
1051 %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0
1052 %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1
1053 ret %struct.float32x4x2_t %.fca.0.1.insert
1056 define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
1057 ; CHECK-LABEL: test_vld2q_lane_f64
1058 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1060 %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
1061 %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
1062 %0 = bitcast double* %a to i8*
1063 %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
1064 %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0
1065 %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1
1066 %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0
1067 %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1
1068 ret %struct.float64x2x2_t %.fca.0.1.insert
1071 define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
1072 ; CHECK-LABEL: test_vld2_lane_s8
1073 ; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1075 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
1076 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
1077 %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
1078 %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0
1079 %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1
1080 %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0
1081 %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1
1082 ret %struct.int8x8x2_t %.fca.0.1.insert
1085 define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
1086 ; CHECK-LABEL: test_vld2_lane_s16
1087 ; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1089 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
1090 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
1091 %0 = bitcast i16* %a to i8*
1092 %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
1093 %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0
1094 %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1
1095 %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0
1096 %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1
1097 ret %struct.int16x4x2_t %.fca.0.1.insert
1100 define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
1101 ; CHECK-LABEL: test_vld2_lane_s32
1102 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1104 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
1105 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
1106 %0 = bitcast i32* %a to i8*
1107 %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
1108 %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0
1109 %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1
1110 %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0
1111 %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1
1112 ret %struct.int32x2x2_t %.fca.0.1.insert
1115 define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
1116 ; CHECK-LABEL: test_vld2_lane_s64
1117 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1119 %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
1120 %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
1121 %0 = bitcast i64* %a to i8*
1122 %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
1123 %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0
1124 %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1
1125 %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0
1126 %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1
1127 ret %struct.int64x1x2_t %.fca.0.1.insert
1130 define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
1131 ; CHECK-LABEL: test_vld2_lane_f32
1132 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1134 %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
1135 %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
1136 %0 = bitcast float* %a to i8*
1137 %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
1138 %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0
1139 %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1
1140 %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0
1141 %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1
1142 ret %struct.float32x2x2_t %.fca.0.1.insert
1145 define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
1146 ; CHECK-LABEL: test_vld2_lane_f64
1147 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1149 %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
1150 %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
1151 %0 = bitcast double* %a to i8*
1152 %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
1153 %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0
1154 %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1
1155 %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0
1156 %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1
1157 ret %struct.float64x1x2_t %.fca.0.1.insert
1160 define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
1161 ; CHECK-LABEL: test_vld3q_lane_s16
1162 ; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1164 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
1165 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
1166 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
1167 %0 = bitcast i16* %a to i8*
1168 %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
1169 %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
1170 %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
1171 %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
1172 %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
1173 %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
1174 %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
1175 ret %struct.int16x8x3_t %.fca.0.2.insert
1178 define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
1179 ; CHECK-LABEL: test_vld3q_lane_s32
1180 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1182 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
1183 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
1184 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
1185 %0 = bitcast i32* %a to i8*
1186 %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
1187 %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
1188 %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
1189 %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
1190 %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
1191 %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
1192 %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
1193 ret %struct.int32x4x3_t %.fca.0.2.insert
1196 define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
1197 ; CHECK-LABEL: test_vld3q_lane_s64
1198 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1200 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
1201 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
1202 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
1203 %0 = bitcast i64* %a to i8*
1204 %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
1205 %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
1206 %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
1207 %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
1208 %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
1209 %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
1210 %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
1211 ret %struct.int64x2x3_t %.fca.0.2.insert
1214 define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
1215 ; CHECK-LABEL: test_vld3q_lane_f32
1216 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1218 %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
1219 %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
1220 %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
1221 %0 = bitcast float* %a to i8*
1222 %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
1223 %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
1224 %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
1225 %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
1226 %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
1227 %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
1228 %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
1229 ret %struct.float32x4x3_t %.fca.0.2.insert
1232 define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
1233 ; CHECK-LABEL: test_vld3q_lane_f64
1234 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1236 %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
1237 %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
1238 %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
1239 %0 = bitcast double* %a to i8*
1240 %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
1241 %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
1242 %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
1243 %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
1244 %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
1245 %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
1246 %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
1247 ret %struct.float64x2x3_t %.fca.0.2.insert
1250 define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
1251 ; CHECK-LABEL: test_vld3_lane_s8
1252 ; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1254 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
1255 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
1256 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
1257 %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
1258 %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
1259 %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
1260 %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
1261 %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
1262 %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
1263 %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
1264 ret %struct.int8x8x3_t %.fca.0.2.insert
1267 define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
1268 ; CHECK-LABEL: test_vld3_lane_s16
1269 ; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1271 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
1272 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
1273 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
1274 %0 = bitcast i16* %a to i8*
1275 %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
1276 %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
1277 %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
1278 %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
1279 %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
1280 %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
1281 %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
1282 ret %struct.int16x4x3_t %.fca.0.2.insert
1285 define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
1286 ; CHECK-LABEL: test_vld3_lane_s32
1287 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1289 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
1290 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1291 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1292 %0 = bitcast i32* %a to i8*
1293 %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
1294 %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
1295 %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
1296 %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
1297 %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
1298 %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
1299 %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
1300 ret %struct.int32x2x3_t %.fca.0.2.insert
1303 define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1304 ; CHECK-LABEL: test_vld3_lane_s64
1305 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1307 %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1308 %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1309 %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1310 %0 = bitcast i64* %a to i8*
1311 %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
1312 %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
1313 %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
1314 %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
1315 %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
1316 %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
1317 %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
1318 ret %struct.int64x1x3_t %.fca.0.2.insert
1321 define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1322 ; CHECK-LABEL: test_vld3_lane_f32
1323 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1325 %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1326 %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1327 %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1328 %0 = bitcast float* %a to i8*
1329 %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
1330 %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
1331 %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
1332 %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
1333 %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
1334 %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
1335 %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
1336 ret %struct.float32x2x3_t %.fca.0.2.insert
1339 define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1340 ; CHECK-LABEL: test_vld3_lane_f64
1341 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1343 %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1344 %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1345 %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1346 %0 = bitcast double* %a to i8*
1347 %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
1348 %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
1349 %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
1350 %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
1351 %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
1352 %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
1353 %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
1354 ret %struct.float64x1x3_t %.fca.0.2.insert
1357 define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
1358 ; CHECK-LABEL: test_vld4q_lane_s8
1359 ; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1361 %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1362 %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1363 %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1364 %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1365 %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1)
1366 %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
1367 %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
1368 %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
1369 %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3
1370 %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
1371 %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
1372 %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
1373 %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3
1374 ret %struct.int8x16x4_t %.fca.0.3.insert
1377 define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
1378 ; CHECK-LABEL: test_vld4q_lane_s16
1379 ; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1381 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
1382 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
1383 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
1384 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
1385 %0 = bitcast i16* %a to i8*
1386 %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
1387 %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
1388 %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
1389 %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
1390 %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3
1391 %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
1392 %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
1393 %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
1394 %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3
1395 ret %struct.int16x8x4_t %.fca.0.3.insert
1398 define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
1399 ; CHECK-LABEL: test_vld4q_lane_s32
1400 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1402 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
1403 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
1404 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
1405 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
1406 %0 = bitcast i32* %a to i8*
1407 %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
1408 %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
1409 %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
1410 %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
1411 %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3
1412 %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
1413 %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
1414 %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
1415 %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3
1416 ret %struct.int32x4x4_t %.fca.0.3.insert
1419 define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
1420 ; CHECK-LABEL: test_vld4q_lane_s64
1421 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1423 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
1424 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
1425 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
1426 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
1427 %0 = bitcast i64* %a to i8*
1428 %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
1429 %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
1430 %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
1431 %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
1432 %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3
1433 %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
1434 %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
1435 %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
1436 %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3
1437 ret %struct.int64x2x4_t %.fca.0.3.insert
1440 define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
1441 ; CHECK-LABEL: test_vld4q_lane_f32
1442 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1444 %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
1445 %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
1446 %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
1447 %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
1448 %0 = bitcast float* %a to i8*
1449 %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
1450 %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
1451 %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
1452 %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
1453 %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3
1454 %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
1455 %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
1456 %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
1457 %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3
1458 ret %struct.float32x4x4_t %.fca.0.3.insert
1461 define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
1462 ; CHECK-LABEL: test_vld4q_lane_f64
1463 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1465 %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
1466 %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
1467 %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
1468 %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
1469 %0 = bitcast double* %a to i8*
1470 %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
1471 %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
1472 %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
1473 %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
1474 %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3
1475 %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
1476 %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
1477 %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
1478 %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3
1479 ret %struct.float64x2x4_t %.fca.0.3.insert
1482 define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
1483 ; CHECK-LABEL: test_vld4_lane_s8
1484 ; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1486 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
1487 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
1488 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
1489 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
1490 %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
1491 %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
1492 %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
1493 %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
1494 %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3
1495 %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
1496 %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
1497 %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
1498 %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3
1499 ret %struct.int8x8x4_t %.fca.0.3.insert
1502 define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
1503 ; CHECK-LABEL: test_vld4_lane_s16
1504 ; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1506 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
1507 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
1508 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
1509 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
1510 %0 = bitcast i16* %a to i8*
1511 %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
1512 %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
1513 %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
1514 %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
1515 %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3
1516 %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
1517 %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
1518 %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
1519 %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3
1520 ret %struct.int16x4x4_t %.fca.0.3.insert
1523 define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
1524 ; CHECK-LABEL: test_vld4_lane_s32
1525 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1527 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
1528 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
1529 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
1530 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
1531 %0 = bitcast i32* %a to i8*
1532 %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
1533 %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
1534 %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
1535 %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
1536 %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3
1537 %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
1538 %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
1539 %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
1540 %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3
1541 ret %struct.int32x2x4_t %.fca.0.3.insert
1544 define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
1545 ; CHECK-LABEL: test_vld4_lane_s64
1546 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1548 %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
1549 %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
1550 %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
1551 %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
1552 %0 = bitcast i64* %a to i8*
1553 %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
1554 %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
1555 %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
1556 %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
1557 %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3
1558 %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
1559 %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
1560 %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
1561 %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3
1562 ret %struct.int64x1x4_t %.fca.0.3.insert
1565 define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
1566 ; CHECK-LABEL: test_vld4_lane_f32
1567 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1569 %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
1570 %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
1571 %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
1572 %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
1573 %0 = bitcast float* %a to i8*
1574 %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
1575 %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
1576 %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
1577 %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
1578 %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3
1579 %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
1580 %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
1581 %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
1582 %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3
1583 ret %struct.float32x2x4_t %.fca.0.3.insert
1586 define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
1587 ; CHECK-LABEL: test_vld4_lane_f64
1588 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1590 %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
1591 %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
1592 %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
1593 %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
1594 %0 = bitcast double* %a to i8*
1595 %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
1596 %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
1597 %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
1598 %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
1599 %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3
1600 %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
1601 %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
1602 %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
1603 %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3
1604 ret %struct.float64x1x4_t %.fca.0.3.insert
1607 define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
1608 ; CHECK-LABEL: test_vst1q_lane_s8
1609 ; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1611 %0 = extractelement <16 x i8> %b, i32 15
1612 store i8 %0, i8* %a, align 1
1616 define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
1617 ; CHECK-LABEL: test_vst1q_lane_s16
1618 ; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1620 %0 = extractelement <8 x i16> %b, i32 7
1621 store i16 %0, i16* %a, align 2
1625 define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
1626 ; CHECK-LABEL: test_vst1q_lane_s32
1627 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1629 %0 = extractelement <4 x i32> %b, i32 3
1630 store i32 %0, i32* %a, align 4
1634 define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
1635 ; CHECK-LABEL: test_vst1q_lane_s64
1636 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1638 %0 = extractelement <2 x i64> %b, i32 1
1639 store i64 %0, i64* %a, align 8
1643 define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
1644 ; CHECK-LABEL: test_vst1q_lane_f32
1645 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1647 %0 = extractelement <4 x float> %b, i32 3
1648 store float %0, float* %a, align 4
1652 define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
1653 ; CHECK-LABEL: test_vst1q_lane_f64
1654 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1656 %0 = extractelement <2 x double> %b, i32 1
1657 store double %0, double* %a, align 8
1661 define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
1662 ; CHECK-LABEL: test_vst1_lane_s8
1663 ; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1665 %0 = extractelement <8 x i8> %b, i32 7
1666 store i8 %0, i8* %a, align 1
1670 define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
1671 ; CHECK-LABEL: test_vst1_lane_s16
1672 ; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1674 %0 = extractelement <4 x i16> %b, i32 3
1675 store i16 %0, i16* %a, align 2
1679 define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
1680 ; CHECK-LABEL: test_vst1_lane_s32
1681 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1683 %0 = extractelement <2 x i32> %b, i32 1
1684 store i32 %0, i32* %a, align 4
1688 define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
1689 ; CHECK-LABEL: test_vst1_lane_s64
1690 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1692 %0 = extractelement <1 x i64> %b, i32 0
1693 store i64 %0, i64* %a, align 8
1697 define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
1698 ; CHECK-LABEL: test_vst1_lane_f32
1699 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1701 %0 = extractelement <2 x float> %b, i32 1
1702 store float %0, float* %a, align 4
1706 define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
1707 ; CHECK-LABEL: test_vst1_lane_f64
1708 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1710 %0 = extractelement <1 x double> %b, i32 0
1711 store double %0, double* %a, align 8
1715 define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
1716 ; CHECK-LABEL: test_vst2q_lane_s8
1717 ; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1719 %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
1720 %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
1721 tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1)
1725 define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
1726 ; CHECK-LABEL: test_vst2q_lane_s16
1727 ; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1729 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
1730 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
1731 %0 = bitcast i16* %a to i8*
1732 tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
1736 define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
1737 ; CHECK-LABEL: test_vst2q_lane_s32
1738 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1740 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
1741 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
1742 %0 = bitcast i32* %a to i8*
1743 tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
1747 define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
1748 ; CHECK-LABEL: test_vst2q_lane_s64
1749 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1751 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
1752 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
1753 %0 = bitcast i64* %a to i8*
1754 tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
1758 define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
1759 ; CHECK-LABEL: test_vst2q_lane_f32
1760 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1762 %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
1763 %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
1764 %0 = bitcast float* %a to i8*
1765 tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
1769 define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
1770 ; CHECK-LABEL: test_vst2q_lane_f64
1771 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1773 %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
1774 %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
1775 %0 = bitcast double* %a to i8*
1776 tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
1780 define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
1781 ; CHECK-LABEL: test_vst2_lane_s8
1782 ; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1784 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
1785 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
1786 tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
1790 define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
1791 ; CHECK-LABEL: test_vst2_lane_s16
1792 ; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1794 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
1795 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
1796 %0 = bitcast i16* %a to i8*
1797 tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
1801 define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
1802 ; CHECK-LABEL: test_vst2_lane_s32
1803 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1805 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
1806 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
1807 %0 = bitcast i32* %a to i8*
1808 tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
1812 define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
1813 ; CHECK-LABEL: test_vst2_lane_s64
1814 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1816 %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
1817 %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
1818 %0 = bitcast i64* %a to i8*
1819 tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
1823 define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
1824 ; CHECK-LABEL: test_vst2_lane_f32
1825 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1827 %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
1828 %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
1829 %0 = bitcast float* %a to i8*
1830 tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
1834 define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
1835 ; CHECK-LABEL: test_vst2_lane_f64
1836 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1838 %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
1839 %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
1840 %0 = bitcast double* %a to i8*
1841 tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
1845 define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
1846 ; CHECK-LABEL: test_vst3q_lane_s8
1847 ; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1849 %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
1850 %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
1851 %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
1852 tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1)
1856 define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
1857 ; CHECK-LABEL: test_vst3q_lane_s16
1858 ; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1860 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
1861 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
1862 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
1863 %0 = bitcast i16* %a to i8*
1864 tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
1868 define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
1869 ; CHECK-LABEL: test_vst3q_lane_s32
1870 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1872 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
1873 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
1874 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
1875 %0 = bitcast i32* %a to i8*
1876 tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
1880 define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
1881 ; CHECK-LABEL: test_vst3q_lane_s64
1882 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1884 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
1885 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
1886 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
1887 %0 = bitcast i64* %a to i8*
1888 tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
1892 define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
1893 ; CHECK-LABEL: test_vst3q_lane_f32
1894 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1896 %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
1897 %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
1898 %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
1899 %0 = bitcast float* %a to i8*
1900 tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
1904 define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
1905 ; CHECK-LABEL: test_vst3q_lane_f64
1906 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1908 %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
1909 %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
1910 %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
1911 %0 = bitcast double* %a to i8*
1912 tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
1916 define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
1917 ; CHECK-LABEL: test_vst3_lane_s8
1918 ; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1920 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
1921 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
1922 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
1923 tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
1927 define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
1928 ; CHECK-LABEL: test_vst3_lane_s16
1929 ; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1931 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
1932 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
1933 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
1934 %0 = bitcast i16* %a to i8*
1935 tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
1939 define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
1940 ; CHECK-LABEL: test_vst3_lane_s32
1941 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1943 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
1944 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1945 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1946 %0 = bitcast i32* %a to i8*
1947 tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
1951 define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1952 ; CHECK-LABEL: test_vst3_lane_s64
1953 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1955 %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1956 %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1957 %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1958 %0 = bitcast i64* %a to i8*
1959 tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
1963 define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1964 ; CHECK-LABEL: test_vst3_lane_f32
1965 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1967 %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1968 %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1969 %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1970 %0 = bitcast float* %a to i8*
1971 tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
1975 define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1976 ; CHECK-LABEL: test_vst3_lane_f64
1977 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1979 %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1980 %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1981 %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1982 %0 = bitcast double* %a to i8*
1983 tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
1987 define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) {
1988 ; CHECK-LABEL: test_vst4q_lane_s8
1989 ; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1991 %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1992 %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1993 %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1994 %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1995 %0 = bitcast i16* %a to i8*
1996 tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2)
2000 define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
2001 ; CHECK-LABEL: test_vst4q_lane_s16
2002 ; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
2004 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
2005 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
2006 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
2007 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
2008 %0 = bitcast i16* %a to i8*
2009 tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
2013 define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
2014 ; CHECK-LABEL: test_vst4q_lane_s32
2015 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2017 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
2018 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
2019 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
2020 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
2021 %0 = bitcast i32* %a to i8*
2022 tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
2026 define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
2027 ; CHECK-LABEL: test_vst4q_lane_s64
2028 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2030 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
2031 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
2032 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
2033 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
2034 %0 = bitcast i64* %a to i8*
2035 tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
2039 define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
2040 ; CHECK-LABEL: test_vst4q_lane_f32
2041 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2043 %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
2044 %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
2045 %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
2046 %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
2047 %0 = bitcast float* %a to i8*
2048 tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
2052 define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
2053 ; CHECK-LABEL: test_vst4q_lane_f64
2054 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2056 %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
2057 %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
2058 %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
2059 %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
2060 %0 = bitcast double* %a to i8*
2061 tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
2065 define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
2066 ; CHECK-LABEL: test_vst4_lane_s8
2067 ; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
2069 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
2070 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
2071 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
2072 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
2073 tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
2077 define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
2078 ; CHECK-LABEL: test_vst4_lane_s16
2079 ; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
2081 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
2082 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
2083 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
2084 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
2085 %0 = bitcast i16* %a to i8*
2086 tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
2090 define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
2091 ; CHECK-LABEL: test_vst4_lane_s32
2092 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2094 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
2095 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
2096 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
2097 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
2098 %0 = bitcast i32* %a to i8*
2099 tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
2103 define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
2104 ; CHECK-LABEL: test_vst4_lane_s64
2105 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2107 %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
2108 %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
2109 %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
2110 %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
2111 %0 = bitcast i64* %a to i8*
2112 tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
2116 define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
2117 ; CHECK-LABEL: test_vst4_lane_f32
2118 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2120 %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
2121 %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
2122 %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
2123 %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
2124 %0 = bitcast float* %a to i8*
2125 tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
2129 define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
2130 ; CHECK-LABEL: test_vst4_lane_f64
2131 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2133 %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
2134 %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
2135 %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
2136 %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
2137 %0 = bitcast double* %a to i8*
2138 tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
2142 declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
2143 declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
2144 declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
2145 declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
2146 declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
2147 declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
2148 declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
2149 declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
2150 declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
2151 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
2152 declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
2153 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
2154 declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2155 declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2156 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2157 declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2158 declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2159 declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2160 declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2161 declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2162 declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2163 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
2164 declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2165 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
2166 declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2167 declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2168 declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2169 declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2170 declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2171 declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2172 declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2173 declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2174 declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2175 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
2176 declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2177 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
2178 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
2179 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
2180 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2181 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2182 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2183 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2184 declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
2185 declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
2186 declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
2187 declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
2188 declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
2189 declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
2190 declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
2191 declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
2192 declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
2193 declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
2194 declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
2195 declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
2196 declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2197 declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2198 declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2199 declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2200 declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2201 declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2202 declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2203 declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2204 declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2205 declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2206 declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2207 declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2208 declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2209 declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2210 declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2211 declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2212 declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2213 declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2214 declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2215 declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2216 declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2217 declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2218 declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2219 declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)