1 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
2 ; interesting parts copied into arm64 directory as aarch64-neon-simd-ldst-one.ll
4 %struct.uint8x16x2_t = type { [2 x <16 x i8>] }
5 %struct.poly8x16x2_t = type { [2 x <16 x i8>] }
6 %struct.uint8x16x3_t = type { [3 x <16 x i8>] }
7 %struct.int8x16x2_t = type { [2 x <16 x i8>] }
8 %struct.int16x8x2_t = type { [2 x <8 x i16>] }
9 %struct.int32x4x2_t = type { [2 x <4 x i32>] }
10 %struct.int64x2x2_t = type { [2 x <2 x i64>] }
11 %struct.float32x4x2_t = type { [2 x <4 x float>] }
12 %struct.float64x2x2_t = type { [2 x <2 x double>] }
13 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
14 %struct.int16x4x2_t = type { [2 x <4 x i16>] }
15 %struct.int32x2x2_t = type { [2 x <2 x i32>] }
16 %struct.int64x1x2_t = type { [2 x <1 x i64>] }
17 %struct.float32x2x2_t = type { [2 x <2 x float>] }
18 %struct.float64x1x2_t = type { [2 x <1 x double>] }
19 %struct.int8x16x3_t = type { [3 x <16 x i8>] }
20 %struct.int16x8x3_t = type { [3 x <8 x i16>] }
21 %struct.int32x4x3_t = type { [3 x <4 x i32>] }
22 %struct.int64x2x3_t = type { [3 x <2 x i64>] }
23 %struct.float32x4x3_t = type { [3 x <4 x float>] }
24 %struct.float64x2x3_t = type { [3 x <2 x double>] }
25 %struct.int8x8x3_t = type { [3 x <8 x i8>] }
26 %struct.int16x4x3_t = type { [3 x <4 x i16>] }
27 %struct.int32x2x3_t = type { [3 x <2 x i32>] }
28 %struct.int64x1x3_t = type { [3 x <1 x i64>] }
29 %struct.float32x2x3_t = type { [3 x <2 x float>] }
30 %struct.float64x1x3_t = type { [3 x <1 x double>] }
31 %struct.int8x16x4_t = type { [4 x <16 x i8>] }
32 %struct.int16x8x4_t = type { [4 x <8 x i16>] }
33 %struct.int32x4x4_t = type { [4 x <4 x i32>] }
34 %struct.int64x2x4_t = type { [4 x <2 x i64>] }
35 %struct.float32x4x4_t = type { [4 x <4 x float>] }
36 %struct.float64x2x4_t = type { [4 x <2 x double>] }
37 %struct.int8x8x4_t = type { [4 x <8 x i8>] }
38 %struct.int16x4x4_t = type { [4 x <4 x i16>] }
39 %struct.int32x2x4_t = type { [4 x <2 x i32>] }
40 %struct.int64x1x4_t = type { [4 x <1 x i64>] }
41 %struct.float32x2x4_t = type { [4 x <2 x float>] }
42 %struct.float64x1x4_t = type { [4 x <1 x double>] }
44 define <16 x i8> @test_ld_from_poll_v16i8(<16 x i8> %a) {
45 ; CHECK-LABEL: test_ld_from_poll_v16i8
46 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
47 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
49 %b = add <16 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 2, i8 13, i8 14, i8 15, i8 16>
53 define <8 x i16> @test_ld_from_poll_v8i16(<8 x i16> %a) {
54 ; CHECK-LABEL: test_ld_from_poll_v8i16
55 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
56 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
58 %b = add <8 x i16> %a, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
62 define <4 x i32> @test_ld_from_poll_v4i32(<4 x i32> %a) {
63 ; CHECK-LABEL: test_ld_from_poll_v4i32
64 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
65 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
67 %b = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
71 define <2 x i64> @test_ld_from_poll_v2i64(<2 x i64> %a) {
72 ; CHECK-LABEL: test_ld_from_poll_v2i64
73 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
74 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
76 %b = add <2 x i64> %a, <i64 1, i64 2>
80 define <4 x float> @test_ld_from_poll_v4f32(<4 x float> %a) {
81 ; CHECK-LABEL: test_ld_from_poll_v4f32
82 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
83 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
85 %b = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
89 define <2 x double> @test_ld_from_poll_v2f64(<2 x double> %a) {
90 ; CHECK-LABEL: test_ld_from_poll_v2f64
91 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
92 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
94 %b = fadd <2 x double> %a, <double 1.0, double 2.0>
98 define <8 x i8> @test_ld_from_poll_v8i8(<8 x i8> %a) {
99 ; CHECK-LABEL: test_ld_from_poll_v8i8
100 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
101 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
103 %b = add <8 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
107 define <4 x i16> @test_ld_from_poll_v4i16(<4 x i16> %a) {
108 ; CHECK-LABEL: test_ld_from_poll_v4i16
109 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
110 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
112 %b = add <4 x i16> %a, <i16 1, i16 2, i16 3, i16 4>
116 define <2 x i32> @test_ld_from_poll_v2i32(<2 x i32> %a) {
117 ; CHECK-LABEL: test_ld_from_poll_v2i32
118 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
119 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
121 %b = add <2 x i32> %a, <i32 1, i32 2>
125 define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
126 ; CHECK-LABEL: test_vld1q_dup_s8
127 ; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0]
129 %0 = load i8* %a, align 1
130 %1 = insertelement <16 x i8> undef, i8 %0, i32 0
131 %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
135 define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
136 ; CHECK-LABEL: test_vld1q_dup_s16
137 ; CHECK: ld1r {{{v[0-9]+}}.8h}, [x0]
139 %0 = load i16* %a, align 2
140 %1 = insertelement <8 x i16> undef, i16 %0, i32 0
141 %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
145 define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
146 ; CHECK-LABEL: test_vld1q_dup_s32
147 ; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
149 %0 = load i32* %a, align 4
150 %1 = insertelement <4 x i32> undef, i32 %0, i32 0
151 %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
155 define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
156 ; CHECK-LABEL: test_vld1q_dup_s64
157 ; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
159 %0 = load i64* %a, align 8
160 %1 = insertelement <2 x i64> undef, i64 %0, i32 0
161 %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
165 define <4 x float> @test_vld1q_dup_f32(float* %a) {
166 ; CHECK-LABEL: test_vld1q_dup_f32
167 ; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
169 %0 = load float* %a, align 4
170 %1 = insertelement <4 x float> undef, float %0, i32 0
171 %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
172 ret <4 x float> %lane
175 define <2 x double> @test_vld1q_dup_f64(double* %a) {
176 ; CHECK-LABEL: test_vld1q_dup_f64
177 ; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
179 %0 = load double* %a, align 8
180 %1 = insertelement <2 x double> undef, double %0, i32 0
181 %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
182 ret <2 x double> %lane
185 define <8 x i8> @test_vld1_dup_s8(i8* %a) {
186 ; CHECK-LABEL: test_vld1_dup_s8
187 ; CHECK: ld1r {{{v[0-9]+}}.8b}, [x0]
189 %0 = load i8* %a, align 1
190 %1 = insertelement <8 x i8> undef, i8 %0, i32 0
191 %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
195 define <4 x i16> @test_vld1_dup_s16(i16* %a) {
196 ; CHECK-LABEL: test_vld1_dup_s16
197 ; CHECK: ld1r {{{v[0-9]+}}.4h}, [x0]
199 %0 = load i16* %a, align 2
200 %1 = insertelement <4 x i16> undef, i16 %0, i32 0
201 %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
205 define <2 x i32> @test_vld1_dup_s32(i32* %a) {
206 ; CHECK-LABEL: test_vld1_dup_s32
207 ; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
209 %0 = load i32* %a, align 4
210 %1 = insertelement <2 x i32> undef, i32 %0, i32 0
211 %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
215 define <1 x i64> @test_vld1_dup_s64(i64* %a) {
216 ; CHECK-LABEL: test_vld1_dup_s64
217 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
219 %0 = load i64* %a, align 8
220 %1 = insertelement <1 x i64> undef, i64 %0, i32 0
224 define <2 x float> @test_vld1_dup_f32(float* %a) {
225 ; CHECK-LABEL: test_vld1_dup_f32
226 ; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
228 %0 = load float* %a, align 4
229 %1 = insertelement <2 x float> undef, float %0, i32 0
230 %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
231 ret <2 x float> %lane
234 define <1 x double> @test_vld1_dup_f64(double* %a) {
235 ; CHECK-LABEL: test_vld1_dup_f64
236 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
238 %0 = load double* %a, align 8
239 %1 = insertelement <1 x double> undef, double %0, i32 0
243 define <1 x i64> @testDUP.v1i64(i64* %a, i64* %b) #0 {
244 ; As there is a store operation depending on %1, LD1R pattern can't be selected.
245 ; So LDR and FMOV should be emitted.
246 ; CHECK-LABEL: testDUP.v1i64
247 ; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
248 ; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
249 ; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}]
250 %1 = load i64* %a, align 8
251 store i64 %1, i64* %b, align 8
252 %vecinit.i = insertelement <1 x i64> undef, i64 %1, i32 0
253 ret <1 x i64> %vecinit.i
256 define <1 x double> @testDUP.v1f64(double* %a, double* %b) #0 {
257 ; As there is a store operation depending on %1, LD1R pattern can't be selected.
258 ; So LDR and FMOV should be emitted.
259 ; CHECK-LABEL: testDUP.v1f64
260 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
261 ; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}]
262 %1 = load double* %a, align 8
263 store double %1, double* %b, align 8
264 %vecinit.i = insertelement <1 x double> undef, double %1, i32 0
265 ret <1 x double> %vecinit.i
268 define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) {
269 ; CHECK-LABEL: test_vld2q_dup_s8
270 ; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
272 %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
273 %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0
274 %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
275 %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1
276 %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
277 %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0
278 %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
279 ret %struct.int8x16x2_t %.fca.0.1.insert
282 define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) {
283 ; CHECK-LABEL: test_vld2q_dup_s16
284 ; CHECK: ld2r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
286 %0 = bitcast i16* %a to i8*
287 %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
288 %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0
289 %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
290 %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1
291 %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
292 %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0
293 %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
294 ret %struct.int16x8x2_t %.fca.0.1.insert
297 define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) {
298 ; CHECK-LABEL: test_vld2q_dup_s32
299 ; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
301 %0 = bitcast i32* %a to i8*
302 %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
303 %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0
304 %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
305 %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1
306 %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
307 %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0
308 %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
309 ret %struct.int32x4x2_t %.fca.0.1.insert
312 define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) {
313 ; CHECK-LABEL: test_vld2q_dup_s64
314 ; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
316 %0 = bitcast i64* %a to i8*
317 %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
318 %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0
319 %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
320 %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1
321 %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
322 %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0
323 %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
324 ret %struct.int64x2x2_t %.fca.0.1.insert
327 define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) {
328 ; CHECK-LABEL: test_vld2q_dup_f32
329 ; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
331 %0 = bitcast float* %a to i8*
332 %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
333 %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0
334 %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
335 %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1
336 %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
337 %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0
338 %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
339 ret %struct.float32x4x2_t %.fca.0.1.insert
342 define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) {
343 ; CHECK-LABEL: test_vld2q_dup_f64
344 ; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
346 %0 = bitcast double* %a to i8*
347 %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
348 %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0
349 %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
350 %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1
351 %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
352 %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0
353 %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
354 ret %struct.float64x2x2_t %.fca.0.1.insert
357 define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) {
358 ; CHECK-LABEL: test_vld2_dup_s8
359 ; CHECK: ld2r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
361 %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
362 %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0
363 %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
364 %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1
365 %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
366 %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0
367 %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
368 ret %struct.int8x8x2_t %.fca.0.1.insert
371 define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) {
372 ; CHECK-LABEL: test_vld2_dup_s16
373 ; CHECK: ld2r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
375 %0 = bitcast i16* %a to i8*
376 %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
377 %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0
378 %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
379 %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1
380 %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
381 %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0
382 %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
383 ret %struct.int16x4x2_t %.fca.0.1.insert
386 define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) {
387 ; CHECK-LABEL: test_vld2_dup_s32
388 ; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
390 %0 = bitcast i32* %a to i8*
391 %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
392 %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0
393 %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
394 %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1
395 %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
396 %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0
397 %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
398 ret %struct.int32x2x2_t %.fca.0.1.insert
401 define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) {
402 ; CHECK-LABEL: test_vld2_dup_s64
403 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
405 %0 = bitcast i64* %a to i8*
406 %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8)
407 %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0
408 %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1
409 %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
410 %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
411 ret %struct.int64x1x2_t %.fca.0.1.insert
414 define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) {
415 ; CHECK-LABEL: test_vld2_dup_f32
416 ; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
418 %0 = bitcast float* %a to i8*
419 %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
420 %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0
421 %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
422 %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1
423 %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
424 %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0
425 %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
426 ret %struct.float32x2x2_t %.fca.0.1.insert
429 define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) {
430 ; CHECK-LABEL: test_vld2_dup_f64
431 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
433 %0 = bitcast double* %a to i8*
434 %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8)
435 %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0
436 %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1
437 %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
438 %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
439 ret %struct.float64x1x2_t %.fca.0.1.insert
442 define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) {
443 ; CHECK-LABEL: test_vld3q_dup_s8
444 ; CHECK: ld3r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
446 %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
447 %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
448 %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
449 %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
450 %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
451 %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
452 %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
453 %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0
454 %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
455 %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
456 ret %struct.int8x16x3_t %.fca.0.2.insert
459 define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) {
460 ; CHECK-LABEL: test_vld3q_dup_s16
461 ; CHECK: ld3r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
463 %0 = bitcast i16* %a to i8*
464 %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
465 %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
466 %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
467 %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
468 %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
469 %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
470 %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
471 %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0
472 %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
473 %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
474 ret %struct.int16x8x3_t %.fca.0.2.insert
477 define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) {
478 ; CHECK-LABEL: test_vld3q_dup_s32
479 ; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
481 %0 = bitcast i32* %a to i8*
482 %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
483 %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
484 %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
485 %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
486 %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
487 %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
488 %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
489 %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0
490 %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
491 %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
492 ret %struct.int32x4x3_t %.fca.0.2.insert
495 define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) {
496 ; CHECK-LABEL: test_vld3q_dup_s64
497 ; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
499 %0 = bitcast i64* %a to i8*
500 %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
501 %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
502 %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
503 %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
504 %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
505 %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
506 %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
507 %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0
508 %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
509 %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
510 ret %struct.int64x2x3_t %.fca.0.2.insert
513 define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) {
514 ; CHECK-LABEL: test_vld3q_dup_f32
515 ; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
517 %0 = bitcast float* %a to i8*
518 %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
519 %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
520 %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
521 %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
522 %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
523 %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
524 %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
525 %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0
526 %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
527 %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
528 ret %struct.float32x4x3_t %.fca.0.2.insert
531 define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) {
532 ; CHECK-LABEL: test_vld3q_dup_f64
533 ; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
535 %0 = bitcast double* %a to i8*
536 %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
537 %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
538 %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
539 %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
540 %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
541 %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
542 %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
543 %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0
544 %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
545 %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
546 ret %struct.float64x2x3_t %.fca.0.2.insert
549 define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) {
550 ; CHECK-LABEL: test_vld3_dup_s8
551 ; CHECK: ld3r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
553 %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
554 %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
555 %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
556 %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
557 %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
558 %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
559 %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
560 %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0
561 %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
562 %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
563 ret %struct.int8x8x3_t %.fca.0.2.insert
566 define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) {
567 ; CHECK-LABEL: test_vld3_dup_s16
568 ; CHECK: ld3r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
570 %0 = bitcast i16* %a to i8*
571 %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
572 %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
573 %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
574 %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
575 %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
576 %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
577 %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
578 %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0
579 %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
580 %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
581 ret %struct.int16x4x3_t %.fca.0.2.insert
584 define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) {
585 ; CHECK-LABEL: test_vld3_dup_s32
586 ; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
588 %0 = bitcast i32* %a to i8*
589 %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
590 %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
591 %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
592 %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
593 %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
594 %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
595 %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
596 %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0
597 %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
598 %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
599 ret %struct.int32x2x3_t %.fca.0.2.insert
602 define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) {
603 ; CHECK-LABEL: test_vld3_dup_s64
604 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
606 %0 = bitcast i64* %a to i8*
607 %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8)
608 %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
609 %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
610 %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
611 %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
612 %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
613 %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
614 ret %struct.int64x1x3_t %.fca.0.2.insert
617 define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) {
618 ; CHECK-LABEL: test_vld3_dup_f32
619 ; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
621 %0 = bitcast float* %a to i8*
622 %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
623 %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
624 %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
625 %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
626 %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
627 %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
628 %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
629 %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0
630 %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
631 %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
632 ret %struct.float32x2x3_t %.fca.0.2.insert
635 define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) {
636 ; CHECK-LABEL: test_vld3_dup_f64
637 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
639 %0 = bitcast double* %a to i8*
640 %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8)
641 %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
642 %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
643 %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
644 %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
645 %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
646 %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
647 ret %struct.float64x1x3_t %.fca.0.2.insert
650 define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) {
651 ; CHECK-LABEL: test_vld4q_dup_s8
652 ; CHECK: ld4r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
654 %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
655 %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
656 %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
657 %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
658 %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
659 %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
660 %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
661 %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3
662 %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer
663 %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0
664 %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
665 %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
666 %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3
667 ret %struct.int8x16x4_t %.fca.0.3.insert
670 define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) {
671 ; CHECK-LABEL: test_vld4q_dup_s16
672 ; CHECK: ld4r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
674 %0 = bitcast i16* %a to i8*
675 %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
676 %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
677 %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
678 %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
679 %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
680 %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
681 %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
682 %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3
683 %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer
684 %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0
685 %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
686 %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
687 %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3
688 ret %struct.int16x8x4_t %.fca.0.3.insert
691 define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) {
692 ; CHECK-LABEL: test_vld4q_dup_s32
693 ; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
695 %0 = bitcast i32* %a to i8*
696 %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
697 %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
698 %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
699 %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
700 %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
701 %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
702 %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
703 %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3
704 %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer
705 %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0
706 %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
707 %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
708 %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3
709 ret %struct.int32x4x4_t %.fca.0.3.insert
712 define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) {
713 ; CHECK-LABEL: test_vld4q_dup_s64
714 ; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
716 %0 = bitcast i64* %a to i8*
717 %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
718 %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
719 %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
720 %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
721 %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
722 %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
723 %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
724 %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3
725 %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer
726 %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0
727 %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
728 %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
729 %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3
730 ret %struct.int64x2x4_t %.fca.0.3.insert
733 define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) {
734 ; CHECK-LABEL: test_vld4q_dup_f32
735 ; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
737 %0 = bitcast float* %a to i8*
738 %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
739 %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
740 %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
741 %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
742 %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
743 %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
744 %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
745 %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3
746 %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer
747 %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0
748 %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
749 %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
750 %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3
751 ret %struct.float32x4x4_t %.fca.0.3.insert
754 define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) {
755 ; CHECK-LABEL: test_vld4q_dup_f64
756 ; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
758 %0 = bitcast double* %a to i8*
759 %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
760 %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
761 %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
762 %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
763 %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
764 %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
765 %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
766 %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3
767 %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer
768 %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0
769 %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
770 %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
771 %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3
772 ret %struct.float64x2x4_t %.fca.0.3.insert
775 define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) {
776 ; CHECK-LABEL: test_vld4_dup_s8
777 ; CHECK: ld4r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
779 %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
780 %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
781 %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
782 %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
783 %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
784 %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
785 %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
786 %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3
787 %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer
788 %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0
789 %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
790 %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
791 %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3
792 ret %struct.int8x8x4_t %.fca.0.3.insert
795 define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) {
796 ; CHECK-LABEL: test_vld4_dup_s16
797 ; CHECK: ld4r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
799 %0 = bitcast i16* %a to i8*
800 %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
801 %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
802 %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
803 %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
804 %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
805 %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
806 %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
807 %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3
808 %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer
809 %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0
810 %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
811 %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
812 %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3
813 ret %struct.int16x4x4_t %.fca.0.3.insert
816 define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) {
817 ; CHECK-LABEL: test_vld4_dup_s32
818 ; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
820 %0 = bitcast i32* %a to i8*
821 %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
822 %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
823 %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
824 %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
825 %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
826 %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
827 %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
828 %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3
829 %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer
830 %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0
831 %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
832 %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
833 %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3
834 ret %struct.int32x2x4_t %.fca.0.3.insert
837 define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) {
838 ; CHECK-LABEL: test_vld4_dup_s64
839 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
841 %0 = bitcast i64* %a to i8*
842 %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8)
843 %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
844 %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
845 %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
846 %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3
847 %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
848 %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
849 %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
850 %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3
851 ret %struct.int64x1x4_t %.fca.0.3.insert
854 define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) {
855 ; CHECK-LABEL: test_vld4_dup_f32
856 ; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
858 %0 = bitcast float* %a to i8*
859 %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
860 %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
861 %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
862 %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
863 %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
864 %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
865 %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
866 %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3
867 %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer
868 %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0
869 %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
870 %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
871 %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3
872 ret %struct.float32x2x4_t %.fca.0.3.insert
875 define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) {
876 ; CHECK-LABEL: test_vld4_dup_f64
877 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
879 %0 = bitcast double* %a to i8*
880 %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8)
881 %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
882 %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
883 %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
884 %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3
885 %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
886 %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
887 %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
888 %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3
889 ret %struct.float64x1x4_t %.fca.0.3.insert
892 define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
893 ; CHECK-LABEL: test_vld1q_lane_s8
894 ; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
896 %0 = load i8* %a, align 1
897 %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
898 ret <16 x i8> %vld1_lane
901 define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
902 ; CHECK-LABEL: test_vld1q_lane_s16
903 ; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
905 %0 = load i16* %a, align 2
906 %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
907 ret <8 x i16> %vld1_lane
910 define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
911 ; CHECK-LABEL: test_vld1q_lane_s32
912 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
914 %0 = load i32* %a, align 4
915 %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
916 ret <4 x i32> %vld1_lane
919 define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
920 ; CHECK-LABEL: test_vld1q_lane_s64
921 ; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
923 %0 = load i64* %a, align 8
924 %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
925 ret <2 x i64> %vld1_lane
928 define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
929 ; CHECK-LABEL: test_vld1q_lane_f32
930 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
932 %0 = load float* %a, align 4
933 %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
934 ret <4 x float> %vld1_lane
937 define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
938 ; CHECK-LABEL: test_vld1q_lane_f64
939 ; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
941 %0 = load double* %a, align 8
942 %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
943 ret <2 x double> %vld1_lane
946 define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
947 ; CHECK-LABEL: test_vld1_lane_s8
948 ; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
950 %0 = load i8* %a, align 1
951 %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
952 ret <8 x i8> %vld1_lane
955 define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
956 ; CHECK-LABEL: test_vld1_lane_s16
957 ; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
959 %0 = load i16* %a, align 2
960 %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
961 ret <4 x i16> %vld1_lane
964 define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
965 ; CHECK-LABEL: test_vld1_lane_s32
966 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
968 %0 = load i32* %a, align 4
969 %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
970 ret <2 x i32> %vld1_lane
973 define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
974 ; CHECK-LABEL: test_vld1_lane_s64
975 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
977 %0 = load i64* %a, align 8
978 %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
979 ret <1 x i64> %vld1_lane
982 define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
983 ; CHECK-LABEL: test_vld1_lane_f32
984 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
986 %0 = load float* %a, align 4
987 %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
988 ret <2 x float> %vld1_lane
991 define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
992 ; CHECK-LABEL: test_vld1_lane_f64
993 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
995 %0 = load double* %a, align 8
996 %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
997 ret <1 x double> %vld1_lane
1000 define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
1001 ; CHECK-LABEL: test_vld2q_lane_s16
1002 ; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1004 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
1005 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
1006 %0 = bitcast i16* %a to i8*
1007 %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
1008 %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0
1009 %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1
1010 %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0
1011 %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1
1012 ret %struct.int16x8x2_t %.fca.0.1.insert
1015 define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
1016 ; CHECK-LABEL: test_vld2q_lane_s32
1017 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1019 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
1020 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
1021 %0 = bitcast i32* %a to i8*
1022 %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
1023 %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0
1024 %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1
1025 %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0
1026 %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1
1027 ret %struct.int32x4x2_t %.fca.0.1.insert
1030 define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
1031 ; CHECK-LABEL: test_vld2q_lane_s64
1032 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1034 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
1035 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
1036 %0 = bitcast i64* %a to i8*
1037 %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
1038 %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0
1039 %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1
1040 %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0
1041 %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1
1042 ret %struct.int64x2x2_t %.fca.0.1.insert
1045 define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
1046 ; CHECK-LABEL: test_vld2q_lane_f32
1047 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1049 %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
1050 %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
1051 %0 = bitcast float* %a to i8*
1052 %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
1053 %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0
1054 %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1
1055 %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0
1056 %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1
1057 ret %struct.float32x4x2_t %.fca.0.1.insert
1060 define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
1061 ; CHECK-LABEL: test_vld2q_lane_f64
1062 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1064 %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
1065 %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
1066 %0 = bitcast double* %a to i8*
1067 %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
1068 %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0
1069 %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1
1070 %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0
1071 %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1
1072 ret %struct.float64x2x2_t %.fca.0.1.insert
1075 define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
1076 ; CHECK-LABEL: test_vld2_lane_s8
1077 ; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1079 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
1080 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
1081 %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
1082 %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0
1083 %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1
1084 %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0
1085 %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1
1086 ret %struct.int8x8x2_t %.fca.0.1.insert
1089 define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
1090 ; CHECK-LABEL: test_vld2_lane_s16
1091 ; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1093 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
1094 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
1095 %0 = bitcast i16* %a to i8*
1096 %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
1097 %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0
1098 %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1
1099 %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0
1100 %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1
1101 ret %struct.int16x4x2_t %.fca.0.1.insert
1104 define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
1105 ; CHECK-LABEL: test_vld2_lane_s32
1106 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1108 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
1109 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
1110 %0 = bitcast i32* %a to i8*
1111 %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
1112 %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0
1113 %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1
1114 %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0
1115 %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1
1116 ret %struct.int32x2x2_t %.fca.0.1.insert
1119 define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
1120 ; CHECK-LABEL: test_vld2_lane_s64
1121 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1123 %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
1124 %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
1125 %0 = bitcast i64* %a to i8*
1126 %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
1127 %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0
1128 %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1
1129 %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0
1130 %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1
1131 ret %struct.int64x1x2_t %.fca.0.1.insert
1134 define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
1135 ; CHECK-LABEL: test_vld2_lane_f32
1136 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1138 %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
1139 %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
1140 %0 = bitcast float* %a to i8*
1141 %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
1142 %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0
1143 %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1
1144 %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0
1145 %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1
1146 ret %struct.float32x2x2_t %.fca.0.1.insert
1149 define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
1150 ; CHECK-LABEL: test_vld2_lane_f64
1151 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1153 %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
1154 %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
1155 %0 = bitcast double* %a to i8*
1156 %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
1157 %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0
1158 %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1
1159 %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0
1160 %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1
1161 ret %struct.float64x1x2_t %.fca.0.1.insert
1164 define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
1165 ; CHECK-LABEL: test_vld3q_lane_s16
1166 ; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1168 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
1169 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
1170 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
1171 %0 = bitcast i16* %a to i8*
1172 %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
1173 %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
1174 %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
1175 %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
1176 %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
1177 %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
1178 %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
1179 ret %struct.int16x8x3_t %.fca.0.2.insert
1182 define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
1183 ; CHECK-LABEL: test_vld3q_lane_s32
1184 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1186 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
1187 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
1188 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
1189 %0 = bitcast i32* %a to i8*
1190 %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
1191 %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
1192 %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
1193 %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
1194 %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
1195 %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
1196 %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
1197 ret %struct.int32x4x3_t %.fca.0.2.insert
1200 define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
1201 ; CHECK-LABEL: test_vld3q_lane_s64
1202 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1204 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
1205 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
1206 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
1207 %0 = bitcast i64* %a to i8*
1208 %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
1209 %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
1210 %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
1211 %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
1212 %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
1213 %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
1214 %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
1215 ret %struct.int64x2x3_t %.fca.0.2.insert
1218 define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
1219 ; CHECK-LABEL: test_vld3q_lane_f32
1220 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1222 %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
1223 %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
1224 %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
1225 %0 = bitcast float* %a to i8*
1226 %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
1227 %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
1228 %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
1229 %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
1230 %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
1231 %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
1232 %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
1233 ret %struct.float32x4x3_t %.fca.0.2.insert
1236 define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
1237 ; CHECK-LABEL: test_vld3q_lane_f64
1238 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1240 %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
1241 %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
1242 %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
1243 %0 = bitcast double* %a to i8*
1244 %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
1245 %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
1246 %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
1247 %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
1248 %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
1249 %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
1250 %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
1251 ret %struct.float64x2x3_t %.fca.0.2.insert
1254 define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
1255 ; CHECK-LABEL: test_vld3_lane_s8
1256 ; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1258 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
1259 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
1260 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
1261 %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
1262 %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
1263 %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
1264 %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
1265 %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
1266 %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
1267 %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
1268 ret %struct.int8x8x3_t %.fca.0.2.insert
1271 define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
1272 ; CHECK-LABEL: test_vld3_lane_s16
1273 ; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1275 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
1276 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
1277 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
1278 %0 = bitcast i16* %a to i8*
1279 %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
1280 %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
1281 %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
1282 %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
1283 %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
1284 %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
1285 %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
1286 ret %struct.int16x4x3_t %.fca.0.2.insert
1289 define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
1290 ; CHECK-LABEL: test_vld3_lane_s32
1291 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1293 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
1294 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1295 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1296 %0 = bitcast i32* %a to i8*
1297 %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
1298 %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
1299 %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
1300 %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
1301 %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
1302 %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
1303 %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
1304 ret %struct.int32x2x3_t %.fca.0.2.insert
1307 define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1308 ; CHECK-LABEL: test_vld3_lane_s64
1309 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1311 %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1312 %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1313 %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1314 %0 = bitcast i64* %a to i8*
1315 %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
1316 %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
1317 %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
1318 %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
1319 %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
1320 %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
1321 %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
1322 ret %struct.int64x1x3_t %.fca.0.2.insert
1325 define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1326 ; CHECK-LABEL: test_vld3_lane_f32
1327 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1329 %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1330 %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1331 %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1332 %0 = bitcast float* %a to i8*
1333 %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
1334 %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
1335 %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
1336 %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
1337 %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
1338 %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
1339 %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
1340 ret %struct.float32x2x3_t %.fca.0.2.insert
1343 define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1344 ; CHECK-LABEL: test_vld3_lane_f64
1345 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1347 %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1348 %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1349 %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1350 %0 = bitcast double* %a to i8*
1351 %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
1352 %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
1353 %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
1354 %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
1355 %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
1356 %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
1357 %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
1358 ret %struct.float64x1x3_t %.fca.0.2.insert
1361 define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
1362 ; CHECK-LABEL: test_vld4q_lane_s8
1363 ; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1365 %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1366 %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1367 %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1368 %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1369 %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1)
1370 %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
1371 %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
1372 %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
1373 %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3
1374 %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
1375 %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
1376 %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
1377 %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3
1378 ret %struct.int8x16x4_t %.fca.0.3.insert
1381 define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
1382 ; CHECK-LABEL: test_vld4q_lane_s16
1383 ; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1385 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
1386 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
1387 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
1388 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
1389 %0 = bitcast i16* %a to i8*
1390 %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
1391 %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
1392 %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
1393 %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
1394 %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3
1395 %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
1396 %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
1397 %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
1398 %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3
1399 ret %struct.int16x8x4_t %.fca.0.3.insert
1402 define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
1403 ; CHECK-LABEL: test_vld4q_lane_s32
1404 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1406 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
1407 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
1408 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
1409 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
1410 %0 = bitcast i32* %a to i8*
1411 %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
1412 %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
1413 %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
1414 %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
1415 %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3
1416 %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
1417 %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
1418 %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
1419 %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3
1420 ret %struct.int32x4x4_t %.fca.0.3.insert
1423 define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
1424 ; CHECK-LABEL: test_vld4q_lane_s64
1425 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1427 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
1428 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
1429 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
1430 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
1431 %0 = bitcast i64* %a to i8*
1432 %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
1433 %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
1434 %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
1435 %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
1436 %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3
1437 %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
1438 %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
1439 %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
1440 %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3
1441 ret %struct.int64x2x4_t %.fca.0.3.insert
1444 define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
1445 ; CHECK-LABEL: test_vld4q_lane_f32
1446 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1448 %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
1449 %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
1450 %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
1451 %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
1452 %0 = bitcast float* %a to i8*
1453 %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
1454 %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
1455 %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
1456 %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
1457 %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3
1458 %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
1459 %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
1460 %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
1461 %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3
1462 ret %struct.float32x4x4_t %.fca.0.3.insert
1465 define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
1466 ; CHECK-LABEL: test_vld4q_lane_f64
1467 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1469 %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
1470 %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
1471 %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
1472 %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
1473 %0 = bitcast double* %a to i8*
1474 %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
1475 %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
1476 %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
1477 %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
1478 %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3
1479 %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
1480 %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
1481 %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
1482 %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3
1483 ret %struct.float64x2x4_t %.fca.0.3.insert
1486 define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
1487 ; CHECK-LABEL: test_vld4_lane_s8
1488 ; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1490 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
1491 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
1492 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
1493 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
1494 %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
1495 %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
1496 %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
1497 %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
1498 %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3
1499 %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
1500 %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
1501 %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
1502 %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3
1503 ret %struct.int8x8x4_t %.fca.0.3.insert
1506 define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
1507 ; CHECK-LABEL: test_vld4_lane_s16
1508 ; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1510 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
1511 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
1512 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
1513 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
1514 %0 = bitcast i16* %a to i8*
1515 %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
1516 %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
1517 %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
1518 %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
1519 %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3
1520 %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
1521 %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
1522 %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
1523 %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3
1524 ret %struct.int16x4x4_t %.fca.0.3.insert
1527 define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
1528 ; CHECK-LABEL: test_vld4_lane_s32
1529 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1531 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
1532 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
1533 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
1534 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
1535 %0 = bitcast i32* %a to i8*
1536 %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
1537 %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
1538 %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
1539 %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
1540 %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3
1541 %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
1542 %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
1543 %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
1544 %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3
1545 ret %struct.int32x2x4_t %.fca.0.3.insert
1548 define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
1549 ; CHECK-LABEL: test_vld4_lane_s64
1550 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1552 %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
1553 %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
1554 %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
1555 %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
1556 %0 = bitcast i64* %a to i8*
1557 %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
1558 %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
1559 %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
1560 %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
1561 %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3
1562 %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
1563 %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
1564 %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
1565 %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3
1566 ret %struct.int64x1x4_t %.fca.0.3.insert
1569 define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
1570 ; CHECK-LABEL: test_vld4_lane_f32
1571 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1573 %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
1574 %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
1575 %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
1576 %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
1577 %0 = bitcast float* %a to i8*
1578 %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
1579 %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
1580 %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
1581 %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
1582 %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3
1583 %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
1584 %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
1585 %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
1586 %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3
1587 ret %struct.float32x2x4_t %.fca.0.3.insert
1590 define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
1591 ; CHECK-LABEL: test_vld4_lane_f64
1592 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1594 %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
1595 %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
1596 %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
1597 %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
1598 %0 = bitcast double* %a to i8*
1599 %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
1600 %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
1601 %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
1602 %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
1603 %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3
1604 %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
1605 %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
1606 %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
1607 %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3
1608 ret %struct.float64x1x4_t %.fca.0.3.insert
1611 define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
1612 ; CHECK-LABEL: test_vst1q_lane_s8
1613 ; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1615 %0 = extractelement <16 x i8> %b, i32 15
1616 store i8 %0, i8* %a, align 1
1620 define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
1621 ; CHECK-LABEL: test_vst1q_lane_s16
1622 ; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1624 %0 = extractelement <8 x i16> %b, i32 7
1625 store i16 %0, i16* %a, align 2
1629 define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
1630 ; CHECK-LABEL: test_vst1q_lane_s32
1631 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1633 %0 = extractelement <4 x i32> %b, i32 3
1634 store i32 %0, i32* %a, align 4
1638 define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
1639 ; CHECK-LABEL: test_vst1q_lane_s64
1640 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1642 %0 = extractelement <2 x i64> %b, i32 1
1643 store i64 %0, i64* %a, align 8
1647 define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
1648 ; CHECK-LABEL: test_vst1q_lane_f32
1649 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1651 %0 = extractelement <4 x float> %b, i32 3
1652 store float %0, float* %a, align 4
1656 define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
1657 ; CHECK-LABEL: test_vst1q_lane_f64
1658 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1660 %0 = extractelement <2 x double> %b, i32 1
1661 store double %0, double* %a, align 8
1665 define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
1666 ; CHECK-LABEL: test_vst1_lane_s8
1667 ; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1669 %0 = extractelement <8 x i8> %b, i32 7
1670 store i8 %0, i8* %a, align 1
1674 define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
1675 ; CHECK-LABEL: test_vst1_lane_s16
1676 ; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1678 %0 = extractelement <4 x i16> %b, i32 3
1679 store i16 %0, i16* %a, align 2
1683 define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
1684 ; CHECK-LABEL: test_vst1_lane_s32
1685 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1687 %0 = extractelement <2 x i32> %b, i32 1
1688 store i32 %0, i32* %a, align 4
1692 define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
1693 ; CHECK-LABEL: test_vst1_lane_s64
1694 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1696 %0 = extractelement <1 x i64> %b, i32 0
1697 store i64 %0, i64* %a, align 8
1701 define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
1702 ; CHECK-LABEL: test_vst1_lane_f32
1703 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1705 %0 = extractelement <2 x float> %b, i32 1
1706 store float %0, float* %a, align 4
1710 define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
1711 ; CHECK-LABEL: test_vst1_lane_f64
1712 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1714 %0 = extractelement <1 x double> %b, i32 0
1715 store double %0, double* %a, align 8
1719 define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
1720 ; CHECK-LABEL: test_vst2q_lane_s8
1721 ; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1723 %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
1724 %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
1725 tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1)
1729 define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
1730 ; CHECK-LABEL: test_vst2q_lane_s16
1731 ; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1733 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
1734 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
1735 %0 = bitcast i16* %a to i8*
1736 tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
1740 define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
1741 ; CHECK-LABEL: test_vst2q_lane_s32
1742 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1744 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
1745 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
1746 %0 = bitcast i32* %a to i8*
1747 tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
1751 define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
1752 ; CHECK-LABEL: test_vst2q_lane_s64
1753 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1755 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
1756 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
1757 %0 = bitcast i64* %a to i8*
1758 tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
1762 define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
1763 ; CHECK-LABEL: test_vst2q_lane_f32
1764 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1766 %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
1767 %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
1768 %0 = bitcast float* %a to i8*
1769 tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
1773 define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
1774 ; CHECK-LABEL: test_vst2q_lane_f64
1775 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1777 %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
1778 %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
1779 %0 = bitcast double* %a to i8*
1780 tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
1784 define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
1785 ; CHECK-LABEL: test_vst2_lane_s8
1786 ; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1788 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
1789 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
1790 tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
1794 define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
1795 ; CHECK-LABEL: test_vst2_lane_s16
1796 ; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1798 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
1799 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
1800 %0 = bitcast i16* %a to i8*
1801 tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
1805 define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
1806 ; CHECK-LABEL: test_vst2_lane_s32
1807 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1809 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
1810 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
1811 %0 = bitcast i32* %a to i8*
1812 tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
1816 define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
1817 ; CHECK-LABEL: test_vst2_lane_s64
1818 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1820 %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
1821 %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
1822 %0 = bitcast i64* %a to i8*
1823 tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
1827 define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
1828 ; CHECK-LABEL: test_vst2_lane_f32
1829 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1831 %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
1832 %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
1833 %0 = bitcast float* %a to i8*
1834 tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
1838 define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
1839 ; CHECK-LABEL: test_vst2_lane_f64
1840 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1842 %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
1843 %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
1844 %0 = bitcast double* %a to i8*
1845 tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
1849 define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
1850 ; CHECK-LABEL: test_vst3q_lane_s8
1851 ; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1853 %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
1854 %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
1855 %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
1856 tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1)
1860 define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
1861 ; CHECK-LABEL: test_vst3q_lane_s16
1862 ; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1864 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
1865 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
1866 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
1867 %0 = bitcast i16* %a to i8*
1868 tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
1872 define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
1873 ; CHECK-LABEL: test_vst3q_lane_s32
1874 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1876 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
1877 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
1878 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
1879 %0 = bitcast i32* %a to i8*
1880 tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
1884 define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
1885 ; CHECK-LABEL: test_vst3q_lane_s64
1886 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1888 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
1889 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
1890 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
1891 %0 = bitcast i64* %a to i8*
1892 tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
1896 define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
1897 ; CHECK-LABEL: test_vst3q_lane_f32
1898 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1900 %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
1901 %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
1902 %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
1903 %0 = bitcast float* %a to i8*
1904 tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
1908 define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
1909 ; CHECK-LABEL: test_vst3q_lane_f64
1910 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1912 %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
1913 %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
1914 %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
1915 %0 = bitcast double* %a to i8*
1916 tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
1920 define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
1921 ; CHECK-LABEL: test_vst3_lane_s8
1922 ; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1924 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
1925 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
1926 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
1927 tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
1931 define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
1932 ; CHECK-LABEL: test_vst3_lane_s16
1933 ; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1935 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
1936 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
1937 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
1938 %0 = bitcast i16* %a to i8*
1939 tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
1943 define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
1944 ; CHECK-LABEL: test_vst3_lane_s32
1945 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1947 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
1948 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1949 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1950 %0 = bitcast i32* %a to i8*
1951 tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
1955 define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1956 ; CHECK-LABEL: test_vst3_lane_s64
1957 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1959 %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1960 %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1961 %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1962 %0 = bitcast i64* %a to i8*
1963 tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
1967 define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1968 ; CHECK-LABEL: test_vst3_lane_f32
1969 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1971 %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1972 %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1973 %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1974 %0 = bitcast float* %a to i8*
1975 tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
1979 define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1980 ; CHECK-LABEL: test_vst3_lane_f64
1981 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1983 %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1984 %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1985 %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1986 %0 = bitcast double* %a to i8*
1987 tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
1991 define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) {
1992 ; CHECK-LABEL: test_vst4q_lane_s8
1993 ; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1995 %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1996 %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1997 %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1998 %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1999 %0 = bitcast i16* %a to i8*
2000 tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2)
2004 define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
2005 ; CHECK-LABEL: test_vst4q_lane_s16
2006 ; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
2008 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
2009 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
2010 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
2011 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
2012 %0 = bitcast i16* %a to i8*
2013 tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
2017 define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
2018 ; CHECK-LABEL: test_vst4q_lane_s32
2019 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2021 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
2022 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
2023 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
2024 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
2025 %0 = bitcast i32* %a to i8*
2026 tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
2030 define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
2031 ; CHECK-LABEL: test_vst4q_lane_s64
2032 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2034 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
2035 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
2036 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
2037 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
2038 %0 = bitcast i64* %a to i8*
2039 tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
2043 define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
2044 ; CHECK-LABEL: test_vst4q_lane_f32
2045 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2047 %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
2048 %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
2049 %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
2050 %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
2051 %0 = bitcast float* %a to i8*
2052 tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
2056 define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
2057 ; CHECK-LABEL: test_vst4q_lane_f64
2058 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2060 %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
2061 %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
2062 %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
2063 %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
2064 %0 = bitcast double* %a to i8*
2065 tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
2069 define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
2070 ; CHECK-LABEL: test_vst4_lane_s8
2071 ; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
2073 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
2074 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
2075 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
2076 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
2077 tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
2081 define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
2082 ; CHECK-LABEL: test_vst4_lane_s16
2083 ; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
2085 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
2086 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
2087 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
2088 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
2089 %0 = bitcast i16* %a to i8*
2090 tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
2094 define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
2095 ; CHECK-LABEL: test_vst4_lane_s32
2096 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2098 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
2099 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
2100 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
2101 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
2102 %0 = bitcast i32* %a to i8*
2103 tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
2107 define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
2108 ; CHECK-LABEL: test_vst4_lane_s64
2109 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2111 %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
2112 %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
2113 %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
2114 %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
2115 %0 = bitcast i64* %a to i8*
2116 tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
2120 define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
2121 ; CHECK-LABEL: test_vst4_lane_f32
2122 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2124 %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
2125 %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
2126 %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
2127 %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
2128 %0 = bitcast float* %a to i8*
2129 tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
2133 define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
2134 ; CHECK-LABEL: test_vst4_lane_f64
2135 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2137 %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
2138 %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
2139 %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
2140 %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
2141 %0 = bitcast double* %a to i8*
2142 tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
2146 declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
2147 declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
2148 declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
2149 declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
2150 declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
2151 declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
2152 declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
2153 declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
2154 declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
2155 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
2156 declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
2157 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
2158 declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2159 declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2160 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2161 declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2162 declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2163 declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2164 declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2165 declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2166 declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2167 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
2168 declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2169 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
2170 declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2171 declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2172 declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2173 declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2174 declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2175 declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2176 declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2177 declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2178 declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2179 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
2180 declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2181 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
2182 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
2183 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
2184 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2185 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2186 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2187 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2188 declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
2189 declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
2190 declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
2191 declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
2192 declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
2193 declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
2194 declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
2195 declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
2196 declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
2197 declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
2198 declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
2199 declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
2200 declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2201 declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2202 declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2203 declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2204 declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2205 declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2206 declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2207 declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2208 declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2209 declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2210 declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2211 declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2212 declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2213 declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2214 declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2215 declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2216 declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2217 declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2218 declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2219 declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2220 declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2221 declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2222 declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2223 declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2225 define %struct.int8x16x2_t @test_vld2q_lane_s8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
2226 ; CHECK-LABEL: test_vld2q_lane_s8
2227 ; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
2229 %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
2230 %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
2231 %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
2232 %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
2233 %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
2234 %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
2235 %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
2236 ret %struct.int8x16x2_t %.fca.0.1.insert
2239 define %struct.uint8x16x2_t @test_vld2q_lane_u8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
2240 ; CHECK-LABEL: test_vld2q_lane_u8
2241 ; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
2243 %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
2244 %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
2245 %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
2246 %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
2247 %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
2248 %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
2249 %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
2250 ret %struct.uint8x16x2_t %.fca.0.1.insert
2253 define %struct.poly8x16x2_t @test_vld2q_lane_p8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
2254 ; CHECK-LABEL: test_vld2q_lane_p8
2255 ; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
2257 %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
2258 %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
2259 %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
2260 %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
2261 %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
2262 %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
2263 %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
2264 ret %struct.poly8x16x2_t %.fca.0.1.insert
2267 define %struct.int8x16x3_t @test_vld3q_lane_s8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
2268 ; CHECK-LABEL: test_vld3q_lane_s8
2269 ; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
2271 %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
2272 %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
2273 %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
2274 %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
2275 %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
2276 %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
2277 %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
2278 %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
2279 %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
2280 %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
2281 ret %struct.int8x16x3_t %.fca.0.2.insert
2284 define %struct.uint8x16x3_t @test_vld3q_lane_u8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
2285 ; CHECK-LABEL: test_vld3q_lane_u8
2286 ; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
2288 %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
2289 %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
2290 %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
2291 %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
2292 %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
2293 %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
2294 %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
2295 %.fca.0.0.insert = insertvalue %struct.uint8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
2296 %.fca.0.1.insert = insertvalue %struct.uint8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
2297 %.fca.0.2.insert = insertvalue %struct.uint8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
2298 ret %struct.uint8x16x3_t %.fca.0.2.insert