1 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
3 %struct.uint8x16x2_t = type { [2 x <16 x i8>] }
4 %struct.poly8x16x2_t = type { [2 x <16 x i8>] }
5 %struct.uint8x16x3_t = type { [3 x <16 x i8>] }
6 %struct.int8x16x2_t = type { [2 x <16 x i8>] }
7 %struct.int16x8x2_t = type { [2 x <8 x i16>] }
8 %struct.int32x4x2_t = type { [2 x <4 x i32>] }
9 %struct.int64x2x2_t = type { [2 x <2 x i64>] }
10 %struct.float32x4x2_t = type { [2 x <4 x float>] }
11 %struct.float64x2x2_t = type { [2 x <2 x double>] }
12 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
13 %struct.int16x4x2_t = type { [2 x <4 x i16>] }
14 %struct.int32x2x2_t = type { [2 x <2 x i32>] }
15 %struct.int64x1x2_t = type { [2 x <1 x i64>] }
16 %struct.float32x2x2_t = type { [2 x <2 x float>] }
17 %struct.float64x1x2_t = type { [2 x <1 x double>] }
18 %struct.int8x16x3_t = type { [3 x <16 x i8>] }
19 %struct.int16x8x3_t = type { [3 x <8 x i16>] }
20 %struct.int32x4x3_t = type { [3 x <4 x i32>] }
21 %struct.int64x2x3_t = type { [3 x <2 x i64>] }
22 %struct.float32x4x3_t = type { [3 x <4 x float>] }
23 %struct.float64x2x3_t = type { [3 x <2 x double>] }
24 %struct.int8x8x3_t = type { [3 x <8 x i8>] }
25 %struct.int16x4x3_t = type { [3 x <4 x i16>] }
26 %struct.int32x2x3_t = type { [3 x <2 x i32>] }
27 %struct.int64x1x3_t = type { [3 x <1 x i64>] }
28 %struct.float32x2x3_t = type { [3 x <2 x float>] }
29 %struct.float64x1x3_t = type { [3 x <1 x double>] }
30 %struct.int8x16x4_t = type { [4 x <16 x i8>] }
31 %struct.int16x8x4_t = type { [4 x <8 x i16>] }
32 %struct.int32x4x4_t = type { [4 x <4 x i32>] }
33 %struct.int64x2x4_t = type { [4 x <2 x i64>] }
34 %struct.float32x4x4_t = type { [4 x <4 x float>] }
35 %struct.float64x2x4_t = type { [4 x <2 x double>] }
36 %struct.int8x8x4_t = type { [4 x <8 x i8>] }
37 %struct.int16x4x4_t = type { [4 x <4 x i16>] }
38 %struct.int32x2x4_t = type { [4 x <2 x i32>] }
39 %struct.int64x1x4_t = type { [4 x <1 x i64>] }
40 %struct.float32x2x4_t = type { [4 x <2 x float>] }
41 %struct.float64x1x4_t = type { [4 x <1 x double>] }
43 define <16 x i8> @test_ld_from_poll_v16i8(<16 x i8> %a) {
44 ; CHECK-LABEL: test_ld_from_poll_v16i8
45 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
46 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
48 %b = add <16 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 2, i8 13, i8 14, i8 15, i8 16>
52 define <8 x i16> @test_ld_from_poll_v8i16(<8 x i16> %a) {
53 ; CHECK-LABEL: test_ld_from_poll_v8i16
54 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
55 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
57 %b = add <8 x i16> %a, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
61 define <4 x i32> @test_ld_from_poll_v4i32(<4 x i32> %a) {
62 ; CHECK-LABEL: test_ld_from_poll_v4i32
63 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
64 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
66 %b = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
70 define <2 x i64> @test_ld_from_poll_v2i64(<2 x i64> %a) {
71 ; CHECK-LABEL: test_ld_from_poll_v2i64
72 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
73 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
75 %b = add <2 x i64> %a, <i64 1, i64 2>
79 define <4 x float> @test_ld_from_poll_v4f32(<4 x float> %a) {
80 ; CHECK-LABEL: test_ld_from_poll_v4f32
81 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
82 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
84 %b = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
88 define <2 x double> @test_ld_from_poll_v2f64(<2 x double> %a) {
89 ; CHECK-LABEL: test_ld_from_poll_v2f64
90 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
91 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
93 %b = fadd <2 x double> %a, <double 1.0, double 2.0>
97 define <8 x i8> @test_ld_from_poll_v8i8(<8 x i8> %a) {
98 ; CHECK-LABEL: test_ld_from_poll_v8i8
99 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
100 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
102 %b = add <8 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
106 define <4 x i16> @test_ld_from_poll_v4i16(<4 x i16> %a) {
107 ; CHECK-LABEL: test_ld_from_poll_v4i16
108 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
109 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
111 %b = add <4 x i16> %a, <i16 1, i16 2, i16 3, i16 4>
115 define <2 x i32> @test_ld_from_poll_v2i32(<2 x i32> %a) {
116 ; CHECK-LABEL: test_ld_from_poll_v2i32
117 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
118 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
120 %b = add <2 x i32> %a, <i32 1, i32 2>
124 define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
125 ; CHECK-LABEL: test_vld1q_dup_s8
126 ; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0]
128 %0 = load i8* %a, align 1
129 %1 = insertelement <16 x i8> undef, i8 %0, i32 0
130 %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
134 define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
135 ; CHECK-LABEL: test_vld1q_dup_s16
136 ; CHECK: ld1r {{{v[0-9]+}}.8h}, [x0]
138 %0 = load i16* %a, align 2
139 %1 = insertelement <8 x i16> undef, i16 %0, i32 0
140 %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
144 define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
145 ; CHECK-LABEL: test_vld1q_dup_s32
146 ; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
148 %0 = load i32* %a, align 4
149 %1 = insertelement <4 x i32> undef, i32 %0, i32 0
150 %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
154 define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
155 ; CHECK-LABEL: test_vld1q_dup_s64
156 ; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
158 %0 = load i64* %a, align 8
159 %1 = insertelement <2 x i64> undef, i64 %0, i32 0
160 %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
164 define <4 x float> @test_vld1q_dup_f32(float* %a) {
165 ; CHECK-LABEL: test_vld1q_dup_f32
166 ; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
168 %0 = load float* %a, align 4
169 %1 = insertelement <4 x float> undef, float %0, i32 0
170 %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
171 ret <4 x float> %lane
174 define <2 x double> @test_vld1q_dup_f64(double* %a) {
175 ; CHECK-LABEL: test_vld1q_dup_f64
176 ; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
178 %0 = load double* %a, align 8
179 %1 = insertelement <2 x double> undef, double %0, i32 0
180 %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
181 ret <2 x double> %lane
184 define <8 x i8> @test_vld1_dup_s8(i8* %a) {
185 ; CHECK-LABEL: test_vld1_dup_s8
186 ; CHECK: ld1r {{{v[0-9]+}}.8b}, [x0]
188 %0 = load i8* %a, align 1
189 %1 = insertelement <8 x i8> undef, i8 %0, i32 0
190 %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
194 define <4 x i16> @test_vld1_dup_s16(i16* %a) {
195 ; CHECK-LABEL: test_vld1_dup_s16
196 ; CHECK: ld1r {{{v[0-9]+}}.4h}, [x0]
198 %0 = load i16* %a, align 2
199 %1 = insertelement <4 x i16> undef, i16 %0, i32 0
200 %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
204 define <2 x i32> @test_vld1_dup_s32(i32* %a) {
205 ; CHECK-LABEL: test_vld1_dup_s32
206 ; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
208 %0 = load i32* %a, align 4
209 %1 = insertelement <2 x i32> undef, i32 %0, i32 0
210 %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
214 define <1 x i64> @test_vld1_dup_s64(i64* %a) {
215 ; CHECK-LABEL: test_vld1_dup_s64
216 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
218 %0 = load i64* %a, align 8
219 %1 = insertelement <1 x i64> undef, i64 %0, i32 0
223 define <2 x float> @test_vld1_dup_f32(float* %a) {
224 ; CHECK-LABEL: test_vld1_dup_f32
225 ; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
227 %0 = load float* %a, align 4
228 %1 = insertelement <2 x float> undef, float %0, i32 0
229 %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
230 ret <2 x float> %lane
233 define <1 x double> @test_vld1_dup_f64(double* %a) {
234 ; CHECK-LABEL: test_vld1_dup_f64
235 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
237 %0 = load double* %a, align 8
238 %1 = insertelement <1 x double> undef, double %0, i32 0
242 define <1 x i64> @testDUP.v1i64(i64* %a, i64* %b) #0 {
243 ; As there is a store operation depending on %1, LD1R pattern can't be selected.
244 ; So LDR and FMOV should be emitted.
245 ; CHECK-LABEL: testDUP.v1i64
246 ; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
247 ; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
248 ; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}]
249 %1 = load i64* %a, align 8
250 store i64 %1, i64* %b, align 8
251 %vecinit.i = insertelement <1 x i64> undef, i64 %1, i32 0
252 ret <1 x i64> %vecinit.i
255 define <1 x double> @testDUP.v1f64(double* %a, double* %b) #0 {
256 ; As there is a store operation depending on %1, LD1R pattern can't be selected.
257 ; So LDR and FMOV should be emitted.
258 ; CHECK-LABEL: testDUP.v1f64
259 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
260 ; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}]
261 %1 = load double* %a, align 8
262 store double %1, double* %b, align 8
263 %vecinit.i = insertelement <1 x double> undef, double %1, i32 0
264 ret <1 x double> %vecinit.i
267 define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) {
268 ; CHECK-LABEL: test_vld2q_dup_s8
269 ; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
271 %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
272 %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0
273 %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
274 %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1
275 %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
276 %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0
277 %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
278 ret %struct.int8x16x2_t %.fca.0.1.insert
281 define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) {
282 ; CHECK-LABEL: test_vld2q_dup_s16
283 ; CHECK: ld2r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
285 %0 = bitcast i16* %a to i8*
286 %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
287 %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0
288 %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
289 %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1
290 %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
291 %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0
292 %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
293 ret %struct.int16x8x2_t %.fca.0.1.insert
296 define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) {
297 ; CHECK-LABEL: test_vld2q_dup_s32
298 ; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
300 %0 = bitcast i32* %a to i8*
301 %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
302 %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0
303 %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
304 %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1
305 %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
306 %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0
307 %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
308 ret %struct.int32x4x2_t %.fca.0.1.insert
311 define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) {
312 ; CHECK-LABEL: test_vld2q_dup_s64
313 ; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
315 %0 = bitcast i64* %a to i8*
316 %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
317 %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0
318 %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
319 %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1
320 %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
321 %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0
322 %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
323 ret %struct.int64x2x2_t %.fca.0.1.insert
326 define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) {
327 ; CHECK-LABEL: test_vld2q_dup_f32
328 ; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
330 %0 = bitcast float* %a to i8*
331 %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
332 %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0
333 %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
334 %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1
335 %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
336 %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0
337 %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
338 ret %struct.float32x4x2_t %.fca.0.1.insert
341 define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) {
342 ; CHECK-LABEL: test_vld2q_dup_f64
343 ; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
345 %0 = bitcast double* %a to i8*
346 %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
347 %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0
348 %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
349 %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1
350 %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
351 %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0
352 %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
353 ret %struct.float64x2x2_t %.fca.0.1.insert
356 define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) {
357 ; CHECK-LABEL: test_vld2_dup_s8
358 ; CHECK: ld2r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
360 %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
361 %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0
362 %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
363 %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1
364 %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
365 %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0
366 %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
367 ret %struct.int8x8x2_t %.fca.0.1.insert
370 define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) {
371 ; CHECK-LABEL: test_vld2_dup_s16
372 ; CHECK: ld2r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
374 %0 = bitcast i16* %a to i8*
375 %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
376 %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0
377 %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
378 %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1
379 %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
380 %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0
381 %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
382 ret %struct.int16x4x2_t %.fca.0.1.insert
385 define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) {
386 ; CHECK-LABEL: test_vld2_dup_s32
387 ; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
389 %0 = bitcast i32* %a to i8*
390 %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
391 %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0
392 %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
393 %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1
394 %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
395 %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0
396 %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
397 ret %struct.int32x2x2_t %.fca.0.1.insert
400 define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) {
401 ; CHECK-LABEL: test_vld2_dup_s64
402 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
404 %0 = bitcast i64* %a to i8*
405 %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8)
406 %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0
407 %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1
408 %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
409 %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
410 ret %struct.int64x1x2_t %.fca.0.1.insert
413 define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) {
414 ; CHECK-LABEL: test_vld2_dup_f32
415 ; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
417 %0 = bitcast float* %a to i8*
418 %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
419 %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0
420 %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
421 %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1
422 %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
423 %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0
424 %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
425 ret %struct.float32x2x2_t %.fca.0.1.insert
428 define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) {
429 ; CHECK-LABEL: test_vld2_dup_f64
430 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
432 %0 = bitcast double* %a to i8*
433 %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8)
434 %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0
435 %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1
436 %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
437 %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
438 ret %struct.float64x1x2_t %.fca.0.1.insert
441 define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) {
442 ; CHECK-LABEL: test_vld3q_dup_s8
443 ; CHECK: ld3r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
445 %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
446 %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
447 %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
448 %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
449 %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
450 %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
451 %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
452 %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0
453 %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
454 %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
455 ret %struct.int8x16x3_t %.fca.0.2.insert
458 define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) {
459 ; CHECK-LABEL: test_vld3q_dup_s16
460 ; CHECK: ld3r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
462 %0 = bitcast i16* %a to i8*
463 %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
464 %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
465 %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
466 %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
467 %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
468 %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
469 %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
470 %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0
471 %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
472 %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
473 ret %struct.int16x8x3_t %.fca.0.2.insert
476 define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) {
477 ; CHECK-LABEL: test_vld3q_dup_s32
478 ; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
480 %0 = bitcast i32* %a to i8*
481 %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
482 %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
483 %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
484 %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
485 %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
486 %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
487 %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
488 %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0
489 %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
490 %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
491 ret %struct.int32x4x3_t %.fca.0.2.insert
494 define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) {
495 ; CHECK-LABEL: test_vld3q_dup_s64
496 ; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
498 %0 = bitcast i64* %a to i8*
499 %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
500 %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
501 %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
502 %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
503 %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
504 %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
505 %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
506 %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0
507 %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
508 %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
509 ret %struct.int64x2x3_t %.fca.0.2.insert
512 define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) {
513 ; CHECK-LABEL: test_vld3q_dup_f32
514 ; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
516 %0 = bitcast float* %a to i8*
517 %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
518 %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
519 %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
520 %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
521 %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
522 %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
523 %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
524 %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0
525 %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
526 %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
527 ret %struct.float32x4x3_t %.fca.0.2.insert
530 define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) {
531 ; CHECK-LABEL: test_vld3q_dup_f64
532 ; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
534 %0 = bitcast double* %a to i8*
535 %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
536 %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
537 %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
538 %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
539 %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
540 %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
541 %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
542 %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0
543 %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
544 %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
545 ret %struct.float64x2x3_t %.fca.0.2.insert
548 define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) {
549 ; CHECK-LABEL: test_vld3_dup_s8
550 ; CHECK: ld3r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
552 %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
553 %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
554 %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
555 %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
556 %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
557 %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
558 %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
559 %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0
560 %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
561 %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
562 ret %struct.int8x8x3_t %.fca.0.2.insert
565 define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) {
566 ; CHECK-LABEL: test_vld3_dup_s16
567 ; CHECK: ld3r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
569 %0 = bitcast i16* %a to i8*
570 %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
571 %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
572 %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
573 %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
574 %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
575 %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
576 %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
577 %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0
578 %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
579 %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
580 ret %struct.int16x4x3_t %.fca.0.2.insert
583 define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) {
584 ; CHECK-LABEL: test_vld3_dup_s32
585 ; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
587 %0 = bitcast i32* %a to i8*
588 %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
589 %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
590 %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
591 %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
592 %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
593 %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
594 %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
595 %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0
596 %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
597 %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
598 ret %struct.int32x2x3_t %.fca.0.2.insert
601 define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) {
602 ; CHECK-LABEL: test_vld3_dup_s64
603 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
605 %0 = bitcast i64* %a to i8*
606 %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8)
607 %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
608 %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
609 %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
610 %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
611 %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
612 %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
613 ret %struct.int64x1x3_t %.fca.0.2.insert
616 define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) {
617 ; CHECK-LABEL: test_vld3_dup_f32
618 ; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
620 %0 = bitcast float* %a to i8*
621 %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
622 %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
623 %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
624 %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
625 %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
626 %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
627 %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
628 %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0
629 %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
630 %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
631 ret %struct.float32x2x3_t %.fca.0.2.insert
634 define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) {
635 ; CHECK-LABEL: test_vld3_dup_f64
636 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
638 %0 = bitcast double* %a to i8*
639 %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8)
640 %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
641 %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
642 %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
643 %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
644 %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
645 %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
646 ret %struct.float64x1x3_t %.fca.0.2.insert
649 define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) {
650 ; CHECK-LABEL: test_vld4q_dup_s8
651 ; CHECK: ld4r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
653 %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
654 %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
655 %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
656 %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
657 %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
658 %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
659 %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
660 %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3
661 %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer
662 %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0
663 %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
664 %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
665 %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3
666 ret %struct.int8x16x4_t %.fca.0.3.insert
669 define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) {
670 ; CHECK-LABEL: test_vld4q_dup_s16
671 ; CHECK: ld4r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
673 %0 = bitcast i16* %a to i8*
674 %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
675 %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
676 %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
677 %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
678 %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
679 %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
680 %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
681 %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3
682 %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer
683 %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0
684 %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
685 %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
686 %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3
687 ret %struct.int16x8x4_t %.fca.0.3.insert
690 define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) {
691 ; CHECK-LABEL: test_vld4q_dup_s32
692 ; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
694 %0 = bitcast i32* %a to i8*
695 %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
696 %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
697 %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
698 %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
699 %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
700 %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
701 %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
702 %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3
703 %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer
704 %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0
705 %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
706 %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
707 %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3
708 ret %struct.int32x4x4_t %.fca.0.3.insert
711 define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) {
712 ; CHECK-LABEL: test_vld4q_dup_s64
713 ; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
715 %0 = bitcast i64* %a to i8*
716 %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
717 %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
718 %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
719 %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
720 %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
721 %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
722 %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
723 %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3
724 %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer
725 %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0
726 %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
727 %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
728 %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3
729 ret %struct.int64x2x4_t %.fca.0.3.insert
732 define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) {
733 ; CHECK-LABEL: test_vld4q_dup_f32
734 ; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
736 %0 = bitcast float* %a to i8*
737 %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
738 %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
739 %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
740 %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
741 %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
742 %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
743 %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
744 %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3
745 %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer
746 %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0
747 %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
748 %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
749 %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3
750 ret %struct.float32x4x4_t %.fca.0.3.insert
753 define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) {
754 ; CHECK-LABEL: test_vld4q_dup_f64
755 ; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
757 %0 = bitcast double* %a to i8*
758 %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
759 %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
760 %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
761 %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
762 %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
763 %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
764 %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
765 %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3
766 %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer
767 %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0
768 %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
769 %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
770 %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3
771 ret %struct.float64x2x4_t %.fca.0.3.insert
774 define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) {
775 ; CHECK-LABEL: test_vld4_dup_s8
776 ; CHECK: ld4r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
778 %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
779 %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
780 %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
781 %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
782 %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
783 %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
784 %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
785 %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3
786 %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer
787 %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0
788 %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
789 %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
790 %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3
791 ret %struct.int8x8x4_t %.fca.0.3.insert
794 define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) {
795 ; CHECK-LABEL: test_vld4_dup_s16
796 ; CHECK: ld4r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
798 %0 = bitcast i16* %a to i8*
799 %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
800 %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
801 %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
802 %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
803 %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
804 %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
805 %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
806 %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3
807 %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer
808 %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0
809 %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
810 %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
811 %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3
812 ret %struct.int16x4x4_t %.fca.0.3.insert
815 define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) {
816 ; CHECK-LABEL: test_vld4_dup_s32
817 ; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
819 %0 = bitcast i32* %a to i8*
820 %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
821 %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
822 %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
823 %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
824 %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
825 %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
826 %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
827 %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3
828 %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer
829 %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0
830 %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
831 %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
832 %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3
833 ret %struct.int32x2x4_t %.fca.0.3.insert
836 define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) {
837 ; CHECK-LABEL: test_vld4_dup_s64
838 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
840 %0 = bitcast i64* %a to i8*
841 %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8)
842 %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
843 %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
844 %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
845 %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3
846 %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
847 %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
848 %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
849 %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3
850 ret %struct.int64x1x4_t %.fca.0.3.insert
853 define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) {
854 ; CHECK-LABEL: test_vld4_dup_f32
855 ; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
857 %0 = bitcast float* %a to i8*
858 %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
859 %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
860 %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
861 %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
862 %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
863 %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
864 %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
865 %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3
866 %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer
867 %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0
868 %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
869 %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
870 %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3
871 ret %struct.float32x2x4_t %.fca.0.3.insert
874 define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) {
875 ; CHECK-LABEL: test_vld4_dup_f64
876 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
878 %0 = bitcast double* %a to i8*
879 %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8)
880 %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
881 %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
882 %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
883 %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3
884 %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
885 %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
886 %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
887 %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3
888 ret %struct.float64x1x4_t %.fca.0.3.insert
891 define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
892 ; CHECK-LABEL: test_vld1q_lane_s8
893 ; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
895 %0 = load i8* %a, align 1
896 %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
897 ret <16 x i8> %vld1_lane
900 define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
901 ; CHECK-LABEL: test_vld1q_lane_s16
902 ; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
904 %0 = load i16* %a, align 2
905 %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
906 ret <8 x i16> %vld1_lane
909 define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
910 ; CHECK-LABEL: test_vld1q_lane_s32
911 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
913 %0 = load i32* %a, align 4
914 %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
915 ret <4 x i32> %vld1_lane
918 define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
919 ; CHECK-LABEL: test_vld1q_lane_s64
920 ; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
922 %0 = load i64* %a, align 8
923 %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
924 ret <2 x i64> %vld1_lane
927 define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
928 ; CHECK-LABEL: test_vld1q_lane_f32
929 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
931 %0 = load float* %a, align 4
932 %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
933 ret <4 x float> %vld1_lane
936 define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
937 ; CHECK-LABEL: test_vld1q_lane_f64
938 ; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
940 %0 = load double* %a, align 8
941 %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
942 ret <2 x double> %vld1_lane
945 define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
946 ; CHECK-LABEL: test_vld1_lane_s8
947 ; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
949 %0 = load i8* %a, align 1
950 %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
951 ret <8 x i8> %vld1_lane
954 define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
955 ; CHECK-LABEL: test_vld1_lane_s16
956 ; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
958 %0 = load i16* %a, align 2
959 %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
960 ret <4 x i16> %vld1_lane
963 define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
964 ; CHECK-LABEL: test_vld1_lane_s32
965 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
967 %0 = load i32* %a, align 4
968 %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
969 ret <2 x i32> %vld1_lane
972 define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
973 ; CHECK-LABEL: test_vld1_lane_s64
974 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
976 %0 = load i64* %a, align 8
977 %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
978 ret <1 x i64> %vld1_lane
981 define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
982 ; CHECK-LABEL: test_vld1_lane_f32
983 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
985 %0 = load float* %a, align 4
986 %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
987 ret <2 x float> %vld1_lane
990 define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
991 ; CHECK-LABEL: test_vld1_lane_f64
992 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
994 %0 = load double* %a, align 8
995 %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
996 ret <1 x double> %vld1_lane
999 define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
1000 ; CHECK-LABEL: test_vld2q_lane_s16
1001 ; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1003 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
1004 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
1005 %0 = bitcast i16* %a to i8*
1006 %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
1007 %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0
1008 %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1
1009 %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0
1010 %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1
1011 ret %struct.int16x8x2_t %.fca.0.1.insert
1014 define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
1015 ; CHECK-LABEL: test_vld2q_lane_s32
1016 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1018 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
1019 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
1020 %0 = bitcast i32* %a to i8*
1021 %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
1022 %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0
1023 %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1
1024 %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0
1025 %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1
1026 ret %struct.int32x4x2_t %.fca.0.1.insert
1029 define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
1030 ; CHECK-LABEL: test_vld2q_lane_s64
1031 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1033 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
1034 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
1035 %0 = bitcast i64* %a to i8*
1036 %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
1037 %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0
1038 %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1
1039 %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0
1040 %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1
1041 ret %struct.int64x2x2_t %.fca.0.1.insert
1044 define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
1045 ; CHECK-LABEL: test_vld2q_lane_f32
1046 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1048 %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
1049 %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
1050 %0 = bitcast float* %a to i8*
1051 %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
1052 %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0
1053 %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1
1054 %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0
1055 %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1
1056 ret %struct.float32x4x2_t %.fca.0.1.insert
1059 define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
1060 ; CHECK-LABEL: test_vld2q_lane_f64
1061 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1063 %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
1064 %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
1065 %0 = bitcast double* %a to i8*
1066 %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
1067 %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0
1068 %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1
1069 %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0
1070 %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1
1071 ret %struct.float64x2x2_t %.fca.0.1.insert
1074 define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
1075 ; CHECK-LABEL: test_vld2_lane_s8
1076 ; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1078 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
1079 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
1080 %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
1081 %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0
1082 %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1
1083 %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0
1084 %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1
1085 ret %struct.int8x8x2_t %.fca.0.1.insert
1088 define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
1089 ; CHECK-LABEL: test_vld2_lane_s16
1090 ; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1092 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
1093 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
1094 %0 = bitcast i16* %a to i8*
1095 %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
1096 %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0
1097 %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1
1098 %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0
1099 %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1
1100 ret %struct.int16x4x2_t %.fca.0.1.insert
1103 define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
1104 ; CHECK-LABEL: test_vld2_lane_s32
1105 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1107 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
1108 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
1109 %0 = bitcast i32* %a to i8*
1110 %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
1111 %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0
1112 %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1
1113 %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0
1114 %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1
1115 ret %struct.int32x2x2_t %.fca.0.1.insert
1118 define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
1119 ; CHECK-LABEL: test_vld2_lane_s64
1120 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1122 %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
1123 %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
1124 %0 = bitcast i64* %a to i8*
1125 %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
1126 %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0
1127 %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1
1128 %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0
1129 %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1
1130 ret %struct.int64x1x2_t %.fca.0.1.insert
1133 define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
1134 ; CHECK-LABEL: test_vld2_lane_f32
1135 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1137 %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
1138 %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
1139 %0 = bitcast float* %a to i8*
1140 %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
1141 %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0
1142 %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1
1143 %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0
1144 %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1
1145 ret %struct.float32x2x2_t %.fca.0.1.insert
1148 define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
1149 ; CHECK-LABEL: test_vld2_lane_f64
1150 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1152 %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
1153 %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
1154 %0 = bitcast double* %a to i8*
1155 %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
1156 %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0
1157 %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1
1158 %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0
1159 %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1
1160 ret %struct.float64x1x2_t %.fca.0.1.insert
1163 define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
1164 ; CHECK-LABEL: test_vld3q_lane_s16
1165 ; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1167 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
1168 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
1169 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
1170 %0 = bitcast i16* %a to i8*
1171 %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
1172 %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
1173 %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
1174 %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
1175 %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
1176 %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
1177 %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
1178 ret %struct.int16x8x3_t %.fca.0.2.insert
1181 define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
1182 ; CHECK-LABEL: test_vld3q_lane_s32
1183 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1185 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
1186 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
1187 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
1188 %0 = bitcast i32* %a to i8*
1189 %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
1190 %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
1191 %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
1192 %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
1193 %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
1194 %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
1195 %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
1196 ret %struct.int32x4x3_t %.fca.0.2.insert
1199 define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
1200 ; CHECK-LABEL: test_vld3q_lane_s64
1201 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1203 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
1204 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
1205 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
1206 %0 = bitcast i64* %a to i8*
1207 %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
1208 %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
1209 %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
1210 %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
1211 %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
1212 %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
1213 %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
1214 ret %struct.int64x2x3_t %.fca.0.2.insert
1217 define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
1218 ; CHECK-LABEL: test_vld3q_lane_f32
1219 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1221 %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
1222 %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
1223 %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
1224 %0 = bitcast float* %a to i8*
1225 %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
1226 %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
1227 %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
1228 %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
1229 %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
1230 %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
1231 %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
1232 ret %struct.float32x4x3_t %.fca.0.2.insert
1235 define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
1236 ; CHECK-LABEL: test_vld3q_lane_f64
1237 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1239 %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
1240 %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
1241 %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
1242 %0 = bitcast double* %a to i8*
1243 %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
1244 %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
1245 %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
1246 %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
1247 %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
1248 %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
1249 %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
1250 ret %struct.float64x2x3_t %.fca.0.2.insert
1253 define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
1254 ; CHECK-LABEL: test_vld3_lane_s8
1255 ; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1257 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
1258 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
1259 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
1260 %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
1261 %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
1262 %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
1263 %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
1264 %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
1265 %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
1266 %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
1267 ret %struct.int8x8x3_t %.fca.0.2.insert
1270 define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
1271 ; CHECK-LABEL: test_vld3_lane_s16
1272 ; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1274 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
1275 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
1276 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
1277 %0 = bitcast i16* %a to i8*
1278 %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
1279 %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
1280 %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
1281 %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
1282 %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
1283 %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
1284 %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
1285 ret %struct.int16x4x3_t %.fca.0.2.insert
1288 define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
1289 ; CHECK-LABEL: test_vld3_lane_s32
1290 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1292 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
1293 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1294 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1295 %0 = bitcast i32* %a to i8*
1296 %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
1297 %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
1298 %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
1299 %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
1300 %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
1301 %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
1302 %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
1303 ret %struct.int32x2x3_t %.fca.0.2.insert
1306 define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1307 ; CHECK-LABEL: test_vld3_lane_s64
1308 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1310 %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1311 %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1312 %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1313 %0 = bitcast i64* %a to i8*
1314 %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
1315 %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
1316 %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
1317 %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
1318 %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
1319 %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
1320 %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
1321 ret %struct.int64x1x3_t %.fca.0.2.insert
1324 define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1325 ; CHECK-LABEL: test_vld3_lane_f32
1326 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1328 %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1329 %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1330 %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1331 %0 = bitcast float* %a to i8*
1332 %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
1333 %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
1334 %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
1335 %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
1336 %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
1337 %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
1338 %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
1339 ret %struct.float32x2x3_t %.fca.0.2.insert
1342 define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1343 ; CHECK-LABEL: test_vld3_lane_f64
1344 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1346 %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1347 %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1348 %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1349 %0 = bitcast double* %a to i8*
1350 %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
1351 %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
1352 %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
1353 %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
1354 %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
1355 %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
1356 %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
1357 ret %struct.float64x1x3_t %.fca.0.2.insert
1360 define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
1361 ; CHECK-LABEL: test_vld4q_lane_s8
1362 ; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1364 %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1365 %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1366 %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1367 %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1368 %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1)
1369 %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
1370 %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
1371 %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
1372 %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3
1373 %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
1374 %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
1375 %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
1376 %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3
1377 ret %struct.int8x16x4_t %.fca.0.3.insert
1380 define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
1381 ; CHECK-LABEL: test_vld4q_lane_s16
1382 ; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1384 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
1385 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
1386 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
1387 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
1388 %0 = bitcast i16* %a to i8*
1389 %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
1390 %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
1391 %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
1392 %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
1393 %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3
1394 %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
1395 %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
1396 %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
1397 %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3
1398 ret %struct.int16x8x4_t %.fca.0.3.insert
1401 define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
1402 ; CHECK-LABEL: test_vld4q_lane_s32
1403 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1405 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
1406 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
1407 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
1408 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
1409 %0 = bitcast i32* %a to i8*
1410 %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
1411 %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
1412 %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
1413 %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
1414 %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3
1415 %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
1416 %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
1417 %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
1418 %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3
1419 ret %struct.int32x4x4_t %.fca.0.3.insert
1422 define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
1423 ; CHECK-LABEL: test_vld4q_lane_s64
1424 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1426 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
1427 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
1428 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
1429 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
1430 %0 = bitcast i64* %a to i8*
1431 %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
1432 %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
1433 %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
1434 %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
1435 %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3
1436 %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
1437 %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
1438 %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
1439 %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3
1440 ret %struct.int64x2x4_t %.fca.0.3.insert
1443 define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
1444 ; CHECK-LABEL: test_vld4q_lane_f32
1445 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1447 %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
1448 %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
1449 %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
1450 %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
1451 %0 = bitcast float* %a to i8*
1452 %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
1453 %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
1454 %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
1455 %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
1456 %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3
1457 %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
1458 %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
1459 %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
1460 %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3
1461 ret %struct.float32x4x4_t %.fca.0.3.insert
1464 define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
1465 ; CHECK-LABEL: test_vld4q_lane_f64
1466 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1468 %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
1469 %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
1470 %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
1471 %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
1472 %0 = bitcast double* %a to i8*
1473 %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
1474 %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
1475 %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
1476 %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
1477 %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3
1478 %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
1479 %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
1480 %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
1481 %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3
1482 ret %struct.float64x2x4_t %.fca.0.3.insert
1485 define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
1486 ; CHECK-LABEL: test_vld4_lane_s8
1487 ; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1489 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
1490 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
1491 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
1492 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
1493 %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
1494 %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
1495 %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
1496 %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
1497 %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3
1498 %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
1499 %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
1500 %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
1501 %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3
1502 ret %struct.int8x8x4_t %.fca.0.3.insert
1505 define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
1506 ; CHECK-LABEL: test_vld4_lane_s16
1507 ; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1509 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
1510 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
1511 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
1512 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
1513 %0 = bitcast i16* %a to i8*
1514 %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
1515 %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
1516 %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
1517 %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
1518 %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3
1519 %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
1520 %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
1521 %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
1522 %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3
1523 ret %struct.int16x4x4_t %.fca.0.3.insert
1526 define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
1527 ; CHECK-LABEL: test_vld4_lane_s32
1528 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1530 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
1531 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
1532 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
1533 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
1534 %0 = bitcast i32* %a to i8*
1535 %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
1536 %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
1537 %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
1538 %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
1539 %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3
1540 %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
1541 %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
1542 %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
1543 %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3
1544 ret %struct.int32x2x4_t %.fca.0.3.insert
1547 define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
1548 ; CHECK-LABEL: test_vld4_lane_s64
1549 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1551 %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
1552 %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
1553 %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
1554 %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
1555 %0 = bitcast i64* %a to i8*
1556 %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
1557 %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
1558 %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
1559 %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
1560 %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3
1561 %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
1562 %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
1563 %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
1564 %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3
1565 ret %struct.int64x1x4_t %.fca.0.3.insert
1568 define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
1569 ; CHECK-LABEL: test_vld4_lane_f32
1570 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1572 %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
1573 %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
1574 %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
1575 %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
1576 %0 = bitcast float* %a to i8*
1577 %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
1578 %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
1579 %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
1580 %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
1581 %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3
1582 %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
1583 %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
1584 %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
1585 %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3
1586 ret %struct.float32x2x4_t %.fca.0.3.insert
1589 define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
1590 ; CHECK-LABEL: test_vld4_lane_f64
1591 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1593 %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
1594 %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
1595 %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
1596 %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
1597 %0 = bitcast double* %a to i8*
1598 %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
1599 %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
1600 %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
1601 %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
1602 %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3
1603 %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
1604 %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
1605 %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
1606 %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3
1607 ret %struct.float64x1x4_t %.fca.0.3.insert
1610 define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
1611 ; CHECK-LABEL: test_vst1q_lane_s8
1612 ; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1614 %0 = extractelement <16 x i8> %b, i32 15
1615 store i8 %0, i8* %a, align 1
1619 define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
1620 ; CHECK-LABEL: test_vst1q_lane_s16
1621 ; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1623 %0 = extractelement <8 x i16> %b, i32 7
1624 store i16 %0, i16* %a, align 2
1628 define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
1629 ; CHECK-LABEL: test_vst1q_lane_s32
1630 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1632 %0 = extractelement <4 x i32> %b, i32 3
1633 store i32 %0, i32* %a, align 4
1637 define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
1638 ; CHECK-LABEL: test_vst1q_lane_s64
1639 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1641 %0 = extractelement <2 x i64> %b, i32 1
1642 store i64 %0, i64* %a, align 8
1646 define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
1647 ; CHECK-LABEL: test_vst1q_lane_f32
1648 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1650 %0 = extractelement <4 x float> %b, i32 3
1651 store float %0, float* %a, align 4
1655 define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
1656 ; CHECK-LABEL: test_vst1q_lane_f64
1657 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1659 %0 = extractelement <2 x double> %b, i32 1
1660 store double %0, double* %a, align 8
1664 define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
1665 ; CHECK-LABEL: test_vst1_lane_s8
1666 ; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1668 %0 = extractelement <8 x i8> %b, i32 7
1669 store i8 %0, i8* %a, align 1
1673 define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
1674 ; CHECK-LABEL: test_vst1_lane_s16
1675 ; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1677 %0 = extractelement <4 x i16> %b, i32 3
1678 store i16 %0, i16* %a, align 2
1682 define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
1683 ; CHECK-LABEL: test_vst1_lane_s32
1684 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1686 %0 = extractelement <2 x i32> %b, i32 1
1687 store i32 %0, i32* %a, align 4
1691 define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
1692 ; CHECK-LABEL: test_vst1_lane_s64
1693 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1695 %0 = extractelement <1 x i64> %b, i32 0
1696 store i64 %0, i64* %a, align 8
1700 define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
1701 ; CHECK-LABEL: test_vst1_lane_f32
1702 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1704 %0 = extractelement <2 x float> %b, i32 1
1705 store float %0, float* %a, align 4
1709 define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
1710 ; CHECK-LABEL: test_vst1_lane_f64
1711 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1713 %0 = extractelement <1 x double> %b, i32 0
1714 store double %0, double* %a, align 8
1718 define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
1719 ; CHECK-LABEL: test_vst2q_lane_s8
1720 ; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1722 %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
1723 %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
1724 tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1)
1728 define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
1729 ; CHECK-LABEL: test_vst2q_lane_s16
1730 ; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1732 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
1733 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
1734 %0 = bitcast i16* %a to i8*
1735 tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
1739 define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
1740 ; CHECK-LABEL: test_vst2q_lane_s32
1741 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1743 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
1744 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
1745 %0 = bitcast i32* %a to i8*
1746 tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
1750 define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
1751 ; CHECK-LABEL: test_vst2q_lane_s64
1752 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1754 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
1755 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
1756 %0 = bitcast i64* %a to i8*
1757 tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
1761 define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
1762 ; CHECK-LABEL: test_vst2q_lane_f32
1763 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1765 %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
1766 %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
1767 %0 = bitcast float* %a to i8*
1768 tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
1772 define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
1773 ; CHECK-LABEL: test_vst2q_lane_f64
1774 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1776 %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
1777 %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
1778 %0 = bitcast double* %a to i8*
1779 tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
1783 define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
1784 ; CHECK-LABEL: test_vst2_lane_s8
1785 ; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1787 %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
1788 %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
1789 tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
1793 define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
1794 ; CHECK-LABEL: test_vst2_lane_s16
1795 ; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1797 %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
1798 %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
1799 %0 = bitcast i16* %a to i8*
1800 tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
1804 define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
1805 ; CHECK-LABEL: test_vst2_lane_s32
1806 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1808 %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
1809 %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
1810 %0 = bitcast i32* %a to i8*
1811 tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
1815 define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
1816 ; CHECK-LABEL: test_vst2_lane_s64
1817 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1819 %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
1820 %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
1821 %0 = bitcast i64* %a to i8*
1822 tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
1826 define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
1827 ; CHECK-LABEL: test_vst2_lane_f32
1828 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1830 %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
1831 %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
1832 %0 = bitcast float* %a to i8*
1833 tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
1837 define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
1838 ; CHECK-LABEL: test_vst2_lane_f64
1839 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1841 %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
1842 %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
1843 %0 = bitcast double* %a to i8*
1844 tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
1848 define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
1849 ; CHECK-LABEL: test_vst3q_lane_s8
1850 ; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1852 %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
1853 %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
1854 %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
1855 tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1)
1859 define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
1860 ; CHECK-LABEL: test_vst3q_lane_s16
1861 ; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1863 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
1864 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
1865 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
1866 %0 = bitcast i16* %a to i8*
1867 tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
1871 define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
1872 ; CHECK-LABEL: test_vst3q_lane_s32
1873 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1875 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
1876 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
1877 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
1878 %0 = bitcast i32* %a to i8*
1879 tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
1883 define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
1884 ; CHECK-LABEL: test_vst3q_lane_s64
1885 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1887 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
1888 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
1889 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
1890 %0 = bitcast i64* %a to i8*
1891 tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
1895 define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
1896 ; CHECK-LABEL: test_vst3q_lane_f32
1897 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1899 %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
1900 %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
1901 %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
1902 %0 = bitcast float* %a to i8*
1903 tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
1907 define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
1908 ; CHECK-LABEL: test_vst3q_lane_f64
1909 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1911 %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
1912 %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
1913 %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
1914 %0 = bitcast double* %a to i8*
1915 tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
1919 define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
1920 ; CHECK-LABEL: test_vst3_lane_s8
1921 ; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1923 %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
1924 %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
1925 %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
1926 tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
1930 define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
1931 ; CHECK-LABEL: test_vst3_lane_s16
1932 ; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1934 %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
1935 %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
1936 %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
1937 %0 = bitcast i16* %a to i8*
1938 tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
1942 define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
1943 ; CHECK-LABEL: test_vst3_lane_s32
1944 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1946 %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
1947 %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1948 %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1949 %0 = bitcast i32* %a to i8*
1950 tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
1954 define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1955 ; CHECK-LABEL: test_vst3_lane_s64
1956 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1958 %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1959 %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1960 %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1961 %0 = bitcast i64* %a to i8*
1962 tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
1966 define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1967 ; CHECK-LABEL: test_vst3_lane_f32
1968 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1970 %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1971 %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1972 %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1973 %0 = bitcast float* %a to i8*
1974 tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
1978 define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1979 ; CHECK-LABEL: test_vst3_lane_f64
1980 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1982 %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1983 %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1984 %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1985 %0 = bitcast double* %a to i8*
1986 tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
1990 define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) {
1991 ; CHECK-LABEL: test_vst4q_lane_s8
1992 ; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1994 %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1995 %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1996 %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1997 %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1998 %0 = bitcast i16* %a to i8*
1999 tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2)
2003 define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
2004 ; CHECK-LABEL: test_vst4q_lane_s16
2005 ; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
2007 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
2008 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
2009 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
2010 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
2011 %0 = bitcast i16* %a to i8*
2012 tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
2016 define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
2017 ; CHECK-LABEL: test_vst4q_lane_s32
2018 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2020 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
2021 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
2022 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
2023 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
2024 %0 = bitcast i32* %a to i8*
2025 tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
2029 define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
2030 ; CHECK-LABEL: test_vst4q_lane_s64
2031 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2033 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
2034 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
2035 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
2036 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
2037 %0 = bitcast i64* %a to i8*
2038 tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
2042 define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
2043 ; CHECK-LABEL: test_vst4q_lane_f32
2044 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2046 %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
2047 %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
2048 %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
2049 %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
2050 %0 = bitcast float* %a to i8*
2051 tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
2055 define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
2056 ; CHECK-LABEL: test_vst4q_lane_f64
2057 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2059 %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
2060 %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
2061 %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
2062 %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
2063 %0 = bitcast double* %a to i8*
2064 tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
2068 define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
2069 ; CHECK-LABEL: test_vst4_lane_s8
2070 ; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
2072 %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
2073 %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
2074 %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
2075 %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
2076 tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
2080 define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
2081 ; CHECK-LABEL: test_vst4_lane_s16
2082 ; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
2084 %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
2085 %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
2086 %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
2087 %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
2088 %0 = bitcast i16* %a to i8*
2089 tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
2093 define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
2094 ; CHECK-LABEL: test_vst4_lane_s32
2095 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2097 %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
2098 %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
2099 %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
2100 %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
2101 %0 = bitcast i32* %a to i8*
2102 tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
2106 define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
2107 ; CHECK-LABEL: test_vst4_lane_s64
2108 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2110 %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
2111 %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
2112 %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
2113 %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
2114 %0 = bitcast i64* %a to i8*
2115 tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
2119 define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
2120 ; CHECK-LABEL: test_vst4_lane_f32
2121 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2123 %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
2124 %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
2125 %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
2126 %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
2127 %0 = bitcast float* %a to i8*
2128 tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
2132 define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
2133 ; CHECK-LABEL: test_vst4_lane_f64
2134 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2136 %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
2137 %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
2138 %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
2139 %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
2140 %0 = bitcast double* %a to i8*
2141 tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
2145 declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
2146 declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
2147 declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
2148 declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
2149 declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
2150 declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
2151 declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
2152 declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
2153 declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
2154 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
2155 declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
2156 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
2157 declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2158 declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2159 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2160 declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2161 declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2162 declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2163 declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2164 declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2165 declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2166 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
2167 declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2168 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
2169 declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2170 declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2171 declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2172 declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2173 declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2174 declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2175 declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2176 declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2177 declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2178 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
2179 declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2180 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
2181 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
2182 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
2183 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2184 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2185 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2186 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2187 declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
2188 declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
2189 declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
2190 declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
2191 declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
2192 declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
2193 declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
2194 declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
2195 declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
2196 declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
2197 declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
2198 declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
2199 declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2200 declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2201 declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2202 declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2203 declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2204 declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2205 declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2206 declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2207 declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2208 declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2209 declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2210 declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2211 declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2212 declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2213 declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2214 declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2215 declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2216 declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2217 declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2218 declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2219 declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2220 declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2221 declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2222 declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2224 define %struct.int8x16x2_t @test_vld2q_lane_s8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
2225 ; CHECK-LABEL: test_vld2q_lane_s8
2226 ; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
2228 %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
2229 %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
2230 %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
2231 %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
2232 %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
2233 %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
2234 %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
2235 ret %struct.int8x16x2_t %.fca.0.1.insert
2238 define %struct.uint8x16x2_t @test_vld2q_lane_u8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
2239 ; CHECK-LABEL: test_vld2q_lane_u8
2240 ; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
2242 %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
2243 %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
2244 %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
2245 %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
2246 %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
2247 %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
2248 %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
2249 ret %struct.uint8x16x2_t %.fca.0.1.insert
2252 define %struct.poly8x16x2_t @test_vld2q_lane_p8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
2253 ; CHECK-LABEL: test_vld2q_lane_p8
2254 ; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
2256 %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
2257 %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
2258 %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
2259 %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
2260 %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
2261 %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
2262 %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
2263 ret %struct.poly8x16x2_t %.fca.0.1.insert
2266 define %struct.int8x16x3_t @test_vld3q_lane_s8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
2267 ; CHECK-LABEL: test_vld3q_lane_s8
2268 ; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
2270 %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
2271 %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
2272 %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
2273 %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
2274 %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
2275 %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
2276 %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
2277 %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
2278 %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
2279 %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
2280 ret %struct.int8x16x3_t %.fca.0.2.insert
2283 define %struct.uint8x16x3_t @test_vld3q_lane_u8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
2284 ; CHECK-LABEL: test_vld3q_lane_u8
2285 ; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
2287 %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
2288 %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
2289 %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
2290 %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
2291 %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
2292 %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
2293 %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
2294 %.fca.0.0.insert = insertvalue %struct.uint8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
2295 %.fca.0.1.insert = insertvalue %struct.uint8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
2296 %.fca.0.2.insert = insertvalue %struct.uint8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
2297 ret %struct.uint8x16x3_t %.fca.0.2.insert