test/CodeGen/AArch64/neon-simd-ldst-one.ll

   1 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
   2
   3 %struct.uint8x16x2_t = type { [2 x <16 x i8>] }
   4 %struct.poly8x16x2_t = type { [2 x <16 x i8>] }
   5 %struct.uint8x16x3_t = type { [3 x <16 x i8>] }
   6 %struct.int8x16x2_t = type { [2 x <16 x i8>] }
   7 %struct.int16x8x2_t = type { [2 x <8 x i16>] }
   8 %struct.int32x4x2_t = type { [2 x <4 x i32>] }
   9 %struct.int64x2x2_t = type { [2 x <2 x i64>] }
  10 %struct.float32x4x2_t = type { [2 x <4 x float>] }
  11 %struct.float64x2x2_t = type { [2 x <2 x double>] }
  12 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
  13 %struct.int16x4x2_t = type { [2 x <4 x i16>] }
  14 %struct.int32x2x2_t = type { [2 x <2 x i32>] }
  15 %struct.int64x1x2_t = type { [2 x <1 x i64>] }
  16 %struct.float32x2x2_t = type { [2 x <2 x float>] }
  17 %struct.float64x1x2_t = type { [2 x <1 x double>] }
  18 %struct.int8x16x3_t = type { [3 x <16 x i8>] }
  19 %struct.int16x8x3_t = type { [3 x <8 x i16>] }
  20 %struct.int32x4x3_t = type { [3 x <4 x i32>] }
  21 %struct.int64x2x3_t = type { [3 x <2 x i64>] }
  22 %struct.float32x4x3_t = type { [3 x <4 x float>] }
  23 %struct.float64x2x3_t = type { [3 x <2 x double>] }
  24 %struct.int8x8x3_t = type { [3 x <8 x i8>] }
  25 %struct.int16x4x3_t = type { [3 x <4 x i16>] }
  26 %struct.int32x2x3_t = type { [3 x <2 x i32>] }
  27 %struct.int64x1x3_t = type { [3 x <1 x i64>] }
  28 %struct.float32x2x3_t = type { [3 x <2 x float>] }
  29 %struct.float64x1x3_t = type { [3 x <1 x double>] }
  30 %struct.int8x16x4_t = type { [4 x <16 x i8>] }
  31 %struct.int16x8x4_t = type { [4 x <8 x i16>] }
  32 %struct.int32x4x4_t = type { [4 x <4 x i32>] }
  33 %struct.int64x2x4_t = type { [4 x <2 x i64>] }
  34 %struct.float32x4x4_t = type { [4 x <4 x float>] }
  35 %struct.float64x2x4_t = type { [4 x <2 x double>] }
  36 %struct.int8x8x4_t = type { [4 x <8 x i8>] }
  37 %struct.int16x4x4_t = type { [4 x <4 x i16>] }
  38 %struct.int32x2x4_t = type { [4 x <2 x i32>] }
  39 %struct.int64x1x4_t = type { [4 x <1 x i64>] }
  40 %struct.float32x2x4_t = type { [4 x <2 x float>] }
  41 %struct.float64x1x4_t = type { [4 x <1 x double>] }
  42
  43 define <16 x i8> @test_ld_from_poll_v16i8(<16 x i8> %a) {
  44 ; CHECK-LABEL: test_ld_from_poll_v16i8
  45 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  46 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  47 entry:
  48   %b = add <16 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 2, i8 13, i8 14, i8 15, i8 16>
  49   ret <16 x i8> %b
  50 }
  51
  52 define <8 x i16> @test_ld_from_poll_v8i16(<8 x i16> %a) {
  53 ; CHECK-LABEL: test_ld_from_poll_v8i16
  54 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  55 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  56 entry:
  57   %b = add <8 x i16> %a, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
  58   ret <8 x i16> %b
  59 }
  60
  61 define <4 x i32> @test_ld_from_poll_v4i32(<4 x i32> %a) {
  62 ; CHECK-LABEL: test_ld_from_poll_v4i32
  63 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  64 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  65 entry:
  66   %b = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
  67   ret <4 x i32> %b
  68 }
  69
  70 define <2 x i64> @test_ld_from_poll_v2i64(<2 x i64> %a) {
  71 ; CHECK-LABEL: test_ld_from_poll_v2i64
  72 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  73 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  74 entry:
  75   %b = add <2 x i64> %a, <i64 1, i64 2>
  76   ret <2 x i64> %b
  77 }
  78
  79 define <4 x float> @test_ld_from_poll_v4f32(<4 x float> %a) {
  80 ; CHECK-LABEL: test_ld_from_poll_v4f32
  81 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  82 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  83 entry:
  84   %b = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
  85   ret <4 x float> %b
  86 }
  87
  88 define <2 x double> @test_ld_from_poll_v2f64(<2 x double> %a) {
  89 ; CHECK-LABEL: test_ld_from_poll_v2f64
  90 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  91 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  92 entry:
  93   %b = fadd <2 x double> %a, <double 1.0, double 2.0>
  94   ret <2 x double> %b
  95 }
  96
  97 define <8 x i8> @test_ld_from_poll_v8i8(<8 x i8> %a) {
  98 ; CHECK-LABEL: test_ld_from_poll_v8i8
  99 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
 100 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
 101 entry:
 102   %b = add <8 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
 103   ret <8 x i8> %b
 104 }
 105
 106 define <4 x i16> @test_ld_from_poll_v4i16(<4 x i16> %a) {
 107 ; CHECK-LABEL: test_ld_from_poll_v4i16
 108 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
 109 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
 110 entry:
 111   %b = add <4 x i16> %a, <i16 1, i16 2, i16 3, i16 4>
 112   ret <4 x i16> %b
 113 }
 114
 115 define <2 x i32> @test_ld_from_poll_v2i32(<2 x i32> %a) {
 116 ; CHECK-LABEL: test_ld_from_poll_v2i32
 117 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
 118 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
 119 entry:
 120   %b = add <2 x i32> %a, <i32 1, i32 2>
 121   ret <2 x i32> %b
 122 }
 123
 124 define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
 125 ; CHECK-LABEL: test_vld1q_dup_s8
 126 ; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0]
 127 entry:
 128   %0 = load i8* %a, align 1
 129   %1 = insertelement <16 x i8> undef, i8 %0, i32 0
 130   %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
 131   ret <16 x i8> %lane
 132 }
 133
 134 define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
 135 ; CHECK-LABEL: test_vld1q_dup_s16
 136 ; CHECK: ld1r {{{v[0-9]+}}.8h}, [x0]
 137 entry:
 138   %0 = load i16* %a, align 2
 139   %1 = insertelement <8 x i16> undef, i16 %0, i32 0
 140   %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
 141   ret <8 x i16> %lane
 142 }
 143
 144 define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
 145 ; CHECK-LABEL: test_vld1q_dup_s32
 146 ; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
 147 entry:
 148   %0 = load i32* %a, align 4
 149   %1 = insertelement <4 x i32> undef, i32 %0, i32 0
 150   %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
 151   ret <4 x i32> %lane
 152 }
 153
 154 define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
 155 ; CHECK-LABEL: test_vld1q_dup_s64
 156 ; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
 157 entry:
 158   %0 = load i64* %a, align 8
 159   %1 = insertelement <2 x i64> undef, i64 %0, i32 0
 160   %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
 161   ret <2 x i64> %lane
 162 }
 163
 164 define <4 x float> @test_vld1q_dup_f32(float* %a) {
 165 ; CHECK-LABEL: test_vld1q_dup_f32
 166 ; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
 167 entry:
 168   %0 = load float* %a, align 4
 169   %1 = insertelement <4 x float> undef, float %0, i32 0
 170   %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
 171   ret <4 x float> %lane
 172 }
 173
 174 define <2 x double> @test_vld1q_dup_f64(double* %a) {
 175 ; CHECK-LABEL: test_vld1q_dup_f64
 176 ; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
 177 entry:
 178   %0 = load double* %a, align 8
 179   %1 = insertelement <2 x double> undef, double %0, i32 0
 180   %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
 181   ret <2 x double> %lane
 182 }
 183
 184 define <8 x i8> @test_vld1_dup_s8(i8* %a) {
 185 ; CHECK-LABEL: test_vld1_dup_s8
 186 ; CHECK: ld1r {{{v[0-9]+}}.8b}, [x0]
 187 entry:
 188   %0 = load i8* %a, align 1
 189   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
 190   %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
 191   ret <8 x i8> %lane
 192 }
 193
 194 define <4 x i16> @test_vld1_dup_s16(i16* %a) {
 195 ; CHECK-LABEL: test_vld1_dup_s16
 196 ; CHECK: ld1r {{{v[0-9]+}}.4h}, [x0]
 197 entry:
 198   %0 = load i16* %a, align 2
 199   %1 = insertelement <4 x i16> undef, i16 %0, i32 0
 200   %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
 201   ret <4 x i16> %lane
 202 }
 203
 204 define <2 x i32> @test_vld1_dup_s32(i32* %a) {
 205 ; CHECK-LABEL: test_vld1_dup_s32
 206 ; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
 207 entry:
 208   %0 = load i32* %a, align 4
 209   %1 = insertelement <2 x i32> undef, i32 %0, i32 0
 210   %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
 211   ret <2 x i32> %lane
 212 }
 213
 214 define <1 x i64> @test_vld1_dup_s64(i64* %a) {
 215 ; CHECK-LABEL: test_vld1_dup_s64
 216 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
 217 entry:
 218   %0 = load i64* %a, align 8
 219   %1 = insertelement <1 x i64> undef, i64 %0, i32 0
 220   ret <1 x i64> %1
 221 }
 222
 223 define <2 x float> @test_vld1_dup_f32(float* %a) {
 224 ; CHECK-LABEL: test_vld1_dup_f32
 225 ; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
 226 entry:
 227   %0 = load float* %a, align 4
 228   %1 = insertelement <2 x float> undef, float %0, i32 0
 229   %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
 230   ret <2 x float> %lane
 231 }
 232
 233 define <1 x double> @test_vld1_dup_f64(double* %a) {
 234 ; CHECK-LABEL: test_vld1_dup_f64
 235 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
 236 entry:
 237   %0 = load double* %a, align 8
 238   %1 = insertelement <1 x double> undef, double %0, i32 0
 239   ret <1 x double> %1
 240 }
 241
 242 define <1 x i64> @testDUP.v1i64(i64* %a, i64* %b) #0 {
 243 ; As there is a store operation depending on %1, LD1R pattern can't be selected.
 244 ; So LDR and FMOV should be emitted.
 245 ; CHECK-LABEL: testDUP.v1i64
 246 ; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
 247 ; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
 248 ; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}]
 249   %1 = load i64* %a, align 8
 250   store i64 %1, i64* %b, align 8
 251   %vecinit.i = insertelement <1 x i64> undef, i64 %1, i32 0
 252   ret <1 x i64> %vecinit.i
 253 }
 254
 255 define <1 x double> @testDUP.v1f64(double* %a, double* %b) #0 {
 256 ; As there is a store operation depending on %1, LD1R pattern can't be selected.
 257 ; So LDR and FMOV should be emitted.
 258 ; CHECK-LABEL: testDUP.v1f64
 259 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
 260 ; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}]
 261   %1 = load double* %a, align 8
 262   store double %1, double* %b, align 8
 263   %vecinit.i = insertelement <1 x double> undef, double %1, i32 0
 264   ret <1 x double> %vecinit.i
 265 }
 266
 267 define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) {
 268 ; CHECK-LABEL: test_vld2q_dup_s8
 269 ; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
 270 entry:
 271   %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
 272   %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0
 273   %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
 274   %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1
 275   %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
 276   %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0
 277   %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
 278   ret %struct.int8x16x2_t %.fca.0.1.insert
 279 }
 280
 281 define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) {
 282 ; CHECK-LABEL: test_vld2q_dup_s16
 283 ; CHECK: ld2r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
 284 entry:
 285   %0 = bitcast i16* %a to i8*
 286   %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
 287   %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0
 288   %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
 289   %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1
 290   %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
 291   %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0
 292   %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
 293   ret %struct.int16x8x2_t %.fca.0.1.insert
 294 }
 295
 296 define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) {
 297 ; CHECK-LABEL: test_vld2q_dup_s32
 298 ; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 299 entry:
 300   %0 = bitcast i32* %a to i8*
 301   %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
 302   %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0
 303   %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
 304   %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1
 305   %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
 306   %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0
 307   %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
 308   ret %struct.int32x4x2_t %.fca.0.1.insert
 309 }
 310
 311 define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) {
 312 ; CHECK-LABEL: test_vld2q_dup_s64
 313 ; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 314 entry:
 315   %0 = bitcast i64* %a to i8*
 316   %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
 317   %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0
 318   %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
 319   %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1
 320   %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
 321   %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0
 322   %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
 323   ret %struct.int64x2x2_t %.fca.0.1.insert
 324 }
 325
 326 define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) {
 327 ; CHECK-LABEL: test_vld2q_dup_f32
 328 ; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 329 entry:
 330   %0 = bitcast float* %a to i8*
 331   %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
 332   %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0
 333   %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
 334   %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1
 335   %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
 336   %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0
 337   %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
 338   ret %struct.float32x4x2_t %.fca.0.1.insert
 339 }
 340
 341 define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) {
 342 ; CHECK-LABEL: test_vld2q_dup_f64
 343 ; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 344 entry:
 345   %0 = bitcast double* %a to i8*
 346   %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
 347   %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0
 348   %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
 349   %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1
 350   %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
 351   %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0
 352   %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
 353   ret %struct.float64x2x2_t %.fca.0.1.insert
 354 }
 355
 356 define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) {
 357 ; CHECK-LABEL: test_vld2_dup_s8
 358 ; CHECK: ld2r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
 359 entry:
 360   %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
 361   %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0
 362   %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
 363   %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1
 364   %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
 365   %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0
 366   %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
 367   ret %struct.int8x8x2_t %.fca.0.1.insert
 368 }
 369
 370 define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) {
 371 ; CHECK-LABEL: test_vld2_dup_s16
 372 ; CHECK: ld2r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
 373 entry:
 374   %0 = bitcast i16* %a to i8*
 375   %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 376   %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0
 377   %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
 378   %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1
 379   %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
 380   %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0
 381   %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
 382   ret %struct.int16x4x2_t %.fca.0.1.insert
 383 }
 384
 385 define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) {
 386 ; CHECK-LABEL: test_vld2_dup_s32
 387 ; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 388 entry:
 389   %0 = bitcast i32* %a to i8*
 390   %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
 391   %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0
 392   %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
 393   %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1
 394   %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
 395   %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0
 396   %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
 397   ret %struct.int32x2x2_t %.fca.0.1.insert
 398 }
 399
 400 define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) {
 401 ; CHECK-LABEL: test_vld2_dup_s64
 402 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 403 entry:
 404   %0 = bitcast i64* %a to i8*
 405   %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8)
 406   %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0
 407   %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1
 408   %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
 409   %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
 410   ret %struct.int64x1x2_t %.fca.0.1.insert
 411 }
 412
 413 define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) {
 414 ; CHECK-LABEL: test_vld2_dup_f32
 415 ; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 416 entry:
 417   %0 = bitcast float* %a to i8*
 418   %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
 419   %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0
 420   %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
 421   %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1
 422   %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
 423   %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0
 424   %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
 425   ret %struct.float32x2x2_t %.fca.0.1.insert
 426 }
 427
 428 define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) {
 429 ; CHECK-LABEL: test_vld2_dup_f64
 430 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 431 entry:
 432   %0 = bitcast double* %a to i8*
 433   %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8)
 434   %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0
 435   %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1
 436   %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
 437   %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
 438   ret %struct.float64x1x2_t %.fca.0.1.insert
 439 }
 440
 441 define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) {
 442 ; CHECK-LABEL: test_vld3q_dup_s8
 443 ; CHECK: ld3r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
 444 entry:
 445   %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
 446   %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
 447   %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
 448   %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
 449   %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
 450   %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
 451   %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
 452   %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0
 453   %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
 454   %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
 455   ret %struct.int8x16x3_t %.fca.0.2.insert
 456 }
 457
 458 define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) {
 459 ; CHECK-LABEL: test_vld3q_dup_s16
 460 ; CHECK: ld3r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
 461 entry:
 462   %0 = bitcast i16* %a to i8*
 463   %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
 464   %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
 465   %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
 466   %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
 467   %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
 468   %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
 469   %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
 470   %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0
 471   %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
 472   %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
 473   ret %struct.int16x8x3_t %.fca.0.2.insert
 474 }
 475
 476 define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) {
 477 ; CHECK-LABEL: test_vld3q_dup_s32
 478 ; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 479 entry:
 480   %0 = bitcast i32* %a to i8*
 481   %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
 482   %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
 483   %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
 484   %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
 485   %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
 486   %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
 487   %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
 488   %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0
 489   %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
 490   %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
 491   ret %struct.int32x4x3_t %.fca.0.2.insert
 492 }
 493
 494 define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) {
 495 ; CHECK-LABEL: test_vld3q_dup_s64
 496 ; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 497 entry:
 498   %0 = bitcast i64* %a to i8*
 499   %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
 500   %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
 501   %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
 502   %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
 503   %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
 504   %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
 505   %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
 506   %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0
 507   %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
 508   %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
 509   ret %struct.int64x2x3_t %.fca.0.2.insert
 510 }
 511
 512 define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) {
 513 ; CHECK-LABEL: test_vld3q_dup_f32
 514 ; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 515 entry:
 516   %0 = bitcast float* %a to i8*
 517   %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
 518   %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
 519   %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
 520   %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
 521   %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
 522   %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
 523   %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
 524   %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0
 525   %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
 526   %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
 527   ret %struct.float32x4x3_t %.fca.0.2.insert
 528 }
 529
 530 define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) {
 531 ; CHECK-LABEL: test_vld3q_dup_f64
 532 ; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 533 entry:
 534   %0 = bitcast double* %a to i8*
 535   %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
 536   %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
 537   %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
 538   %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
 539   %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
 540   %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
 541   %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
 542   %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0
 543   %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
 544   %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
 545   ret %struct.float64x2x3_t %.fca.0.2.insert
 546 }
 547
 548 define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) {
 549 ; CHECK-LABEL: test_vld3_dup_s8
 550 ; CHECK: ld3r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
 551 entry:
 552   %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
 553   %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
 554   %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
 555   %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
 556   %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
 557   %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
 558   %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
 559   %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0
 560   %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
 561   %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
 562   ret %struct.int8x8x3_t %.fca.0.2.insert
 563 }
 564
 565 define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) {
 566 ; CHECK-LABEL: test_vld3_dup_s16
 567 ; CHECK: ld3r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
 568 entry:
 569   %0 = bitcast i16* %a to i8*
 570   %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 571   %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
 572   %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
 573   %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
 574   %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
 575   %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
 576   %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
 577   %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0
 578   %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
 579   %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
 580   ret %struct.int16x4x3_t %.fca.0.2.insert
 581 }
 582
 583 define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) {
 584 ; CHECK-LABEL: test_vld3_dup_s32
 585 ; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 586 entry:
 587   %0 = bitcast i32* %a to i8*
 588   %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
 589   %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
 590   %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
 591   %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
 592   %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
 593   %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
 594   %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
 595   %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0
 596   %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
 597   %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
 598   ret %struct.int32x2x3_t %.fca.0.2.insert
 599 }
 600
 601 define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) {
 602 ; CHECK-LABEL: test_vld3_dup_s64
 603 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 604 entry:
 605   %0 = bitcast i64* %a to i8*
 606   %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8)
 607   %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
 608   %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
 609   %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
 610   %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
 611   %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
 612   %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
 613   ret %struct.int64x1x3_t %.fca.0.2.insert
 614 }
 615
 616 define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) {
 617 ; CHECK-LABEL: test_vld3_dup_f32
 618 ; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 619 entry:
 620   %0 = bitcast float* %a to i8*
 621   %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
 622   %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
 623   %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
 624   %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
 625   %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
 626   %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
 627   %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
 628   %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0
 629   %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
 630   %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
 631   ret %struct.float32x2x3_t %.fca.0.2.insert
 632 }
 633
 634 define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) {
 635 ; CHECK-LABEL: test_vld3_dup_f64
 636 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 637 entry:
 638   %0 = bitcast double* %a to i8*
 639   %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8)
 640   %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
 641   %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
 642   %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
 643   %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
 644   %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
 645   %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
 646   ret %struct.float64x1x3_t %.fca.0.2.insert
 647 }
 648
 649 define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) {
 650 ; CHECK-LABEL: test_vld4q_dup_s8
 651 ; CHECK: ld4r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
 652 entry:
 653   %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
 654   %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
 655   %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
 656   %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
 657   %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
 658   %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
 659   %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
 660   %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3
 661   %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer
 662   %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0
 663   %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
 664   %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
 665   %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3
 666   ret %struct.int8x16x4_t %.fca.0.3.insert
 667 }
 668
 669 define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) {
 670 ; CHECK-LABEL: test_vld4q_dup_s16
 671 ; CHECK: ld4r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
 672 entry:
 673   %0 = bitcast i16* %a to i8*
 674   %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
 675   %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
 676   %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
 677   %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
 678   %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
 679   %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
 680   %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
 681   %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3
 682   %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer
 683   %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0
 684   %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
 685   %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
 686   %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3
 687   ret %struct.int16x8x4_t %.fca.0.3.insert
 688 }
 689
 690 define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) {
 691 ; CHECK-LABEL: test_vld4q_dup_s32
 692 ; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 693 entry:
 694   %0 = bitcast i32* %a to i8*
 695   %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
 696   %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
 697   %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
 698   %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
 699   %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
 700   %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
 701   %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
 702   %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3
 703   %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer
 704   %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0
 705   %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
 706   %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
 707   %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3
 708   ret %struct.int32x4x4_t %.fca.0.3.insert
 709 }
 710
 711 define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) {
 712 ; CHECK-LABEL: test_vld4q_dup_s64
 713 ; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 714 entry:
 715   %0 = bitcast i64* %a to i8*
 716   %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
 717   %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
 718   %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
 719   %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
 720   %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
 721   %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
 722   %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
 723   %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3
 724   %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer
 725   %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0
 726   %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
 727   %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
 728   %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3
 729   ret %struct.int64x2x4_t %.fca.0.3.insert
 730 }
 731
 732 define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) {
 733 ; CHECK-LABEL: test_vld4q_dup_f32
 734 ; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 735 entry:
 736   %0 = bitcast float* %a to i8*
 737   %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
 738   %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
 739   %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
 740   %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
 741   %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
 742   %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
 743   %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
 744   %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3
 745   %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer
 746   %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0
 747   %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
 748   %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
 749   %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3
 750   ret %struct.float32x4x4_t %.fca.0.3.insert
 751 }
 752
 753 define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) {
 754 ; CHECK-LABEL: test_vld4q_dup_f64
 755 ; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 756 entry:
 757   %0 = bitcast double* %a to i8*
 758   %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
 759   %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
 760   %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
 761   %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
 762   %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
 763   %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
 764   %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
 765   %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3
 766   %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer
 767   %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0
 768   %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
 769   %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
 770   %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3
 771   ret %struct.float64x2x4_t %.fca.0.3.insert
 772 }
 773
 774 define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) {
 775 ; CHECK-LABEL: test_vld4_dup_s8
 776 ; CHECK: ld4r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
 777 entry:
 778   %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
 779   %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
 780   %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
 781   %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
 782   %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
 783   %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
 784   %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
 785   %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3
 786   %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer
 787   %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0
 788   %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
 789   %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
 790   %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3
 791   ret %struct.int8x8x4_t %.fca.0.3.insert
 792 }
 793
 794 define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) {
 795 ; CHECK-LABEL: test_vld4_dup_s16
 796 ; CHECK: ld4r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
 797 entry:
 798   %0 = bitcast i16* %a to i8*
 799   %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 800   %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
 801   %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
 802   %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
 803   %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
 804   %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
 805   %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
 806   %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3
 807   %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer
 808   %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0
 809   %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
 810   %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
 811   %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3
 812   ret %struct.int16x4x4_t %.fca.0.3.insert
 813 }
 814
 815 define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) {
 816 ; CHECK-LABEL: test_vld4_dup_s32
 817 ; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 818 entry:
 819   %0 = bitcast i32* %a to i8*
 820   %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
 821   %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
 822   %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
 823   %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
 824   %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
 825   %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
 826   %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
 827   %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3
 828   %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer
 829   %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0
 830   %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
 831   %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
 832   %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3
 833   ret %struct.int32x2x4_t %.fca.0.3.insert
 834 }
 835
 836 define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) {
 837 ; CHECK-LABEL: test_vld4_dup_s64
 838 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 839 entry:
 840   %0 = bitcast i64* %a to i8*
 841   %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8)
 842   %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
 843   %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
 844   %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
 845   %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3
 846   %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
 847   %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
 848   %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
 849   %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3
 850   ret %struct.int64x1x4_t %.fca.0.3.insert
 851 }
 852
 853 define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) {
 854 ; CHECK-LABEL: test_vld4_dup_f32
 855 ; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 856 entry:
 857   %0 = bitcast float* %a to i8*
 858   %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
 859   %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
 860   %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
 861   %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
 862   %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
 863   %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
 864   %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
 865   %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3
 866   %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer
 867   %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0
 868   %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
 869   %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
 870   %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3
 871   ret %struct.float32x2x4_t %.fca.0.3.insert
 872 }
 873
 874 define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) {
 875 ; CHECK-LABEL: test_vld4_dup_f64
 876 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 877 entry:
 878   %0 = bitcast double* %a to i8*
 879   %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8)
 880   %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
 881   %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
 882   %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
 883   %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3
 884   %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
 885   %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
 886   %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
 887   %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3
 888   ret %struct.float64x1x4_t %.fca.0.3.insert
 889 }
 890
 891 define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
 892 ; CHECK-LABEL: test_vld1q_lane_s8
 893 ; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
 894 entry:
 895   %0 = load i8* %a, align 1
 896   %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
 897   ret <16 x i8> %vld1_lane
 898 }
 899
 900 define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
 901 ; CHECK-LABEL: test_vld1q_lane_s16
 902 ; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
 903 entry:
 904   %0 = load i16* %a, align 2
 905   %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
 906   ret <8 x i16> %vld1_lane
 907 }
 908
 909 define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
 910 ; CHECK-LABEL: test_vld1q_lane_s32
 911 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
 912 entry:
 913   %0 = load i32* %a, align 4
 914   %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
 915   ret <4 x i32> %vld1_lane
 916 }
 917
 918 define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
 919 ; CHECK-LABEL: test_vld1q_lane_s64
 920 ; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
 921 entry:
 922   %0 = load i64* %a, align 8
 923   %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
 924   ret <2 x i64> %vld1_lane
 925 }
 926
 927 define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
 928 ; CHECK-LABEL: test_vld1q_lane_f32
 929 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
 930 entry:
 931   %0 = load float* %a, align 4
 932   %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
 933   ret <4 x float> %vld1_lane
 934 }
 935
 936 define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
 937 ; CHECK-LABEL: test_vld1q_lane_f64
 938 ; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
 939 entry:
 940   %0 = load double* %a, align 8
 941   %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
 942   ret <2 x double> %vld1_lane
 943 }
 944
 945 define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
 946 ; CHECK-LABEL: test_vld1_lane_s8
 947 ; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
 948 entry:
 949   %0 = load i8* %a, align 1
 950   %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
 951   ret <8 x i8> %vld1_lane
 952 }
 953
 954 define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
 955 ; CHECK-LABEL: test_vld1_lane_s16
 956 ; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
 957 entry:
 958   %0 = load i16* %a, align 2
 959   %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
 960   ret <4 x i16> %vld1_lane
 961 }
 962
 963 define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
 964 ; CHECK-LABEL: test_vld1_lane_s32
 965 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
 966 entry:
 967   %0 = load i32* %a, align 4
 968   %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
 969   ret <2 x i32> %vld1_lane
 970 }
 971
 972 define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
 973 ; CHECK-LABEL: test_vld1_lane_s64
 974 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
 975 entry:
 976   %0 = load i64* %a, align 8
 977   %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
 978   ret <1 x i64> %vld1_lane
 979 }
 980
 981 define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
 982 ; CHECK-LABEL: test_vld1_lane_f32
 983 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
 984 entry:
 985   %0 = load float* %a, align 4
 986   %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
 987   ret <2 x float> %vld1_lane
 988 }
 989
 990 define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
 991 ; CHECK-LABEL: test_vld1_lane_f64
 992 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
 993 entry:
 994   %0 = load double* %a, align 8
 995   %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
 996   ret <1 x double> %vld1_lane
 997 }
 998
 999 define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
1000 ; CHECK-LABEL: test_vld2q_lane_s16
1001 ; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1002 entry:
1003   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
1004   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
1005   %0 = bitcast i16* %a to i8*
1006   %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
1007   %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0
1008   %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1
1009   %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0
1010   %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1
1011   ret %struct.int16x8x2_t %.fca.0.1.insert
1012 }
1013
1014 define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
1015 ; CHECK-LABEL: test_vld2q_lane_s32
1016 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1017 entry:
1018   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
1019   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
1020   %0 = bitcast i32* %a to i8*
1021   %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
1022   %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0
1023   %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1
1024   %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0
1025   %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1
1026   ret %struct.int32x4x2_t %.fca.0.1.insert
1027 }
1028
1029 define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
1030 ; CHECK-LABEL: test_vld2q_lane_s64
1031 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1032 entry:
1033   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
1034   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
1035   %0 = bitcast i64* %a to i8*
1036   %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
1037   %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0
1038   %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1
1039   %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0
1040   %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1
1041   ret %struct.int64x2x2_t %.fca.0.1.insert
1042 }
1043
1044 define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
1045 ; CHECK-LABEL: test_vld2q_lane_f32
1046 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1047 entry:
1048   %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
1049   %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
1050   %0 = bitcast float* %a to i8*
1051   %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
1052   %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0
1053   %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1
1054   %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0
1055   %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1
1056   ret %struct.float32x4x2_t %.fca.0.1.insert
1057 }
1058
1059 define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
1060 ; CHECK-LABEL: test_vld2q_lane_f64
1061 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1062 entry:
1063   %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
1064   %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
1065   %0 = bitcast double* %a to i8*
1066   %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
1067   %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0
1068   %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1
1069   %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0
1070   %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1
1071   ret %struct.float64x2x2_t %.fca.0.1.insert
1072 }
1073
1074 define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
1075 ; CHECK-LABEL: test_vld2_lane_s8
1076 ; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1077 entry:
1078   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
1079   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
1080   %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
1081   %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0
1082   %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1
1083   %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0
1084   %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1
1085   ret %struct.int8x8x2_t %.fca.0.1.insert
1086 }
1087
1088 define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
1089 ; CHECK-LABEL: test_vld2_lane_s16
1090 ; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1091 entry:
1092   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
1093   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
1094   %0 = bitcast i16* %a to i8*
1095   %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
1096   %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0
1097   %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1
1098   %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0
1099   %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1
1100   ret %struct.int16x4x2_t %.fca.0.1.insert
1101 }
1102
1103 define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
1104 ; CHECK-LABEL: test_vld2_lane_s32
1105 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1106 entry:
1107   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
1108   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
1109   %0 = bitcast i32* %a to i8*
1110   %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
1111   %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0
1112   %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1
1113   %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0
1114   %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1
1115   ret %struct.int32x2x2_t %.fca.0.1.insert
1116 }
1117
1118 define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
1119 ; CHECK-LABEL: test_vld2_lane_s64
1120 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1121 entry:
1122   %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
1123   %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
1124   %0 = bitcast i64* %a to i8*
1125   %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
1126   %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0
1127   %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1
1128   %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0
1129   %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1
1130   ret %struct.int64x1x2_t %.fca.0.1.insert
1131 }
1132
1133 define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
1134 ; CHECK-LABEL: test_vld2_lane_f32
1135 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1136 entry:
1137   %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
1138   %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
1139   %0 = bitcast float* %a to i8*
1140   %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
1141   %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0
1142   %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1
1143   %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0
1144   %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1
1145   ret %struct.float32x2x2_t %.fca.0.1.insert
1146 }
1147
1148 define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
1149 ; CHECK-LABEL: test_vld2_lane_f64
1150 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1151 entry:
1152   %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
1153   %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
1154   %0 = bitcast double* %a to i8*
1155   %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
1156   %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0
1157   %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1
1158   %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0
1159   %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1
1160   ret %struct.float64x1x2_t %.fca.0.1.insert
1161 }
1162
1163 define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
1164 ; CHECK-LABEL: test_vld3q_lane_s16
1165 ; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1166 entry:
1167   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
1168   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
1169   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
1170   %0 = bitcast i16* %a to i8*
1171   %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
1172   %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
1173   %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
1174   %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
1175   %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
1176   %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
1177   %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
1178   ret %struct.int16x8x3_t %.fca.0.2.insert
1179 }
1180
1181 define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
1182 ; CHECK-LABEL: test_vld3q_lane_s32
1183 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1184 entry:
1185   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
1186   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
1187   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
1188   %0 = bitcast i32* %a to i8*
1189   %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
1190   %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
1191   %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
1192   %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
1193   %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
1194   %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
1195   %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
1196   ret %struct.int32x4x3_t %.fca.0.2.insert
1197 }
1198
1199 define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
1200 ; CHECK-LABEL: test_vld3q_lane_s64
1201 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1202 entry:
1203   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
1204   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
1205   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
1206   %0 = bitcast i64* %a to i8*
1207   %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
1208   %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
1209   %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
1210   %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
1211   %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
1212   %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
1213   %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
1214   ret %struct.int64x2x3_t %.fca.0.2.insert
1215 }
1216
1217 define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
1218 ; CHECK-LABEL: test_vld3q_lane_f32
1219 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1220 entry:
1221   %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
1222   %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
1223   %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
1224   %0 = bitcast float* %a to i8*
1225   %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
1226   %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
1227   %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
1228   %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
1229   %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
1230   %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
1231   %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
1232   ret %struct.float32x4x3_t %.fca.0.2.insert
1233 }
1234
1235 define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
1236 ; CHECK-LABEL: test_vld3q_lane_f64
1237 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1238 entry:
1239   %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
1240   %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
1241   %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
1242   %0 = bitcast double* %a to i8*
1243   %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
1244   %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
1245   %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
1246   %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
1247   %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
1248   %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
1249   %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
1250   ret %struct.float64x2x3_t %.fca.0.2.insert
1251 }
1252
1253 define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
1254 ; CHECK-LABEL: test_vld3_lane_s8
1255 ; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1256 entry:
1257   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
1258   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
1259   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
1260   %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
1261   %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
1262   %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
1263   %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
1264   %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
1265   %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
1266   %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
1267   ret %struct.int8x8x3_t %.fca.0.2.insert
1268 }
1269
1270 define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
1271 ; CHECK-LABEL: test_vld3_lane_s16
1272 ; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1273 entry:
1274   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
1275   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
1276   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
1277   %0 = bitcast i16* %a to i8*
1278   %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
1279   %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
1280   %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
1281   %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
1282   %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
1283   %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
1284   %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
1285   ret %struct.int16x4x3_t %.fca.0.2.insert
1286 }
1287
1288 define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
1289 ; CHECK-LABEL: test_vld3_lane_s32
1290 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1291 entry:
1292   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
1293   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1294   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1295   %0 = bitcast i32* %a to i8*
1296   %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
1297   %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
1298   %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
1299   %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
1300   %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
1301   %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
1302   %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
1303   ret %struct.int32x2x3_t %.fca.0.2.insert
1304 }
1305
1306 define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1307 ; CHECK-LABEL: test_vld3_lane_s64
1308 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1309 entry:
1310   %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1311   %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1312   %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1313   %0 = bitcast i64* %a to i8*
1314   %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
1315   %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
1316   %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
1317   %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
1318   %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
1319   %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
1320   %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
1321   ret %struct.int64x1x3_t %.fca.0.2.insert
1322 }
1323
1324 define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1325 ; CHECK-LABEL: test_vld3_lane_f32
1326 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1327 entry:
1328   %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1329   %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1330   %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1331   %0 = bitcast float* %a to i8*
1332   %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
1333   %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
1334   %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
1335   %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
1336   %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
1337   %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
1338   %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
1339   ret %struct.float32x2x3_t %.fca.0.2.insert
1340 }
1341
1342 define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1343 ; CHECK-LABEL: test_vld3_lane_f64
1344 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1345 entry:
1346   %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1347   %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1348   %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1349   %0 = bitcast double* %a to i8*
1350   %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
1351   %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
1352   %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
1353   %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
1354   %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
1355   %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
1356   %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
1357   ret %struct.float64x1x3_t %.fca.0.2.insert
1358 }
1359
1360 define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
1361 ; CHECK-LABEL: test_vld4q_lane_s8
1362 ; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1363 entry:
1364   %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1365   %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1366   %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1367   %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1368   %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1)
1369   %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
1370   %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
1371   %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
1372   %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3
1373   %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
1374   %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
1375   %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
1376   %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3
1377   ret %struct.int8x16x4_t %.fca.0.3.insert
1378 }
1379
1380 define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
1381 ; CHECK-LABEL: test_vld4q_lane_s16
1382 ; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1383 entry:
1384   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
1385   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
1386   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
1387   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
1388   %0 = bitcast i16* %a to i8*
1389   %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
1390   %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
1391   %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
1392   %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
1393   %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3
1394   %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
1395   %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
1396   %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
1397   %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3
1398   ret %struct.int16x8x4_t %.fca.0.3.insert
1399 }
1400
1401 define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
1402 ; CHECK-LABEL: test_vld4q_lane_s32
1403 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1404 entry:
1405   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
1406   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
1407   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
1408   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
1409   %0 = bitcast i32* %a to i8*
1410   %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
1411   %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
1412   %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
1413   %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
1414   %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3
1415   %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
1416   %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
1417   %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
1418   %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3
1419   ret %struct.int32x4x4_t %.fca.0.3.insert
1420 }
1421
1422 define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
1423 ; CHECK-LABEL: test_vld4q_lane_s64
1424 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1425 entry:
1426   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
1427   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
1428   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
1429   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
1430   %0 = bitcast i64* %a to i8*
1431   %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
1432   %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
1433   %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
1434   %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
1435   %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3
1436   %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
1437   %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
1438   %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
1439   %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3
1440   ret %struct.int64x2x4_t %.fca.0.3.insert
1441 }
1442
1443 define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
1444 ; CHECK-LABEL: test_vld4q_lane_f32
1445 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1446 entry:
1447   %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
1448   %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
1449   %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
1450   %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
1451   %0 = bitcast float* %a to i8*
1452   %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
1453   %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
1454   %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
1455   %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
1456   %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3
1457   %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
1458   %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
1459   %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
1460   %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3
1461   ret %struct.float32x4x4_t %.fca.0.3.insert
1462 }
1463
1464 define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
1465 ; CHECK-LABEL: test_vld4q_lane_f64
1466 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1467 entry:
1468   %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
1469   %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
1470   %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
1471   %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
1472   %0 = bitcast double* %a to i8*
1473   %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
1474   %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
1475   %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
1476   %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
1477   %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3
1478   %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
1479   %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
1480   %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
1481   %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3
1482   ret %struct.float64x2x4_t %.fca.0.3.insert
1483 }
1484
1485 define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
1486 ; CHECK-LABEL: test_vld4_lane_s8
1487 ; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1488 entry:
1489   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
1490   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
1491   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
1492   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
1493   %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
1494   %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
1495   %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
1496   %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
1497   %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3
1498   %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
1499   %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
1500   %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
1501   %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3
1502   ret %struct.int8x8x4_t %.fca.0.3.insert
1503 }
1504
1505 define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
1506 ; CHECK-LABEL: test_vld4_lane_s16
1507 ; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1508 entry:
1509   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
1510   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
1511   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
1512   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
1513   %0 = bitcast i16* %a to i8*
1514   %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
1515   %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
1516   %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
1517   %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
1518   %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3
1519   %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
1520   %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
1521   %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
1522   %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3
1523   ret %struct.int16x4x4_t %.fca.0.3.insert
1524 }
1525
1526 define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
1527 ; CHECK-LABEL: test_vld4_lane_s32
1528 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1529 entry:
1530   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
1531   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
1532   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
1533   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
1534   %0 = bitcast i32* %a to i8*
1535   %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
1536   %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
1537   %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
1538   %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
1539   %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3
1540   %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
1541   %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
1542   %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
1543   %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3
1544   ret %struct.int32x2x4_t %.fca.0.3.insert
1545 }
1546
1547 define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
1548 ; CHECK-LABEL: test_vld4_lane_s64
1549 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1550 entry:
1551   %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
1552   %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
1553   %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
1554   %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
1555   %0 = bitcast i64* %a to i8*
1556   %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
1557   %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
1558   %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
1559   %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
1560   %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3
1561   %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
1562   %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
1563   %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
1564   %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3
1565   ret %struct.int64x1x4_t %.fca.0.3.insert
1566 }
1567
1568 define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
1569 ; CHECK-LABEL: test_vld4_lane_f32
1570 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1571 entry:
1572   %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
1573   %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
1574   %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
1575   %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
1576   %0 = bitcast float* %a to i8*
1577   %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
1578   %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
1579   %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
1580   %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
1581   %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3
1582   %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
1583   %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
1584   %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
1585   %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3
1586   ret %struct.float32x2x4_t %.fca.0.3.insert
1587 }
1588
1589 define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
1590 ; CHECK-LABEL: test_vld4_lane_f64
1591 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1592 entry:
1593   %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
1594   %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
1595   %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
1596   %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
1597   %0 = bitcast double* %a to i8*
1598   %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
1599   %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
1600   %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
1601   %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
1602   %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3
1603   %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
1604   %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
1605   %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
1606   %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3
1607   ret %struct.float64x1x4_t %.fca.0.3.insert
1608 }
1609
1610 define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
1611 ; CHECK-LABEL: test_vst1q_lane_s8
1612 ; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1613 entry:
1614   %0 = extractelement <16 x i8> %b, i32 15
1615   store i8 %0, i8* %a, align 1
1616   ret void
1617 }
1618
1619 define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
1620 ; CHECK-LABEL: test_vst1q_lane_s16
1621 ; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1622 entry:
1623   %0 = extractelement <8 x i16> %b, i32 7
1624   store i16 %0, i16* %a, align 2
1625   ret void
1626 }
1627
1628 define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
1629 ; CHECK-LABEL: test_vst1q_lane_s32
1630 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1631 entry:
1632   %0 = extractelement <4 x i32> %b, i32 3
1633   store i32 %0, i32* %a, align 4
1634   ret void
1635 }
1636
1637 define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
1638 ; CHECK-LABEL: test_vst1q_lane_s64
1639 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1640 entry:
1641   %0 = extractelement <2 x i64> %b, i32 1
1642   store i64 %0, i64* %a, align 8
1643   ret void
1644 }
1645
1646 define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
1647 ; CHECK-LABEL: test_vst1q_lane_f32
1648 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1649 entry:
1650   %0 = extractelement <4 x float> %b, i32 3
1651   store float %0, float* %a, align 4
1652   ret void
1653 }
1654
1655 define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
1656 ; CHECK-LABEL: test_vst1q_lane_f64
1657 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1658 entry:
1659   %0 = extractelement <2 x double> %b, i32 1
1660   store double %0, double* %a, align 8
1661   ret void
1662 }
1663
1664 define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
1665 ; CHECK-LABEL: test_vst1_lane_s8
1666 ; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1667 entry:
1668   %0 = extractelement <8 x i8> %b, i32 7
1669   store i8 %0, i8* %a, align 1
1670   ret void
1671 }
1672
1673 define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
1674 ; CHECK-LABEL: test_vst1_lane_s16
1675 ; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1676 entry:
1677   %0 = extractelement <4 x i16> %b, i32 3
1678   store i16 %0, i16* %a, align 2
1679   ret void
1680 }
1681
1682 define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
1683 ; CHECK-LABEL: test_vst1_lane_s32
1684 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1685 entry:
1686   %0 = extractelement <2 x i32> %b, i32 1
1687   store i32 %0, i32* %a, align 4
1688   ret void
1689 }
1690
1691 define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
1692 ; CHECK-LABEL: test_vst1_lane_s64
1693 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1694 entry:
1695   %0 = extractelement <1 x i64> %b, i32 0
1696   store i64 %0, i64* %a, align 8
1697   ret void
1698 }
1699
1700 define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
1701 ; CHECK-LABEL: test_vst1_lane_f32
1702 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1703 entry:
1704   %0 = extractelement <2 x float> %b, i32 1
1705   store float %0, float* %a, align 4
1706   ret void
1707 }
1708
1709 define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
1710 ; CHECK-LABEL: test_vst1_lane_f64
1711 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1712 entry:
1713   %0 = extractelement <1 x double> %b, i32 0
1714   store double %0, double* %a, align 8
1715   ret void
1716 }
1717
1718 define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
1719 ; CHECK-LABEL: test_vst2q_lane_s8
1720 ; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1721 entry:
1722   %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
1723   %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
1724   tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1)
1725   ret void
1726 }
1727
1728 define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
1729 ; CHECK-LABEL: test_vst2q_lane_s16
1730 ; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1731 entry:
1732   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
1733   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
1734   %0 = bitcast i16* %a to i8*
1735   tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
1736   ret void
1737 }
1738
1739 define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
1740 ; CHECK-LABEL: test_vst2q_lane_s32
1741 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1742 entry:
1743   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
1744   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
1745   %0 = bitcast i32* %a to i8*
1746   tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
1747   ret void
1748 }
1749
1750 define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
1751 ; CHECK-LABEL: test_vst2q_lane_s64
1752 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1753 entry:
1754   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
1755   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
1756   %0 = bitcast i64* %a to i8*
1757   tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
1758   ret void
1759 }
1760
1761 define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
1762 ; CHECK-LABEL: test_vst2q_lane_f32
1763 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1764 entry:
1765   %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
1766   %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
1767   %0 = bitcast float* %a to i8*
1768   tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
1769   ret void
1770 }
1771
1772 define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
1773 ; CHECK-LABEL: test_vst2q_lane_f64
1774 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1775 entry:
1776   %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
1777   %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
1778   %0 = bitcast double* %a to i8*
1779   tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
1780   ret void
1781 }
1782
1783 define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
1784 ; CHECK-LABEL: test_vst2_lane_s8
1785 ; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1786 entry:
1787   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
1788   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
1789   tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
1790   ret void
1791 }
1792
1793 define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
1794 ; CHECK-LABEL: test_vst2_lane_s16
1795 ; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1796 entry:
1797   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
1798   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
1799   %0 = bitcast i16* %a to i8*
1800   tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
1801   ret void
1802 }
1803
1804 define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
1805 ; CHECK-LABEL: test_vst2_lane_s32
1806 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1807 entry:
1808   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
1809   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
1810   %0 = bitcast i32* %a to i8*
1811   tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
1812   ret void
1813 }
1814
1815 define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
1816 ; CHECK-LABEL: test_vst2_lane_s64
1817 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1818 entry:
1819   %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
1820   %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
1821   %0 = bitcast i64* %a to i8*
1822   tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
1823   ret void
1824 }
1825
1826 define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
1827 ; CHECK-LABEL: test_vst2_lane_f32
1828 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1829 entry:
1830   %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
1831   %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
1832   %0 = bitcast float* %a to i8*
1833   tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
1834   ret void
1835 }
1836
1837 define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
1838 ; CHECK-LABEL: test_vst2_lane_f64
1839 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1840 entry:
1841   %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
1842   %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
1843   %0 = bitcast double* %a to i8*
1844   tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
1845   ret void
1846 }
1847
1848 define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
1849 ; CHECK-LABEL: test_vst3q_lane_s8
1850 ; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1851 entry:
1852   %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
1853   %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
1854   %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
1855   tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1)
1856   ret void
1857 }
1858
1859 define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
1860 ; CHECK-LABEL: test_vst3q_lane_s16
1861 ; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1862 entry:
1863   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
1864   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
1865   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
1866   %0 = bitcast i16* %a to i8*
1867   tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
1868   ret void
1869 }
1870
1871 define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
1872 ; CHECK-LABEL: test_vst3q_lane_s32
1873 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1874 entry:
1875   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
1876   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
1877   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
1878   %0 = bitcast i32* %a to i8*
1879   tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
1880   ret void
1881 }
1882
1883 define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
1884 ; CHECK-LABEL: test_vst3q_lane_s64
1885 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1886 entry:
1887   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
1888   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
1889   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
1890   %0 = bitcast i64* %a to i8*
1891   tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
1892   ret void
1893 }
1894
1895 define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
1896 ; CHECK-LABEL: test_vst3q_lane_f32
1897 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1898 entry:
1899   %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
1900   %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
1901   %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
1902   %0 = bitcast float* %a to i8*
1903   tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
1904   ret void
1905 }
1906
1907 define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
1908 ; CHECK-LABEL: test_vst3q_lane_f64
1909 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1910 entry:
1911   %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
1912   %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
1913   %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
1914   %0 = bitcast double* %a to i8*
1915   tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
1916   ret void
1917 }
1918
1919 define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
1920 ; CHECK-LABEL: test_vst3_lane_s8
1921 ; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1922 entry:
1923   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
1924   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
1925   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
1926   tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
1927   ret void
1928 }
1929
1930 define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
1931 ; CHECK-LABEL: test_vst3_lane_s16
1932 ; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1933 entry:
1934   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
1935   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
1936   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
1937   %0 = bitcast i16* %a to i8*
1938   tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
1939   ret void
1940 }
1941
1942 define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
1943 ; CHECK-LABEL: test_vst3_lane_s32
1944 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1945 entry:
1946   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
1947   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1948   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1949   %0 = bitcast i32* %a to i8*
1950   tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
1951   ret void
1952 }
1953
1954 define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1955 ; CHECK-LABEL: test_vst3_lane_s64
1956 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1957 entry:
1958   %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1959   %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1960   %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1961   %0 = bitcast i64* %a to i8*
1962   tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
1963   ret void
1964 }
1965
1966 define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1967 ; CHECK-LABEL: test_vst3_lane_f32
1968 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1969 entry:
1970   %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1971   %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1972   %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1973   %0 = bitcast float* %a to i8*
1974   tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
1975   ret void
1976 }
1977
1978 define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1979 ; CHECK-LABEL: test_vst3_lane_f64
1980 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1981 entry:
1982   %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1983   %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1984   %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1985   %0 = bitcast double* %a to i8*
1986   tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
1987   ret void
1988 }
1989
1990 define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) {
1991 ; CHECK-LABEL: test_vst4q_lane_s8
1992 ; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1993 entry:
1994   %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1995   %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1996   %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1997   %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1998   %0 = bitcast i16* %a to i8*
1999   tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2)
2000   ret void
2001 }
2002
2003 define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
2004 ; CHECK-LABEL: test_vst4q_lane_s16
2005 ; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
2006 entry:
2007   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
2008   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
2009   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
2010   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
2011   %0 = bitcast i16* %a to i8*
2012   tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
2013   ret void
2014 }
2015
2016 define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
2017 ; CHECK-LABEL: test_vst4q_lane_s32
2018 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2019 entry:
2020   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
2021   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
2022   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
2023   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
2024   %0 = bitcast i32* %a to i8*
2025   tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
2026   ret void
2027 }
2028
2029 define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
2030 ; CHECK-LABEL: test_vst4q_lane_s64
2031 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2032 entry:
2033   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
2034   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
2035   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
2036   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
2037   %0 = bitcast i64* %a to i8*
2038   tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
2039   ret void
2040 }
2041
2042 define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
2043 ; CHECK-LABEL: test_vst4q_lane_f32
2044 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2045 entry:
2046   %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
2047   %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
2048   %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
2049   %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
2050   %0 = bitcast float* %a to i8*
2051   tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
2052   ret void
2053 }
2054
2055 define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
2056 ; CHECK-LABEL: test_vst4q_lane_f64
2057 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2058 entry:
2059   %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
2060   %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
2061   %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
2062   %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
2063   %0 = bitcast double* %a to i8*
2064   tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
2065   ret void
2066 }
2067
2068 define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
2069 ; CHECK-LABEL: test_vst4_lane_s8
2070 ; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
2071 entry:
2072   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
2073   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
2074   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
2075   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
2076   tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
2077   ret void
2078 }
2079
2080 define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
2081 ; CHECK-LABEL: test_vst4_lane_s16
2082 ; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
2083 entry:
2084   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
2085   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
2086   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
2087   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
2088   %0 = bitcast i16* %a to i8*
2089   tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
2090   ret void
2091 }
2092
2093 define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
2094 ; CHECK-LABEL: test_vst4_lane_s32
2095 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2096 entry:
2097   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
2098   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
2099   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
2100   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
2101   %0 = bitcast i32* %a to i8*
2102   tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
2103   ret void
2104 }
2105
2106 define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
2107 ; CHECK-LABEL: test_vst4_lane_s64
2108 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2109 entry:
2110   %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
2111   %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
2112   %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
2113   %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
2114   %0 = bitcast i64* %a to i8*
2115   tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
2116   ret void
2117 }
2118
2119 define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
2120 ; CHECK-LABEL: test_vst4_lane_f32
2121 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2122 entry:
2123   %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
2124   %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
2125   %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
2126   %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
2127   %0 = bitcast float* %a to i8*
2128   tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
2129   ret void
2130 }
2131
2132 define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
2133 ; CHECK-LABEL: test_vst4_lane_f64
2134 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2135 entry:
2136   %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
2137   %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
2138   %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
2139   %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
2140   %0 = bitcast double* %a to i8*
2141   tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
2142   ret void
2143 }
2144
2145 declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
2146 declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
2147 declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
2148 declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
2149 declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
2150 declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
2151 declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
2152 declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
2153 declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
2154 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
2155 declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
2156 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
2157 declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2158 declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2159 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2160 declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2161 declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2162 declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2163 declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2164 declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2165 declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2166 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
2167 declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2168 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
2169 declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2170 declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2171 declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2172 declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2173 declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2174 declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2175 declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2176 declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2177 declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2178 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
2179 declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2180 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
2181 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
2182 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
2183 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2184 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2185 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2186 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2187 declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
2188 declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
2189 declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
2190 declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
2191 declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
2192 declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
2193 declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
2194 declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
2195 declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
2196 declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
2197 declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
2198 declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
2199 declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2200 declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2201 declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2202 declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2203 declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2204 declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2205 declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2206 declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2207 declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2208 declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2209 declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2210 declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2211 declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2212 declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2213 declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2214 declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2215 declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2216 declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2217 declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2218 declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2219 declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2220 declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2221 declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2222 declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2223
2224 define %struct.int8x16x2_t @test_vld2q_lane_s8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
2225 ; CHECK-LABEL: test_vld2q_lane_s8
2226 ; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
2227 entry:
2228   %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
2229   %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
2230   %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
2231   %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
2232   %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
2233   %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
2234   %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
2235   ret %struct.int8x16x2_t %.fca.0.1.insert
2236 }
2237
2238 define %struct.uint8x16x2_t @test_vld2q_lane_u8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
2239 ; CHECK-LABEL: test_vld2q_lane_u8
2240 ; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
2241 entry:
2242   %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
2243   %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
2244   %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
2245   %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
2246   %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
2247   %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
2248   %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
2249   ret %struct.uint8x16x2_t %.fca.0.1.insert
2250 }
2251
2252 define %struct.poly8x16x2_t @test_vld2q_lane_p8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
2253 ; CHECK-LABEL: test_vld2q_lane_p8
2254 ; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
2255 entry:
2256   %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
2257   %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
2258   %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
2259   %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
2260   %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
2261   %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
2262   %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
2263   ret %struct.poly8x16x2_t %.fca.0.1.insert
2264 }
2265
2266 define %struct.int8x16x3_t @test_vld3q_lane_s8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
2267 ; CHECK-LABEL: test_vld3q_lane_s8
2268 ; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
2269 entry:
2270   %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
2271   %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
2272   %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
2273   %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
2274   %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
2275   %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
2276   %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
2277   %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
2278   %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
2279   %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
2280   ret %struct.int8x16x3_t %.fca.0.2.insert
2281 }
2282
2283 define %struct.uint8x16x3_t @test_vld3q_lane_u8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
2284 ; CHECK-LABEL: test_vld3q_lane_u8
2285 ; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
2286 entry:
2287   %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
2288   %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
2289   %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
2290   %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
2291   %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
2292   %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
2293   %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
2294   %.fca.0.0.insert = insertvalue %struct.uint8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
2295   %.fca.0.1.insert = insertvalue %struct.uint8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
2296   %.fca.0.2.insert = insertvalue %struct.uint8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
2297   ret %struct.uint8x16x3_t %.fca.0.2.insert
2298 }
2299