test/CodeGen/AArch64/neon-simd-ldst-one.ll

   1 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
   2
   3 %struct.int8x16x2_t = type { [2 x <16 x i8>] }
   4 %struct.int16x8x2_t = type { [2 x <8 x i16>] }
   5 %struct.int32x4x2_t = type { [2 x <4 x i32>] }
   6 %struct.int64x2x2_t = type { [2 x <2 x i64>] }
   7 %struct.float32x4x2_t = type { [2 x <4 x float>] }
   8 %struct.float64x2x2_t = type { [2 x <2 x double>] }
   9 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
  10 %struct.int16x4x2_t = type { [2 x <4 x i16>] }
  11 %struct.int32x2x2_t = type { [2 x <2 x i32>] }
  12 %struct.int64x1x2_t = type { [2 x <1 x i64>] }
  13 %struct.float32x2x2_t = type { [2 x <2 x float>] }
  14 %struct.float64x1x2_t = type { [2 x <1 x double>] }
  15 %struct.int8x16x3_t = type { [3 x <16 x i8>] }
  16 %struct.int16x8x3_t = type { [3 x <8 x i16>] }
  17 %struct.int32x4x3_t = type { [3 x <4 x i32>] }
  18 %struct.int64x2x3_t = type { [3 x <2 x i64>] }
  19 %struct.float32x4x3_t = type { [3 x <4 x float>] }
  20 %struct.float64x2x3_t = type { [3 x <2 x double>] }
  21 %struct.int8x8x3_t = type { [3 x <8 x i8>] }
  22 %struct.int16x4x3_t = type { [3 x <4 x i16>] }
  23 %struct.int32x2x3_t = type { [3 x <2 x i32>] }
  24 %struct.int64x1x3_t = type { [3 x <1 x i64>] }
  25 %struct.float32x2x3_t = type { [3 x <2 x float>] }
  26 %struct.float64x1x3_t = type { [3 x <1 x double>] }
  27 %struct.int8x16x4_t = type { [4 x <16 x i8>] }
  28 %struct.int16x8x4_t = type { [4 x <8 x i16>] }
  29 %struct.int32x4x4_t = type { [4 x <4 x i32>] }
  30 %struct.int64x2x4_t = type { [4 x <2 x i64>] }
  31 %struct.float32x4x4_t = type { [4 x <4 x float>] }
  32 %struct.float64x2x4_t = type { [4 x <2 x double>] }
  33 %struct.int8x8x4_t = type { [4 x <8 x i8>] }
  34 %struct.int16x4x4_t = type { [4 x <4 x i16>] }
  35 %struct.int32x2x4_t = type { [4 x <2 x i32>] }
  36 %struct.int64x1x4_t = type { [4 x <1 x i64>] }
  37 %struct.float32x2x4_t = type { [4 x <2 x float>] }
  38 %struct.float64x1x4_t = type { [4 x <1 x double>] }
  39
  40 define <16 x i8> @test_ld_from_poll_v16i8(<16 x i8> %a) {
  41 ; CHECK-LABEL: test_ld_from_poll_v16i8
  42 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  43 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  44 entry:
  45   %b = add <16 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 2, i8 13, i8 14, i8 15, i8 16>
  46   ret <16 x i8> %b
  47 }
  48
  49 define <8 x i16> @test_ld_from_poll_v8i16(<8 x i16> %a) {
  50 ; CHECK-LABEL: test_ld_from_poll_v8i16
  51 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  52 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  53 entry:
  54   %b = add <8 x i16> %a, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
  55   ret <8 x i16> %b
  56 }
  57
  58 define <4 x i32> @test_ld_from_poll_v4i32(<4 x i32> %a) {
  59 ; CHECK-LABEL: test_ld_from_poll_v4i32
  60 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  61 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  62 entry:
  63   %b = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
  64   ret <4 x i32> %b
  65 }
  66
  67 define <2 x i64> @test_ld_from_poll_v2i64(<2 x i64> %a) {
  68 ; CHECK-LABEL: test_ld_from_poll_v2i64
  69 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  70 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  71 entry:
  72   %b = add <2 x i64> %a, <i64 1, i64 2>
  73   ret <2 x i64> %b
  74 }
  75
  76 define <4 x float> @test_ld_from_poll_v4f32(<4 x float> %a) {
  77 ; CHECK-LABEL: test_ld_from_poll_v4f32
  78 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  79 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  80 entry:
  81   %b = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
  82   ret <4 x float> %b
  83 }
  84
  85 define <2 x double> @test_ld_from_poll_v2f64(<2 x double> %a) {
  86 ; CHECK-LABEL: test_ld_from_poll_v2f64
  87 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  88 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  89 entry:
  90   %b = fadd <2 x double> %a, <double 1.0, double 2.0>
  91   ret <2 x double> %b
  92 }
  93
  94 define <8 x i8> @test_ld_from_poll_v8i8(<8 x i8> %a) {
  95 ; CHECK-LABEL: test_ld_from_poll_v8i8
  96 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  97 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  98 entry:
  99   %b = add <8 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
 100   ret <8 x i8> %b
 101 }
 102
 103 define <4 x i16> @test_ld_from_poll_v4i16(<4 x i16> %a) {
 104 ; CHECK-LABEL: test_ld_from_poll_v4i16
 105 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
 106 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
 107 entry:
 108   %b = add <4 x i16> %a, <i16 1, i16 2, i16 3, i16 4>
 109   ret <4 x i16> %b
 110 }
 111
 112 define <2 x i32> @test_ld_from_poll_v2i32(<2 x i32> %a) {
 113 ; CHECK-LABEL: test_ld_from_poll_v2i32
 114 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
 115 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
 116 entry:
 117   %b = add <2 x i32> %a, <i32 1, i32 2>
 118   ret <2 x i32> %b
 119 }
 120
 121 define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
 122 ; CHECK-LABEL: test_vld1q_dup_s8
 123 ; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0]
 124 entry:
 125   %0 = load i8* %a, align 1
 126   %1 = insertelement <16 x i8> undef, i8 %0, i32 0
 127   %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
 128   ret <16 x i8> %lane
 129 }
 130
 131 define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
 132 ; CHECK-LABEL: test_vld1q_dup_s16
 133 ; CHECK: ld1r {{{v[0-9]+}}.8h}, [x0]
 134 entry:
 135   %0 = load i16* %a, align 2
 136   %1 = insertelement <8 x i16> undef, i16 %0, i32 0
 137   %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
 138   ret <8 x i16> %lane
 139 }
 140
 141 define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
 142 ; CHECK-LABEL: test_vld1q_dup_s32
 143 ; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
 144 entry:
 145   %0 = load i32* %a, align 4
 146   %1 = insertelement <4 x i32> undef, i32 %0, i32 0
 147   %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
 148   ret <4 x i32> %lane
 149 }
 150
 151 define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
 152 ; CHECK-LABEL: test_vld1q_dup_s64
 153 ; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
 154 entry:
 155   %0 = load i64* %a, align 8
 156   %1 = insertelement <2 x i64> undef, i64 %0, i32 0
 157   %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
 158   ret <2 x i64> %lane
 159 }
 160
 161 define <4 x float> @test_vld1q_dup_f32(float* %a) {
 162 ; CHECK-LABEL: test_vld1q_dup_f32
 163 ; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
 164 entry:
 165   %0 = load float* %a, align 4
 166   %1 = insertelement <4 x float> undef, float %0, i32 0
 167   %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
 168   ret <4 x float> %lane
 169 }
 170
 171 define <2 x double> @test_vld1q_dup_f64(double* %a) {
 172 ; CHECK-LABEL: test_vld1q_dup_f64
 173 ; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
 174 entry:
 175   %0 = load double* %a, align 8
 176   %1 = insertelement <2 x double> undef, double %0, i32 0
 177   %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
 178   ret <2 x double> %lane
 179 }
 180
 181 define <8 x i8> @test_vld1_dup_s8(i8* %a) {
 182 ; CHECK-LABEL: test_vld1_dup_s8
 183 ; CHECK: ld1r {{{v[0-9]+}}.8b}, [x0]
 184 entry:
 185   %0 = load i8* %a, align 1
 186   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
 187   %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
 188   ret <8 x i8> %lane
 189 }
 190
 191 define <4 x i16> @test_vld1_dup_s16(i16* %a) {
 192 ; CHECK-LABEL: test_vld1_dup_s16
 193 ; CHECK: ld1r {{{v[0-9]+}}.4h}, [x0]
 194 entry:
 195   %0 = load i16* %a, align 2
 196   %1 = insertelement <4 x i16> undef, i16 %0, i32 0
 197   %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
 198   ret <4 x i16> %lane
 199 }
 200
 201 define <2 x i32> @test_vld1_dup_s32(i32* %a) {
 202 ; CHECK-LABEL: test_vld1_dup_s32
 203 ; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
 204 entry:
 205   %0 = load i32* %a, align 4
 206   %1 = insertelement <2 x i32> undef, i32 %0, i32 0
 207   %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
 208   ret <2 x i32> %lane
 209 }
 210
 211 define <1 x i64> @test_vld1_dup_s64(i64* %a) {
 212 ; CHECK-LABEL: test_vld1_dup_s64
 213 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
 214 entry:
 215   %0 = load i64* %a, align 8
 216   %1 = insertelement <1 x i64> undef, i64 %0, i32 0
 217   ret <1 x i64> %1
 218 }
 219
 220 define <2 x float> @test_vld1_dup_f32(float* %a) {
 221 ; CHECK-LABEL: test_vld1_dup_f32
 222 ; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
 223 entry:
 224   %0 = load float* %a, align 4
 225   %1 = insertelement <2 x float> undef, float %0, i32 0
 226   %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
 227   ret <2 x float> %lane
 228 }
 229
 230 define <1 x double> @test_vld1_dup_f64(double* %a) {
 231 ; CHECK-LABEL: test_vld1_dup_f64
 232 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
 233 entry:
 234   %0 = load double* %a, align 8
 235   %1 = insertelement <1 x double> undef, double %0, i32 0
 236   ret <1 x double> %1
 237 }
 238
 239 define <1 x i64> @testDUP.v1i64(i64* %a, i64* %b) #0 {
 240 ; As there is a store operation depending on %1, LD1R pattern can't be selected.
 241 ; So LDR and FMOV should be emitted.
 242 ; CHECK-LABEL: testDUP.v1i64
 243 ; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
 244 ; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
 245 ; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}]
 246   %1 = load i64* %a, align 8
 247   store i64 %1, i64* %b, align 8
 248   %vecinit.i = insertelement <1 x i64> undef, i64 %1, i32 0
 249   ret <1 x i64> %vecinit.i
 250 }
 251
 252 define <1 x double> @testDUP.v1f64(double* %a, double* %b) #0 {
 253 ; As there is a store operation depending on %1, LD1R pattern can't be selected.
 254 ; So LDR and FMOV should be emitted.
 255 ; CHECK-LABEL: testDUP.v1f64
 256 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
 257 ; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}]
 258   %1 = load double* %a, align 8
 259   store double %1, double* %b, align 8
 260   %vecinit.i = insertelement <1 x double> undef, double %1, i32 0
 261   ret <1 x double> %vecinit.i
 262 }
 263
 264 define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) {
 265 ; CHECK-LABEL: test_vld2q_dup_s8
 266 ; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
 267 entry:
 268   %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
 269   %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0
 270   %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
 271   %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1
 272   %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
 273   %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0
 274   %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
 275   ret %struct.int8x16x2_t %.fca.0.1.insert
 276 }
 277
 278 define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) {
 279 ; CHECK-LABEL: test_vld2q_dup_s16
 280 ; CHECK: ld2r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
 281 entry:
 282   %0 = bitcast i16* %a to i8*
 283   %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
 284   %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0
 285   %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
 286   %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1
 287   %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
 288   %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0
 289   %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
 290   ret %struct.int16x8x2_t %.fca.0.1.insert
 291 }
 292
 293 define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) {
 294 ; CHECK-LABEL: test_vld2q_dup_s32
 295 ; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 296 entry:
 297   %0 = bitcast i32* %a to i8*
 298   %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
 299   %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0
 300   %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
 301   %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1
 302   %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
 303   %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0
 304   %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
 305   ret %struct.int32x4x2_t %.fca.0.1.insert
 306 }
 307
 308 define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) {
 309 ; CHECK-LABEL: test_vld2q_dup_s64
 310 ; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 311 entry:
 312   %0 = bitcast i64* %a to i8*
 313   %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
 314   %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0
 315   %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
 316   %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1
 317   %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
 318   %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0
 319   %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
 320   ret %struct.int64x2x2_t %.fca.0.1.insert
 321 }
 322
 323 define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) {
 324 ; CHECK-LABEL: test_vld2q_dup_f32
 325 ; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 326 entry:
 327   %0 = bitcast float* %a to i8*
 328   %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
 329   %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0
 330   %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
 331   %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1
 332   %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
 333   %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0
 334   %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
 335   ret %struct.float32x4x2_t %.fca.0.1.insert
 336 }
 337
 338 define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) {
 339 ; CHECK-LABEL: test_vld2q_dup_f64
 340 ; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 341 entry:
 342   %0 = bitcast double* %a to i8*
 343   %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
 344   %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0
 345   %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
 346   %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1
 347   %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
 348   %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0
 349   %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
 350   ret %struct.float64x2x2_t %.fca.0.1.insert
 351 }
 352
 353 define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) {
 354 ; CHECK-LABEL: test_vld2_dup_s8
 355 ; CHECK: ld2r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
 356 entry:
 357   %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
 358   %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0
 359   %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
 360   %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1
 361   %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
 362   %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0
 363   %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
 364   ret %struct.int8x8x2_t %.fca.0.1.insert
 365 }
 366
 367 define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) {
 368 ; CHECK-LABEL: test_vld2_dup_s16
 369 ; CHECK: ld2r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
 370 entry:
 371   %0 = bitcast i16* %a to i8*
 372   %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 373   %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0
 374   %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
 375   %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1
 376   %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
 377   %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0
 378   %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
 379   ret %struct.int16x4x2_t %.fca.0.1.insert
 380 }
 381
 382 define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) {
 383 ; CHECK-LABEL: test_vld2_dup_s32
 384 ; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 385 entry:
 386   %0 = bitcast i32* %a to i8*
 387   %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
 388   %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0
 389   %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
 390   %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1
 391   %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
 392   %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0
 393   %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
 394   ret %struct.int32x2x2_t %.fca.0.1.insert
 395 }
 396
 397 define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) {
 398 ; CHECK-LABEL: test_vld2_dup_s64
 399 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 400 entry:
 401   %0 = bitcast i64* %a to i8*
 402   %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8)
 403   %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0
 404   %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1
 405   %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
 406   %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
 407   ret %struct.int64x1x2_t %.fca.0.1.insert
 408 }
 409
 410 define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) {
 411 ; CHECK-LABEL: test_vld2_dup_f32
 412 ; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 413 entry:
 414   %0 = bitcast float* %a to i8*
 415   %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
 416   %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0
 417   %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
 418   %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1
 419   %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
 420   %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0
 421   %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
 422   ret %struct.float32x2x2_t %.fca.0.1.insert
 423 }
 424
 425 define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) {
 426 ; CHECK-LABEL: test_vld2_dup_f64
 427 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 428 entry:
 429   %0 = bitcast double* %a to i8*
 430   %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8)
 431   %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0
 432   %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1
 433   %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
 434   %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
 435   ret %struct.float64x1x2_t %.fca.0.1.insert
 436 }
 437
 438 define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) {
 439 ; CHECK-LABEL: test_vld3q_dup_s8
 440 ; CHECK: ld3r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
 441 entry:
 442   %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
 443   %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
 444   %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
 445   %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
 446   %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
 447   %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
 448   %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
 449   %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0
 450   %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
 451   %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
 452   ret %struct.int8x16x3_t %.fca.0.2.insert
 453 }
 454
 455 define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) {
 456 ; CHECK-LABEL: test_vld3q_dup_s16
 457 ; CHECK: ld3r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
 458 entry:
 459   %0 = bitcast i16* %a to i8*
 460   %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
 461   %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
 462   %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
 463   %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
 464   %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
 465   %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
 466   %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
 467   %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0
 468   %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
 469   %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
 470   ret %struct.int16x8x3_t %.fca.0.2.insert
 471 }
 472
 473 define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) {
 474 ; CHECK-LABEL: test_vld3q_dup_s32
 475 ; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 476 entry:
 477   %0 = bitcast i32* %a to i8*
 478   %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
 479   %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
 480   %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
 481   %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
 482   %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
 483   %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
 484   %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
 485   %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0
 486   %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
 487   %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
 488   ret %struct.int32x4x3_t %.fca.0.2.insert
 489 }
 490
 491 define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) {
 492 ; CHECK-LABEL: test_vld3q_dup_s64
 493 ; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 494 entry:
 495   %0 = bitcast i64* %a to i8*
 496   %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
 497   %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
 498   %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
 499   %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
 500   %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
 501   %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
 502   %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
 503   %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0
 504   %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
 505   %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
 506   ret %struct.int64x2x3_t %.fca.0.2.insert
 507 }
 508
 509 define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) {
 510 ; CHECK-LABEL: test_vld3q_dup_f32
 511 ; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 512 entry:
 513   %0 = bitcast float* %a to i8*
 514   %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
 515   %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
 516   %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
 517   %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
 518   %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
 519   %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
 520   %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
 521   %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0
 522   %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
 523   %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
 524   ret %struct.float32x4x3_t %.fca.0.2.insert
 525 }
 526
 527 define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) {
 528 ; CHECK-LABEL: test_vld3q_dup_f64
 529 ; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 530 entry:
 531   %0 = bitcast double* %a to i8*
 532   %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
 533   %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
 534   %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
 535   %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
 536   %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
 537   %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
 538   %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
 539   %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0
 540   %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
 541   %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
 542   ret %struct.float64x2x3_t %.fca.0.2.insert
 543 }
 544
 545 define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) {
 546 ; CHECK-LABEL: test_vld3_dup_s8
 547 ; CHECK: ld3r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
 548 entry:
 549   %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
 550   %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
 551   %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
 552   %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
 553   %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
 554   %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
 555   %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
 556   %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0
 557   %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
 558   %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
 559   ret %struct.int8x8x3_t %.fca.0.2.insert
 560 }
 561
 562 define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) {
 563 ; CHECK-LABEL: test_vld3_dup_s16
 564 ; CHECK: ld3r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
 565 entry:
 566   %0 = bitcast i16* %a to i8*
 567   %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 568   %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
 569   %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
 570   %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
 571   %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
 572   %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
 573   %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
 574   %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0
 575   %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
 576   %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
 577   ret %struct.int16x4x3_t %.fca.0.2.insert
 578 }
 579
 580 define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) {
 581 ; CHECK-LABEL: test_vld3_dup_s32
 582 ; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 583 entry:
 584   %0 = bitcast i32* %a to i8*
 585   %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
 586   %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
 587   %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
 588   %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
 589   %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
 590   %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
 591   %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
 592   %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0
 593   %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
 594   %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
 595   ret %struct.int32x2x3_t %.fca.0.2.insert
 596 }
 597
 598 define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) {
 599 ; CHECK-LABEL: test_vld3_dup_s64
 600 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 601 entry:
 602   %0 = bitcast i64* %a to i8*
 603   %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8)
 604   %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
 605   %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
 606   %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
 607   %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
 608   %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
 609   %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
 610   ret %struct.int64x1x3_t %.fca.0.2.insert
 611 }
 612
 613 define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) {
 614 ; CHECK-LABEL: test_vld3_dup_f32
 615 ; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 616 entry:
 617   %0 = bitcast float* %a to i8*
 618   %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
 619   %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
 620   %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
 621   %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
 622   %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
 623   %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
 624   %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
 625   %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0
 626   %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
 627   %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
 628   ret %struct.float32x2x3_t %.fca.0.2.insert
 629 }
 630
 631 define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) {
 632 ; CHECK-LABEL: test_vld3_dup_f64
 633 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 634 entry:
 635   %0 = bitcast double* %a to i8*
 636   %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8)
 637   %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
 638   %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
 639   %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
 640   %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
 641   %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
 642   %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
 643   ret %struct.float64x1x3_t %.fca.0.2.insert
 644 }
 645
 646 define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) {
 647 ; CHECK-LABEL: test_vld4q_dup_s8
 648 ; CHECK: ld4r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
 649 entry:
 650   %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
 651   %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
 652   %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
 653   %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
 654   %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
 655   %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
 656   %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
 657   %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3
 658   %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer
 659   %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0
 660   %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
 661   %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
 662   %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3
 663   ret %struct.int8x16x4_t %.fca.0.3.insert
 664 }
 665
 666 define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) {
 667 ; CHECK-LABEL: test_vld4q_dup_s16
 668 ; CHECK: ld4r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
 669 entry:
 670   %0 = bitcast i16* %a to i8*
 671   %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
 672   %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
 673   %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
 674   %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
 675   %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
 676   %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
 677   %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
 678   %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3
 679   %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer
 680   %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0
 681   %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
 682   %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
 683   %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3
 684   ret %struct.int16x8x4_t %.fca.0.3.insert
 685 }
 686
 687 define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) {
 688 ; CHECK-LABEL: test_vld4q_dup_s32
 689 ; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 690 entry:
 691   %0 = bitcast i32* %a to i8*
 692   %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
 693   %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
 694   %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
 695   %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
 696   %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
 697   %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
 698   %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
 699   %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3
 700   %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer
 701   %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0
 702   %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
 703   %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
 704   %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3
 705   ret %struct.int32x4x4_t %.fca.0.3.insert
 706 }
 707
 708 define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) {
 709 ; CHECK-LABEL: test_vld4q_dup_s64
 710 ; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 711 entry:
 712   %0 = bitcast i64* %a to i8*
 713   %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
 714   %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
 715   %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
 716   %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
 717   %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
 718   %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
 719   %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
 720   %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3
 721   %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer
 722   %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0
 723   %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
 724   %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
 725   %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3
 726   ret %struct.int64x2x4_t %.fca.0.3.insert
 727 }
 728
 729 define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) {
 730 ; CHECK-LABEL: test_vld4q_dup_f32
 731 ; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 732 entry:
 733   %0 = bitcast float* %a to i8*
 734   %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
 735   %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
 736   %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
 737   %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
 738   %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
 739   %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
 740   %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
 741   %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3
 742   %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer
 743   %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0
 744   %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
 745   %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
 746   %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3
 747   ret %struct.float32x4x4_t %.fca.0.3.insert
 748 }
 749
 750 define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) {
 751 ; CHECK-LABEL: test_vld4q_dup_f64
 752 ; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 753 entry:
 754   %0 = bitcast double* %a to i8*
 755   %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
 756   %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
 757   %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
 758   %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
 759   %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
 760   %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
 761   %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
 762   %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3
 763   %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer
 764   %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0
 765   %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
 766   %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
 767   %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3
 768   ret %struct.float64x2x4_t %.fca.0.3.insert
 769 }
 770
 771 define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) {
 772 ; CHECK-LABEL: test_vld4_dup_s8
 773 ; CHECK: ld4r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
 774 entry:
 775   %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
 776   %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
 777   %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
 778   %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
 779   %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
 780   %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
 781   %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
 782   %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3
 783   %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer
 784   %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0
 785   %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
 786   %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
 787   %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3
 788   ret %struct.int8x8x4_t %.fca.0.3.insert
 789 }
 790
 791 define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) {
 792 ; CHECK-LABEL: test_vld4_dup_s16
 793 ; CHECK: ld4r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
 794 entry:
 795   %0 = bitcast i16* %a to i8*
 796   %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 797   %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
 798   %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
 799   %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
 800   %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
 801   %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
 802   %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
 803   %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3
 804   %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer
 805   %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0
 806   %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
 807   %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
 808   %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3
 809   ret %struct.int16x4x4_t %.fca.0.3.insert
 810 }
 811
 812 define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) {
 813 ; CHECK-LABEL: test_vld4_dup_s32
 814 ; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 815 entry:
 816   %0 = bitcast i32* %a to i8*
 817   %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
 818   %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
 819   %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
 820   %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
 821   %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
 822   %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
 823   %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
 824   %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3
 825   %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer
 826   %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0
 827   %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
 828   %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
 829   %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3
 830   ret %struct.int32x2x4_t %.fca.0.3.insert
 831 }
 832
 833 define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) {
 834 ; CHECK-LABEL: test_vld4_dup_s64
 835 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 836 entry:
 837   %0 = bitcast i64* %a to i8*
 838   %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8)
 839   %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
 840   %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
 841   %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
 842   %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3
 843   %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
 844   %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
 845   %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
 846   %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3
 847   ret %struct.int64x1x4_t %.fca.0.3.insert
 848 }
 849
 850 define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) {
 851 ; CHECK-LABEL: test_vld4_dup_f32
 852 ; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 853 entry:
 854   %0 = bitcast float* %a to i8*
 855   %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
 856   %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
 857   %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
 858   %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
 859   %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
 860   %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
 861   %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
 862   %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3
 863   %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer
 864   %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0
 865   %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
 866   %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
 867   %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3
 868   ret %struct.float32x2x4_t %.fca.0.3.insert
 869 }
 870
 871 define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) {
 872 ; CHECK-LABEL: test_vld4_dup_f64
 873 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 874 entry:
 875   %0 = bitcast double* %a to i8*
 876   %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8)
 877   %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
 878   %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
 879   %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
 880   %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3
 881   %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
 882   %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
 883   %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
 884   %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3
 885   ret %struct.float64x1x4_t %.fca.0.3.insert
 886 }
 887
 888 define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
 889 ; CHECK-LABEL: test_vld1q_lane_s8
 890 ; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
 891 entry:
 892   %0 = load i8* %a, align 1
 893   %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
 894   ret <16 x i8> %vld1_lane
 895 }
 896
 897 define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
 898 ; CHECK-LABEL: test_vld1q_lane_s16
 899 ; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
 900 entry:
 901   %0 = load i16* %a, align 2
 902   %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
 903   ret <8 x i16> %vld1_lane
 904 }
 905
 906 define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
 907 ; CHECK-LABEL: test_vld1q_lane_s32
 908 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
 909 entry:
 910   %0 = load i32* %a, align 4
 911   %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
 912   ret <4 x i32> %vld1_lane
 913 }
 914
 915 define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
 916 ; CHECK-LABEL: test_vld1q_lane_s64
 917 ; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
 918 entry:
 919   %0 = load i64* %a, align 8
 920   %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
 921   ret <2 x i64> %vld1_lane
 922 }
 923
 924 define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
 925 ; CHECK-LABEL: test_vld1q_lane_f32
 926 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
 927 entry:
 928   %0 = load float* %a, align 4
 929   %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
 930   ret <4 x float> %vld1_lane
 931 }
 932
 933 define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
 934 ; CHECK-LABEL: test_vld1q_lane_f64
 935 ; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
 936 entry:
 937   %0 = load double* %a, align 8
 938   %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
 939   ret <2 x double> %vld1_lane
 940 }
 941
 942 define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
 943 ; CHECK-LABEL: test_vld1_lane_s8
 944 ; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
 945 entry:
 946   %0 = load i8* %a, align 1
 947   %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
 948   ret <8 x i8> %vld1_lane
 949 }
 950
 951 define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
 952 ; CHECK-LABEL: test_vld1_lane_s16
 953 ; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
 954 entry:
 955   %0 = load i16* %a, align 2
 956   %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
 957   ret <4 x i16> %vld1_lane
 958 }
 959
 960 define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
 961 ; CHECK-LABEL: test_vld1_lane_s32
 962 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
 963 entry:
 964   %0 = load i32* %a, align 4
 965   %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
 966   ret <2 x i32> %vld1_lane
 967 }
 968
 969 define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
 970 ; CHECK-LABEL: test_vld1_lane_s64
 971 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
 972 entry:
 973   %0 = load i64* %a, align 8
 974   %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
 975   ret <1 x i64> %vld1_lane
 976 }
 977
 978 define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
 979 ; CHECK-LABEL: test_vld1_lane_f32
 980 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
 981 entry:
 982   %0 = load float* %a, align 4
 983   %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
 984   ret <2 x float> %vld1_lane
 985 }
 986
 987 define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
 988 ; CHECK-LABEL: test_vld1_lane_f64
 989 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
 990 entry:
 991   %0 = load double* %a, align 8
 992   %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
 993   ret <1 x double> %vld1_lane
 994 }
 995
 996 define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
 997 ; CHECK-LABEL: test_vld2q_lane_s16
 998 ; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
 999 entry:
1000   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
1001   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
1002   %0 = bitcast i16* %a to i8*
1003   %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
1004   %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0
1005   %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1
1006   %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0
1007   %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1
1008   ret %struct.int16x8x2_t %.fca.0.1.insert
1009 }
1010
1011 define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
1012 ; CHECK-LABEL: test_vld2q_lane_s32
1013 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1014 entry:
1015   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
1016   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
1017   %0 = bitcast i32* %a to i8*
1018   %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
1019   %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0
1020   %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1
1021   %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0
1022   %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1
1023   ret %struct.int32x4x2_t %.fca.0.1.insert
1024 }
1025
1026 define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
1027 ; CHECK-LABEL: test_vld2q_lane_s64
1028 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1029 entry:
1030   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
1031   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
1032   %0 = bitcast i64* %a to i8*
1033   %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
1034   %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0
1035   %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1
1036   %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0
1037   %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1
1038   ret %struct.int64x2x2_t %.fca.0.1.insert
1039 }
1040
1041 define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
1042 ; CHECK-LABEL: test_vld2q_lane_f32
1043 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1044 entry:
1045   %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
1046   %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
1047   %0 = bitcast float* %a to i8*
1048   %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
1049   %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0
1050   %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1
1051   %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0
1052   %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1
1053   ret %struct.float32x4x2_t %.fca.0.1.insert
1054 }
1055
1056 define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
1057 ; CHECK-LABEL: test_vld2q_lane_f64
1058 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1059 entry:
1060   %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
1061   %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
1062   %0 = bitcast double* %a to i8*
1063   %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
1064   %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0
1065   %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1
1066   %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0
1067   %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1
1068   ret %struct.float64x2x2_t %.fca.0.1.insert
1069 }
1070
1071 define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
1072 ; CHECK-LABEL: test_vld2_lane_s8
1073 ; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1074 entry:
1075   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
1076   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
1077   %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
1078   %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0
1079   %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1
1080   %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0
1081   %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1
1082   ret %struct.int8x8x2_t %.fca.0.1.insert
1083 }
1084
1085 define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
1086 ; CHECK-LABEL: test_vld2_lane_s16
1087 ; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1088 entry:
1089   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
1090   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
1091   %0 = bitcast i16* %a to i8*
1092   %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
1093   %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0
1094   %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1
1095   %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0
1096   %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1
1097   ret %struct.int16x4x2_t %.fca.0.1.insert
1098 }
1099
1100 define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
1101 ; CHECK-LABEL: test_vld2_lane_s32
1102 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1103 entry:
1104   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
1105   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
1106   %0 = bitcast i32* %a to i8*
1107   %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
1108   %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0
1109   %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1
1110   %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0
1111   %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1
1112   ret %struct.int32x2x2_t %.fca.0.1.insert
1113 }
1114
1115 define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
1116 ; CHECK-LABEL: test_vld2_lane_s64
1117 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1118 entry:
1119   %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
1120   %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
1121   %0 = bitcast i64* %a to i8*
1122   %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
1123   %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0
1124   %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1
1125   %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0
1126   %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1
1127   ret %struct.int64x1x2_t %.fca.0.1.insert
1128 }
1129
1130 define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
1131 ; CHECK-LABEL: test_vld2_lane_f32
1132 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1133 entry:
1134   %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
1135   %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
1136   %0 = bitcast float* %a to i8*
1137   %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
1138   %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0
1139   %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1
1140   %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0
1141   %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1
1142   ret %struct.float32x2x2_t %.fca.0.1.insert
1143 }
1144
1145 define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
1146 ; CHECK-LABEL: test_vld2_lane_f64
1147 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1148 entry:
1149   %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
1150   %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
1151   %0 = bitcast double* %a to i8*
1152   %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
1153   %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0
1154   %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1
1155   %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0
1156   %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1
1157   ret %struct.float64x1x2_t %.fca.0.1.insert
1158 }
1159
1160 define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
1161 ; CHECK-LABEL: test_vld3q_lane_s16
1162 ; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1163 entry:
1164   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
1165   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
1166   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
1167   %0 = bitcast i16* %a to i8*
1168   %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
1169   %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
1170   %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
1171   %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
1172   %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
1173   %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
1174   %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
1175   ret %struct.int16x8x3_t %.fca.0.2.insert
1176 }
1177
1178 define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
1179 ; CHECK-LABEL: test_vld3q_lane_s32
1180 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1181 entry:
1182   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
1183   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
1184   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
1185   %0 = bitcast i32* %a to i8*
1186   %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
1187   %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
1188   %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
1189   %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
1190   %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
1191   %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
1192   %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
1193   ret %struct.int32x4x3_t %.fca.0.2.insert
1194 }
1195
1196 define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
1197 ; CHECK-LABEL: test_vld3q_lane_s64
1198 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1199 entry:
1200   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
1201   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
1202   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
1203   %0 = bitcast i64* %a to i8*
1204   %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
1205   %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
1206   %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
1207   %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
1208   %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
1209   %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
1210   %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
1211   ret %struct.int64x2x3_t %.fca.0.2.insert
1212 }
1213
1214 define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
1215 ; CHECK-LABEL: test_vld3q_lane_f32
1216 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1217 entry:
1218   %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
1219   %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
1220   %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
1221   %0 = bitcast float* %a to i8*
1222   %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
1223   %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
1224   %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
1225   %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
1226   %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
1227   %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
1228   %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
1229   ret %struct.float32x4x3_t %.fca.0.2.insert
1230 }
1231
1232 define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
1233 ; CHECK-LABEL: test_vld3q_lane_f64
1234 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1235 entry:
1236   %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
1237   %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
1238   %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
1239   %0 = bitcast double* %a to i8*
1240   %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
1241   %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
1242   %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
1243   %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
1244   %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
1245   %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
1246   %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
1247   ret %struct.float64x2x3_t %.fca.0.2.insert
1248 }
1249
1250 define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
1251 ; CHECK-LABEL: test_vld3_lane_s8
1252 ; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1253 entry:
1254   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
1255   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
1256   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
1257   %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
1258   %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
1259   %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
1260   %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
1261   %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
1262   %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
1263   %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
1264   ret %struct.int8x8x3_t %.fca.0.2.insert
1265 }
1266
1267 define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
1268 ; CHECK-LABEL: test_vld3_lane_s16
1269 ; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1270 entry:
1271   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
1272   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
1273   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
1274   %0 = bitcast i16* %a to i8*
1275   %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
1276   %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
1277   %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
1278   %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
1279   %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
1280   %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
1281   %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
1282   ret %struct.int16x4x3_t %.fca.0.2.insert
1283 }
1284
1285 define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
1286 ; CHECK-LABEL: test_vld3_lane_s32
1287 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1288 entry:
1289   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
1290   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1291   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1292   %0 = bitcast i32* %a to i8*
1293   %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
1294   %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
1295   %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
1296   %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
1297   %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
1298   %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
1299   %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
1300   ret %struct.int32x2x3_t %.fca.0.2.insert
1301 }
1302
1303 define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1304 ; CHECK-LABEL: test_vld3_lane_s64
1305 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1306 entry:
1307   %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1308   %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1309   %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1310   %0 = bitcast i64* %a to i8*
1311   %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
1312   %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
1313   %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
1314   %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
1315   %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
1316   %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
1317   %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
1318   ret %struct.int64x1x3_t %.fca.0.2.insert
1319 }
1320
1321 define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1322 ; CHECK-LABEL: test_vld3_lane_f32
1323 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1324 entry:
1325   %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1326   %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1327   %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1328   %0 = bitcast float* %a to i8*
1329   %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
1330   %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
1331   %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
1332   %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
1333   %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
1334   %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
1335   %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
1336   ret %struct.float32x2x3_t %.fca.0.2.insert
1337 }
1338
1339 define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1340 ; CHECK-LABEL: test_vld3_lane_f64
1341 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1342 entry:
1343   %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1344   %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1345   %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1346   %0 = bitcast double* %a to i8*
1347   %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
1348   %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
1349   %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
1350   %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
1351   %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
1352   %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
1353   %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
1354   ret %struct.float64x1x3_t %.fca.0.2.insert
1355 }
1356
1357 define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
1358 ; CHECK-LABEL: test_vld4q_lane_s8
1359 ; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1360 entry:
1361   %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1362   %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1363   %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1364   %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1365   %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1)
1366   %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
1367   %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
1368   %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
1369   %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3
1370   %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
1371   %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
1372   %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
1373   %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3
1374   ret %struct.int8x16x4_t %.fca.0.3.insert
1375 }
1376
1377 define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
1378 ; CHECK-LABEL: test_vld4q_lane_s16
1379 ; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1380 entry:
1381   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
1382   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
1383   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
1384   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
1385   %0 = bitcast i16* %a to i8*
1386   %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
1387   %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
1388   %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
1389   %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
1390   %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3
1391   %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
1392   %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
1393   %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
1394   %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3
1395   ret %struct.int16x8x4_t %.fca.0.3.insert
1396 }
1397
1398 define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
1399 ; CHECK-LABEL: test_vld4q_lane_s32
1400 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1401 entry:
1402   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
1403   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
1404   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
1405   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
1406   %0 = bitcast i32* %a to i8*
1407   %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
1408   %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
1409   %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
1410   %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
1411   %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3
1412   %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
1413   %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
1414   %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
1415   %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3
1416   ret %struct.int32x4x4_t %.fca.0.3.insert
1417 }
1418
1419 define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
1420 ; CHECK-LABEL: test_vld4q_lane_s64
1421 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1422 entry:
1423   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
1424   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
1425   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
1426   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
1427   %0 = bitcast i64* %a to i8*
1428   %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
1429   %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
1430   %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
1431   %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
1432   %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3
1433   %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
1434   %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
1435   %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
1436   %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3
1437   ret %struct.int64x2x4_t %.fca.0.3.insert
1438 }
1439
1440 define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
1441 ; CHECK-LABEL: test_vld4q_lane_f32
1442 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1443 entry:
1444   %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
1445   %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
1446   %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
1447   %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
1448   %0 = bitcast float* %a to i8*
1449   %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
1450   %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
1451   %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
1452   %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
1453   %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3
1454   %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
1455   %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
1456   %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
1457   %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3
1458   ret %struct.float32x4x4_t %.fca.0.3.insert
1459 }
1460
1461 define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
1462 ; CHECK-LABEL: test_vld4q_lane_f64
1463 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1464 entry:
1465   %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
1466   %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
1467   %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
1468   %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
1469   %0 = bitcast double* %a to i8*
1470   %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
1471   %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
1472   %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
1473   %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
1474   %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3
1475   %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
1476   %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
1477   %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
1478   %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3
1479   ret %struct.float64x2x4_t %.fca.0.3.insert
1480 }
1481
1482 define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
1483 ; CHECK-LABEL: test_vld4_lane_s8
1484 ; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1485 entry:
1486   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
1487   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
1488   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
1489   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
1490   %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
1491   %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
1492   %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
1493   %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
1494   %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3
1495   %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
1496   %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
1497   %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
1498   %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3
1499   ret %struct.int8x8x4_t %.fca.0.3.insert
1500 }
1501
1502 define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
1503 ; CHECK-LABEL: test_vld4_lane_s16
1504 ; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1505 entry:
1506   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
1507   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
1508   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
1509   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
1510   %0 = bitcast i16* %a to i8*
1511   %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
1512   %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
1513   %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
1514   %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
1515   %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3
1516   %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
1517   %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
1518   %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
1519   %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3
1520   ret %struct.int16x4x4_t %.fca.0.3.insert
1521 }
1522
1523 define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
1524 ; CHECK-LABEL: test_vld4_lane_s32
1525 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1526 entry:
1527   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
1528   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
1529   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
1530   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
1531   %0 = bitcast i32* %a to i8*
1532   %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
1533   %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
1534   %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
1535   %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
1536   %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3
1537   %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
1538   %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
1539   %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
1540   %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3
1541   ret %struct.int32x2x4_t %.fca.0.3.insert
1542 }
1543
1544 define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
1545 ; CHECK-LABEL: test_vld4_lane_s64
1546 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1547 entry:
1548   %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
1549   %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
1550   %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
1551   %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
1552   %0 = bitcast i64* %a to i8*
1553   %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
1554   %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
1555   %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
1556   %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
1557   %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3
1558   %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
1559   %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
1560   %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
1561   %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3
1562   ret %struct.int64x1x4_t %.fca.0.3.insert
1563 }
1564
1565 define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
1566 ; CHECK-LABEL: test_vld4_lane_f32
1567 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1568 entry:
1569   %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
1570   %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
1571   %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
1572   %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
1573   %0 = bitcast float* %a to i8*
1574   %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
1575   %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
1576   %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
1577   %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
1578   %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3
1579   %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
1580   %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
1581   %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
1582   %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3
1583   ret %struct.float32x2x4_t %.fca.0.3.insert
1584 }
1585
1586 define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
1587 ; CHECK-LABEL: test_vld4_lane_f64
1588 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1589 entry:
1590   %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
1591   %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
1592   %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
1593   %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
1594   %0 = bitcast double* %a to i8*
1595   %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
1596   %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
1597   %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
1598   %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
1599   %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3
1600   %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
1601   %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
1602   %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
1603   %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3
1604   ret %struct.float64x1x4_t %.fca.0.3.insert
1605 }
1606
1607 define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
1608 ; CHECK-LABEL: test_vst1q_lane_s8
1609 ; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1610 entry:
1611   %0 = extractelement <16 x i8> %b, i32 15
1612   store i8 %0, i8* %a, align 1
1613   ret void
1614 }
1615
1616 define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
1617 ; CHECK-LABEL: test_vst1q_lane_s16
1618 ; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1619 entry:
1620   %0 = extractelement <8 x i16> %b, i32 7
1621   store i16 %0, i16* %a, align 2
1622   ret void
1623 }
1624
1625 define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
1626 ; CHECK-LABEL: test_vst1q_lane_s32
1627 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1628 entry:
1629   %0 = extractelement <4 x i32> %b, i32 3
1630   store i32 %0, i32* %a, align 4
1631   ret void
1632 }
1633
1634 define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
1635 ; CHECK-LABEL: test_vst1q_lane_s64
1636 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1637 entry:
1638   %0 = extractelement <2 x i64> %b, i32 1
1639   store i64 %0, i64* %a, align 8
1640   ret void
1641 }
1642
1643 define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
1644 ; CHECK-LABEL: test_vst1q_lane_f32
1645 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1646 entry:
1647   %0 = extractelement <4 x float> %b, i32 3
1648   store float %0, float* %a, align 4
1649   ret void
1650 }
1651
1652 define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
1653 ; CHECK-LABEL: test_vst1q_lane_f64
1654 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1655 entry:
1656   %0 = extractelement <2 x double> %b, i32 1
1657   store double %0, double* %a, align 8
1658   ret void
1659 }
1660
1661 define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
1662 ; CHECK-LABEL: test_vst1_lane_s8
1663 ; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1664 entry:
1665   %0 = extractelement <8 x i8> %b, i32 7
1666   store i8 %0, i8* %a, align 1
1667   ret void
1668 }
1669
1670 define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
1671 ; CHECK-LABEL: test_vst1_lane_s16
1672 ; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1673 entry:
1674   %0 = extractelement <4 x i16> %b, i32 3
1675   store i16 %0, i16* %a, align 2
1676   ret void
1677 }
1678
1679 define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
1680 ; CHECK-LABEL: test_vst1_lane_s32
1681 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1682 entry:
1683   %0 = extractelement <2 x i32> %b, i32 1
1684   store i32 %0, i32* %a, align 4
1685   ret void
1686 }
1687
1688 define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
1689 ; CHECK-LABEL: test_vst1_lane_s64
1690 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1691 entry:
1692   %0 = extractelement <1 x i64> %b, i32 0
1693   store i64 %0, i64* %a, align 8
1694   ret void
1695 }
1696
1697 define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
1698 ; CHECK-LABEL: test_vst1_lane_f32
1699 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1700 entry:
1701   %0 = extractelement <2 x float> %b, i32 1
1702   store float %0, float* %a, align 4
1703   ret void
1704 }
1705
1706 define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
1707 ; CHECK-LABEL: test_vst1_lane_f64
1708 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1709 entry:
1710   %0 = extractelement <1 x double> %b, i32 0
1711   store double %0, double* %a, align 8
1712   ret void
1713 }
1714
1715 define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
1716 ; CHECK-LABEL: test_vst2q_lane_s8
1717 ; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1718 entry:
1719   %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
1720   %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
1721   tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1)
1722   ret void
1723 }
1724
1725 define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
1726 ; CHECK-LABEL: test_vst2q_lane_s16
1727 ; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1728 entry:
1729   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
1730   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
1731   %0 = bitcast i16* %a to i8*
1732   tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
1733   ret void
1734 }
1735
1736 define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
1737 ; CHECK-LABEL: test_vst2q_lane_s32
1738 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1739 entry:
1740   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
1741   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
1742   %0 = bitcast i32* %a to i8*
1743   tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
1744   ret void
1745 }
1746
1747 define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
1748 ; CHECK-LABEL: test_vst2q_lane_s64
1749 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1750 entry:
1751   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
1752   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
1753   %0 = bitcast i64* %a to i8*
1754   tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
1755   ret void
1756 }
1757
1758 define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
1759 ; CHECK-LABEL: test_vst2q_lane_f32
1760 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1761 entry:
1762   %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
1763   %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
1764   %0 = bitcast float* %a to i8*
1765   tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
1766   ret void
1767 }
1768
1769 define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
1770 ; CHECK-LABEL: test_vst2q_lane_f64
1771 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1772 entry:
1773   %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
1774   %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
1775   %0 = bitcast double* %a to i8*
1776   tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
1777   ret void
1778 }
1779
1780 define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
1781 ; CHECK-LABEL: test_vst2_lane_s8
1782 ; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1783 entry:
1784   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
1785   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
1786   tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
1787   ret void
1788 }
1789
1790 define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
1791 ; CHECK-LABEL: test_vst2_lane_s16
1792 ; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1793 entry:
1794   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
1795   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
1796   %0 = bitcast i16* %a to i8*
1797   tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
1798   ret void
1799 }
1800
1801 define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
1802 ; CHECK-LABEL: test_vst2_lane_s32
1803 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1804 entry:
1805   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
1806   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
1807   %0 = bitcast i32* %a to i8*
1808   tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
1809   ret void
1810 }
1811
1812 define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
1813 ; CHECK-LABEL: test_vst2_lane_s64
1814 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1815 entry:
1816   %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
1817   %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
1818   %0 = bitcast i64* %a to i8*
1819   tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
1820   ret void
1821 }
1822
1823 define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
1824 ; CHECK-LABEL: test_vst2_lane_f32
1825 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1826 entry:
1827   %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
1828   %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
1829   %0 = bitcast float* %a to i8*
1830   tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
1831   ret void
1832 }
1833
1834 define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
1835 ; CHECK-LABEL: test_vst2_lane_f64
1836 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1837 entry:
1838   %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
1839   %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
1840   %0 = bitcast double* %a to i8*
1841   tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
1842   ret void
1843 }
1844
1845 define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
1846 ; CHECK-LABEL: test_vst3q_lane_s8
1847 ; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1848 entry:
1849   %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
1850   %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
1851   %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
1852   tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1)
1853   ret void
1854 }
1855
1856 define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
1857 ; CHECK-LABEL: test_vst3q_lane_s16
1858 ; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1859 entry:
1860   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
1861   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
1862   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
1863   %0 = bitcast i16* %a to i8*
1864   tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
1865   ret void
1866 }
1867
1868 define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
1869 ; CHECK-LABEL: test_vst3q_lane_s32
1870 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1871 entry:
1872   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
1873   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
1874   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
1875   %0 = bitcast i32* %a to i8*
1876   tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
1877   ret void
1878 }
1879
1880 define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
1881 ; CHECK-LABEL: test_vst3q_lane_s64
1882 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1883 entry:
1884   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
1885   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
1886   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
1887   %0 = bitcast i64* %a to i8*
1888   tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
1889   ret void
1890 }
1891
1892 define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
1893 ; CHECK-LABEL: test_vst3q_lane_f32
1894 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1895 entry:
1896   %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
1897   %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
1898   %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
1899   %0 = bitcast float* %a to i8*
1900   tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
1901   ret void
1902 }
1903
1904 define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
1905 ; CHECK-LABEL: test_vst3q_lane_f64
1906 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1907 entry:
1908   %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
1909   %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
1910   %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
1911   %0 = bitcast double* %a to i8*
1912   tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
1913   ret void
1914 }
1915
1916 define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
1917 ; CHECK-LABEL: test_vst3_lane_s8
1918 ; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1919 entry:
1920   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
1921   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
1922   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
1923   tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
1924   ret void
1925 }
1926
1927 define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
1928 ; CHECK-LABEL: test_vst3_lane_s16
1929 ; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1930 entry:
1931   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
1932   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
1933   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
1934   %0 = bitcast i16* %a to i8*
1935   tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
1936   ret void
1937 }
1938
1939 define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
1940 ; CHECK-LABEL: test_vst3_lane_s32
1941 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1942 entry:
1943   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
1944   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1945   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1946   %0 = bitcast i32* %a to i8*
1947   tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
1948   ret void
1949 }
1950
1951 define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1952 ; CHECK-LABEL: test_vst3_lane_s64
1953 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1954 entry:
1955   %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1956   %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1957   %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1958   %0 = bitcast i64* %a to i8*
1959   tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
1960   ret void
1961 }
1962
1963 define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1964 ; CHECK-LABEL: test_vst3_lane_f32
1965 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1966 entry:
1967   %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1968   %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1969   %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1970   %0 = bitcast float* %a to i8*
1971   tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
1972   ret void
1973 }
1974
1975 define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1976 ; CHECK-LABEL: test_vst3_lane_f64
1977 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1978 entry:
1979   %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1980   %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1981   %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1982   %0 = bitcast double* %a to i8*
1983   tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
1984   ret void
1985 }
1986
1987 define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) {
1988 ; CHECK-LABEL: test_vst4q_lane_s8
1989 ; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1990 entry:
1991   %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1992   %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1993   %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1994   %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1995   %0 = bitcast i16* %a to i8*
1996   tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2)
1997   ret void
1998 }
1999
2000 define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
2001 ; CHECK-LABEL: test_vst4q_lane_s16
2002 ; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
2003 entry:
2004   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
2005   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
2006   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
2007   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
2008   %0 = bitcast i16* %a to i8*
2009   tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
2010   ret void
2011 }
2012
2013 define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
2014 ; CHECK-LABEL: test_vst4q_lane_s32
2015 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2016 entry:
2017   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
2018   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
2019   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
2020   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
2021   %0 = bitcast i32* %a to i8*
2022   tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
2023   ret void
2024 }
2025
2026 define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
2027 ; CHECK-LABEL: test_vst4q_lane_s64
2028 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2029 entry:
2030   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
2031   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
2032   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
2033   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
2034   %0 = bitcast i64* %a to i8*
2035   tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
2036   ret void
2037 }
2038
2039 define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
2040 ; CHECK-LABEL: test_vst4q_lane_f32
2041 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2042 entry:
2043   %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
2044   %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
2045   %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
2046   %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
2047   %0 = bitcast float* %a to i8*
2048   tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
2049   ret void
2050 }
2051
2052 define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
2053 ; CHECK-LABEL: test_vst4q_lane_f64
2054 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2055 entry:
2056   %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
2057   %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
2058   %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
2059   %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
2060   %0 = bitcast double* %a to i8*
2061   tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
2062   ret void
2063 }
2064
2065 define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
2066 ; CHECK-LABEL: test_vst4_lane_s8
2067 ; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
2068 entry:
2069   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
2070   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
2071   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
2072   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
2073   tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
2074   ret void
2075 }
2076
2077 define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
2078 ; CHECK-LABEL: test_vst4_lane_s16
2079 ; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
2080 entry:
2081   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
2082   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
2083   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
2084   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
2085   %0 = bitcast i16* %a to i8*
2086   tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
2087   ret void
2088 }
2089
2090 define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
2091 ; CHECK-LABEL: test_vst4_lane_s32
2092 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2093 entry:
2094   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
2095   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
2096   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
2097   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
2098   %0 = bitcast i32* %a to i8*
2099   tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
2100   ret void
2101 }
2102
2103 define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
2104 ; CHECK-LABEL: test_vst4_lane_s64
2105 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2106 entry:
2107   %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
2108   %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
2109   %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
2110   %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
2111   %0 = bitcast i64* %a to i8*
2112   tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
2113   ret void
2114 }
2115
2116 define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
2117 ; CHECK-LABEL: test_vst4_lane_f32
2118 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2119 entry:
2120   %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
2121   %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
2122   %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
2123   %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
2124   %0 = bitcast float* %a to i8*
2125   tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
2126   ret void
2127 }
2128
2129 define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
2130 ; CHECK-LABEL: test_vst4_lane_f64
2131 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2132 entry:
2133   %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
2134   %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
2135   %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
2136   %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
2137   %0 = bitcast double* %a to i8*
2138   tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
2139   ret void
2140 }
2141
2142 declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
2143 declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
2144 declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
2145 declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
2146 declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
2147 declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
2148 declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
2149 declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
2150 declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
2151 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
2152 declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
2153 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
2154 declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2155 declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2156 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2157 declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2158 declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2159 declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2160 declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2161 declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2162 declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2163 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
2164 declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2165 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
2166 declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2167 declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2168 declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2169 declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2170 declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2171 declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2172 declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2173 declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2174 declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2175 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
2176 declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2177 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
2178 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
2179 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
2180 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2181 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2182 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2183 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2184 declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
2185 declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
2186 declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
2187 declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
2188 declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
2189 declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
2190 declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
2191 declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
2192 declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
2193 declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
2194 declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
2195 declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
2196 declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2197 declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2198 declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2199 declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2200 declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2201 declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2202 declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2203 declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2204 declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2205 declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2206 declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2207 declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2208 declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2209 declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2210 declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2211 declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2212 declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2213 declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2214 declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2215 declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2216 declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2217 declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2218 declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2219 declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)