test/CodeGen/AArch64/neon-simd-post-ldst-one.ll

   1 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
   2
   3 define { [2 x <16 x i8>] } @test_vld2q_dup_fx_update(i8* %a, i8** %ptr) {
   4 ; CHECK-LABEL: test_vld2q_dup_fx_update
   5 ; CHECK: ld2r  { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [x{{[0-9]+|sp}}], #2
   6   %1 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
   7   %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
   8   %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
   9   %4 = extractvalue { <16 x i8>, <16 x i8> } %1, 1
  10   %5 = shufflevector <16 x i8> %4, <16 x i8> undef, <16 x i32> zeroinitializer
  11   %6 = insertvalue { [2 x <16 x i8>] } undef, <16 x i8> %3, 0, 0
  12   %7 = insertvalue { [2 x <16 x i8>] } %6, <16 x i8> %5, 0, 1
  13   %tmp1 = getelementptr i8* %a, i32 2
  14   store i8* %tmp1, i8** %ptr
  15   ret { [2 x <16 x i8>] } %7
  16 }
  17
  18 define { [2 x <4 x i32>] } @test_vld2q_dup_reg_update(i32* %a, i32** %ptr, i32 %inc) {
  19 ; CHECK-LABEL: test_vld2q_dup_reg_update
  20 ; CHECK: ld2r  { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [x{{[0-9]+|sp}}], x{{[0-9]+}}
  21   %1 = bitcast i32* %a to i8*
  22   %2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %1, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
  23   %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0
  24   %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
  25   %5 = extractvalue { <4 x i32>, <4 x i32> } %2, 1
  26   %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer
  27   %7 = insertvalue { [2 x <4 x i32>] } undef, <4 x i32> %4, 0, 0
  28   %8 = insertvalue { [2 x <4 x i32>] } %7, <4 x i32> %6, 0, 1
  29   %tmp1 = getelementptr i32* %a, i32 %inc
  30   store i32* %tmp1, i32** %ptr
  31   ret { [2 x <4 x i32>] } %8
  32 }
  33
  34 define { [3 x <4 x i16>] } @test_vld3_dup_fx_update(i16* %a, i16** %ptr) {
  35 ; CHECK-LABEL: test_vld3_dup_fx_update
  36 ; CHECK: ld3r  { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [x{{[0-9]+|sp}}], #6
  37   %1 = bitcast i16* %a to i8*
  38   %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %1, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
  39   %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
  40   %4 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
  41   %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
  42   %6 = shufflevector <4 x i16> %5, <4 x i16> undef, <4 x i32> zeroinitializer
  43   %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
  44   %8 = shufflevector <4 x i16> %7, <4 x i16> undef, <4 x i32> zeroinitializer
  45   %9 = insertvalue { [3 x <4 x i16>] }  undef, <4 x i16> %4, 0, 0
  46   %10 = insertvalue { [3 x <4 x i16>] }  %9, <4 x i16> %6, 0, 1
  47   %11 = insertvalue { [3 x <4 x i16>] }  %10, <4 x i16> %8, 0, 2
  48   %tmp1 = getelementptr i16* %a, i32 3
  49   store i16* %tmp1, i16** %ptr
  50   ret { [3 x <4 x i16>] }  %11
  51 }
  52
  53 define { [3 x <8 x i8>] } @test_vld3_dup_reg_update(i8* %a, i8** %ptr, i32 %inc) {
  54 ; CHECK-LABEL: test_vld3_dup_reg_update
  55 ; CHECK: ld3r  { v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b }, [x{{[0-9]+|sp}}], x{{[0-9]+}}
  56   %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
  57   %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
  58   %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
  59   %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
  60   %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <8 x i32> zeroinitializer
  61   %6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
  62   %7 = shufflevector <8 x i8> %6, <8 x i8> undef, <8 x i32> zeroinitializer
  63   %8 = insertvalue { [3 x <8 x i8>] } undef, <8 x i8> %3, 0, 0
  64   %9 = insertvalue { [3 x <8 x i8>] } %8, <8 x i8> %5, 0, 1
  65   %10 = insertvalue { [3 x <8 x i8>] } %9, <8 x i8> %7, 0, 2
  66   %tmp1 = getelementptr i8* %a, i32 %inc
  67   store i8* %tmp1, i8** %ptr
  68   ret { [3 x <8 x i8>] }%10
  69 }
  70
  71 define { [4 x <2 x i32>] } @test_vld4_dup_fx_update(i32* %a, i32** %ptr) #0 {
  72 ; CHECK-LABEL: test_vld4_dup_fx_update
  73 ; CHECK: ld4r  { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [x{{[0-9]+|sp}}], #16
  74   %1 = bitcast i32* %a to i8*
  75   %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %1, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
  76   %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
  77   %4 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
  78   %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
  79   %6 = shufflevector <2 x i32> %5, <2 x i32> undef, <2 x i32> zeroinitializer
  80   %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
  81   %8 = shufflevector <2 x i32> %7, <2 x i32> undef, <2 x i32> zeroinitializer
  82   %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3
  83   %10 = shufflevector <2 x i32> %9, <2 x i32> undef, <2 x i32> zeroinitializer
  84   %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %4, 0, 0
  85   %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %6, 0, 1
  86   %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %8, 0, 2
  87   %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
  88   %tmp1 = getelementptr i32* %a, i32 4
  89   store i32* %tmp1, i32** %ptr
  90   ret { [4 x <2 x i32>] } %14
  91 }
  92
  93 define { [4 x <2 x double>] } @test_vld4_dup_reg_update(double* %a, double** %ptr, i32 %inc) {
  94 ; CHECK-LABEL: test_vld4_dup_reg_update
  95 ; CHECK: ld4r  { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [x{{[0-9]+|sp}}], x{{[0-9]+}}
  96   %1 = bitcast double* %a to i8*
  97   %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %1, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
  98   %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0
  99   %4 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
 100   %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1
 101   %6 = shufflevector <2 x double> %5, <2 x double> undef, <2 x i32> zeroinitializer
 102   %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2
 103   %8 = shufflevector <2 x double> %7, <2 x double> undef, <2 x i32> zeroinitializer
 104   %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3
 105   %10 = shufflevector <2 x double> %9, <2 x double> undef, <2 x i32> zeroinitializer
 106   %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %4, 0, 0
 107   %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %6, 0, 1
 108   %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %8, 0, 2
 109   %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
 110   %tmp1 = getelementptr double* %a, i32 %inc
 111   store double* %tmp1, double** %ptr
 112   ret { [4 x <2 x double>] } %14
 113 }
 114
 115 define { [2 x <8 x i8>] } @test_vld2_lane_fx_update(i8*  %a, [2 x <8 x i8>] %b, i8** %ptr) {
 116 ; CHECK-LABEL: test_vld2_lane_fx_update
 117 ; CHECK: ld2  { v{{[0-9]+}}.b, v{{[0-9]+}}.b }[7], [x{{[0-9]+|sp}}], #2
 118   %1 = extractvalue [2 x <8 x i8>] %b, 0
 119   %2 = extractvalue [2 x <8 x i8>] %b, 1
 120   %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
 121   %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
 122   %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
 123   %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
 124   %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
 125   %tmp1 = getelementptr i8* %a, i32 2
 126   store i8* %tmp1, i8** %ptr
 127   ret { [2 x <8 x i8>] } %7
 128 }
 129
 130 define { [2 x <8 x i8>] } @test_vld2_lane_reg_update(i8*  %a, [2 x <8 x i8>] %b, i8** %ptr, i32 %inc) {
 131 ; CHECK-LABEL: test_vld2_lane_reg_update
 132 ; CHECK: ld2  { v{{[0-9]+}}.b, v{{[0-9]+}}.b }[6], [x{{[0-9]+|sp}}], x{{[0-9]+}}
 133   %1 = extractvalue [2 x <8 x i8>] %b, 0
 134   %2 = extractvalue [2 x <8 x i8>] %b, 1
 135   %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 6, i32 1)
 136   %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
 137   %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
 138   %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
 139   %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
 140   %tmp1 = getelementptr i8* %a, i32 %inc
 141   store i8* %tmp1, i8** %ptr
 142   ret { [2 x <8 x i8>] } %7
 143 }
 144
 145 define { [3 x <2 x float>] } @test_vld3_lane_fx_update(float* %a, [3 x <2 x float>] %b, float** %ptr) {
 146 ; CHECK-LABEL: test_vld3_lane_fx_update
 147 ; CHECK: ld3  { v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s }[1], [x{{[0-9]+|sp}}], #12
 148   %1 = extractvalue [3 x <2 x float>] %b, 0
 149   %2 = extractvalue [3 x <2 x float>] %b, 1
 150   %3 = extractvalue [3 x <2 x float>] %b, 2
 151   %4 = bitcast float* %a to i8*
 152   %5 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 1, i32 4)
 153   %6 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 0
 154   %7 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 1
 155   %8 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 2
 156   %9 = insertvalue { [3 x <2 x float>] } undef, <2 x float> %6, 0, 0
 157   %10 = insertvalue { [3 x <2 x float>] } %9, <2 x float> %7, 0, 1
 158   %11 = insertvalue { [3 x <2 x float>] } %10, <2 x float> %8, 0, 2
 159   %tmp1 = getelementptr float* %a, i32 3
 160   store float* %tmp1, float** %ptr
 161   ret { [3 x <2 x float>] } %11
 162 }
 163
 164 define { [3 x <4 x i16>] } @test_vld3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
 165 ; CHECK-LABEL: test_vld3_lane_reg_update
 166 ; CHECK: ld3  { v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h }[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
 167   %1 = extractvalue [3 x <4 x i16>] %b, 0
 168   %2 = extractvalue [3 x <4 x i16>] %b, 1
 169   %3 = extractvalue [3 x <4 x i16>] %b, 2
 170   %4 = bitcast i16* %a to i8*
 171   %5 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
 172   %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 0
 173   %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 1
 174   %8 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 2
 175   %9 = insertvalue { [3 x <4 x i16>] } undef, <4 x i16> %6, 0, 0
 176   %10 = insertvalue { [3 x <4 x i16>] } %9, <4 x i16> %7, 0, 1
 177   %11 = insertvalue { [3 x <4 x i16>] } %10, <4 x i16> %8, 0, 2
 178   %tmp1 = getelementptr i16* %a, i32 %inc
 179   store i16* %tmp1, i16** %ptr
 180   ret { [3 x <4 x i16>] } %11
 181 }
 182
 183 define { [4 x <2 x i32>] } @test_vld4_lane_fx_update(i32* readonly %a, [4 x <2 x i32>] %b, i32** %ptr) {
 184 ; CHECK-LABEL: test_vld4_lane_fx_update
 185 ; CHECK: ld4  { v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s }[1], [x{{[0-9]+|sp}}], #16
 186   %1 = extractvalue [4 x <2 x i32>] %b, 0
 187   %2 = extractvalue [4 x <2 x i32>] %b, 1
 188   %3 = extractvalue [4 x <2 x i32>] %b, 2
 189   %4 = extractvalue [4 x <2 x i32>] %b, 3
 190   %5 = bitcast i32* %a to i8*
 191   %6 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 1, i32 4)
 192   %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 0
 193   %8 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 1
 194   %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 2
 195   %10 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 3
 196   %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %7, 0, 0
 197   %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %8, 0, 1
 198   %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %9, 0, 2
 199   %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
 200   %tmp1 = getelementptr i32* %a, i32 4
 201   store i32* %tmp1, i32** %ptr
 202   ret { [4 x <2 x i32>] } %14
 203 }
 204
 205 define { [4 x <2 x double>] } @test_vld4_lane_reg_update(double* readonly %a, [4 x <2 x double>] %b, double** %ptr, i32 %inc) {
 206 ; CHECK-LABEL: test_vld4_lane_reg_update
 207 ; CHECK: ld4  { v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d }[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
 208   %1 = extractvalue [4 x <2 x double>] %b, 0
 209   %2 = extractvalue [4 x <2 x double>] %b, 1
 210   %3 = extractvalue [4 x <2 x double>] %b, 2
 211   %4 = extractvalue [4 x <2 x double>] %b, 3
 212   %5 = bitcast double* %a to i8*
 213   %6 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
 214   %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 0
 215   %8 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 1
 216   %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 2
 217   %10 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 3
 218   %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %7, 0, 0
 219   %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %8, 0, 1
 220   %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %9, 0, 2
 221   %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
 222   %tmp1 = getelementptr double* %a, i32 %inc
 223   store double* %tmp1, double** %ptr
 224   ret { [4 x <2 x double>] } %14
 225 }
 226
 227 define void @test_vst2_lane_fx_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr) {
 228 ; CHECK-LABEL: test_vst2_lane_fx_update
 229 ; CHECK: st2  { v{{[0-9]+}}.b, v{{[0-9]+}}.b }[7], [x{{[0-9]+|sp}}], #2
 230   %1 = extractvalue [2 x <8 x i8>] %b, 0
 231   %2 = extractvalue [2 x <8 x i8>] %b, 1
 232   call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
 233   %tmp1 = getelementptr i8* %a, i32 2
 234   store i8* %tmp1, i8** %ptr
 235   ret void
 236 }
 237
 238 define void @test_vst2_lane_reg_update(i32* %a, [2 x <2 x i32>] %b.coerce, i32** %ptr, i32 %inc) {
 239 ; CHECK-LABEL: test_vst2_lane_reg_update
 240 ; CHECK: st2  { v{{[0-9]+}}.s, v{{[0-9]+}}.s }[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
 241   %1 = extractvalue [2 x <2 x i32>] %b.coerce, 0
 242   %2 = extractvalue [2 x <2 x i32>] %b.coerce, 1
 243   %3 = bitcast i32* %a to i8*
 244   tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 1, i32 4)
 245   %tmp1 = getelementptr i32* %a, i32 %inc
 246   store i32* %tmp1, i32** %ptr
 247   ret void
 248 }
 249
 250 define void @test_vst3_lane_fx_update(float* %a, [3 x <4 x float>] %b, float** %ptr) {
 251 ; CHECK-LABEL: test_vst3_lane_fx_update
 252 ; CHECK: st3  { v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s }[3], [x{{[0-9]+|sp}}], #12
 253   %1 = extractvalue [3 x <4 x float>] %b, 0
 254   %2 = extractvalue [3 x <4 x float>] %b, 1
 255   %3 = extractvalue [3 x <4 x float>] %b, 2
 256   %4 = bitcast float* %a to i8*
 257   call void @llvm.arm.neon.vst3lane.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 3, i32 4)
 258   %tmp1 = getelementptr float* %a, i32 3
 259   store float* %tmp1, float** %ptr
 260   ret void
 261 }
 262
 263 ; Function Attrs: nounwind
 264 define void @test_vst3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
 265 ; CHECK-LABEL: test_vst3_lane_reg_update
 266 ; CHECK: st3  { v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h }[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
 267   %1 = extractvalue [3 x <4 x i16>] %b, 0
 268   %2 = extractvalue [3 x <4 x i16>] %b, 1
 269   %3 = extractvalue [3 x <4 x i16>] %b, 2
 270   %4 = bitcast i16* %a to i8*
 271   tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
 272   %tmp1 = getelementptr i16* %a, i32 %inc
 273   store i16* %tmp1, i16** %ptr
 274   ret void
 275 }
 276
 277 define void @test_vst4_lane_fx_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr) {
 278 ; CHECK-LABEL: test_vst4_lane_fx_update
 279 ; CHECK: st4  { v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d }[1], [x{{[0-9]+|sp}}], #32
 280   %1 = extractvalue [4 x <2 x double>] %b.coerce, 0
 281   %2 = extractvalue [4 x <2 x double>] %b.coerce, 1
 282   %3 = extractvalue [4 x <2 x double>] %b.coerce, 2
 283   %4 = extractvalue [4 x <2 x double>] %b.coerce, 3
 284   %5 = bitcast double* %a to i8*
 285   tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
 286   %tmp1 = getelementptr double* %a, i32 4
 287   store double* %tmp1, double** %ptr
 288   ret void
 289 }
 290
 291
 292 define void @test_vst4_lane_reg_update(float* %a, [4 x <2 x float>] %b.coerce, float** %ptr, i32 %inc) {
 293 ; CHECK-LABEL: test_vst4_lane_reg_update
 294 ; CHECK: st4  { v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s }[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
 295   %1 = extractvalue [4 x <2 x float>] %b.coerce, 0
 296   %2 = extractvalue [4 x <2 x float>] %b.coerce, 1
 297   %3 = extractvalue [4 x <2 x float>] %b.coerce, 2
 298   %4 = extractvalue [4 x <2 x float>] %b.coerce, 3
 299   %5 = bitcast float* %a to i8*
 300   tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 1, i32 4)
 301   %tmp1 = getelementptr float* %a, i32 %inc
 302   store float* %tmp1, float** %ptr
 303   ret void
 304 }
 305
 306 declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
 307 declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
 308 declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
 309 declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
 310 declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
 311 declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
 312 declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
 313 declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
 314 declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
 315 declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
 316 declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
 317 declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
 318 declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
 319 declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)