1 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
3 %struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
4 %struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
5 %struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
6 %struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
8 %struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
9 %struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
10 %struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> }
12 define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind {
15 %tmp1 = load <8 x i8>* %B
16 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
17 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
18 %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
19 %tmp5 = add <8 x i8> %tmp3, %tmp4
23 define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind {
26 %tmp1 = load <4 x i16>* %B
27 %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
28 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
29 %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
30 %tmp5 = add <4 x i16> %tmp3, %tmp4
34 define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
37 %tmp1 = load <2 x i32>* %B
38 %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
39 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
40 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
41 %tmp5 = add <2 x i32> %tmp3, %tmp4
45 define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
48 %tmp1 = load <2 x float>* %B
49 %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
50 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0
51 %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1
52 %tmp5 = add <2 x float> %tmp3, %tmp4
56 define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
59 %tmp1 = load <8 x i16>* %B
60 %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
61 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
62 %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
63 %tmp5 = add <8 x i16> %tmp3, %tmp4
67 define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
70 %tmp1 = load <4 x i32>* %B
71 %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2)
72 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
73 %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
74 %tmp5 = add <4 x i32> %tmp3, %tmp4
78 define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind {
81 %tmp1 = load <4 x float>* %B
82 %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
83 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0
84 %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1
85 %tmp5 = add <4 x float> %tmp3, %tmp4
89 declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind readonly
90 declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32) nounwind readonly
91 declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind readonly
92 declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32) nounwind readonly
94 declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind readonly
95 declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind readonly
96 declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind readonly
98 %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
99 %struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
100 %struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
101 %struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
103 %struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> }
104 %struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> }
105 %struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> }
107 define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind {
110 %tmp1 = load <8 x i8>* %B
111 %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
112 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
113 %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
114 %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
115 %tmp6 = add <8 x i8> %tmp3, %tmp4
116 %tmp7 = add <8 x i8> %tmp5, %tmp6
120 define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind {
123 %tmp1 = load <4 x i16>* %B
124 %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
125 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
126 %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
127 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
128 %tmp6 = add <4 x i16> %tmp3, %tmp4
129 %tmp7 = add <4 x i16> %tmp5, %tmp6
133 define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind {
136 %tmp1 = load <2 x i32>* %B
137 %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
138 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
139 %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
140 %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
141 %tmp6 = add <2 x i32> %tmp3, %tmp4
142 %tmp7 = add <2 x i32> %tmp5, %tmp6
146 define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind {
149 %tmp1 = load <2 x float>* %B
150 %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
151 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0
152 %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1
153 %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2
154 %tmp6 = add <2 x float> %tmp3, %tmp4
155 %tmp7 = add <2 x float> %tmp5, %tmp6
156 ret <2 x float> %tmp7
159 define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
160 ;CHECK: vld3laneQi16:
162 %tmp1 = load <8 x i16>* %B
163 %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
164 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
165 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
166 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
167 %tmp6 = add <8 x i16> %tmp3, %tmp4
168 %tmp7 = add <8 x i16> %tmp5, %tmp6
172 define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
173 ;CHECK: vld3laneQi32:
175 %tmp1 = load <4 x i32>* %B
176 %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3)
177 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
178 %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
179 %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
180 %tmp6 = add <4 x i32> %tmp3, %tmp4
181 %tmp7 = add <4 x i32> %tmp5, %tmp6
185 define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind {
188 %tmp1 = load <4 x float>* %B
189 %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
190 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0
191 %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1
192 %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2
193 %tmp6 = add <4 x float> %tmp3, %tmp4
194 %tmp7 = add <4 x float> %tmp5, %tmp6
195 ret <4 x float> %tmp7
198 declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind readonly
199 declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind readonly
200 declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind readonly
201 declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) nounwind readonly
203 declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind readonly
204 declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind readonly
205 declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32) nounwind readonly
207 %struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
208 %struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
209 %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
210 %struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
212 define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
215 %tmp1 = load <8 x i8>* %B
216 %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1)
217 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
218 %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
219 %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
220 %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
221 %tmp7 = add <8 x i8> %tmp3, %tmp4
222 %tmp8 = add <8 x i8> %tmp5, %tmp6
223 %tmp9 = add <8 x i8> %tmp7, %tmp8
227 define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
230 %tmp1 = load <4 x i16>* %B
231 %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
232 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
233 %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
234 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
235 %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3
236 %tmp7 = add <4 x i16> %tmp3, %tmp4
237 %tmp8 = add <4 x i16> %tmp5, %tmp6
238 %tmp9 = add <4 x i16> %tmp7, %tmp8
242 define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
245 %tmp1 = load <2 x i32>* %B
246 %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
247 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
248 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
249 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
250 %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3
251 %tmp7 = add <2 x i32> %tmp3, %tmp4
252 %tmp8 = add <2 x i32> %tmp5, %tmp6
253 %tmp9 = add <2 x i32> %tmp7, %tmp8
257 define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind {
260 %tmp1 = load <2 x float>* %B
261 %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
262 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0
263 %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1
264 %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2
265 %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3
266 %tmp7 = add <2 x float> %tmp3, %tmp4
267 %tmp8 = add <2 x float> %tmp5, %tmp6
268 %tmp9 = add <2 x float> %tmp7, %tmp8
269 ret <2 x float> %tmp9
272 declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind readonly
273 declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind readonly
274 declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind readonly
275 declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) nounwind readonly