1 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
3 define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
6 ; CHECK-NEXT: vldr d16, [r1]
7 ; CHECK-NEXT: vldr d17, [r0]
8 ; CHECK-NEXT: vuzp.8 d17, d16
9 ; CHECK-NEXT: vadd.i8 d16, d17, d16
10 ; CHECK-NEXT: vmov r0, r1, d16
11 ; CHECK-NEXT: mov pc, lr
12 %tmp1 = load <8 x i8>, <8 x i8>* %A
13 %tmp2 = load <8 x i8>, <8 x i8>* %B
14 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
15 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
16 %tmp5 = add <8 x i8> %tmp3, %tmp4
20 define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
21 ; CHECK-LABEL: vuzpi8_Qres:
23 ; CHECK-NEXT: vldr d17, [r1]
24 ; CHECK-NEXT: vldr d16, [r0]
25 ; CHECK-NEXT: vuzp.8 d16, d17
26 ; CHECK-NEXT: vmov r0, r1, d16
27 ; CHECK-NEXT: vmov r2, r3, d17
28 ; CHECK-NEXT: mov pc, lr
29 %tmp1 = load <8 x i8>, <8 x i8>* %A
30 %tmp2 = load <8 x i8>, <8 x i8>* %B
31 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
35 define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
36 ; CHECK-LABEL: vuzpi16:
38 ; CHECK-NEXT: vldr d16, [r1]
39 ; CHECK-NEXT: vldr d17, [r0]
40 ; CHECK-NEXT: vuzp.16 d17, d16
41 ; CHECK-NEXT: vadd.i16 d16, d17, d16
42 ; CHECK-NEXT: vmov r0, r1, d16
43 ; CHECK-NEXT: mov pc, lr
44 %tmp1 = load <4 x i16>, <4 x i16>* %A
45 %tmp2 = load <4 x i16>, <4 x i16>* %B
46 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
47 %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
48 %tmp5 = add <4 x i16> %tmp3, %tmp4
52 define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind {
53 ; CHECK-LABEL: vuzpi16_Qres:
55 ; CHECK-NEXT: vldr d17, [r1]
56 ; CHECK-NEXT: vldr d16, [r0]
57 ; CHECK-NEXT: vuzp.16 d16, d17
58 ; CHECK-NEXT: vmov r0, r1, d16
59 ; CHECK-NEXT: vmov r2, r3, d17
60 ; CHECK-NEXT: mov pc, lr
61 %tmp1 = load <4 x i16>, <4 x i16>* %A
62 %tmp2 = load <4 x i16>, <4 x i16>* %B
63 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
67 ; VUZP.32 is equivalent to VTRN.32 for 64-bit vectors.
69 define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
70 ; CHECK-LABEL: vuzpQi8:
72 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
73 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
74 ; CHECK-NEXT: vuzp.8 q9, q8
75 ; CHECK-NEXT: vadd.i8 q8, q9, q8
76 ; CHECK-NEXT: vmov r0, r1, d16
77 ; CHECK-NEXT: vmov r2, r3, d17
78 ; CHECK-NEXT: mov pc, lr
79 %tmp1 = load <16 x i8>, <16 x i8>* %A
80 %tmp2 = load <16 x i8>, <16 x i8>* %B
81 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
82 %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
83 %tmp5 = add <16 x i8> %tmp3, %tmp4
87 define <32 x i8> @vuzpQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind {
88 ; CHECK-LABEL: vuzpQi8_QQres:
90 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2]
91 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
92 ; CHECK-NEXT: vuzp.8 q9, q8
93 ; CHECK-NEXT: vst1.8 {d18, d19}, [r0:128]!
94 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128]
95 ; CHECK-NEXT: mov pc, lr
96 %tmp1 = load <16 x i8>, <16 x i8>* %A
97 %tmp2 = load <16 x i8>, <16 x i8>* %B
98 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
102 define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
103 ; CHECK-LABEL: vuzpQi16:
105 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
106 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
107 ; CHECK-NEXT: vuzp.16 q9, q8
108 ; CHECK-NEXT: vadd.i16 q8, q9, q8
109 ; CHECK-NEXT: vmov r0, r1, d16
110 ; CHECK-NEXT: vmov r2, r3, d17
111 ; CHECK-NEXT: mov pc, lr
112 %tmp1 = load <8 x i16>, <8 x i16>* %A
113 %tmp2 = load <8 x i16>, <8 x i16>* %B
114 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
115 %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
116 %tmp5 = add <8 x i16> %tmp3, %tmp4
120 define <16 x i16> @vuzpQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
121 ; CHECK-LABEL: vuzpQi16_QQres:
123 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2]
124 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
125 ; CHECK-NEXT: vuzp.16 q9, q8
126 ; CHECK-NEXT: vst1.16 {d18, d19}, [r0:128]!
127 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128]
128 ; CHECK-NEXT: mov pc, lr
129 %tmp1 = load <8 x i16>, <8 x i16>* %A
130 %tmp2 = load <8 x i16>, <8 x i16>* %B
131 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
135 define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
136 ; CHECK-LABEL: vuzpQi32:
138 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
139 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
140 ; CHECK-NEXT: vuzp.32 q9, q8
141 ; CHECK-NEXT: vadd.i32 q8, q9, q8
142 ; CHECK-NEXT: vmov r0, r1, d16
143 ; CHECK-NEXT: vmov r2, r3, d17
144 ; CHECK-NEXT: mov pc, lr
145 %tmp1 = load <4 x i32>, <4 x i32>* %A
146 %tmp2 = load <4 x i32>, <4 x i32>* %B
147 %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
148 %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
149 %tmp5 = add <4 x i32> %tmp3, %tmp4
153 define <8 x i32> @vuzpQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind {
154 ; CHECK-LABEL: vuzpQi32_QQres:
156 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2]
157 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
158 ; CHECK-NEXT: vuzp.32 q9, q8
159 ; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]!
160 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128]
161 ; CHECK-NEXT: mov pc, lr
162 %tmp1 = load <4 x i32>, <4 x i32>* %A
163 %tmp2 = load <4 x i32>, <4 x i32>* %B
164 %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
168 define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind {
169 ; CHECK-LABEL: vuzpQf:
171 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
172 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
173 ; CHECK-NEXT: vuzp.32 q9, q8
174 ; CHECK-NEXT: vadd.f32 q8, q9, q8
175 ; CHECK-NEXT: vmov r0, r1, d16
176 ; CHECK-NEXT: vmov r2, r3, d17
177 ; CHECK-NEXT: mov pc, lr
178 %tmp1 = load <4 x float>, <4 x float>* %A
179 %tmp2 = load <4 x float>, <4 x float>* %B
180 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
181 %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
182 %tmp5 = fadd <4 x float> %tmp3, %tmp4
183 ret <4 x float> %tmp5
186 define <8 x float> @vuzpQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind {
187 ; CHECK-LABEL: vuzpQf_QQres:
189 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2]
190 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
191 ; CHECK-NEXT: vuzp.32 q9, q8
192 ; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]!
193 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128]
194 ; CHECK-NEXT: mov pc, lr
195 %tmp1 = load <4 x float>, <4 x float>* %A
196 %tmp2 = load <4 x float>, <4 x float>* %B
197 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
198 ret <8 x float> %tmp3
201 ; Undef shuffle indices should not prevent matching to VUZP:
203 define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
204 ; CHECK-LABEL: vuzpi8_undef:
206 ; CHECK-NEXT: vldr d16, [r1]
207 ; CHECK-NEXT: vldr d17, [r0]
208 ; CHECK-NEXT: vuzp.8 d17, d16
209 ; CHECK-NEXT: vadd.i8 d16, d17, d16
210 ; CHECK-NEXT: vmov r0, r1, d16
211 ; CHECK-NEXT: mov pc, lr
212 %tmp1 = load <8 x i8>, <8 x i8>* %A
213 %tmp2 = load <8 x i8>, <8 x i8>* %B
214 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
215 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
216 %tmp5 = add <8 x i8> %tmp3, %tmp4
220 define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind {
221 ; CHECK-LABEL: vuzpi8_undef_Qres:
223 ; CHECK-NEXT: vldr d17, [r1]
224 ; CHECK-NEXT: vldr d16, [r0]
225 ; CHECK-NEXT: vuzp.8 d16, d17
226 ; CHECK-NEXT: vmov r0, r1, d16
227 ; CHECK-NEXT: vmov r2, r3, d17
228 ; CHECK-NEXT: mov pc, lr
229 %tmp1 = load <8 x i8>, <8 x i8>* %A
230 %tmp2 = load <8 x i8>, <8 x i8>* %B
231 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
235 define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
236 ; CHECK-LABEL: vuzpQi16_undef:
238 ; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
239 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0]
240 ; CHECK-NEXT: vuzp.16 q9, q8
241 ; CHECK-NEXT: vadd.i16 q8, q9, q8
242 ; CHECK-NEXT: vmov r0, r1, d16
243 ; CHECK-NEXT: vmov r2, r3, d17
244 ; CHECK-NEXT: mov pc, lr
245 %tmp1 = load <8 x i16>, <8 x i16>* %A
246 %tmp2 = load <8 x i16>, <8 x i16>* %B
247 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14>
248 %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
249 %tmp5 = add <8 x i16> %tmp3, %tmp4
253 define <16 x i16> @vuzpQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind {
254 ; CHECK-LABEL: vuzpQi16_undef_QQres:
256 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2]
257 ; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
258 ; CHECK-NEXT: vuzp.16 q9, q8
259 ; CHECK-NEXT: vst1.16 {d18, d19}, [r0:128]!
260 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128]
261 ; CHECK-NEXT: mov pc, lr
262 %tmp1 = load <8 x i16>, <8 x i16>* %A
263 %tmp2 = load <8 x i16>, <8 x i16>* %B
264 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
268 define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) {
270 ; CHECK-LABEL: vuzp_lower_shufflemask_undef
272 %tmp1 = load <4 x i16>, <4 x i16>* %A
273 %tmp2 = load <4 x i16>, <4 x i16>* %B
274 %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
278 define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) {
280 ; CHECK-LABEL: vuzp_lower_shufflemask_zeroed
283 %tmp1 = load <2 x i32>, <2 x i32>* %A
284 %tmp2 = load <2 x i32>, <2 x i32>* %B
285 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
289 define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) {
290 ; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8.
291 ; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to
292 ; truncate from i32 to i16 and one vuzp to perform the final truncation for i8.
293 ; CHECK-LABEL: vuzp_trunc
298 %c = icmp ult <8 x i32> %cmp0, %cmp1
299 %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1
303 ; Shuffle the result from the compare with a <4 x i8>.
304 ; We need to extend the loaded <4 x i8> to <4 x i16>. Otherwise we wouldn't be able
305 ; to perform the vuzp and get the vbsl mask.
306 define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1,
307 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
308 ; CHECK-LABEL: vuzp_trunc_and_shuffle
312 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
313 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
314 %c0 = icmp ult <4 x i32> %cmp0, %cmp1
315 %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
316 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
320 ; Use an undef value for the <4 x i8> that is being shuffled with the compare result.
321 ; This produces a build_vector with some of the operands undefs.
322 define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1,
323 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
324 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right
327 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
328 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
329 %c0 = icmp ult <4 x i32> %cmp0, %cmp1
330 %c = shufflevector <4 x i1> %c0, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
331 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
335 define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1,
336 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
337 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left
340 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
341 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
342 %c0 = icmp ult <4 x i32> %cmp0, %cmp1
343 %c = shufflevector <4 x i1> undef, <4 x i1> %c0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
344 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1
348 ; We're using large data types here, and we have to fill with undef values until we
349 ; get some vector size that we can represent.
350 define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1,
351 <5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) {
352 ; CHECK-LABEL: vuzp_wide_type
354 %cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4
355 %cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1>
356 %c0 = icmp ult <5 x i32> %cmp0, %cmp1
357 %c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
358 %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1