1 ; RUN: llc -mtriple=aarch64 -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NEON
2 ; RUN: llc -mtriple=aarch64 -lower-interleaved-accesses=true -mattr=-neon < %s | FileCheck %s -check-prefix=NONEON
4 ; NEON-LABEL: load_factor2:
5 ; NEON: ld2 { v0.8b, v1.8b }, [x0]
6 ; NONEON-LABEL: load_factor2:
8 define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
9 %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
10 %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
11 %strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
12 %add = add nsw <8 x i8> %strided.v0, %strided.v1
16 ; NEON-LABEL: load_factor3:
17 ; NEON: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
18 ; NONEON-LABEL: load_factor3:
20 define <4 x i32> @load_factor3(i32* %ptr) {
21 %base = bitcast i32* %ptr to <12 x i32>*
22 %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
23 %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
24 %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
25 %add = add nsw <4 x i32> %strided.v2, %strided.v1
29 ; NEON-LABEL: load_factor4:
30 ; NEON: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
31 ; NONEON-LABEL: load_factor4:
33 define <4 x i32> @load_factor4(i32* %ptr) {
34 %base = bitcast i32* %ptr to <16 x i32>*
35 %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
36 %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
37 %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
38 %add = add nsw <4 x i32> %strided.v0, %strided.v2
42 ; NEON-LABEL: store_factor2:
43 ; NEON: st2 { v0.8b, v1.8b }, [x0]
44 ; NONEON-LABEL: store_factor2:
46 define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
47 %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
48 store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
52 ; NEON-LABEL: store_factor3:
53 ; NEON: st3 { v0.4s, v1.4s, v2.4s }, [x0]
54 ; NONEON-LABEL: store_factor3:
56 define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
57 %base = bitcast i32* %ptr to <12 x i32>*
58 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
59 %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
60 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
61 store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
65 ; NEON-LABEL: store_factor4:
66 ; NEON: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
67 ; NONEON-LABEL: store_factor4:
69 define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
70 %base = bitcast i32* %ptr to <16 x i32>*
71 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
72 %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
73 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
74 store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
78 ; The following cases test that interleaved access of pointer vectors can be
79 ; matched to ldN/stN instruction.
81 ; NEON-LABEL: load_ptrvec_factor2:
82 ; NEON: ld2 { v0.2d, v1.2d }, [x0]
83 ; NONEON-LABEL: load_ptrvec_factor2:
85 define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
86 %base = bitcast i32** %ptr to <4 x i32*>*
87 %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4
88 %strided.v0 = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
89 ret <2 x i32*> %strided.v0
92 ; NEON-LABEL: load_ptrvec_factor3:
93 ; NEON: ld3 { v0.2d, v1.2d, v2.2d }, [x0]
94 ; NONEON-LABEL: load_ptrvec_factor3:
96 define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
97 %base = bitcast i32** %ptr to <6 x i32*>*
98 %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4
99 %strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
100 store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1
101 %strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
102 store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2
106 ; NEON-LABEL: load_ptrvec_factor4:
107 ; NEON: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
108 ; NONEON-LABEL: load_ptrvec_factor4:
110 define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
111 %base = bitcast i32** %ptr to <8 x i32*>*
112 %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4
113 %strided.v1 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
114 %strided.v3 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
115 store <2 x i32*> %strided.v1, <2 x i32*>* %ptr1
116 store <2 x i32*> %strided.v3, <2 x i32*>* %ptr2
120 ; NEON-LABEL: store_ptrvec_factor2:
121 ; NEON: st2 { v0.2d, v1.2d }, [x0]
122 ; NONEON-LABEL: store_ptrvec_factor2:
124 define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
125 %base = bitcast i32** %ptr to <4 x i32*>*
126 %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
127 store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4
131 ; NEON-LABEL: store_ptrvec_factor3:
132 ; NEON: st3 { v0.2d, v1.2d, v2.2d }, [x0]
133 ; NONEON-LABEL: store_ptrvec_factor3:
135 define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
136 %base = bitcast i32** %ptr to <6 x i32*>*
137 %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
138 %v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
139 %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
140 store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4
144 ; NEON-LABEL: store_ptrvec_factor4:
145 ; NEON: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
146 ; NONEON-LABEL: store_ptrvec_factor4:
148 define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
149 %base = bitcast i32* %ptr to <8 x i32*>*
150 %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
151 %v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
152 %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
153 store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4
157 ; Following cases check that shuffle maskes with undef indices can be matched
158 ; into ldN/stN instruction.
160 ; NEON-LABEL: load_undef_mask_factor2:
161 ; NEON: ld2 { v0.4s, v1.4s }, [x0]
162 ; NONEON-LABEL: load_undef_mask_factor2:
164 define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
165 %base = bitcast i32* %ptr to <8 x i32>*
166 %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
167 %strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
168 %strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
169 %add = add nsw <4 x i32> %strided.v0, %strided.v1
173 ; NEON-LABEL: load_undef_mask_factor3:
174 ; NEON: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
175 ; NONEON-LABEL: load_undef_mask_factor3:
177 define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
178 %base = bitcast i32* %ptr to <12 x i32>*
179 %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
180 %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
181 %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
182 %add = add nsw <4 x i32> %strided.v2, %strided.v1
186 ; NEON-LABEL: load_undef_mask_factor4:
187 ; NEON: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
188 ; NONEON-LABEL: load_undef_mask_factor4:
190 define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
191 %base = bitcast i32* %ptr to <16 x i32>*
192 %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
193 %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
194 %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
195 %add = add nsw <4 x i32> %strided.v0, %strided.v2
199 ; NEON-LABEL: store_undef_mask_factor2:
200 ; NEON: st2 { v0.4s, v1.4s }, [x0]
201 ; NONEON-LABEL: store_undef_mask_factor2:
203 define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
204 %base = bitcast i32* %ptr to <8 x i32>*
205 %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
206 store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
210 ; NEON-LABEL: store_undef_mask_factor3:
211 ; NEON: st3 { v0.4s, v1.4s, v2.4s }, [x0]
212 ; NONEON-LABEL: store_undef_mask_factor3:
214 define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
215 %base = bitcast i32* %ptr to <12 x i32>*
216 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
217 %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
218 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
219 store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
223 ; NEON-LABEL: store_undef_mask_factor4:
224 ; NEON: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
225 ; NONEON-LABEL: store_undef_mask_factor4:
227 define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
228 %base = bitcast i32* %ptr to <16 x i32>*
229 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
230 %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
231 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
232 store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
236 ; Check that we do something sane with illegal types.
238 ; NEON-LABEL: load_illegal_factor2:
240 ; NEON-NEXT: ldr q[[V:[0-9]+]], [x0]
241 ; NEON-NEXT: uzp1 v0.4s, v[[V]].4s, v{{.*}}.4s
243 ; NONEON-LABEL: load_illegal_factor2:
245 ; NONEON-NEXT: ldr s0, [x0]
246 ; NONEON-NEXT: ldr s1, [x0, #8]
248 define <3 x float> @load_illegal_factor2(<3 x float>* %p) nounwind {
249 %tmp1 = load <3 x float>, <3 x float>* %p, align 16
250 %tmp2 = shufflevector <3 x float> %tmp1, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
251 ret <3 x float> %tmp2
254 ; NEON-LABEL: store_illegal_factor2:
256 ; NEON-NEXT: uzp1 v0.4s, v0.4s, v{{.*}}.4s
257 ; NEON-NEXT: st1 { v0.d }[0], [x0]
259 ; NONEON-LABEL: store_illegal_factor2:
261 ; NONEON-NEXT: fmov w[[ELT2:[0-9]+]], s2
262 ; NONEON-NEXT: fmov w[[RES:[0-9]+]], s0
263 ; NONEON-NEXT: bfi x[[RES]], x[[ELT2]], #32, #32
264 ; NONEON-NEXT: str x[[RES]], [x0]
266 define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
267 %tmp1 = shufflevector <3 x float> %v, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
268 store <3 x float> %tmp1, <3 x float>* %p, align 16