1 ; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic -lower-interleaved-accesses=true < %s | FileCheck %s
3 ; CHECK-LABEL: load_factor2:
4 ; CHECK: ld2 { v0.8b, v1.8b }, [x0]
5 define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
6 %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
7 %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
8 %strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
9 %add = add nsw <8 x i8> %strided.v0, %strided.v1
13 ; CHECK-LABEL: load_factor3:
14 ; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
15 define <4 x i32> @load_factor3(i32* %ptr) {
16 %base = bitcast i32* %ptr to <12 x i32>*
17 %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
18 %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
19 %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
20 %add = add nsw <4 x i32> %strided.v2, %strided.v1
24 ; CHECK-LABEL: load_factor4:
25 ; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
26 define <4 x i32> @load_factor4(i32* %ptr) {
27 %base = bitcast i32* %ptr to <16 x i32>*
28 %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
29 %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
30 %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
31 %add = add nsw <4 x i32> %strided.v0, %strided.v2
35 ; CHECK-LABEL: store_factor2:
36 ; CHECK: st2 { v0.8b, v1.8b }, [x0]
37 define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
38 %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
39 store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
43 ; CHECK-LABEL: store_factor3:
44 ; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
45 define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
46 %base = bitcast i32* %ptr to <12 x i32>*
47 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
48 %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
49 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
50 store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
54 ; CHECK-LABEL: store_factor4:
55 ; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
56 define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
57 %base = bitcast i32* %ptr to <16 x i32>*
58 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
59 %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
60 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
61 store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
65 ; The following cases test that interleaved access of pointer vectors can be
66 ; matched to ldN/stN instruction.
68 ; CHECK-LABEL: load_ptrvec_factor2:
69 ; CHECK: ld2 { v0.2d, v1.2d }, [x0]
70 define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
71 %base = bitcast i32** %ptr to <4 x i32*>*
72 %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4
73 %strided.v0 = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
74 ret <2 x i32*> %strided.v0
77 ; CHECK-LABEL: load_ptrvec_factor3:
78 ; CHECK: ld3 { v0.2d, v1.2d, v2.2d }, [x0]
79 define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
80 %base = bitcast i32** %ptr to <6 x i32*>*
81 %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4
82 %strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
83 store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1
84 %strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
85 store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2
89 ; CHECK-LABEL: load_ptrvec_factor4:
90 ; CHECK: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
91 define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
92 %base = bitcast i32** %ptr to <8 x i32*>*
93 %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4
94 %strided.v1 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
95 %strided.v3 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
96 store <2 x i32*> %strided.v1, <2 x i32*>* %ptr1
97 store <2 x i32*> %strided.v3, <2 x i32*>* %ptr2
101 ; CHECK-LABEL: store_ptrvec_factor2:
102 ; CHECK: st2 { v0.2d, v1.2d }, [x0]
103 define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
104 %base = bitcast i32** %ptr to <4 x i32*>*
105 %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
106 store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4
110 ; CHECK-LABEL: store_ptrvec_factor3:
111 ; CHECK: st3 { v0.2d, v1.2d, v2.2d }, [x0]
112 define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
113 %base = bitcast i32** %ptr to <6 x i32*>*
114 %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
115 %v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
116 %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
117 store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4
121 ; CHECK-LABEL: store_ptrvec_factor4:
122 ; CHECK: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
123 define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
124 %base = bitcast i32* %ptr to <8 x i32*>*
125 %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
126 %v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
127 %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
128 store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4
132 ; Following cases check that shuffle maskes with undef indices can be matched
133 ; into ldN/stN instruction.
135 ; CHECK-LABEL: load_undef_mask_factor2:
136 ; CHECK: ld2 { v0.4s, v1.4s }, [x0]
137 define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
138 %base = bitcast i32* %ptr to <8 x i32>*
139 %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
140 %strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
141 %strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
142 %add = add nsw <4 x i32> %strided.v0, %strided.v1
146 ; CHECK-LABEL: load_undef_mask_factor3:
147 ; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
148 define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
149 %base = bitcast i32* %ptr to <12 x i32>*
150 %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
151 %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
152 %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
153 %add = add nsw <4 x i32> %strided.v2, %strided.v1
157 ; CHECK-LABEL: load_undef_mask_factor4:
158 ; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
159 define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
160 %base = bitcast i32* %ptr to <16 x i32>*
161 %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
162 %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
163 %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
164 %add = add nsw <4 x i32> %strided.v0, %strided.v2
168 ; CHECK-LABEL: store_undef_mask_factor2:
169 ; CHECK: st2 { v0.4s, v1.4s }, [x0]
170 define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
171 %base = bitcast i32* %ptr to <8 x i32>*
172 %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
173 store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
177 ; CHECK-LABEL: store_undef_mask_factor3:
178 ; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
179 define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
180 %base = bitcast i32* %ptr to <12 x i32>*
181 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
182 %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
183 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
184 store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
188 ; CHECK-LABEL: store_undef_mask_factor4:
189 ; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
190 define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
191 %base = bitcast i32* %ptr to <16 x i32>*
192 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
193 %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
194 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
195 store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4