1 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse | FileCheck %s
3 define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
5 ; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
6 ; CHECK-LABEL: @test_cse
7 ; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
8 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
9 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
12 for.cond: ; preds = %for.body, %entry
13 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
14 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
15 %cmp = icmp slt i32 %i.0, %n
16 br i1 %cmp, label %for.body, label %for.end
18 for.body: ; preds = %for.cond
19 %0 = bitcast i32* %a to i8*
20 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
21 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
22 %3 = bitcast <16 x i8> %1 to <4 x i32>
23 %4 = bitcast <16 x i8> %2 to <4 x i32>
24 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
25 %5 = bitcast i32* %a to i8*
26 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
27 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
28 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
29 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
30 %inc = add nsw i32 %i.0, 1
33 for.end: ; preds = %for.cond
37 define <4 x i32> @test_cse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
39 ; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE.
40 ; CHECK-LABEL: @test_cse2
41 ; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
42 ; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
43 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
44 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
47 for.cond: ; preds = %for.body, %entry
48 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
49 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
50 %cmp = icmp slt i32 %i.0, %n
51 br i1 %cmp, label %for.body, label %for.end
53 for.body: ; preds = %for.cond
54 %0 = bitcast i32* %a to i8*
55 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
56 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
57 %3 = bitcast <16 x i8> %1 to <4 x i32>
58 %4 = bitcast <16 x i8> %2 to <4 x i32>
59 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
60 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
61 %5 = bitcast i32* %a to i8*
62 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
63 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
64 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
65 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
66 %inc = add nsw i32 %i.0, 1
69 for.end: ; preds = %for.cond
73 define <4 x i32> @test_cse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 {
75 ; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
76 ; CHECK-LABEL: @test_cse3
77 ; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
78 ; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
79 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
80 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
83 for.cond: ; preds = %for.body, %entry
84 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
85 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
86 %cmp = icmp slt i32 %i.0, %n
87 br i1 %cmp, label %for.body, label %for.end
89 for.body: ; preds = %for.cond
90 %0 = bitcast i32* %a to i8*
91 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %0)
92 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
93 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
94 %1 = bitcast i32* %a to i8*
95 %vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %1)
96 %vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0
97 %vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1
98 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract)
99 %inc = add nsw i32 %i.0, 1
102 for.end: ; preds = %for.cond
107 define <4 x i32> @test_nocse(i32* %a, i32* %b, [2 x <4 x i32>] %s.coerce, i32 %n) {
109 ; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized
111 ; CHECK-LABEL: @test_nocse
112 ; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
113 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
114 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
117 for.cond: ; preds = %for.body, %entry
118 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
119 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
120 %cmp = icmp slt i32 %i.0, %n
121 br i1 %cmp, label %for.body, label %for.end
123 for.body: ; preds = %for.cond
124 %0 = bitcast i32* %a to i8*
125 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
126 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
127 %3 = bitcast <16 x i8> %1 to <4 x i32>
128 %4 = bitcast <16 x i8> %2 to <4 x i32>
129 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
130 store i32 0, i32* %b, align 4
131 %5 = bitcast i32* %a to i8*
132 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
133 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
134 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
135 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
136 %inc = add nsw i32 %i.0, 1
139 for.end: ; preds = %for.cond
143 define <4 x i32> @test_nocse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
145 ; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due
146 ; to mismatch between st2 and ld3.
147 ; CHECK-LABEL: @test_nocse2
148 ; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8
149 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
150 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
153 for.cond: ; preds = %for.body, %entry
154 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
155 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
156 %cmp = icmp slt i32 %i.0, %n
157 br i1 %cmp, label %for.body, label %for.end
159 for.body: ; preds = %for.cond
160 %0 = bitcast i32* %a to i8*
161 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
162 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
163 %3 = bitcast <16 x i8> %1 to <4 x i32>
164 %4 = bitcast <16 x i8> %2 to <4 x i32>
165 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
166 %5 = bitcast i32* %a to i8*
167 %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
168 %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
169 %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
170 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract)
171 %inc = add nsw i32 %i.0, 1
174 for.end: ; preds = %for.cond
178 define <4 x i32> @test_nocse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
180 ; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to
181 ; mismatch between st2 and st3.
182 ; CHECK-LABEL: @test_nocse3
183 ; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8
184 ; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8
185 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
186 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
189 for.cond: ; preds = %for.body, %entry
190 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
191 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
192 %cmp = icmp slt i32 %i.0, %n
193 br i1 %cmp, label %for.body, label %for.end
195 for.body: ; preds = %for.cond
196 %0 = bitcast i32* %a to i8*
197 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
198 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
199 %3 = bitcast <16 x i8> %1 to <4 x i32>
200 %4 = bitcast <16 x i8> %2 to <4 x i32>
201 call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, i8* %0)
202 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
203 %5 = bitcast i32* %a to i8*
204 %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
205 %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
206 %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
207 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract)
208 %inc = add nsw i32 %i.0, 1
211 for.end: ; preds = %for.cond
215 ; Function Attrs: nounwind
216 declare void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32>, <4 x i32>, i8* nocapture)
218 ; Function Attrs: nounwind
219 declare void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32>, <4 x i32>, <4 x i32>, i8* nocapture)
221 ; Function Attrs: nounwind readonly
222 declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8*)
224 ; Function Attrs: nounwind readonly
225 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8*)
227 define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) {
229 %add = add <4 x i32> %__p0, %__p1