1 ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core2 -mtriple=x86_64-apple-darwin | FileCheck %s
2 ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=SSE3
3 ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7-avx -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX
4 ; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core-avx2 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX2
6 define fastcc float @reduction_cost_float(<4 x float> %rdx) {
7 %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
8 %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
9 %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
10 %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
12 ; Check that we recognize the tree starting at the extractelement as a
14 ; CHECK-LABEL: reduction_cost
15 ; CHECK: cost of 9 {{.*}} extractelement
17 %r = extractelement <4 x float> %bin.rdx8, i32 0
21 define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
22 %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef,
23 <8 x i32> <i32 4 , i32 5, i32 6, i32 7,
24 i32 undef, i32 undef, i32 undef, i32 undef>
25 %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
26 %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,
27 <8 x i32> <i32 2 , i32 3, i32 undef, i32 undef,
28 i32 undef, i32 undef, i32 undef, i32 undef>
29 %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
30 %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef,
31 <8 x i32> <i32 1 , i32 undef, i32 undef, i32 undef,
32 i32 undef, i32 undef, i32 undef, i32 undef>
33 %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
35 ; CHECK-LABEL: reduction_cost_int
36 ; CHECK: cost of 17 {{.*}} extractelement
38 %r = extractelement <8 x i32> %bin.rdx.3, i32 0
42 define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
43 %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
44 <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
45 %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
46 <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
47 %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
48 %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
49 <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
50 %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
51 <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
52 %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
54 ; CHECK-LABEL: pairwise_hadd
55 ; CHECK: cost of 11 {{.*}} extractelement
57 %r = extractelement <4 x float> %bin.rdx.1, i32 0
58 %r2 = fadd float %r, %f1
62 define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
63 %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
64 <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
65 %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
66 <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
67 %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
68 %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
69 <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
70 %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
71 <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
72 %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
74 ; CHECK-LABEL: pairwise_hadd_assoc
75 ; CHECK: cost of 11 {{.*}} extractelement
77 %r = extractelement <4 x float> %bin.rdx.1, i32 0
78 %r2 = fadd float %r, %f1
82 define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
83 %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
84 <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
85 %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
86 <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
87 %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
88 %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
89 <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
90 %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
92 ; CHECK-LABEL: pairwise_hadd_skip_first
93 ; CHECK: cost of 11 {{.*}} extractelement
95 %r = extractelement <4 x float> %bin.rdx.1, i32 0
96 %r2 = fadd float %r, %f1
100 define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1) {
101 %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
102 %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
104 ; SSE3: cost of 2 {{.*}} extractelement
105 ; AVX: cost of 2 {{.*}} extractelement
106 ; AVX2: cost of 2 {{.*}} extractelement
108 %r = extractelement <2 x double> %bin.rdx, i32 0
112 define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
113 %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
114 %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
115 %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
116 %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
118 ; SSE3: cost of 4 {{.*}} extractelement
119 ; AVX: cost of 3 {{.*}} extractelement
120 ; AVX2: cost of 3 {{.*}} extractelement
122 %r = extractelement <4 x float> %bin.rdx8, i32 0
126 define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1) {
127 %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
128 %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
129 %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
130 %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
132 ; AVX: cost of 3 {{.*}} extractelement
133 ; AVX2: cost of 3 {{.*}} extractelement
135 %r = extractelement <4 x double> %bin.rdx8, i32 0
139 define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
140 %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
141 %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
142 %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
143 %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
144 %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
145 %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
147 ; AVX: cost of 4 {{.*}} extractelement
148 ; AVX2: cost of 4 {{.*}} extractelement
150 %r = extractelement <8 x float> %bin.rdx8, i32 0
154 define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
155 %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
156 %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
158 ; SSE3: cost of 2 {{.*}} extractelement
159 ; AVX: cost of 1 {{.*}} extractelement
160 ; AVX2: cost of 1 {{.*}} extractelement
162 %r = extractelement <2 x i64> %bin.rdx, i32 0
166 define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
167 %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
168 %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
169 %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
170 %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
172 ; SSE3: cost of 3 {{.*}} extractelement
173 ; AVX: cost of 3 {{.*}} extractelement
174 ; AVX2: cost of 3 {{.*}} extractelement
176 %r = extractelement <4 x i32> %bin.rdx8, i32 0
180 define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
181 %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
182 %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
183 %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
184 %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
186 ; AVX: cost of 3 {{.*}} extractelement
187 ; AVX2: cost of 3 {{.*}} extractelement
189 %r = extractelement <4 x i64> %bin.rdx8, i32 0
193 define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
194 %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
195 %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
196 %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
197 %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
198 %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
199 %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
201 ; SSE3: cost of 4 {{.*}} extractelement
202 ; AVX: cost of 4 {{.*}} extractelement
203 ; AVX2: cost of 4 {{.*}} extractelement
205 %r = extractelement <8 x i16> %bin.rdx8, i32 0
209 define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
210 %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
211 %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
212 %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
213 %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
214 %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
215 %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
217 ; AVX: cost of 5 {{.*}} extractelement
218 ; AVX2: cost of 5 {{.*}} extractelement
220 %r = extractelement <8 x i32> %bin.rdx8, i32 0
224 define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) {
225 %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
226 %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
227 %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
229 ; SSE3: cost of 2 {{.*}} extractelement
230 ; AVX: cost of 2 {{.*}} extractelement
231 ; AVX2: cost of 2 {{.*}} extractelement
233 %r = extractelement <2 x double> %bin.rdx8, i32 0
237 define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
238 %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
239 %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
240 %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
241 %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
242 %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
243 %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
245 ; SSE3: cost of 4 {{.*}} extractelement
246 ; AVX: cost of 4 {{.*}} extractelement
247 ; AVX2: cost of 4 {{.*}} extractelement
249 %r = extractelement <4 x float> %bin.rdx8, i32 0
253 define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
254 %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
255 %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
256 %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
257 %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
258 %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
259 %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
261 ; AVX: cost of 5 {{.*}} extractelement
262 ; AVX2: cost of 5 {{.*}} extractelement
264 %r = extractelement <4 x double> %bin.rdx8, i32 0
268 define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
269 %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
270 %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
271 %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
272 %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
273 %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
274 %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
275 %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
276 %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
277 %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
279 ; AVX: cost of 7 {{.*}} extractelement
280 ; AVX2: cost of 7 {{.*}} extractelement
282 %r = extractelement <8 x float> %bin.rdx9, i32 0
286 define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
287 %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
288 %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
289 %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
291 ; SSE3: cost of 2 {{.*}} extractelement
292 ; AVX: cost of 1 {{.*}} extractelement
293 ; AVX2: cost of 1 {{.*}} extractelement
295 %r = extractelement <2 x i64> %bin.rdx8, i32 0
299 define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
300 %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
301 %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
302 %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
303 %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
304 %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
305 %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
307 ; SSE3: cost of 3 {{.*}} extractelement
308 ; AVX: cost of 3 {{.*}} extractelement
309 ; AVX2: cost of 3 {{.*}} extractelement
311 %r = extractelement <4 x i32> %bin.rdx8, i32 0
315 define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
316 %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
317 %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
318 %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
319 %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
320 %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
321 %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
323 ; AVX: cost of 5 {{.*}} extractelement
324 ; AVX2: cost of 5 {{.*}} extractelement
326 %r = extractelement <4 x i64> %bin.rdx8, i32 0
330 define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
331 %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
332 %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
333 %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
334 %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
335 %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
336 %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
337 %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
338 %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
339 %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
341 ; SSE3: cost of 5 {{.*}} extractelement
342 ; AVX: cost of 5 {{.*}} extractelement
343 ; AVX2: cost of 5 {{.*}} extractelement
345 %r = extractelement <8 x i16> %bin.rdx9, i32 0
349 define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
350 %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
351 %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
352 %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
353 %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
354 %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
355 %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
356 %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
357 %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
358 %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
360 ; AVX: cost of 5 {{.*}} extractelement
361 ; AVX2: cost of 5 {{.*}} extractelement
363 %r = extractelement <8 x i32> %bin.rdx9, i32 0