1 ; RUN: opt -S -slp-vectorizer -slp-threshold=-10000 < %s | FileCheck %s
2 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-n8:16:32:64-S128"
4 target triple = "x86_64-apple-macosx10.8.0"
6 define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
7 ; CHECK-LABEL: @simple_select(
8 ; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer
9 ; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b
10 %c0 = extractelement <4 x i32> %c, i32 0
11 %c1 = extractelement <4 x i32> %c, i32 1
12 %c2 = extractelement <4 x i32> %c, i32 2
13 %c3 = extractelement <4 x i32> %c, i32 3
14 %a0 = extractelement <4 x float> %a, i32 0
15 %a1 = extractelement <4 x float> %a, i32 1
16 %a2 = extractelement <4 x float> %a, i32 2
17 %a3 = extractelement <4 x float> %a, i32 3
18 %b0 = extractelement <4 x float> %b, i32 0
19 %b1 = extractelement <4 x float> %b, i32 1
20 %b2 = extractelement <4 x float> %b, i32 2
21 %b3 = extractelement <4 x float> %b, i32 3
22 %cmp0 = icmp ne i32 %c0, 0
23 %cmp1 = icmp ne i32 %c1, 0
24 %cmp2 = icmp ne i32 %c2, 0
25 %cmp3 = icmp ne i32 %c3, 0
26 %s0 = select i1 %cmp0, float %a0, float %b0
27 %s1 = select i1 %cmp1, float %a1, float %b1
28 %s2 = select i1 %cmp2, float %a2, float %b2
29 %s3 = select i1 %cmp3, float %a3, float %b3
30 %ra = insertelement <4 x float> undef, float %s0, i32 0
31 %rb = insertelement <4 x float> %ra, float %s1, i32 1
32 %rc = insertelement <4 x float> %rb, float %s2, i32 2
33 %rd = insertelement <4 x float> %rc, float %s3, i32 3
37 ; Insert in an order different from the vector indices to make sure it
39 define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
40 ; CHECK-LABEL: @simple_select_insert_out_of_order(
41 ; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer
42 ; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b
43 %c0 = extractelement <4 x i32> %c, i32 0
44 %c1 = extractelement <4 x i32> %c, i32 1
45 %c2 = extractelement <4 x i32> %c, i32 2
46 %c3 = extractelement <4 x i32> %c, i32 3
47 %a0 = extractelement <4 x float> %a, i32 0
48 %a1 = extractelement <4 x float> %a, i32 1
49 %a2 = extractelement <4 x float> %a, i32 2
50 %a3 = extractelement <4 x float> %a, i32 3
51 %b0 = extractelement <4 x float> %b, i32 0
52 %b1 = extractelement <4 x float> %b, i32 1
53 %b2 = extractelement <4 x float> %b, i32 2
54 %b3 = extractelement <4 x float> %b, i32 3
55 %cmp0 = icmp ne i32 %c0, 0
56 %cmp1 = icmp ne i32 %c1, 0
57 %cmp2 = icmp ne i32 %c2, 0
58 %cmp3 = icmp ne i32 %c3, 0
59 %s0 = select i1 %cmp0, float %a0, float %b0
60 %s1 = select i1 %cmp1, float %a1, float %b1
61 %s2 = select i1 %cmp2, float %a2, float %b2
62 %s3 = select i1 %cmp3, float %a3, float %b3
63 %ra = insertelement <4 x float> undef, float %s0, i32 2
64 %rb = insertelement <4 x float> %ra, float %s1, i32 1
65 %rc = insertelement <4 x float> %rb, float %s2, i32 0
66 %rd = insertelement <4 x float> %rc, float %s3, i32 3
70 declare void @v4f32_user(<4 x float>) #0
71 declare void @f32_user(float) #0
73 ; Multiple users of the final constructed vector
74 define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
75 ; CHECK-LABEL: @simple_select_users(
76 ; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer
77 ; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b
78 %c0 = extractelement <4 x i32> %c, i32 0
79 %c1 = extractelement <4 x i32> %c, i32 1
80 %c2 = extractelement <4 x i32> %c, i32 2
81 %c3 = extractelement <4 x i32> %c, i32 3
82 %a0 = extractelement <4 x float> %a, i32 0
83 %a1 = extractelement <4 x float> %a, i32 1
84 %a2 = extractelement <4 x float> %a, i32 2
85 %a3 = extractelement <4 x float> %a, i32 3
86 %b0 = extractelement <4 x float> %b, i32 0
87 %b1 = extractelement <4 x float> %b, i32 1
88 %b2 = extractelement <4 x float> %b, i32 2
89 %b3 = extractelement <4 x float> %b, i32 3
90 %cmp0 = icmp ne i32 %c0, 0
91 %cmp1 = icmp ne i32 %c1, 0
92 %cmp2 = icmp ne i32 %c2, 0
93 %cmp3 = icmp ne i32 %c3, 0
94 %s0 = select i1 %cmp0, float %a0, float %b0
95 %s1 = select i1 %cmp1, float %a1, float %b1
96 %s2 = select i1 %cmp2, float %a2, float %b2
97 %s3 = select i1 %cmp3, float %a3, float %b3
98 %ra = insertelement <4 x float> undef, float %s0, i32 0
99 %rb = insertelement <4 x float> %ra, float %s1, i32 1
100 %rc = insertelement <4 x float> %rb, float %s2, i32 2
101 %rd = insertelement <4 x float> %rc, float %s3, i32 3
102 call void @v4f32_user(<4 x float> %rd) #0
106 ; Unused insertelement
107 define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
108 ; CHECK-LABEL: @simple_select_no_users(
109 ; CHECK-NOT: icmp ne <4 x i32>
110 ; CHECK-NOT: select <4 x i1>
111 %c0 = extractelement <4 x i32> %c, i32 0
112 %c1 = extractelement <4 x i32> %c, i32 1
113 %c2 = extractelement <4 x i32> %c, i32 2
114 %c3 = extractelement <4 x i32> %c, i32 3
115 %a0 = extractelement <4 x float> %a, i32 0
116 %a1 = extractelement <4 x float> %a, i32 1
117 %a2 = extractelement <4 x float> %a, i32 2
118 %a3 = extractelement <4 x float> %a, i32 3
119 %b0 = extractelement <4 x float> %b, i32 0
120 %b1 = extractelement <4 x float> %b, i32 1
121 %b2 = extractelement <4 x float> %b, i32 2
122 %b3 = extractelement <4 x float> %b, i32 3
123 %cmp0 = icmp ne i32 %c0, 0
124 %cmp1 = icmp ne i32 %c1, 0
125 %cmp2 = icmp ne i32 %c2, 0
126 %cmp3 = icmp ne i32 %c3, 0
127 %s0 = select i1 %cmp0, float %a0, float %b0
128 %s1 = select i1 %cmp1, float %a1, float %b1
129 %s2 = select i1 %cmp2, float %a2, float %b2
130 %s3 = select i1 %cmp3, float %a3, float %b3
131 %ra = insertelement <4 x float> undef, float %s0, i32 0
132 %rb = insertelement <4 x float> %ra, float %s1, i32 1
133 %rc = insertelement <4 x float> undef, float %s2, i32 2
134 %rd = insertelement <4 x float> %rc, float %s3, i32 3
138 ; Make sure infinite loop doesn't happen which I ran into when trying
139 ; to do this backwards this backwards
140 define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
141 ; CHECK-LABEL: @reconstruct(
142 %c0 = extractelement <4 x i32> %c, i32 0
143 %c1 = extractelement <4 x i32> %c, i32 1
144 %c2 = extractelement <4 x i32> %c, i32 2
145 %c3 = extractelement <4 x i32> %c, i32 3
146 %ra = insertelement <4 x i32> undef, i32 %c0, i32 0
147 %rb = insertelement <4 x i32> %ra, i32 %c1, i32 1
148 %rc = insertelement <4 x i32> %rb, i32 %c2, i32 2
149 %rd = insertelement <4 x i32> %rc, i32 %c3, i32 3
153 define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) #0 {
154 ; CHECK-LABEL: @simple_select_v2(
155 ; CHECK: icmp ne <2 x i32>
156 ; CHECK: select <2 x i1>
157 %c0 = extractelement <2 x i32> %c, i32 0
158 %c1 = extractelement <2 x i32> %c, i32 1
159 %a0 = extractelement <2 x float> %a, i32 0
160 %a1 = extractelement <2 x float> %a, i32 1
161 %b0 = extractelement <2 x float> %b, i32 0
162 %b1 = extractelement <2 x float> %b, i32 1
163 %cmp0 = icmp ne i32 %c0, 0
164 %cmp1 = icmp ne i32 %c1, 0
165 %s0 = select i1 %cmp0, float %a0, float %b0
166 %s1 = select i1 %cmp1, float %a1, float %b1
167 %ra = insertelement <2 x float> undef, float %s0, i32 0
168 %rb = insertelement <2 x float> %ra, float %s1, i32 1
172 ; Make sure when we construct partial vectors, we don't keep
173 ; re-visiting the insertelement chains starting with undef
174 ; (low cost threshold needed to force this to happen)
175 define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
176 %c0 = extractelement <4 x i32> %c, i32 0
177 %c1 = extractelement <4 x i32> %c, i32 1
178 %a0 = extractelement <4 x float> %a, i32 0
179 %a1 = extractelement <4 x float> %a, i32 1
180 %b0 = extractelement <4 x float> %b, i32 0
181 %b1 = extractelement <4 x float> %b, i32 1
182 %1 = insertelement <2 x i32> undef, i32 %c0, i32 0
183 %2 = insertelement <2 x i32> %1, i32 %c1, i32 1
184 %3 = icmp ne <2 x i32> %2, zeroinitializer
185 %4 = insertelement <2 x float> undef, float %a0, i32 0
186 %5 = insertelement <2 x float> %4, float %a1, i32 1
187 %6 = insertelement <2 x float> undef, float %b0, i32 0
188 %7 = insertelement <2 x float> %6, float %b1, i32 1
189 %8 = select <2 x i1> %3, <2 x float> %5, <2 x float> %7
190 %9 = extractelement <2 x float> %8, i32 0
191 %ra = insertelement <4 x float> undef, float %9, i32 0
192 %10 = extractelement <2 x float> %8, i32 1
193 %rb = insertelement <4 x float> %ra, float %10, i32 1
197 ; Make sure that vectorization happens even if insertelements operations
198 ; must be rescheduled. The case here is from compiling Julia.
199 define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
200 ; CHECK-LABEL: @reschedule_extract(
201 ; CHECK: %1 = fadd <4 x float> %a, %b
202 %a0 = extractelement <4 x float> %a, i32 0
203 %b0 = extractelement <4 x float> %b, i32 0
204 %c0 = fadd float %a0, %b0
205 %v0 = insertelement <4 x float> undef, float %c0, i32 0
206 %a1 = extractelement <4 x float> %a, i32 1
207 %b1 = extractelement <4 x float> %b, i32 1
208 %c1 = fadd float %a1, %b1
209 %v1 = insertelement <4 x float> %v0, float %c1, i32 1
210 %a2 = extractelement <4 x float> %a, i32 2
211 %b2 = extractelement <4 x float> %b, i32 2
212 %c2 = fadd float %a2, %b2
213 %v2 = insertelement <4 x float> %v1, float %c2, i32 2
214 %a3 = extractelement <4 x float> %a, i32 3
215 %b3 = extractelement <4 x float> %b, i32 3
216 %c3 = fadd float %a3, %b3
217 %v3 = insertelement <4 x float> %v2, float %c3, i32 3
221 attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }