1 ; RUN: opt < %s -instcombine -S | FileCheck %s
2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
4 define i16 @test1(float %f) {
8 ; CHECK-NOT: insertelement {{.*}} 0.00
9 ; CHECK-NOT: call {{.*}} @llvm.x86.sse.mul
10 ; CHECK-NOT: call {{.*}} @llvm.x86.sse.sub
12 %tmp = insertelement <4 x float> undef, float %f, i32 0 ; <<4 x float>> [#uses=1]
13 %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
14 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
15 %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
16 %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
17 %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
18 %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1]
19 %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer ) ; <<4 x float>> [#uses=1]
20 %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <i32> [#uses=1]
21 %tmp69 = trunc i32 %tmp.upgrd.1 to i16 ; <i16> [#uses=1]
25 define i32 @test2(float %f) {
26 ; CHECK-LABEL: @test2(
27 ; CHECK-NOT: insertelement
28 ; CHECK-NOT: extractelement
30 %tmp5 = fmul float %f, %f
31 %tmp9 = insertelement <4 x float> undef, float %tmp5, i32 0
32 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1
33 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
34 %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
35 %tmp19 = bitcast <4 x float> %tmp12 to <4 x i32>
36 %tmp21 = extractelement <4 x i32> %tmp19, i32 0
40 define i64 @test3(float %f, double %d) {
41 ; CHECK-LABEL: @test3(
42 ; CHECK-NOT: insertelement {{.*}} 0.00
45 %v00 = insertelement <4 x float> undef, float %f, i32 0
46 %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
47 %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
48 %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
49 %tmp0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %v03)
50 %v10 = insertelement <4 x float> undef, float %f, i32 0
51 %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
52 %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
53 %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
54 %tmp1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %v13)
55 %v20 = insertelement <4 x float> undef, float %f, i32 0
56 %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
57 %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
58 %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
59 %tmp2 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %v23)
60 %v30 = insertelement <4 x float> undef, float %f, i32 0
61 %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
62 %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
63 %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
64 %tmp3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %v33)
65 %v40 = insertelement <2 x double> undef, double %d, i32 0
66 %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
67 %tmp4 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %v41)
68 %v50 = insertelement <2 x double> undef, double %d, i32 0
69 %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
70 %tmp5 = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %v51)
71 %v60 = insertelement <2 x double> undef, double %d, i32 0
72 %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
73 %tmp6 = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %v61)
74 %v70 = insertelement <2 x double> undef, double %d, i32 0
75 %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
76 %tmp7 = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %v71)
77 %tmp8 = add i32 %tmp0, %tmp2
78 %tmp9 = add i32 %tmp4, %tmp6
79 %tmp10 = add i32 %tmp8, %tmp9
80 %tmp11 = sext i32 %tmp10 to i64
81 %tmp12 = add i64 %tmp1, %tmp3
82 %tmp13 = add i64 %tmp5, %tmp7
83 %tmp14 = add i64 %tmp12, %tmp13
84 %tmp15 = add i64 %tmp11, %tmp14
88 define void @get_image() nounwind {
89 ; CHECK-LABEL: @get_image(
90 ; CHECK-NOT: extractelement
93 %0 = call i32 @fgetc(i8* null) nounwind ; <i32> [#uses=1]
94 %1 = trunc i32 %0 to i8 ; <i8> [#uses=1]
95 %tmp2 = insertelement <100 x i8> zeroinitializer, i8 %1, i32 1 ; <<100 x i8>> [#uses=1]
96 %tmp1 = extractelement <100 x i8> %tmp2, i32 0 ; <i8> [#uses=1]
97 %2 = icmp eq i8 %tmp1, 80 ; <i1> [#uses=1]
98 br i1 %2, label %bb2, label %bb3
100 bb2: ; preds = %entry
103 bb3: ; preds = %bb2, %entry
108 define void @vac(<4 x float>* nocapture %a) nounwind {
113 %tmp1 = load <4 x float>, <4 x float>* %a ; <<4 x float>> [#uses=1]
114 %vecins = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 0 ; <<4 x float>> [#uses=1]
115 %vecins4 = insertelement <4 x float> %vecins, float 0.000000e+00, i32 1; <<4 x float>> [#uses=1]
116 %vecins6 = insertelement <4 x float> %vecins4, float 0.000000e+00, i32 2; <<4 x float>> [#uses=1]
117 %vecins8 = insertelement <4 x float> %vecins6, float 0.000000e+00, i32 3; <<4 x float>> [#uses=1]
118 store <4 x float> %vecins8, <4 x float>* %a
122 declare i32 @fgetc(i8*)
124 declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
126 declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>)
128 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
130 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
132 declare i32 @llvm.x86.sse.cvtss2si(<4 x float>)
133 declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>)
134 declare i32 @llvm.x86.sse.cvttss2si(<4 x float>)
135 declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>)
136 declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>)
137 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>)
138 declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>)
139 declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>)
141 ; <rdar://problem/6945110>
142 define <4 x i32> @kernel3_vertical(<4 x i16> * %src, <8 x i16> * %foo) nounwind {
144 %tmp = load <4 x i16>, <4 x i16>* %src
145 %tmp1 = load <8 x i16>, <8 x i16>* %foo
146 ; CHECK: %tmp2 = shufflevector
147 %tmp2 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
148 ; pmovzxwd ignores the upper 64-bits of its input; -instcombine should remove this shuffle:
149 ; CHECK-NOT: shufflevector
150 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
151 ; CHECK-NEXT: pmovzxwd
152 %0 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp3)
155 declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
157 define <4 x float> @dead_shuffle_elt(<4 x float> %x, <2 x float> %y) nounwind {
159 ; CHECK-LABEL: define <4 x float> @dead_shuffle_elt(
160 ; CHECK: shufflevector <2 x float> %y, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
161 %shuffle.i = shufflevector <2 x float> %y, <2 x float> %y, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
162 %shuffle9.i = shufflevector <4 x float> %x, <4 x float> %shuffle.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
163 ret <4 x float> %shuffle9.i
166 define <2 x float> @test_fptrunc(double %f) {
167 ; CHECK-LABEL: @test_fptrunc(
168 ; CHECK: insertelement
169 ; CHECK: insertelement
170 ; CHECK-NOT: insertelement
171 %tmp9 = insertelement <4 x double> undef, double %f, i32 0
172 %tmp10 = insertelement <4 x double> %tmp9, double 0.000000e+00, i32 1
173 %tmp11 = insertelement <4 x double> %tmp10, double 0.000000e+00, i32 2
174 %tmp12 = insertelement <4 x double> %tmp11, double 0.000000e+00, i32 3
175 %tmp5 = fptrunc <4 x double> %tmp12 to <4 x float>
176 %ret = shufflevector <4 x float> %tmp5, <4 x float> undef, <2 x i32> <i32 0, i32 1>
180 define <2 x double> @test_fpext(float %f) {
181 ; CHECK-LABEL: @test_fpext(
182 ; CHECK: insertelement
183 ; CHECK: insertelement
184 ; CHECK-NOT: insertelement
185 %tmp9 = insertelement <4 x float> undef, float %f, i32 0
186 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1
187 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
188 %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
189 %tmp5 = fpext <4 x float> %tmp12 to <4 x double>
190 %ret = shufflevector <4 x double> %tmp5, <4 x double> undef, <2 x i32> <i32 0, i32 1>
191 ret <2 x double> %ret
194 define <4 x float> @test_select(float %f, float %g) {
195 ; CHECK-LABEL: @test_select(
196 ; CHECK: %a0 = insertelement <4 x float> undef, float %f, i32 0
197 ; CHECK-NOT: insertelement
198 ; CHECK: %a3 = insertelement <4 x float> %a0, float 3.000000e+00, i32 3
199 ; CHECK-NOT: insertelement
200 ; CHECK: %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef>
201 %a0 = insertelement <4 x float> undef, float %f, i32 0
202 %a1 = insertelement <4 x float> %a0, float 1.000000e+00, i32 1
203 %a2 = insertelement <4 x float> %a1, float 2.000000e+00, i32 2
204 %a3 = insertelement <4 x float> %a2, float 3.000000e+00, i32 3
205 %b0 = insertelement <4 x float> undef, float %g, i32 0
206 %b1 = insertelement <4 x float> %b0, float 4.000000e+00, i32 1
207 %b2 = insertelement <4 x float> %b1, float 5.000000e+00, i32 2
208 %b3 = insertelement <4 x float> %b2, float 6.000000e+00, i32 3
209 %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> %b3
213 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
214 define <4 x float> @test_vpermilvar_ps(<4 x float> %v) {
215 ; CHECK-LABEL: @test_vpermilvar_ps(
216 ; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
217 %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
221 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)
222 define <8 x float> @test_vpermilvar_ps_256(<8 x float> %v) {
223 ; CHECK-LABEL: @test_vpermilvar_ps_256(
224 ; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
225 %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
229 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>)
230 define <2 x double> @test_vpermilvar_pd(<2 x double> %v) {
231 ; CHECK-LABEL: @test_vpermilvar_pd(
232 ; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 0>
233 %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 2, i64 0>)
237 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>)
238 define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) {
239 ; CHECK-LABEL: @test_vpermilvar_pd_256(
240 ; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
241 %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 3, i64 1, i64 2, i64 0>)
245 define <4 x float> @test_vpermilvar_ps_zero(<4 x float> %v) {
246 ; CHECK-LABEL: @test_vpermilvar_ps_zero(
247 ; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
248 %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> zeroinitializer)
252 define <8 x float> @test_vpermilvar_ps_256_zero(<8 x float> %v) {
253 ; CHECK-LABEL: @test_vpermilvar_ps_256_zero(
254 ; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
255 %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> zeroinitializer)
259 define <2 x double> @test_vpermilvar_pd_zero(<2 x double> %v) {
260 ; CHECK-LABEL: @test_vpermilvar_pd_zero(
261 ; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
262 %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> zeroinitializer)
266 define <4 x double> @test_vpermilvar_pd_256_zero(<4 x double> %v) {
267 ; CHECK-LABEL: @test_vpermilvar_pd_256_zero(
268 ; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
269 %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> zeroinitializer)
273 define <2 x i64> @test_sse2_1() nounwind readnone uwtable {
274 %S = bitcast i32 1 to i32
275 %1 = zext i32 %S to i64
276 %2 = insertelement <2 x i64> undef, i64 %1, i32 0
277 %3 = insertelement <2 x i64> %2, i64 0, i32 1
278 %4 = bitcast <2 x i64> %3 to <8 x i16>
279 %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
280 %6 = bitcast <8 x i16> %5 to <4 x i32>
281 %7 = bitcast <2 x i64> %3 to <4 x i32>
282 %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
283 %9 = bitcast <4 x i32> %8 to <2 x i64>
284 %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
285 %11 = bitcast <2 x i64> %10 to <8 x i16>
286 %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
287 %13 = bitcast <8 x i16> %12 to <4 x i32>
288 %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
289 %15 = bitcast <4 x i32> %14 to <2 x i64>
290 %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
293 ; CHECK: ret <2 x i64> <i64 72058418680037440, i64 144117112246370624>
296 define <4 x i64> @test_avx2_1() nounwind readnone uwtable {
297 %S = bitcast i32 1 to i32
298 %1 = zext i32 %S to i64
299 %2 = insertelement <2 x i64> undef, i64 %1, i32 0
300 %3 = insertelement <2 x i64> %2, i64 0, i32 1
301 %4 = bitcast <2 x i64> %3 to <8 x i16>
302 %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
303 %6 = bitcast <16 x i16> %5 to <8 x i32>
304 %7 = bitcast <2 x i64> %3 to <4 x i32>
305 %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
306 %9 = bitcast <8 x i32> %8 to <4 x i64>
307 %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
308 %11 = bitcast <4 x i64> %10 to <16 x i16>
309 %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
310 %13 = bitcast <16 x i16> %12 to <8 x i32>
311 %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
312 %15 = bitcast <8 x i32> %14 to <4 x i64>
313 %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
316 ; CHECK: ret <4 x i64> <i64 64, i64 128, i64 192, i64 256>
319 define <2 x i64> @test_sse2_0() nounwind readnone uwtable {
320 %S = bitcast i32 128 to i32
321 %1 = zext i32 %S to i64
322 %2 = insertelement <2 x i64> undef, i64 %1, i32 0
323 %3 = insertelement <2 x i64> %2, i64 0, i32 1
324 %4 = bitcast <2 x i64> %3 to <8 x i16>
325 %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
326 %6 = bitcast <8 x i16> %5 to <4 x i32>
327 %7 = bitcast <2 x i64> %3 to <4 x i32>
328 %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
329 %9 = bitcast <4 x i32> %8 to <2 x i64>
330 %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
331 %11 = bitcast <2 x i64> %10 to <8 x i16>
332 %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
333 %13 = bitcast <8 x i16> %12 to <4 x i32>
334 %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
335 %15 = bitcast <4 x i32> %14 to <2 x i64>
336 %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
339 ; CHECK: ret <2 x i64> zeroinitializer
342 define <4 x i64> @test_avx2_0() nounwind readnone uwtable {
343 %S = bitcast i32 128 to i32
344 %1 = zext i32 %S to i64
345 %2 = insertelement <2 x i64> undef, i64 %1, i32 0
346 %3 = insertelement <2 x i64> %2, i64 0, i32 1
347 %4 = bitcast <2 x i64> %3 to <8 x i16>
348 %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
349 %6 = bitcast <16 x i16> %5 to <8 x i32>
350 %7 = bitcast <2 x i64> %3 to <4 x i32>
351 %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
352 %9 = bitcast <8 x i32> %8 to <4 x i64>
353 %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
354 %11 = bitcast <4 x i64> %10 to <16 x i16>
355 %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
356 %13 = bitcast <16 x i16> %12 to <8 x i32>
357 %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
358 %15 = bitcast <8 x i32> %14 to <4 x i64>
359 %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
362 ; CHECK: ret <4 x i64> zeroinitializer
364 define <2 x i64> @test_sse2_psrl_1() nounwind readnone uwtable {
365 %S = bitcast i32 1 to i32
366 %1 = zext i32 %S to i64
367 %2 = insertelement <2 x i64> undef, i64 %1, i32 0
368 %3 = insertelement <2 x i64> %2, i64 0, i32 1
369 %4 = bitcast <2 x i64> %3 to <8 x i16>
370 %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048>, <8 x i16> %4)
371 %6 = bitcast <8 x i16> %5 to <4 x i32>
372 %7 = bitcast <2 x i64> %3 to <4 x i32>
373 %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7)
374 %9 = bitcast <4 x i32> %8 to <2 x i64>
375 %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3)
376 %11 = bitcast <2 x i64> %10 to <8 x i16>
377 %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S)
378 %13 = bitcast <8 x i16> %12 to <4 x i32>
379 %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S)
380 %15 = bitcast <4 x i32> %14 to <2 x i64>
381 %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S)
383 ; CHECK: test_sse2_psrl_1
384 ; CHECK: ret <2 x i64> <i64 562954248421376, i64 9007267974742020>
387 define <4 x i64> @test_avx2_psrl_1() nounwind readnone uwtable {
388 %S = bitcast i32 1 to i32
389 %1 = zext i32 %S to i64
390 %2 = insertelement <2 x i64> undef, i64 %1, i32 0
391 %3 = insertelement <2 x i64> %2, i64 0, i32 1
392 %4 = bitcast <2 x i64> %3 to <8 x i16>
393 %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
394 %6 = bitcast <16 x i16> %5 to <8 x i32>
395 %7 = bitcast <2 x i64> %3 to <4 x i32>
396 %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7)
397 %9 = bitcast <8 x i32> %8 to <4 x i64>
398 %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3)
399 %11 = bitcast <4 x i64> %10 to <16 x i16>
400 %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S)
401 %13 = bitcast <16 x i16> %12 to <8 x i32>
402 %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S)
403 %15 = bitcast <8 x i32> %14 to <4 x i64>
404 %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S)
406 ; CHECK: test_avx2_psrl_1
407 ; CHECK: ret <4 x i64> <i64 16, i64 32, i64 64, i64 128>
410 define <2 x i64> @test_sse2_psrl_0() nounwind readnone uwtable {
411 %S = bitcast i32 128 to i32
412 %1 = zext i32 %S to i64
413 %2 = insertelement <2 x i64> undef, i64 %1, i32 0
414 %3 = insertelement <2 x i64> %2, i64 0, i32 1
415 %4 = bitcast <2 x i64> %3 to <8 x i16>
416 %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096>, <8 x i16> %4)
417 %6 = bitcast <8 x i16> %5 to <4 x i32>
418 %7 = bitcast <2 x i64> %3 to <4 x i32>
419 %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7)
420 %9 = bitcast <4 x i32> %8 to <2 x i64>
421 %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3)
422 %11 = bitcast <2 x i64> %10 to <8 x i16>
423 %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S)
424 %13 = bitcast <8 x i16> %12 to <4 x i32>
425 %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S)
426 %15 = bitcast <4 x i32> %14 to <2 x i64>
427 %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S)
429 ; CHECK: test_sse2_psrl_0
430 ; CHECK: ret <2 x i64> zeroinitializer
433 define <4 x i64> @test_avx2_psrl_0() nounwind readnone uwtable {
434 %S = bitcast i32 128 to i32
435 %1 = zext i32 %S to i64
436 %2 = insertelement <2 x i64> undef, i64 %1, i32 0
437 %3 = insertelement <2 x i64> %2, i64 0, i32 1
438 %4 = bitcast <2 x i64> %3 to <8 x i16>
439 %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
440 %6 = bitcast <16 x i16> %5 to <8 x i32>
441 %7 = bitcast <2 x i64> %3 to <4 x i32>
442 %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7)
443 %9 = bitcast <8 x i32> %8 to <4 x i64>
444 %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3)
445 %11 = bitcast <4 x i64> %10 to <16 x i16>
446 %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S)
447 %13 = bitcast <16 x i16> %12 to <8 x i32>
448 %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S)
449 %15 = bitcast <8 x i32> %14 to <4 x i64>
450 %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S)
452 ; CHECK: test_avx2_psrl_0
453 ; CHECK: ret <4 x i64> zeroinitializer
456 declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1
457 declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1
458 declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1
459 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1
460 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1
461 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1
462 declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1
463 declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1
464 declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1
465 declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1
466 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1
467 declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1
468 declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1
469 declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1
470 declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1
471 declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) #1
472 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) #1
473 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1
474 declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1
475 declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1
476 declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1
477 declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1
478 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1
479 declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1
481 attributes #1 = { nounwind readnone }