1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; Tests for SSE2 and below, without SSE3+.
3 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s
5 define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
8 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
9 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
10 ; CHECK-NEXT: movapd (%ecx), %xmm0
11 ; CHECK-NEXT: movlpd {{[0-9]+}}(%esp), %xmm0
12 ; CHECK-NEXT: movapd %xmm0, (%eax)
14 %tmp3 = load <2 x double>, <2 x double>* %A, align 16
15 %tmp7 = insertelement <2 x double> undef, double %B, i32 0
16 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
17 store <2 x double> %tmp9, <2 x double>* %r, align 16
21 define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
24 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
25 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
26 ; CHECK-NEXT: movapd (%ecx), %xmm0
27 ; CHECK-NEXT: movhpd {{[0-9]+}}(%esp), %xmm0
28 ; CHECK-NEXT: movapd %xmm0, (%eax)
30 %tmp3 = load <2 x double>, <2 x double>* %A, align 16
31 %tmp7 = insertelement <2 x double> undef, double %B, i32 0
32 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
33 store <2 x double> %tmp9, <2 x double>* %r, align 16
38 define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind {
41 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
42 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
43 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
44 ; CHECK-NEXT: movaps (%edx), %xmm0
45 ; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
46 ; CHECK-NEXT: movaps %xmm0, (%eax)
48 %tmp = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=2]
49 %tmp3 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=2]
50 %tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0 ; <float> [#uses=1]
51 %tmp7 = extractelement <4 x float> %tmp, i32 0 ; <float> [#uses=1]
52 %tmp8 = extractelement <4 x float> %tmp3, i32 1 ; <float> [#uses=1]
53 %tmp9 = extractelement <4 x float> %tmp, i32 1 ; <float> [#uses=1]
54 %tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0 ; <<4 x float>> [#uses=1]
55 %tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1 ; <<4 x float>> [#uses=1]
56 %tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2 ; <<4 x float>> [#uses=1]
57 %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 ; <<4 x float>> [#uses=1]
58 store <4 x float> %tmp13, <4 x float>* %res
62 define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
65 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
66 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
67 ; CHECK-NEXT: movaps %xmm0, (%eax)
69 %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1]
70 store <4 x float> %tmp5, <4 x float>* %res
74 define <4 x i32> @test5(i8** %ptr) nounwind {
77 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
78 ; CHECK-NEXT: movl (%eax), %eax
79 ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
80 ; CHECK-NEXT: pxor %xmm0, %xmm0
81 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
82 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
84 %tmp = load i8*, i8** %ptr ; <i8*> [#uses=1]
85 %tmp.upgrd.1 = bitcast i8* %tmp to float* ; <float*> [#uses=1]
86 %tmp.upgrd.2 = load float, float* %tmp.upgrd.1 ; <float> [#uses=1]
87 %tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0 ; <<4 x float>> [#uses=1]
88 %tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
89 %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1]
90 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1]
91 %tmp21 = bitcast <4 x float> %tmp11 to <16 x i8> ; <<16 x i8>> [#uses=1]
92 %tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 > ; <<16 x i8>> [#uses=1]
93 %tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16> ; <<8 x i16>> [#uses=1]
94 %tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 > ; <<8 x i16>> [#uses=1]
95 %tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32> ; <<4 x i32>> [#uses=1]
99 define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
100 ; CHECK-LABEL: test6:
102 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
103 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
104 ; CHECK-NEXT: movaps (%ecx), %xmm0
105 ; CHECK-NEXT: movaps %xmm0, (%eax)
107 %tmp1 = load <4 x float>, <4 x float>* %A ; <<4 x float>> [#uses=1]
108 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
109 store <4 x float> %tmp2, <4 x float>* %res
113 define void @test7() nounwind {
114 ; CHECK-LABEL: test7:
116 ; CHECK-NEXT: xorps %xmm0, %xmm0
117 ; CHECK-NEXT: movaps %xmm0, 0
119 bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1]
120 shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1]
121 store <4 x float> %2, <4 x float>* null
125 @x = external global [4 x i32]
127 define <2 x i64> @test8() nounwind {
128 ; CHECK-LABEL: test8:
130 ; CHECK-NEXT: movl L_x$non_lazy_ptr, %eax
131 ; CHECK-NEXT: movups (%eax), %xmm0
133 %tmp = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 0) ; <i32> [#uses=1]
134 %tmp3 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 1) ; <i32> [#uses=1]
135 %tmp5 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 2) ; <i32> [#uses=1]
136 %tmp7 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 3) ; <i32> [#uses=1]
137 %tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0 ; <<4 x i32>> [#uses=1]
138 %tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1 ; <<4 x i32>> [#uses=1]
139 %tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2 ; <<4 x i32>> [#uses=1]
140 %tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3 ; <<4 x i32>> [#uses=1]
141 %tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1]
145 define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
146 ; CHECK-LABEL: test9:
148 ; CHECK-NEXT: movups {{[0-9]+}}(%esp), %xmm0
150 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
151 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
152 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1]
153 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1]
154 ret <4 x float> %tmp13
157 define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
158 ; CHECK-LABEL: test10:
160 ; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0
162 %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
163 %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
164 %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1]
165 %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1]
166 ret <4 x float> %tmp13
169 define <2 x double> @test11(double %a, double %b) nounwind {
170 ; CHECK-LABEL: test11:
172 ; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0
174 %tmp = insertelement <2 x double> undef, double %a, i32 0 ; <<2 x double>> [#uses=1]
175 %tmp7 = insertelement <2 x double> %tmp, double %b, i32 1 ; <<2 x double>> [#uses=1]
176 ret <2 x double> %tmp7
179 define void @test12() nounwind {
180 ; CHECK-LABEL: test12:
182 ; CHECK-NEXT: movapd 0, %xmm0
183 ; CHECK-NEXT: movapd {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
184 ; CHECK-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
185 ; CHECK-NEXT: xorpd %xmm2, %xmm2
186 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
187 ; CHECK-NEXT: addps %xmm1, %xmm0
188 ; CHECK-NEXT: movaps %xmm0, 0
190 %tmp1 = load <4 x float>, <4 x float>* null ; <<4 x float>> [#uses=2]
191 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
192 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
193 %tmp4 = fadd <4 x float> %tmp2, %tmp3 ; <<4 x float>> [#uses=1]
194 store <4 x float> %tmp4, <4 x float>* null
198 define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
199 ; CHECK-LABEL: test13:
201 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
202 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
203 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
204 ; CHECK-NEXT: movaps (%edx), %xmm0
205 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
206 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
207 ; CHECK-NEXT: movaps %xmm0, (%eax)
209 %tmp3 = load <4 x float>, <4 x float>* %B ; <<4 x float>> [#uses=1]
210 %tmp5 = load <4 x float>, <4 x float>* %C ; <<4 x float>> [#uses=1]
211 %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1]
212 store <4 x float> %tmp11, <4 x float>* %res
216 define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
217 ; CHECK-LABEL: test14:
219 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
220 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
221 ; CHECK-NEXT: movaps (%ecx), %xmm1
222 ; CHECK-NEXT: movaps (%eax), %xmm2
223 ; CHECK-NEXT: movaps %xmm2, %xmm0
224 ; CHECK-NEXT: addps %xmm1, %xmm0
225 ; CHECK-NEXT: subps %xmm1, %xmm2
226 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
228 %tmp = load <4 x float>, <4 x float>* %y ; <<4 x float>> [#uses=2]
229 %tmp5 = load <4 x float>, <4 x float>* %x ; <<4 x float>> [#uses=2]
230 %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
231 %tmp21 = fsub <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1]
232 %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > ; <<4 x float>> [#uses=1]
233 ret <4 x float> %tmp27
236 define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind {
237 ; CHECK-LABEL: test15:
238 ; CHECK: ## BB#0: ## %entry
239 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
240 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
241 ; CHECK-NEXT: movapd (%ecx), %xmm0
242 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
245 %tmp = load <4 x float>, <4 x float>* %y ; <<4 x float>> [#uses=1]
246 %tmp3 = load <4 x float>, <4 x float>* %x ; <<4 x float>> [#uses=1]
247 %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1]
248 ret <4 x float> %tmp4
253 define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) {
254 ; CHECK-LABEL: test16:
256 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
257 ; CHECK-NEXT: movapd 96(%eax), %xmm0
258 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
260 %i5 = getelementptr inbounds <4 x double>, <4 x double>* %srcA, i32 3
261 %i6 = load <4 x double>, <4 x double>* %i5, align 32
262 %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
267 define fastcc void @test17() nounwind {
268 ; CHECK-LABEL: test17:
269 ; CHECK: ## BB#0: ## %entry
270 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = <u,u,32768,32768>
271 ; CHECK-NEXT: movaps %xmm0, (%eax)
274 %0 = insertelement <4 x i32> undef, i32 undef, i32 1
275 %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
276 %2 = bitcast <4 x i32> %1 to <4 x float>
277 store <4 x float> %2, <4 x float> * undef
282 define <4 x float> @f(<4 x double>) nounwind {
284 ; CHECK: ## BB#0: ## %entry
285 ; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1
286 ; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0
287 ; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
290 %double2float.i = fptrunc <4 x double> %0 to <4 x float>
291 ret <4 x float> %double2float.i
294 define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
295 ; CHECK-LABEL: test_insert_64_zext:
297 ; CHECK-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
299 %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
303 define <4 x i32> @PR19721(<4 x i32> %i) {
304 ; CHECK-LABEL: PR19721:
306 ; CHECK-NEXT: andps LCPI19_0, %xmm0
308 %bc = bitcast <4 x i32> %i to i128
309 %insert = and i128 %bc, -4294967296
310 %bc2 = bitcast i128 %insert to <4 x i32>
314 define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
315 ; CHECK-LABEL: test_mul:
317 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
318 ; CHECK-NEXT: pmuludq %xmm1, %xmm0
319 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
320 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
321 ; CHECK-NEXT: pmuludq %xmm2, %xmm1
322 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
323 ; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
325 %m = mul <4 x i32> %x, %y