//===---------------------------------------------------------------------===//
-We should compile
+This:
#include <mmintrin.h>
-extern __m64 C;
-
-void baz(__v2si *A, __v2si *B)
-{
- *A = __builtin_ia32_psllq(*B, C);
- _mm_empty();
+__v2si qux(int A) {
+ return (__v2si){ 0, A };
}
-to:
+is compiled into:
+
+_qux:
+ subl $28, %esp
+ movl 32(%esp), %eax
+ movd %eax, %mm0
+ movq %mm0, (%esp)
+ movl (%esp), %eax
+ movl %eax, 20(%esp)
+ movq %mm0, 8(%esp)
+ movl 12(%esp), %eax
+ movl %eax, 16(%esp)
+ movq 16(%esp), %mm0
+ addl $28, %esp
+ ret
+
+Yuck!
+
+GCC gives us:
+
+_qux:
+ subl $12, %esp
+ movl 16(%esp), %eax
+ movl 20(%esp), %edx
+ movl $0, (%eax)
+ movl %edx, 4(%eax)
+ addl $12, %esp
+ ret $4
+
+//===---------------------------------------------------------------------===//
+
+We generate crappy code for this:
+
+__m64 t() {
+ return _mm_cvtsi32_si64(1);
+}
-.globl _baz
-_baz:
- call L3
-"L00000000001$pb":
-L3:
- popl %ecx
+_t:
subl $12, %esp
- movl 20(%esp), %eax
- movq (%eax), %mm0
- movl L_C$non_lazy_ptr-"L00000000001$pb"(%ecx), %eax
- movq (%eax), %mm1
- movl 16(%esp), %eax
- psllq %mm1, %mm0
- movq %mm0, (%eax)
- emms
+ movl $1, %eax
+ movd %eax, %mm0
+ movq %mm0, (%esp)
+ movl (%esp), %eax
+ movl 4(%esp), %edx
addl $12, %esp
ret
-not:
-
-_baz:
- subl $12, %esp
- call "L1$pb"
-"L1$pb":
- popl %eax
- movl L_C$non_lazy_ptr-"L1$pb"(%eax), %eax
- movl (%eax), %ecx
- movl %ecx, (%esp)
- movl 4(%eax), %eax
- movl %eax, 4(%esp)
- movl 20(%esp), %eax
- movq (%eax), %mm0
- movq (%esp), %mm1
- psllq %mm1, %mm0
- movl 16(%esp), %eax
- movq %mm0, (%eax)
- emms
- addl $12, %esp
+The extra stack traffic is covered in the previous entry. But the other reason
+is we are not smart about materializing constants in MMX registers. With -m64
+
+ movl $1, %eax
+ movd %eax, %mm0
+ movd %mm0, %rax
ret
+
+We should be using a constantpool load instead:
+ movq LC0(%rip), %rax