//===---------------------------------------------------------------------===//
-int main() {
- __m64 A[1] = { _mm_cvtsi32_si64(1) };
- __m64 B[1] = { _mm_cvtsi32_si64(10) };
- __m64 sum = _mm_cvtsi32_si64(0);
+We generate crappy code for this:
- sum = __builtin_ia32_paddq(__builtin_ia32_paddq(A[0], B[0]), sum);
-
- printf("Sum = %d\n", _mm_cvtsi64_si32(sum));
- return 0;
+__m64 t() {
+ return _mm_cvtsi32_si64(1);
}
-Generates:
-
- movl $11, %eax
-### movd %eax, %mm0
-### movq %mm0, 8(%esp)
-### movl 8(%esp), %eax
- movl %eax, 4(%esp)
- movl $_str, (%esp)
- call L_printf$stub
- xorl %eax, %eax
- addl $28, %esp
-
-These instructions are unnecessary.
+_t:
+ subl $12, %esp
+ movl $1, %eax
+ movd %eax, %mm0
+ movq %mm0, (%esp)
+ movl (%esp), %eax
+ movl 4(%esp), %edx
+ addl $12, %esp
+ ret
+
+The extra stack traffic is covered in the previous entry. But the other reason
+is we are not smart about materializing constants in MMX registers. With -m64
+
+ movl $1, %eax
+ movd %eax, %mm0
+ movd %mm0, %rax
+ ret
+
+We should be using a constantpool load instead:
+ movq LC0(%rip), %rax