some notes about suboptimal insertps's

author Chris Lattner <sabre@nondot.org>

Mon, 5 Jul 2010 05:48:41 +0000 (05:48 +0000)

committer Chris Lattner <sabre@nondot.org>

Mon, 5 Jul 2010 05:48:41 +0000 (05:48 +0000)
author Chris Lattner <sabre@nondot.org>
Mon, 5 Jul 2010 05:48:41 +0000 (05:48 +0000)
committer Chris Lattner <sabre@nondot.org>
Mon, 5 Jul 2010 05:48:41 +0000 (05:48 +0000)
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt

index d761bde646fb54fc742874615bd1d18da076ec89..2a8506fd4f8c5147d2783278a387540d00a0493f 100644 (file)
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -846,3 +846,34 @@ This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
  doing a shuffle from v[1] to v[0] then a float store.
  
  //===---------------------------------------------------------------------===//
+
+On SSE4 machines, we compile this code:
+
+define <2 x float> @test2(<2 x float> %Q, <2 x float> %R,
+       <2 x float> *%P) nounwind {
+  %Z = fadd <2 x float> %Q, %R
+
+  store <2 x float> %Z, <2 x float> *%P
+  ret <2 x float> %Z
+}
+
+into:
+
+_test2:                                 ## @test2
+## BB#0:
+       insertps        $0, %xmm2, %xmm2
+       insertps        $16, %xmm3, %xmm2
+       insertps        $0, %xmm0, %xmm3
+       insertps        $16, %xmm1, %xmm3
+       addps   %xmm2, %xmm3
+       movq    %xmm3, (%rdi)
+       movaps  %xmm3, %xmm0
+       pshufd  $1, %xmm3, %xmm1
+                                        ## kill: XMM1<def> XMM1<kill>
+       ret
+
+The insertps's of $0 are pointless complex copies.
+
+//===---------------------------------------------------------------------===//
+
+
author	Chris Lattner <sabre@nondot.org>
	Mon, 5 Jul 2010 05:48:41 +0000 (05:48 +0000)
committer	Chris Lattner <sabre@nondot.org>
	Mon, 5 Jul 2010 05:48:41 +0000 (05:48 +0000)