[X86, AVX] adjust tablegen patterns to generate better code for scalar insertion...

author Sanjay Patel <spatel@rotateright.com>

Thu, 2 Apr 2015 17:56:17 +0000 (17:56 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Thu, 2 Apr 2015 17:56:17 +0000 (17:56 +0000)
author Sanjay Patel <spatel@rotateright.com>
Thu, 2 Apr 2015 17:56:17 +0000 (17:56 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Thu, 2 Apr 2015 17:56:17 +0000 (17:56 +0000)
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td

index c2626808774c441ed8f369c892fca01e86a7ca08..65b155c0a9d1cfcf644091320ef4a8d330fc9bea 100644 (file)
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -7168,6 +7168,10 @@ let Predicates = [HasAVX2] in {
  }
  
  // Patterns
+// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
+// on targets where they have equal performance. These were changed to use
+// blends because blends have better throughput on SandyBridge and Haswell, but
+// movs[s/d] are 1-2 byte shorter instructions.
  let Predicates = [UseAVX] in {
    let AddedComplexity = 15 in {
    // Move scalar to XMM zero-extended, zeroing a VR128 then do a
@@ -7184,8 +7188,10 @@ let Predicates = [UseAVX] in {
    // Move low f32 and clear high bits.
    def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
              (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;
-  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
-            (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
+
+  // Move low f64 and clear high bits.
+  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
+            (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
    }
  
    def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
@@ -7199,14 +7205,19 @@ let Predicates = [UseAVX] in {
                             (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
                             sub_xmm)>;
  
-  // Move low f64 and clear high bits.
-  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
-            (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
-
+  // These will incur an FP/int domain crossing penalty, but it may be the only
+  // way without AVX2. Do not add any complexity because we may be able to match
+  // more optimal patterns defined earlier in this file.
+  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+            (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
    def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
              (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;
  }
  
+// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
+// on targets where they have equal performance. These were changed to use
+// blends because blends have better throughput on SandyBridge and Haswell, but
+// movs[s/d] are 1-2 byte shorter instructions.
  let Predicates = [UseSSE41] in {
    // With SSE41 we can use blends for these patterns.
    def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll

index e4aedf188583e78f56009e164460193675c4d073..1b42a637907c701f9defc81fb8d7aba984d0f238 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -843,8 +843,9 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
  define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
  ; ALL-LABEL: insert_reg_and_zero_v4f64:
  ; ALL:       # BB#0:
-; ALL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; ALL-NEXT:    # kill: XMM0<def> XMM0<kill> YMM0<def>
+; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
  ; ALL-NEXT:    retq
    %v = insertelement <4 x double> undef, double %a, i32 0
    %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll

index 6feffb83609c3cb727b52dcb891e0979a0d840aa..bb07077b5559c6e716e7dd6415e90bb6a082e827 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -133,8 +133,6 @@ define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
  ; AVX2:       # BB#0:
  ; AVX2-NEXT:    movl $7, %eax
  ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vxorps %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
  ; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
  ; AVX2-NEXT:    retq
    %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -962,8 +960,6 @@ define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
  ; AVX2:       # BB#0:
  ; AVX2-NEXT:    movl $7, %eax
  ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vxorps %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
  ; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
  ; AVX2-NEXT:    retq
    %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
author	Sanjay Patel <spatel@rotateright.com>
	Thu, 2 Apr 2015 17:56:17 +0000 (17:56 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Thu, 2 Apr 2015 17:56:17 +0000 (17:56 +0000)
lib/Target/X86/X86InstrSSE.td		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-256-v4.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-256-v8.ll		patch \| blob \| history