[x86] Tweak the rules surrounding 0,0 and 1,1 v2f64 shuffles and add

author Chandler Carruth <chandlerc@gmail.com>

Sun, 7 Sep 2014 12:02:14 +0000 (12:02 +0000)

committer Chandler Carruth <chandlerc@gmail.com>

Sun, 7 Sep 2014 12:02:14 +0000 (12:02 +0000)
author Chandler Carruth <chandlerc@gmail.com>
Sun, 7 Sep 2014 12:02:14 +0000 (12:02 +0000)
committer Chandler Carruth <chandlerc@gmail.com>
Sun, 7 Sep 2014 12:02:14 +0000 (12:02 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index fab65738f707f7571e403e9fd3b6c487d34c8e58..c1abeaaa3f2804f0e8381268803ca3e816c55f8f 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -19320,26 +19320,42 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
    // Use the float domain if the operand type is a floating point type.
    bool FloatDomain = VT.isFloatingPoint();
  
-  // If we don't have access to VEX encodings, the generic PSHUF instructions
-  // are preferable to some of the specialized forms despite requiring one more
-  // byte to encode because they can implicitly copy.
+  // For floating point shuffles, we don't have free copies in the shuffle
+  // instructions, so this always makes sense to canonicalize.
    //
-  // IF we *do* have VEX encodings, than we can use shorter, more specific
+  // For integer shuffles, if we don't have access to VEX encodings, the generic
+  // PSHUF instructions are preferable to some of the specialized forms despite
+  // requiring one more byte to encode because they can implicitly copy.
+  //
+  // IF we *do* have VEX encodings, then we can use shorter, more specific
    // shuffle instructions freely as they can copy due to the extra register
    // operand.
-  if (Subtarget->hasAVX()) {
+  if (FloatDomain || Subtarget->hasAVX()) {
      // We have both floating point and integer variants of shuffles that dup
      // either the low or high half of the vector.
      if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
        bool Lo = Mask.equals(0, 0);
-      unsigned Shuffle = FloatDomain ? (Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS)
-                                     : (Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH);
+      unsigned Shuffle;
+      // If the input is a floating point, check if we have SSE3 which will let
+      // us use MOVDDUP. That instruction is no slower than UNPCKLPD but has the
+      // option to fold the input operand into even an unaligned memory load.
+      if (FloatDomain && Lo && Subtarget->hasSSE3()) {
+        Shuffle = X86ISD::MOVDDUP;
+      } else {
+        // We model everything else using UNPCK instructions. While MOVLHPS and
+        // MOVHLPS are shorter encodings they cannot accept a memory operand
+        // which overly constrains subsequent lowering.
+        Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+      }
        if (Depth == 1 && Root->getOpcode() == Shuffle)
          return false; // Nothing to do!
-      MVT ShuffleVT = FloatDomain ? MVT::v4f32 : MVT::v2i64;
+      MVT ShuffleVT = FloatDomain ? MVT::v2f64 : MVT::v2i64;
        Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
        DCI.AddToWorklist(Op.getNode());
-      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+      if (Shuffle == X86ISD::MOVDDUP)
+        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
+      else
+        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
        DCI.AddToWorklist(Op.getNode());
        DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
                      /*AddTo*/ true);
diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll

index d0e8dfd242a481bf56b627238240334cc8f19f3c..e650d75e117e33811b6f26ca238e79fd887df328 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -1,4 +1,5 @@
  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE3
  
  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-unknown-unknown"
@@ -48,7 +49,7 @@ define <2 x i64> @shuffle_v2i64_33(<2 x i64> %a, <2 x i64> %b) {
  
  define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) {
  ; CHECK-SSE2-LABEL: @shuffle_v2f64_00
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE2:         unpcklpd {{.*}} # xmm0 = xmm0[0,0]
  ; CHECK-SSE2-NEXT:    retq
    %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0>
    ret <2 x double> %shuffle
@@ -62,17 +63,15 @@ define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) {
  }
  define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) {
  ; CHECK-SSE2-LABEL: @shuffle_v2f64_11
-; CHECK-SSE2:         shufpd {{.*}} # xmm0 = xmm0[1,1]
+; CHECK-SSE2:         unpckhpd {{.*}} # xmm0 = xmm0[1,1]
  ; CHECK-SSE2-NEXT:    retq
    %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
    ret <2 x double> %shuffle
  }
  define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) {
-; FIXME: Should these use movapd + shufpd to remove a domain change at the cost
-;        of a mov?
-;
  ; CHECK-SSE2-LABEL: @shuffle_v2f64_22
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[0,1,0,1]
+; CHECK-SSE2:         unpcklpd {{.*}} # xmm1 = xmm1[0,0]
+; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
  ; CHECK-SSE2-NEXT:    retq
    %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 2>
    ret <2 x double> %shuffle
@@ -86,7 +85,8 @@ define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) {
  }
  define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
  ; CHECK-SSE2-LABEL: @shuffle_v2f64_33
-; CHECK-SSE2:         pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
+; CHECK-SSE2:         unpckhpd {{.*}} # xmm1 = xmm1[1,1]
+; CHECK-SSE2-NEXT:    movapd %xmm1, %xmm0
  ; CHECK-SSE2-NEXT:    retq
    %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3>
    ret <2 x double> %shuffle
@@ -217,3 +217,31 @@ define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
    %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
    ret <2 x i64> %shuffle
  }
+
+
+define <2 x double> @insert_dup_reg_v2f64(double %a) {
+; CHECK-SSE2-LABEL: @insert_dup_reg_v2f64
+; CHECK-SSE2:         unpcklpd {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-SSE3-LABEL: @insert_dup_reg_v2f64
+; CHECK-SSE3:         unpcklpd {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE3-NEXT:    retq
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %shuffle
+}
+define <2 x double> @insert_dup_mem_v2f64(double* %ptr) {
+; CHECK-SSE2-LABEL: @insert_dup_mem_v2f64
+; CHECK-SSE2:         movsd {{.*}}, %xmm0
+; CHECK-SSE2-NEXT:    unpcklpd {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE2-NEXT:    retq
+;
+; CHECK-SSE3-LABEL: @insert_dup_mem_v2f64
+; CHECK-SSE3:         movddup {{.*}}, %xmm0
+; CHECK-SSE3-NEXT:    retq
+  %a = load double* %ptr
+  %v = insertelement <2 x double> undef, double %a, i32 0
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll

index cde96dbb30f0e73c7228869a3a5ab279f3ead659..1c8bdf57686d9b69256bcfd30c3caaa36def68d3 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -98,7 +98,7 @@ define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
  define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
  ; AVX1-LABEL: @shuffle_v4f64_0001
  ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm1 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm1 = xmm0[0,0]
  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
  ; AVX1-NEXT:    retq
    %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
@@ -109,7 +109,7 @@ define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
  ; AVX1:       # BB#0:
  ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
  ; AVX1-NEXT:    vunpcklpd {{.*}} # xmm1 = xmm1[0],xmm0[0]
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
  ; AVX1-NEXT:    retq
    %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
@@ -120,7 +120,7 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
  ; AVX1:       # BB#0:
  ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
  ; AVX1-NEXT:    vshufpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
  ; AVX1-NEXT:    retq
    %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
@@ -130,7 +130,7 @@ define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
  ; AVX1-LABEL: @shuffle_v4f64_1000
  ; AVX1:       # BB#0:
  ; AVX1-NEXT:    vshufpd {{.*}} # xmm1 = xmm0[1,0]
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
  ; AVX1-NEXT:    retq
    %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
@@ -140,8 +140,8 @@ define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
  ; AVX1-LABEL: @shuffle_v4f64_2200
  ; AVX1:       # BB#0:
  ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm1 = xmm1[0,0]
-; AVX1-NEXT:    vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
  ; AVX1-NEXT:    retq
    %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
@@ -152,7 +152,7 @@ define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
  ; AVX1:       # BB#0:
  ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
  ; AVX1-NEXT:    vshufpd {{.*}} # xmm0 = xmm1[1],xmm0[0]
-; AVX1-NEXT:    vmovhlps {{.*}} # xmm1 = xmm1[1,1]
+; AVX1-NEXT:    vunpckhpd {{.*}} # xmm1 = xmm1[1,1]
  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
  ; AVX1-NEXT:    retq
    %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
author	Chandler Carruth <chandlerc@gmail.com>
	Sun, 7 Sep 2014 12:02:14 +0000 (12:02 +0000)
committer	Chandler Carruth <chandlerc@gmail.com>
	Sun, 7 Sep 2014 12:02:14 +0000 (12:02 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-128-v2.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-256-v4.ll		patch \| blob \| history