[x86] Teach the target shuffle mask extraction to recognize unary forms

author Chandler Carruth <chandlerc@gmail.com>

Sat, 2 Aug 2014 10:27:38 +0000 (10:27 +0000)

committer Chandler Carruth <chandlerc@gmail.com>

Sat, 2 Aug 2014 10:27:38 +0000 (10:27 +0000)
author Chandler Carruth <chandlerc@gmail.com>
Sat, 2 Aug 2014 10:27:38 +0000 (10:27 +0000)
committer Chandler Carruth <chandlerc@gmail.com>
Sat, 2 Aug 2014 10:27:38 +0000 (10:27 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index d2c410240da696443ba32fcb0b7bc971663d10be..26e0e2ea26b28969e6b3b93ffcc5ecc89d995223 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5133,30 +5133,38 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
  }
  
  /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
-/// target specific opcode. Returns true if the Mask could be calculated.
-/// Sets IsUnary to true if only uses one source.
+/// target specific opcode. Returns true if the Mask could be calculated. Sets
+/// IsUnary to true if only uses one source. Note that this will set IsUnary for
+/// shuffles which use a single input multiple times, and in those cases it will
+/// adjust the mask to only have indices within that single input.
  static bool getTargetShuffleMask(SDNode *N, MVT VT,
                                   SmallVectorImpl<int> &Mask, bool &IsUnary) {
    unsigned NumElems = VT.getVectorNumElements();
    SDValue ImmN;
  
    IsUnary = false;
+  bool IsFakeUnary = false;
    switch(N->getOpcode()) {
    case X86ISD::SHUFP:
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
      break;
    case X86ISD::UNPCKH:
      DecodeUNPCKHMask(VT, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
      break;
    case X86ISD::UNPCKL:
      DecodeUNPCKLMask(VT, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
      break;
    case X86ISD::MOVHLPS:
      DecodeMOVHLPSMask(NumElems, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
      break;
    case X86ISD::MOVLHPS:
      DecodeMOVLHPSMask(NumElems, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
      break;
    case X86ISD::PALIGNR:
      ImmN = N->getOperand(N->getNumOperands()-1);
@@ -5210,6 +5218,14 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
    default: llvm_unreachable("unknown target shuffle node");
    }
  
+  // If we have a fake unary shuffle, the shuffle mask is spread across two
+  // inputs that are actually the same node. Re-map the mask to always point
+  // into the first input.
+  if (IsFakeUnary)
+    for (int &M : Mask)
+      if (M >= (int)Mask.size())
+        M -= Mask.size();
+
    return true;
  }
  
@@ -18735,6 +18751,8 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
        bool Lo = Mask.equals(0, 0);
        unsigned Shuffle = FloatDomain ? (Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS)
                                       : (Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH);
+      if (Depth == 1 && Root->getOpcode() == Shuffle)
+        return false; // Nothing to do!
        MVT ShuffleVT = FloatDomain ? MVT::v4f32 : MVT::v2i64;
        Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
        DCI.AddToWorklist(Op.getNode());
@@ -18757,16 +18775,18 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
           Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
                       15))) {
        bool Lo = Mask[0] == 0;
+      unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+      if (Depth == 1 && Root->getOpcode() == Shuffle)
+        return false; // Nothing to do!
        MVT ShuffleVT;
        switch (Mask.size()) {
        case 4: ShuffleVT = MVT::v4i32; break;
-      case 8: ShuffleVT = MVT::v8i32; break;
-      case 16: ShuffleVT = MVT::v16i32; break;
+      case 8: ShuffleVT = MVT::v8i16; break;
+      case 16: ShuffleVT = MVT::v16i8; break;
        };
        Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
        DCI.AddToWorklist(Op.getNode());
-      Op = DAG.getNode(Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, ShuffleVT, Op,
-                       Op);
+      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
        DCI.AddToWorklist(Op.getNode());
        DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
                      /*AddTo*/ true);
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll

index 6d9b11792c7d8e8302c10c2c5a6732fee80f14b6..ca540226ee7fe47886bee196e0d1f253c8d1f650 100644 (file)
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -72,9 +72,9 @@ entry:
    ret <4 x i64> %shuffle
  }
  
-; CHECK: movlhps
+; CHECK: vpunpcklqdq
  ; CHECK-NEXT: vextractf128  $1
-; CHECK-NEXT: movlhps
+; CHECK-NEXT: vpunpcklqdq
  ; CHECK-NEXT: vinsertf128 $1
  define <4 x i64> @C(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
  entry:
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll

index 8c1b7b63850f417c65fc011e27d1f8e9021b1991..3856aeac3fded81da74445bec854ef2ba00c2979 100644 (file)
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -1,9 +1,7 @@
  ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
  
  
-; CHECK: vpunpcklbw %xmm
-; CHECK-NEXT: vpunpckhbw %xmm
-; CHECK-NEXT: vpshufd $85
+; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
  ; CHECK-NEXT: vinsertf128 $1
  define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
  entry:
@@ -21,7 +19,7 @@ entry:
  }
  
  ; CHECK: vmovq
-; CHECK-NEXT: vmovlhps %xmm
+; CHECK-NEXT: vpunpcklqdq %xmm
  ; CHECK-NEXT: vinsertf128 $1
  define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
  entry:
diff --git a/test/CodeGen/X86/exedepsfix-broadcast.ll b/test/CodeGen/X86/exedepsfix-broadcast.ll

index a18f75195631ceb0723722858bc2258f1bb4cbfe..f4539c8969c1f33eeae036f31ba094470a5506cf 100644 (file)
--- a/test/CodeGen/X86/exedepsfix-broadcast.ll
+++ b/test/CodeGen/X86/exedepsfix-broadcast.ll
@@ -93,10 +93,10 @@ define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> %
  
  
  ; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg
-; ExeDepsFix works top down, thus it coalesces vmovlhps domain with
-; vandps and there is nothing more you can do to match vmaxpd.
-; CHECK: vmovlhps
-; CHECK: vandps
+; ExeDepsFix works top down, thus it coalesces vpunpcklqdq domain with
+; vpand and there is nothing more you can do to match vmaxpd.
+; CHECK: vpunpcklqdq
+; CHECK: vpand
  ; CHECK: vmaxpd
  ; CHECK: ret
  define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) {
diff --git a/test/CodeGen/X86/vec_splat-3.ll b/test/CodeGen/X86/vec_splat-3.ll

index 754cbf41867d1bfbbbdc7b401e7fd3598fd1dc89..3ace250380df0f4a83a8da1b6a563da3a99e86ef 100644 (file)
--- a/test/CodeGen/X86/vec_splat-3.ll
+++ b/test/CodeGen/X86/vec_splat-3.ll
@@ -75,9 +75,8 @@ define <16 x i8> @shuf_16i8_8(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
         ret <16 x i8> %tmp6
  
  ; CHECK-LABEL: shuf_16i8_8:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $0
+; CHECK: pxor %[[X:xmm[0-9]+]], %[[X]]
+; CHECK-NEXT: pshufb %[[X]], %xmm0
  }
  
  define <16 x i8> @shuf_16i8_9(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
@@ -85,9 +84,7 @@ define <16 x i8> @shuf_16i8_9(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
         ret <16 x i8> %tmp6
  
  ; CHECK-LABEL: shuf_16i8_9:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $85
+; CHECK: pshufb {{.*}} # xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
  }
  
  define <16 x i8> @shuf_16i8_10(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
@@ -95,9 +92,7 @@ define <16 x i8> @shuf_16i8_10(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
         ret <16 x i8> %tmp6
  
  ; CHECK-LABEL: shuf_16i8_10:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $-86
+; CHECK: pshufb {{.*}} # xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
  }
  
  define <16 x i8> @shuf_16i8_11(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
@@ -105,9 +100,7 @@ define <16 x i8> @shuf_16i8_11(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
         ret <16 x i8> %tmp6
  
  ; CHECK-LABEL: shuf_16i8_11:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $-1
+; CHECK: pshufb {{.*}} # xmm0 = xmm0[3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
  }
  
  
@@ -124,9 +117,7 @@ define <16 x i8> @shuf_16i8_13(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
         ret <16 x i8> %tmp6
  
  ; CHECK-LABEL: shuf_16i8_13:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $85
+; CHECK: pshufb {{.*}} # xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
  }
  
  define <16 x i8> @shuf_16i8_14(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
@@ -134,9 +125,7 @@ define <16 x i8> @shuf_16i8_14(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
         ret <16 x i8> %tmp6
  
  ; CHECK-LABEL: shuf_16i8_14:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $-86
+; CHECK: pshufb {{.*}} # xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
  }
  
  define <16 x i8> @shuf_16i8_15(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
@@ -144,9 +133,7 @@ define <16 x i8> @shuf_16i8_15(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
         ret <16 x i8> %tmp6
  
  ; CHECK-LABEL: shuf_16i8_15:
-; CHECK: punpcklbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $-1
+; CHECK: pshufb {{.*}} # xmm0 = xmm0[7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
  }
  
  define <16 x i8> @shuf_16i8_16(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
@@ -154,9 +141,7 @@ define <16 x i8> @shuf_16i8_16(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
         ret <16 x i8> %tmp6
  
  ; CHECK-LABEL: shuf_16i8_16:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $0
+; CHECK: pshufb {{.*}} # xmm0 = xmm0[8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
  }
  
  define <16 x i8> @shuf_16i8_17(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
@@ -164,9 +149,7 @@ define <16 x i8> @shuf_16i8_17(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
         ret <16 x i8> %tmp6
  
  ; CHECK-LABEL: shuf_16i8_17:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $85
+; CHECK: pshufb {{.*}} # xmm0 = xmm0[9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9]
  }
  
  define <16 x i8> @shuf_16i8_18(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
@@ -174,9 +157,7 @@ define <16 x i8> @shuf_16i8_18(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
         ret <16 x i8> %tmp6
  
  ; CHECK-LABEL: shuf_16i8_18:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $-86
+; CHECK: pshufb {{.*}} # xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
  }
  
  define <16 x i8> @shuf_16i8_19(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
@@ -184,9 +165,7 @@ define <16 x i8> @shuf_16i8_19(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
         ret <16 x i8> %tmp6
  
  ; CHECK-LABEL: shuf_16i8_19:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpcklbw
-; CHECK-NEXT: pshufd $-1
+; CHECK: pshufb {{.*}} # xmm0 = xmm0[11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
  }
  
  define <16 x i8> @shuf_16i8_20(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
@@ -194,9 +173,7 @@ define <16 x i8> @shuf_16i8_20(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
         ret <16 x i8> %tmp6
  
  ; CHECK-LABEL: shuf_16i8_20:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $0
+; CHECK: pshufb {{.*}} # xmm0 = xmm0[12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12]
  }
  
  define <16 x i8> @shuf_16i8_21(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
@@ -204,9 +181,7 @@ define <16 x i8> @shuf_16i8_21(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
         ret <16 x i8> %tmp6
  
  ; CHECK-LABEL: shuf_16i8_21:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $85
+; CHECK: pshufb {{.*}} # xmm0 = xmm0[13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13]
  }
  
  define <16 x i8> @shuf_16i8_22(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
@@ -214,9 +189,7 @@ define <16 x i8> @shuf_16i8_22(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
         ret <16 x i8> %tmp6
  
  ; CHECK-LABEL: shuf_16i8_22:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $-86
+; CHECK: pshufb {{.*}} # xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
  }
  
  define <16 x i8> @shuf_16i8_23(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
@@ -224,7 +197,5 @@ define <16 x i8> @shuf_16i8_23(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
         ret <16 x i8> %tmp6
  
  ; CHECK-LABEL: shuf_16i8_23:
-; CHECK: punpckhbw
-; CHECK-NEXT: punpckhbw
-; CHECK-NEXT: pshufd $-1
+; CHECK: pshufb {{.*}} # xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
  }
author	Chandler Carruth <chandlerc@gmail.com>
	Sat, 2 Aug 2014 10:27:38 +0000 (10:27 +0000)
committer	Chandler Carruth <chandlerc@gmail.com>
	Sat, 2 Aug 2014 10:27:38 +0000 (10:27 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/avx-basic.ll		patch \| blob \| history
test/CodeGen/X86/avx-splat.ll		patch \| blob \| history
test/CodeGen/X86/exedepsfix-broadcast.ll		patch \| blob \| history
test/CodeGen/X86/vec_splat-3.ll		patch \| blob \| history