From 1596373671e4df54e53e79dc613545d5cf9d83bb Mon Sep 17 00:00:00 2001
From: Elena Demikhovsky <elena.demikhovsky@intel.com>
Date: Tue, 26 Jun 2012 08:04:10 +0000
Subject: [PATCH] Shuffle optimization for AVX/AVX2. The current patch
 optimizes frequently used shuffle patterns and gives these instruction
 sequence reduction. Before:       vshufps $-35, %xmm1, %xmm0, %xmm2 ## xmm2 =
 xmm0[1,3],xmm1[1,3]        vpermilps       $-40, %xmm2, %xmm2 ## xmm2 =
 xmm2[0,2,1,3]        vextractf128    $1, %ymm1, %xmm1        vextractf128   
 $1, %ymm0, %xmm0        vshufps $-35, %xmm1, %xmm0, %xmm0 ## xmm0 =
 xmm0[1,3],xmm1[1,3]        vpermilps       $-40, %xmm0, %xmm0 ## xmm0 =
 xmm0[0,2,1,3]        vinsertf128     $1, %xmm0, %ymm2, %ymm0 After:      
 vshufps $13, %ymm0, %ymm1, %ymm1 ## ymm1 =
 ymm1[1,3],ymm0[0,0],ymm1[5,7],ymm0[4,4]       vshufps $13, %ymm0, %ymm0,
 %ymm0 ## ymm0 = ymm0[1,3,0,0,5,7,4,4]       vunpcklps       %ymm1, %ymm0,
 %ymm0 ## ymm0 =
 ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@159188 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp | 52 ++++++++++++++++++++++++++++++
 test/CodeGen/X86/avx-shuffle.ll    | 21 ++++++++++++
 2 files changed, 73 insertions(+)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index ee7a6352732..e84ec2959cf 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -3496,6 +3496,53 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
   return true;
 }
 
+//
+// Some special combinations that can be optimized.
+//
+static
+SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
+                               SelectionDAG &DAG) {
+  EVT VT = SVOp->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  DebugLoc dl = SVOp->getDebugLoc();
+
+  if (VT != MVT::v8i32 && VT != MVT::v8f32)
+    return SDValue();
+
+  ArrayRef<int> Mask = SVOp->getMask();
+
+  // These are the special masks that may be optimized.
+  static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
+  static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
+  bool MatchEvenMask = true;
+  bool MatchOddMask  = true;
+  for (int i=0; i<8; ++i) {
+    if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
+      MatchEvenMask = false;
+    if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
+      MatchOddMask = false;
+  }
+  static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1};
+  static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1};
+
+  const int *CompactionMask;
+  if (MatchEvenMask)
+    CompactionMask = CompactionMaskEven;
+  else if (MatchOddMask)
+    CompactionMask = CompactionMaskOdd;
+  else
+    return SDValue();
+
+  SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
+
+  SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0),
+                                     UndefNode, CompactionMask);
+  SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1),
+                                     UndefNode, CompactionMask);
+  static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13};
+  return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask);
+}
+
 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
 static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
@@ -5982,6 +6029,11 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
 /// which could not be matched by any known target speficic shuffle
 static SDValue
 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+
+  SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
+  if (NewOp.getNode())
+    return NewOp;
+
   EVT VT = SVOp->getValueType(0);
 
   unsigned NumElems = VT.getVectorNumElements();
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index a88b0b9cadf..9b41709a3b1 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -227,3 +227,24 @@ define   <8 x float> @test17(<4 x float> %y) {
   %x = shufflevector <4 x float> %y, <4 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x float> %x
 }
+
+; CHECK: test18
+; CHECK: vshufps
+; CHECK: vshufps
+; CHECK: vunpcklps
+; CHECK: ret
+define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
+  %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x float>%S
+}
+
+; CHECK: test19
+; CHECK: vshufps
+; CHECK: vshufps
+; CHECK: vunpcklps
+; CHECK: ret
+define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
+  %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x float>%S
+}
+
-- 
2.34.1