From 65b74e1d00eef81b596b4c207fba069aa1eb8214 Mon Sep 17 00:00:00 2001
From: Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
Date: Thu, 21 Jul 2011 01:55:47 +0000
Subject: [PATCH] Add support for 256-bit versions of VPERMIL instruction. This
 is a new instruction introduced in AVX, which can operate on 128 and 256-bit
 vectors. It considers a 256-bit vector as two independent 128-bit lanes. It
 can permute any 32 or 64 elements inside a lane, and restricts the second
 lane to have the same permutation of the first one. With the improved splat
 support introduced early today, adding codegen for this instruction enable
 more efficient 256-bit code:

Instead of:
  vextractf128  $0, %ymm0, %xmm0
  punpcklbw %xmm0, %xmm0
  punpckhbw %xmm0, %xmm0
  vinsertf128 $0, %xmm0, %ymm0, %ymm1
  vinsertf128 $1, %xmm0, %ymm1, %ymm0
  vextractf128  $1, %ymm0, %xmm1
  shufps  $1, %xmm1, %xmm1
  movss %xmm1, 28(%rsp)
  movss %xmm1, 24(%rsp)
  movss %xmm1, 20(%rsp)
  movss %xmm1, 16(%rsp)
  vextractf128  $0, %ymm0, %xmm0
  shufps  $1, %xmm0, %xmm0
  movss %xmm0, 12(%rsp)
  movss %xmm0, 8(%rsp)
  movss %xmm0, 4(%rsp)
  movss %xmm0, (%rsp)
  vmovaps (%rsp), %ymm0
We get:
  vextractf128  $0, %ymm0, %xmm0
  punpcklbw %xmm0, %xmm0
  punpckhbw %xmm0, %xmm0
  vinsertf128 $0, %xmm0, %ymm0, %ymm1
  vinsertf128 $1, %xmm0, %ymm1, %ymm0
  vpermilps $85, %ymm0, %ymm0

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@135662 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../X86/InstPrinter/X86InstComments.cpp       | 10 +++
 lib/Target/X86/Utils/X86ShuffleDecode.cpp     | 27 ++++++++
 lib/Target/X86/Utils/X86ShuffleDecode.h       | 14 +++++
 lib/Target/X86/X86ISelLowering.cpp            | 63 +++++++++++++++++++
 lib/Target/X86/X86ISelLowering.h              |  1 +
 lib/Target/X86/X86InstrFragmentsSIMD.td       |  2 +
 lib/Target/X86/X86InstrSSE.td                 |  4 ++
 test/CodeGen/X86/avx-256-splat.ll             | 16 +++++
 8 files changed, 137 insertions(+)
 create mode 100644 test/CodeGen/X86/avx-256-splat.ll
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 4e28dfe7fa8..b2f246e4419 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -205,6 +205,16 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DecodeUNPCKHPMask(4, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
+  case X86::VPERMILPSYri:
+    DecodeVPERMILPSMask(8, MI->getOperand(2).getImm(),
+                        ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPDYri:
+    DecodeVPERMILPSMask(4, MI->getOperand(2).getImm(),
+                        ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    break;
   }
 
 
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index cd06060748b..c1ff0e5011e 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -187,4 +187,31 @@ void DecodeUNPCKLPMask(EVT VT,
   }
 }
 
+void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm,
+                        SmallVectorImpl<unsigned> &ShuffleMask) {
+  DecodeVPERMILMask(MVT::getVectorVT(MVT::i32, NElts), Imm, ShuffleMask);
+}
+
+void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm,
+                        SmallVectorImpl<unsigned> &ShuffleMask) {
+  DecodeVPERMILMask(MVT::getVectorVT(MVT::i64, NElts), Imm, ShuffleMask);
+}
+
+// DecodeVPERMILMask - Decodes VPERMIL permutes for any 128-bit
+// with 32/64-bit elements. For 256-bit vectors, it's considered
+// as two 128 lanes and the mask of the first lane should be
+// identical of the second one.
+void DecodeVPERMILMask(EVT VT, unsigned Imm,
+                       SmallVectorImpl<unsigned> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    for (unsigned i = 0; i != NumElts/NumLanes; ++i) {
+      unsigned Idx = (Imm >> (i*2)) & 0x3 ;
+      ShuffleMask.push_back(Idx+(l*NumElts/NumLanes));
+    }
+  }
+}
+
 } // llvm namespace
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index b18f6703309..4a5214028fa 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -82,6 +82,20 @@ void DecodeUNPCKLPDMask(unsigned NElts,
 void DecodeUNPCKLPMask(EVT VT,
                        SmallVectorImpl<unsigned> &ShuffleMask);
 
+
+void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm,
+                        SmallVectorImpl<unsigned> &ShuffleMask);
+
+void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm,
+                        SmallVectorImpl<unsigned> &ShuffleMask);
+
+// DecodeVPERMILMask - Decodes VPERMIL permutes for any 128-bit
+// with 32/64-bit elements. For 256-bit vectors, it's considered
+// as two 128 lanes and the mask of the first lane should be
+// identical of the second one.
+void DecodeVPERMILMask(EVT VT, unsigned Imm,
+                       SmallVectorImpl<unsigned> &ShuffleMask);
+
 } // llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index c4e44f999d0..1c827f0dd30 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -2747,6 +2747,7 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::PUNPCKHBW:
   case X86ISD::PUNPCKHDQ:
   case X86ISD::PUNPCKHQDQ:
+  case X86ISD::VPERMIL:
     return true;
   }
   return false;
@@ -2772,6 +2773,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
+  case X86ISD::VPERMIL:
     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
   }
 
@@ -3422,6 +3424,54 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
   return ::isMOVLMask(M, N->getValueType(0));
 }
 
+/// isVPERMILMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to VPERMIL*.
+static bool isVPERMILMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+
+  // Match any permutation of 128-bit vector with 32/64-bit types
+  if (NumLanes == 1) {
+    if (NumElts == 4 || NumElts == 2)
+      return true;
+    return false;
+  }
+
+  // Only match 256-bit with 32/64-bit types
+  if (NumElts != 8 && NumElts != 4)
+    return false;
+
+  // The mask on the high lane should be the same as the low. Actually,
+  // they can differ if any of the corresponding index in a lane is undef.
+  int LaneSize = NumElts/NumLanes;
+  for (int i = 0; i < LaneSize; ++i) {
+    int HighElt = i+LaneSize;
+    if (Mask[i] < 0 || Mask[HighElt] < 0)
+      continue;
+
+    if (Mask[HighElt]-Mask[i] != LaneSize)
+      return false;
+  }
+
+  return true;
+}
+
+/// getShuffleVPERMILImmediateediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERMIL* instructions.
+static unsigned getShuffleVPERMILImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VT = SVOp->getValueType(0);
+
+  int NumElts = VT.getVectorNumElements();
+  int NumLanes = VT.getSizeInBits()/128;
+
+  unsigned Mask = 0;
+  for (int i = 0; i < NumElts/NumLanes /* lane size */; ++i)
+    Mask |= SVOp->getMaskElt(i) << (i*2);
+
+  return Mask;
+}
+
 /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
 /// element of vector 2 and the other elements to come from vector 1 in order.
@@ -4097,6 +4147,10 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
       return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
                                  Depth+1);
     }
+    case X86ISD::VPERMIL:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodeVPERMILMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                        ShuffleMask);
     default:
       assert("not implemented for target shuffle node");
       return SDValue();
@@ -6043,6 +6097,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   if (NumElems == 4)
     return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
 
+  // Handle VPERMIL permutations
+  if (isVPERMILMask(M, VT)) {
+    unsigned TargetMask = getShuffleVPERMILImmediate(SVOp);
+    if (VT == MVT::v8f32)
+      return getTargetShuffleNode(X86ISD::VPERMIL, dl, VT, V1, TargetMask, DAG);
+  }
+
   return SDValue();
 }
 
@@ -9660,6 +9721,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::PUNPCKHWD:          return "X86ISD::PUNPCKHWD";
   case X86ISD::PUNPCKHDQ:          return "X86ISD::PUNPCKHDQ";
   case X86ISD::PUNPCKHQDQ:         return "X86ISD::PUNPCKHQDQ";
+  case X86ISD::VPERMIL:            return "X86ISD::VPERMIL";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
@@ -12465,6 +12527,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::PSHUFLW:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
+  case X86ISD::VPERMIL:
   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI);
   }
 
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 376aa8a4409..298d45110ff 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -271,6 +271,7 @@ namespace llvm {
       PUNPCKHWD,
       PUNPCKHDQ,
       PUNPCKHQDQ,
+      VPERMIL,
 
       // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
       // according to %al. An operator is needed so that this can be expanded
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index b00109c9fa4..f792f53b7ba 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -150,6 +150,8 @@ def X86Punpckhwd  : SDNode<"X86ISD::PUNPCKHWD", SDTShuff2Op>;
 def X86Punpckhdq  : SDNode<"X86ISD::PUNPCKHDQ", SDTShuff2Op>;
 def X86Punpckhqdq : SDNode<"X86ISD::PUNPCKHQDQ", SDTShuff2Op>;
 
+def X86VPermil : SDNode<"X86ISD::VPERMIL", SDTShuff2OpI>;
+
 //===----------------------------------------------------------------------===//
 // SSE Complex Patterns
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index ae26a80fe2d..73e465fe29e 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -5529,6 +5529,10 @@ def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
 // The AVX version of some but not all of them are described here, and more
 // should come in a near future.
 
+// Shuffle with VPERMIL instructions
+def : Pat<(v8f32 (X86VPermil VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPSYri VR256:$src1, imm:$imm)>;
+
 // Shuffle with PSHUFD instruction folding loads. The first two patterns match
 // SSE2 loads, which are always promoted to v2i64. The last one should match
 // the SSE1 case, where the only legal load is v4f32, but there is no PSHUFD
diff --git a/test/CodeGen/X86/avx-256-splat.ll b/test/CodeGen/X86/avx-256-splat.ll
new file mode 100644
index 00000000000..27ff9268b40
--- /dev/null
+++ b/test/CodeGen/X86/avx-256-splat.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+
+; FIXME: use avx versions for punpcklbw and punpckhbw
+
+; CHECK: vextractf128 $0
+; CHECK-NEXT: punpcklbw
+; CHECK-NEXT: punpckhbw
+; CHECK-NEXT: vinsertf128 $0
+; CHECK-NEXT: vinsertf128 $1
+; CHECK-NEXT: vpermilps $85
+define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
+entry:
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <32 x i8> %shuffle
+}
+
-- 
2.34.1