AVX-512: shufflevector for i1 vectors <2 x i1> .. <64 x i1>

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 6f337984336ec2d26f1dde18d385e6ef491d1167..dc5c0d1e352bd31c4517e400d089c4d91471f3d1 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1382,6 +1382,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
      setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
      setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i1,  Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v16i1, Custom);
      if (Subtarget->hasDQI()) {
        setOperationAction(ISD::TRUNCATE,         MVT::v2i1, Custom);
        setOperationAction(ISD::TRUNCATE,         MVT::v4i1, Custom);
@@ -1601,6 +1603,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
      setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
      setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i1, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i1, Custom);
  
      setOperationAction(ISD::SMAX,               MVT::v64i8, Legal);
      setOperationAction(ISD::SMAX,               MVT::v32i16, Legal);
@@ -1646,6 +1650,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
      setOperationAction(ISD::SELECT,             MVT::v2i1, Custom);
      setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i1, Custom);
      setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i1, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i1, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i1, Custom);
  
      setOperationAction(ISD::AND,                MVT::v8i32, Legal);
      setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
@@ -4417,14 +4423,18 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
  /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
  /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
  /// Then bitcast to their original type, ensuring they get CSE'd.
-static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
-                             SDLoc dl) {
+static SDValue getOnesVector(EVT VT, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG, SDLoc dl) {
    assert(VT.isVector() && "Expected a vector type");
  
    SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32);
    SDValue Vec;
-  if (VT.is256BitVector()) {
-    if (HasInt256) { // AVX2
+  if (VT.is512BitVector()) {
+    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
+  } else if (VT.is256BitVector()) {
+    if (Subtarget->hasInt256()) { // AVX2
        SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
        Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
      } else { // AVX
@@ -6047,7 +6057,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
        return Op;
  
      if (!VT.is512BitVector())
-      return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
+      return getOnesVector(VT, Subtarget, DAG, dl);
    }
  
    BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
@@ -10762,6 +10772,61 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
  }
  
+// Lower vXi1 vector shuffles.
+// There is no a dedicated instruction on AVX-512 that shuffles the masks.
+// The only way to shuffle bits is to sign-extend the mask vector to SIMD
+// vector, shuffle and then truncate it back.
+static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+                                      MVT VT, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  ArrayRef<int> Mask = SVOp->getMask();
+  assert(Subtarget->hasAVX512() &&
+         "Cannot lower 512-bit vectors w/o basic ISA!");
+  EVT ExtVT;
+  switch (VT.SimpleTy) {
+  default:
+    assert(false && "Expected a vector of i1 elements");
+    break;
+  case MVT::v2i1:
+    ExtVT = MVT::v2i64;
+    break;
+  case MVT::v4i1:
+    ExtVT = MVT::v4i32;
+    break;
+  case MVT::v8i1:
+    ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
+    break;
+  case MVT::v16i1:
+    ExtVT = MVT::v16i32;
+    break;
+  case MVT::v32i1:
+    ExtVT = MVT::v32i16;
+    break;
+  case MVT::v64i1:
+    ExtVT = MVT::v64i8;
+    break;
+  }
+
+  if (ISD::isBuildVectorAllZeros(V1.getNode()))
+    V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
+  else if (ISD::isBuildVectorAllOnes(V1.getNode()))
+    V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+  else
+    V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
+
+  if (V2.isUndef())
+    V2 = DAG.getUNDEF(ExtVT);
+  else if (ISD::isBuildVectorAllZeros(V2.getNode()))
+    V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
+  else if (ISD::isBuildVectorAllOnes(V2.getNode()))
+    V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+  else
+    V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
+  return DAG.getNode(ISD::TRUNCATE, DL, VT,
+                     DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask));
+}
  /// \brief Top-level lowering for x86 vector shuffles.
  ///
  /// This handles decomposition, canonicalization, and lowering of all x86
@@ -10778,8 +10843,10 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
    MVT VT = Op.getSimpleValueType();
    int NumElements = VT.getVectorNumElements();
    SDLoc dl(Op);
+  bool Is1BitVector = (VT.getScalarType() == MVT::i1);
  
-  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
+  assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
+         "Can't lower MMX shuffles");
  
    bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
    bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
@@ -10817,7 +10884,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
    // elements wider than 64 bits, but it might be interesting to form i128
    // integers to handle flipping the low and high halves of AVX 256-bit vectors.
    SmallVector<int, 16> WidenedMask;
-  if (VT.getScalarSizeInBits() < 64 &&
+  if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
        canWidenShuffleElements(Mask, WidenedMask)) {
      MVT NewEltVT = VT.isFloatingPoint()
                         ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
@@ -10894,6 +10961,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
    if (VT.getSizeInBits() == 512)
      return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
  
+  if (Is1BitVector)
+    return lower1BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
    llvm_unreachable("Unimplemented!");
  }