[SKX] Enabling mask instructions: encoding, lowering

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 140b2a830ce029f64529244364f1ed042ba2af04..cba145208da63a4ff91e9bde0550dd7c376bb261 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -519,10 +519,21 @@ void X86TargetLowering::resetOperationActions() {
    // If we don't have F16C support, then lower half float conversions
    // into library calls.
    if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
-    setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand);
-    setOperationAction(ISD::FP32_TO_FP16, MVT::i16, Expand);
+    setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+    setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
    }
  
+  // There's never any support for operations beyond MVT::f32.
+  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+  setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f80, MVT::f16, Expand);
+
    if (Subtarget->hasPOPCNT()) {
      setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
    } else {
@@ -878,7 +889,12 @@ void X86TargetLowering::resetOperationActions() {
                            (MVT::SimpleValueType)InnerVT, Expand);
      setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
      setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
-    setLoadExtAction(ISD::EXTLOAD, VT, Expand);
+
+    // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types,
+    // we have to deal with them whether we ask for Expansion or not. Setting
+    // Expand causes its own optimisation problems though, so leave them legal.
+    if (VT.getVectorElementType() == MVT::i1)
+      setLoadExtAction(ISD::EXTLOAD, VT, Expand);
    }
  
    // FIXME: In order to prevent SSE instructions being expanded to MMX ones
@@ -1489,6 +1505,11 @@ void X86TargetLowering::resetOperationActions() {
      }
    }// has  AVX-512
  
+  if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
+    addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
+    addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
+  }
+
    // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
    // of this type with custom code.
    for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
@@ -2296,6 +2317,10 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
          RC = &X86::VK8RegClass;
        else if (RegVT == MVT::v16i1)
          RC = &X86::VK16RegClass;
+      else if (RegVT == MVT::v32i1)
+        RC = &X86::VK32RegClass;
+      else if (RegVT == MVT::v64i1)
+        RC = &X86::VK64RegClass;
        else
          llvm_unreachable("Unknown argument type!");
  
@@ -3043,7 +3068,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  //  If a tail called function callee has more arguments than the caller the
  //  caller needs to make sure that there is room to move the RETADDR to. This is
  //  achieved by reserving an area the size of the argument delta right after the
-//  original REtADDR, but before the saved framepointer or the spilled registers
+//  original RETADDR, but before the saved framepointer or the spilled registers
  //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
  //  stack layout:
  //    arg1
@@ -4751,28 +4776,6 @@ bool X86::isZeroNode(SDValue Elt) {
    return false;
  }
  
-/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
-/// their permute mask.
-static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
-                                    SelectionDAG &DAG) {
-  MVT VT = SVOp->getSimpleValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
-  SmallVector<int, 8> MaskVec;
-
-  for (unsigned i = 0; i != NumElems; ++i) {
-    int Idx = SVOp->getMaskElt(i);
-    if (Idx >= 0) {
-      if (Idx < (int)NumElems)
-        Idx += NumElems;
-      else
-        Idx -= NumElems;
-    }
-    MaskVec.push_back(Idx);
-  }
-  return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1),
-                              SVOp->getOperand(0), &MaskVec[0]);
-}
-
  /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
  /// match movhlps. The lower half elements should come from upper half of
  /// V1 (and in order), and the upper half elements should come from the upper
@@ -7694,6 +7697,9 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    if (isSingleInputShuffleMask(Mask)) {
      // Check whether we can widen this to an i16 shuffle by duplicating bytes.
      // Notably, this handles splat and partial-splat shuffles more efficiently.
+    // However, it only makes sense if the pre-duplication shuffle simplifies
+    // things significantly. Currently, this means we need to be able to
+    // express the pre-duplication shuffle as an i16 shuffle.
      //
      // FIXME: We should check for other patterns which can be widened into an
      // i16 shuffle as well.
@@ -7704,7 +7710,9 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
        }
        return true;
      };
-    if (canWidenViaDuplication(Mask)) {
+    auto tryToWidenViaDuplication = [&]() -> SDValue {
+      if (!canWidenViaDuplication(Mask))
+        return SDValue();
        SmallVector<int, 4> LoInputs;
        std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
                     [](int M) { return M >= 0 && M < 8; });
@@ -7722,52 +7730,57 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
        ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
        ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
  
-      int ByteMask[16];
+      int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
        SmallDenseMap<int, int, 8> LaneMap;
-      for (int i = 0; i < 16; ++i)
-        ByteMask[i] = -1;
        for (int I : InPlaceInputs) {
-        ByteMask[I] = I;
+        PreDupI16Shuffle[I/2] = I/2;
          LaneMap[I] = I;
        }
-      int FreeByteIdx = 0;
-      int TargetOffset = TargetLo ? 0 : 8;
-      for (int I : MovingInputs) {
-        // Walk the free index into the byte mask until we find an unoccupied
-        // spot. We bound this to 8 steps to catch bugs, the pigeonhole
-        // principle indicates that there *must* be a spot as we can only have
-        // 8 duplicated inputs. We have to walk the index using modular
-        // arithmetic to wrap around as necessary.
-        // FIXME: We could do a much better job of picking an inexpensive slot
-        // so this doesn't go through the worst case for the byte shuffle.
-        for (int j = 0; j < 8 && ByteMask[FreeByteIdx + TargetOffset] != -1;
-             ++j, FreeByteIdx = (FreeByteIdx + 1) % 8)
-          ;
-        assert(ByteMask[FreeByteIdx + TargetOffset] == -1 &&
-               "Failed to find a free byte!");
-        ByteMask[FreeByteIdx + TargetOffset] = I;
-        LaneMap[I] = FreeByteIdx + TargetOffset;
+      int j = TargetLo ? 0 : 4, je = j + 4;
+      for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
+        // Check if j is already a shuffle of this input. This happens when
+        // there are two adjacent bytes after we move the low one.
+        if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
+          // If we haven't yet mapped the input, search for a slot into which
+          // we can map it.
+          while (j < je && PreDupI16Shuffle[j] != -1)
+            ++j;
+
+          if (j == je)
+            // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
+            return SDValue();
+
+          // Map this input with the i16 shuffle.
+          PreDupI16Shuffle[j] = MovingInputs[i] / 2;
+        }
+
+        // Update the lane map based on the mapping we ended up with.
+        LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
        }
-      V1 = DAG.getVectorShuffle(MVT::v16i8, DL, V1, DAG.getUNDEF(MVT::v16i8),
-                                ByteMask);
-      for (int &M : Mask)
-        if (M != -1)
-          M = LaneMap[M];
+      V1 = DAG.getNode(
+          ISD::BITCAST, DL, MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL,
+                               DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+                               DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
  
        // Unpack the bytes to form the i16s that will be shuffled into place.
        V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
                         MVT::v16i8, V1, V1);
  
-      int I16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+      int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
        for (int i = 0; i < 16; i += 2) {
          if (Mask[i] != -1)
-          I16Shuffle[i / 2] = Mask[i] - (TargetLo ? 0 : 8);
-        assert(I16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!");
+          PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
+        assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!");
        }
-      return DAG.getVectorShuffle(MVT::v8i16, DL,
-                                  DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
-                                  DAG.getUNDEF(MVT::v8i16), I16Shuffle);
-    }
+      return DAG.getNode(
+          ISD::BITCAST, DL, MVT::v16i8,
+          DAG.getVectorShuffle(MVT::v8i16, DL,
+                               DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+                               DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
+    };
+    if (SDValue V = tryToWidenViaDuplication())
+      return V;
    }
  
    // Check whether an interleaving lowering is likely to be more efficient.
@@ -7815,16 +7828,42 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
    SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
  
-  auto buildLoAndHiV8s =
-      [&](SDValue V, ArrayRef<int> LoBlendMask, ArrayRef<int> HiBlendMask) {
-    SDValue LoV =
-        DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                    DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
-    SDValue HiV =
-        DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                    DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
-    SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, LoV, HiV, LoBlendMask);
-    SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, LoV, HiV, HiBlendMask);
+  auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
+                             MutableArrayRef<int> HiBlendMask) {
+    SDValue V1, V2;
+    // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
+    // them out and avoid using UNPCK{L,H} to extract the elements of V as
+    // i16s.
+    if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
+                     [](int M) { return M >= 0 && M % 2 == 1; }) &&
+        std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
+                     [](int M) { return M >= 0 && M % 2 == 1; })) {
+      // Use a mask to drop the high bytes.
+      V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
+      V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
+                       DAG.getConstant(0x00FF, MVT::v8i16));
+
+      // This will be a single vector shuffle instead of a blend so nuke V2.
+      V2 = DAG.getUNDEF(MVT::v8i16);
+
+      // Squash the masks to point directly into V1.
+      for (int &M : LoBlendMask)
+        if (M >= 0)
+          M /= 2;
+      for (int &M : HiBlendMask)
+        if (M >= 0)
+          M /= 2;
+    } else {
+      // Otherwise just unpack the low half of V into V1 and the high half into
+      // V2 so that we can blend them as i16s.
+      V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                       DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
+      V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                       DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
+    }
+
+    SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
+    SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
      return std::make_pair(BlendedLo, BlendedHi);
    };
    SDValue V1Lo, V1Hi, V2Lo, V2Hi;
@@ -7900,7 +7939,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
    // but in some cases the first operand may be transformed to UNDEF.
    // In this case we should just commute the node.
    if (V1IsUndef)
-    return CommuteVectorShuffle(SVOp, DAG);
+    return DAG.getCommutedVectorShuffle(*SVOp);
  
    // Check for non-undef masks pointing at an undef vector and make the masks
    // undef as well. This makes it easier to match the shuffle based solely on
@@ -7946,7 +7985,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
    // V2. This allows us to match the shuffle pattern strictly on how many
    // elements come from V1 without handling the symmetric cases.
    if (NumV2Elements > NumV1Elements)
-    return CommuteVectorShuffle(SVOp, DAG);
+    return DAG.getCommutedVectorShuffle(*SVOp);
  
    // When the number of V1 and V2 elements are the same, try to minimize the
    // number of uses of V2 in the low half of the vector.
@@ -7958,7 +7997,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
        else if (M >= 0)
          ++LowV1Elements;
      if (LowV2Elements > LowV1Elements)
-      return CommuteVectorShuffle(SVOp, DAG);
+      return DAG.getCommutedVectorShuffle(*SVOp);
    }
  
    // For each vector width, delegate to a specialized lowering routine.
@@ -9242,7 +9281,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
    // but in some cases the first operand may be transformed to UNDEF.
    // In this case we should just commute the node.
    if (V1IsUndef)
-    return CommuteVectorShuffle(SVOp, DAG);
+    return DAG.getCommutedVectorShuffle(*SVOp);
  
    // Vector shuffle lowering takes 3 steps:
    //
@@ -9354,7 +9393,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
  
    if (ShouldXformToMOVHLPS(M, VT) ||
        ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
-    return CommuteVectorShuffle(SVOp, DAG);
+    return DAG.getCommutedVectorShuffle(*SVOp);
  
    if (isShift) {
      // No better options. Use a vshldq / vsrldq.
@@ -9426,7 +9465,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
  
    // Normalize the node to match x86 shuffle ops if needed
    if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
-    return CommuteVectorShuffle(SVOp, DAG);
+    return DAG.getCommutedVectorShuffle(*SVOp);
  
    // The checks below are all present in isShuffleMaskLegal, but they are
    // inlined here right now to enable us to directly emit target specific
@@ -15120,10 +15159,23 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
    assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
           (VT == MVT::v8i32 && Subtarget->hasInt256()));
  
-  // Get the high parts.
+  // PMULxD operations multiply each even value (starting at 0) of LHS with
+  // the related value of RHS and produce a widen result.
+  // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+  // => <2 x i64> <ae|cg>
+  //
+  // In other word, to have all the results, we need to perform two PMULxD:
+  // 1. one with the even values.
+  // 2. one with the odd values.
+  // To achieve #2, with need to place the odd values at an even position.
+  //
+  // Place the odd value at an even position (basically, shift all values 1
+  // step to the left):
    const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
-  SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
-  SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
+  // <a|b|c|d> => <b|undef|d|undef>
+  SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
+  // <e|f|g|h> => <f|undef|h|undef>
+  SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
  
    // Emit two multiplies, one for the lower 2 ints and one for the higher 2
    // ints.
@@ -15131,22 +15183,39 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
    bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
    unsigned Opcode =
        (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+  // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
+  // => <2 x i64> <ae|cg>
    SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
                               DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+  // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
+  // => <2 x i64> <bf|dh>
    SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
-                             DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1));
+                             DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
  
    // Shuffle it back into the right order.
+  // The internal representation is big endian.
+  // In other words, a i64 bitcasted to 2 x i32 has its high part at index 0
+  // and its low part at index 1.
+  // Moreover, we have: Mul1 = <ae|cg> ; Mul2 = <bf|dh>
+  // Vector index                0 1   ;          2 3
+  // We want      <ae|bf|cg|dh>
+  // Vector index   0  2  1  3
+  // Since each element is seen as 2 x i32, we get:
+  // high_mask[i] = 2 x vector_index[i]
+  // low_mask[i] = 2 x vector_index[i] + 1
+  // where vector_index = {0, Size/2, 1, Size/2 + 1, ...,
+  //                       Size/2 - 1, Size/2 + Size/2 - 1}
+  // where Size is the number of element of the final vector.
    SDValue Highs, Lows;
    if (VT == MVT::v8i32) {
-    const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
+    const int HighMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
      Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
-    const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
+    const int LowMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
      Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
    } else {
-    const int HighMask[] = {1, 5, 3, 7};
+    const int HighMask[] = {0, 4, 2, 6};
      Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
-    const int LowMask[] = {0, 4, 2, 6};
+    const int LowMask[] = {1, 5, 3, 7};
      Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
    }
  
@@ -15164,7 +15233,9 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
      Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
    }
  
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows);
+  // The low part of a MUL_LOHI is supposed to be the first value and the
+  // high part the second value.
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Lows, Highs);
  }
  
  static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
@@ -16812,6 +16883,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
    return (SVT.getVectorNumElements() == 2 ||
            ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
            isMOVLMask(M, SVT) ||
+          isMOVHLPSMask(M, SVT) ||
            isSHUFPMask(M, SVT) ||
            isPSHUFDMask(M, SVT) ||
            isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
@@ -18456,6 +18528,39 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
          return false;
  
        continue;
+
+    case X86ISD::UNPCKL:
+    case X86ISD::UNPCKH:
+      // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
+      // shuffle into a preceding word shuffle.
+      if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
+        return false;
+
+      // Search for a half-shuffle which we can combine with.
+      unsigned CombineOp =
+          V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
+      if (V.getOperand(0) != V.getOperand(1) ||
+          !V->isOnlyUserOf(V.getOperand(0).getNode()))
+        return false;
+      V = V.getOperand(0);
+      do {
+        switch (V.getOpcode()) {
+        default:
+          return false; // Nothing to combine.
+
+        case X86ISD::PSHUFLW:
+        case X86ISD::PSHUFHW:
+          if (V.getOpcode() == CombineOp)
+            break;
+
+          // Fallthrough!
+        case ISD::BITCAST:
+          V = V.getOperand(0);
+          continue;
+        }
+        break;
+      } while (V.hasOneUse());
+      break;
      }
      // Break out of the loop if we break out of the switch.
      break;
@@ -18472,7 +18577,7 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
    SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
    for (int &M : Mask)
      M = VMask[M];
-  V = DAG.getNode(X86ISD::PSHUFD, DL, V.getValueType(), V.getOperand(0),
+  V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
                    getV4X86ShuffleImm8ForMask(Mask, DAG));
  
    // It is possible that one of the combinable shuffles was completely absorbed
@@ -18625,6 +18730,47 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
        return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
      }
  
+    // Look for shuffle patterns which can be implemented as a single unpack.
+    // FIXME: This doesn't handle the location of the PSHUFD generically, and
+    // only works when we have a PSHUFD followed by two half-shuffles.
+    if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
+        (V.getOpcode() == X86ISD::PSHUFLW ||
+         V.getOpcode() == X86ISD::PSHUFHW) &&
+        V.getOpcode() != N.getOpcode() &&
+        V.hasOneUse()) {
+      SDValue D = V.getOperand(0);
+      while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
+        D = D.getOperand(0);
+      if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
+        SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+        SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
+        int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+        int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
+        int WordMask[8];
+        for (int i = 0; i < 4; ++i) {
+          WordMask[i + NOffset] = Mask[i] + NOffset;
+          WordMask[i + VOffset] = VMask[i] + VOffset;
+        }
+        // Map the word mask through the DWord mask.
+        int MappedMask[8];
+        for (int i = 0; i < 8; ++i)
+          MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
+        const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
+        const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
+        if (std::equal(std::begin(MappedMask), std::end(MappedMask),
+                       std::begin(UnpackLoMask)) ||
+            std::equal(std::begin(MappedMask), std::end(MappedMask),
+                       std::begin(UnpackHiMask))) {
+          // We can replace all three shuffles with an unpack.
+          V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
+          DCI.AddToWorklist(V.getNode());
+          return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
+                                                : X86ISD::UNPCKH,
+                             DL, MVT::v8i16, V, V);
+        }
+      }
+    }
+
      break;
  
    case X86ISD::PSHUFD:
@@ -18646,49 +18792,6 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
    SDValue N1 = N->getOperand(1);
    EVT VT = N->getValueType(0);
  
-  // Canonicalize shuffles that perform 'addsub' on packed float vectors
-  // according to the rule:
-  //  (shuffle (FADD A, B), (FSUB A, B), Mask) ->
-  //  (shuffle (FSUB A, -B), (FADD A, -B), Mask)
-  //
-  // Where 'Mask' is:
-  //  <0,5,2,7>             -- for v4f32 and v4f64 shuffles;
-  //  <0,3>                 -- for v2f64 shuffles;
-  //  <0,9,2,11,4,13,6,15>  -- for v8f32 shuffles.
-  //
-  // This helps pattern-matching more SSE3/AVX ADDSUB instructions
-  // during ISel stage.
-  if (N->getOpcode() == ISD::VECTOR_SHUFFLE &&
-      ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
-       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
-      N0->getOpcode() == ISD::FADD && N1->getOpcode() == ISD::FSUB &&
-      // Operands to the FADD and FSUB must be the same.
-      ((N0->getOperand(0) == N1->getOperand(0) &&
-        N0->getOperand(1) == N1->getOperand(1)) ||
-       // FADD is commutable. See if by commuting the operands of the FADD
-       // we would still be able to match the operands of the FSUB dag node.
-       (N0->getOperand(1) == N1->getOperand(0) &&
-        N0->getOperand(0) == N1->getOperand(1))) &&
-      N0->getOperand(0)->getOpcode() != ISD::UNDEF &&
-      N0->getOperand(1)->getOpcode() != ISD::UNDEF) {
-    
-    ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N);
-    unsigned NumElts = VT.getVectorNumElements();
-    ArrayRef<int> Mask = SV->getMask();
-    bool CanFold = true;
-
-    for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i)
-      CanFold = Mask[i] == (int)((i & 1) ? i + NumElts : i);
-
-    if (CanFold) {
-      SDValue Op0 = N1->getOperand(0);
-      SDValue Op1 = DAG.getNode(ISD::FNEG, dl, VT, N1->getOperand(1));
-      SDValue Sub = DAG.getNode(ISD::FSUB, dl, VT, Op0, Op1);
-      SDValue Add = DAG.getNode(ISD::FADD, dl, VT, Op0, Op1);
-      return DAG.getVectorShuffle(VT, dl, Sub, Add, Mask);
-    }
-  }
-
    // Don't create instructions with illegal types after legalize types has run.
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
@@ -21693,8 +21796,59 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
+static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
+                                                         SelectionDAG &DAG) {
+  // Take advantage of vector comparisons producing 0 or -1 in each lane to
+  // optimize away operation when it's from a constant.
+  //
+  // The general transformation is:
+  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
+  //       AND(VECTOR_CMP(x,y), constant2)
+  //    constant2 = UNARYOP(constant)
+
+  // Early exit if this isn't a vector operation or if the operand of the
+  // unary operation isn't a bitwise AND.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
+      N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  // Now check that the other operand of the AND is a constant splat. We could
+  // make the transformation for non-constant splats as well, but it's unclear
+  // that would be a benefit as it would not eliminate any operations, just
+  // perform one more step in scalar code before moving to the vector unit.
+  if (BuildVectorSDNode *BV =
+          dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
+    // Bail out if the vector isn't a constant splat.
+    if (!BV->getConstantSplatNode())
+      return SDValue();
+
+    // Everything checks out. Build up the new and improved node.
+    SDLoc DL(N);
+    EVT IntVT = BV->getValueType(0);
+    // Create a new constant of the appropriate type for the transformed
+    // DAG.
+    SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
+    // The AND node needs bitcasts to/from an integer vector type around it.
+    SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
+    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
+                                 N->getOperand(0)->getOperand(0), MaskConst);
+    SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
+    return Res;
+  }
+
+  return SDValue();
+}
+
  static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
                                          const X86TargetLowering *XTLI) {
+  // First try to optimize away the conversion entirely when it's
+  // conditionally from a constant. Vectors only.
+  SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
+  if (Res != SDValue())
+    return Res;
+
+  // Now move on to more general possibilities.
    SDValue Op0 = N->getOperand(0);
    EVT InVT = Op0->getValueType(0);