[x86] Fix a crash and wrong-code bug in the new vector lowering all

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 38a6ba0d91ff622fa2de340987685b9ca13105b2..d1540562298002955c2e24e5721389e8be9859d0 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -255,7 +255,7 @@ void X86TargetLowering::resetOperationActions() {
    else
      setSchedulingPreference(Sched::RegPressure);
    const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
+      TM.getSubtarget<X86Subtarget>().getRegisterInfo();
    setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
  
    // Bypass expensive divides on Atom when compiling with O2
@@ -1775,9 +1775,10 @@ bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
  }
  
  bool
-X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
-                                                 unsigned,
-                                                 bool *Fast) const {
+X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                  unsigned,
+                                                  unsigned,
+                                                  bool *Fast) const {
    if (Fast)
      *Fast = Subtarget->isUnalignedMemAccessFast();
    return true;
@@ -1967,8 +1968,8 @@ X86TargetLowering::LowerReturn(SDValue Chain,
  
      // Returns in ST0/ST1 are handled specially: these are pushed as operands to
      // the RET instruction and handled by the FP Stackifier.
-    if (VA.getLocReg() == X86::ST0 ||
-        VA.getLocReg() == X86::ST1) {
+    if (VA.getLocReg() == X86::FP0 ||
+        VA.getLocReg() == X86::FP1) {
        // If this is a copy from an xmm register to ST(0), use an FPExtend to
        // change the value to the FP stack register class.
        if (isScalarFPTypeInSSEReg(VA.getValVT()))
@@ -2106,33 +2107,21 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
        report_fatal_error("SSE register return with SSE disabled");
      }
  
-    SDValue Val;
-
-    // If this is a call to a function that returns an fp value on the floating
-    // point stack, we must guarantee the value is popped from the stack, so
-    // a CopyFromReg is not good enough - the copy instruction may be eliminated
-    // if the return value is not used. We use the FpPOP_RETVAL instruction
-    // instead.
-    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
-      // If we prefer to use the value in xmm registers, copy it out as f80 and
-      // use a truncate to move it from fp stack reg to xmm reg.
-      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
-      SDValue Ops[] = { Chain, InFlag };
-      Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
-                                         MVT::Other, MVT::Glue, Ops), 1);
-      Val = Chain.getValue(0);
-
-      // Round the f80 to the right size, which also moves it to the appropriate
-      // xmm register.
-      if (CopyVT != VA.getValVT())
-        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
-                          // This truncation won't change the value.
-                          DAG.getIntPtrConstant(1));
-    } else {
-      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
-                                 CopyVT, InFlag).getValue(1);
-      Val = Chain.getValue(0);
-    }
+    // If we prefer to use the value in xmm registers, copy it out as f80 and
+    // use a truncate to move it from fp stack reg to xmm reg.
+    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+        isScalarFPTypeInSSEReg(VA.getValVT()))
+      CopyVT = MVT::f80;
+
+    Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                               CopyVT, InFlag).getValue(1);
+    SDValue Val = Chain.getValue(0);
+
+    if (CopyVT != VA.getValVT())
+      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+                        // This truncation won't change the value.
+                        DAG.getIntPtrConstant(1));
+
      InFlag = Chain.getValue(2);
      InVals.push_back(Val);
    }
@@ -2468,7 +2457,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
          TotalNumXMMRegs = 0;
  
        if (IsWin64) {
-        const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
+        const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
          // Get to the caller-allocated home save location.  Add 8 to account
          // for the return address.
          int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
@@ -2732,8 +2721,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  
    // Walk the register/memloc assignments, inserting copies/loads.  In the case
    // of tail call optimization arguments are handle later.
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
      // Skip inalloca arguments, they have already been written.
      ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -3029,7 +3018,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                    RegsToPass[i].second.getValueType()));
  
    // Add a register mask operand representing the call-preserved registers.
-  const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
    const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
    assert(Mask && "Missing call preserved mask for calling convention");
    Ops.push_back(DAG.getRegisterMask(Mask));
@@ -3120,9 +3109,9 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
                                                 SelectionDAG& DAG) const {
    MachineFunction &MF = DAG.getMachineFunction();
    const TargetMachine &TM = MF.getTarget();
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
-  const TargetFrameLowering &TFI = *TM.getFrameLowering();
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      TM.getSubtargetImpl()->getRegisterInfo());
+  const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
    unsigned StackAlignment = TFI.getStackAlignment();
    uint64_t AlignMask = StackAlignment - 1;
    int64_t Offset = StackSize;
@@ -3235,8 +3224,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
  
    // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
    // emit a special epilogue.
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
    if (RegInfo->needsStackRealignment(MF))
      return false;
  
@@ -3290,7 +3279,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
      CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
      for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
        CCValAssign &VA = RVLocs[i];
-      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+      if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
          return false;
      }
    }
@@ -3349,7 +3338,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
        MachineFrameInfo *MFI = MF.getFrameInfo();
        const MachineRegisterInfo *MRI = &MF.getRegInfo();
        const X86InstrInfo *TII =
-          static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo());
+          static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
        for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
          CCValAssign &VA = ArgLocs[i];
          SDValue Arg = OutVals[i];
@@ -3419,6 +3408,7 @@ static bool MayFoldIntoStore(SDValue Op) {
  static bool isTargetShuffle(unsigned Opcode) {
    switch(Opcode) {
    default: return false;
+  case X86ISD::PSHUFB:
    case X86ISD::PSHUFD:
    case X86ISD::PSHUFHW:
    case X86ISD::PSHUFLW:
@@ -3500,8 +3490,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
  
  SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
    MachineFunction &MF = DAG.getMachineFunction();
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
    int ReturnAddrIndex = FuncInfo->getRAIndex();
  
@@ -5144,30 +5134,38 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
  }
  
  /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
-/// target specific opcode. Returns true if the Mask could be calculated.
-/// Sets IsUnary to true if only uses one source.
+/// target specific opcode. Returns true if the Mask could be calculated. Sets
+/// IsUnary to true if only uses one source. Note that this will set IsUnary for
+/// shuffles which use a single input multiple times, and in those cases it will
+/// adjust the mask to only have indices within that single input.
  static bool getTargetShuffleMask(SDNode *N, MVT VT,
                                   SmallVectorImpl<int> &Mask, bool &IsUnary) {
    unsigned NumElems = VT.getVectorNumElements();
    SDValue ImmN;
  
    IsUnary = false;
+  bool IsFakeUnary = false;
    switch(N->getOpcode()) {
    case X86ISD::SHUFP:
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
      break;
    case X86ISD::UNPCKH:
      DecodeUNPCKHMask(VT, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
      break;
    case X86ISD::UNPCKL:
      DecodeUNPCKLMask(VT, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
      break;
    case X86ISD::MOVHLPS:
      DecodeMOVHLPSMask(NumElems, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
      break;
    case X86ISD::MOVLHPS:
      DecodeMOVLHPSMask(NumElems, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
      break;
    case X86ISD::PALIGNR:
      ImmN = N->getOperand(N->getNumOperands()-1);
@@ -5189,6 +5187,67 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
      DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
      IsUnary = true;
      break;
+  case X86ISD::PSHUFB: {
+    IsUnary = true;
+    SDValue MaskNode = N->getOperand(1);
+    while (MaskNode->getOpcode() == ISD::BITCAST)
+      MaskNode = MaskNode->getOperand(0);
+
+    if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
+      // If we have a build-vector, then things are easy.
+      EVT VT = MaskNode.getValueType();
+      assert(VT.isVector() &&
+             "Can't produce a non-vector with a build_vector!");
+      if (!VT.isInteger())
+        return false;
+
+      int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
+
+      SmallVector<uint64_t, 32> RawMask;
+      for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
+        auto *CN = dyn_cast<ConstantSDNode>(MaskNode->getOperand(i));
+        if (!CN)
+          return false;
+        APInt MaskElement = CN->getAPIntValue();
+
+        // We now have to decode the element which could be any integer size and
+        // extract each byte of it.
+        for (int j = 0; j < NumBytesPerElement; ++j) {
+          // Note that this is x86 and so always little endian: the low byte is
+          // the first byte of the mask.
+          RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
+          MaskElement = MaskElement.lshr(8);
+        }
+      }
+      DecodePSHUFBMask(RawMask, Mask);
+      break;
+    }
+
+    auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
+    if (!MaskLoad)
+      return false;
+
+    SDValue Ptr = MaskLoad->getBasePtr();
+    if (Ptr->getOpcode() == X86ISD::Wrapper)
+      Ptr = Ptr->getOperand(0);
+
+    auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
+    if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
+      return false;
+
+    if (auto *C = dyn_cast<ConstantDataSequential>(MaskCP->getConstVal())) {
+      // FIXME: Support AVX-512 here.
+      if (!C->getType()->isVectorTy() ||
+          (C->getNumElements() != 16 && C->getNumElements() != 32))
+        return false;
+
+      assert(C->getType()->isVectorTy() && "Expected a vector constant.");
+      DecodePSHUFBMask(C, Mask);
+      break;
+    }
+
+    return false;
+  }
    case X86ISD::VPERMI:
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
@@ -5221,6 +5280,14 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
    default: llvm_unreachable("unknown target shuffle node");
    }
  
+  // If we have a fake unary shuffle, the shuffle mask is spread across two
+  // inputs that are actually the same node. Re-map the mask to always point
+  // into the first input.
+  if (IsFakeUnary)
+    for (int &M : Mask)
+      if (M >= (int)Mask.size())
+        M -= Mask.size();
+
    return true;
  }
  
@@ -7561,34 +7628,37 @@ static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
        if (GoodInputs.size() == 2) {
          // If the low inputs are spread across two dwords, pack them into
          // a single dword.
-        MoveMask[Mask[GoodInputs[0]] % 2 + MoveOffset] =
-            Mask[GoodInputs[0]] - MaskOffset;
-        MoveMask[Mask[GoodInputs[1]] % 2 + MoveOffset] =
-            Mask[GoodInputs[1]] - MaskOffset;
-        Mask[GoodInputs[0]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset;
-        Mask[GoodInputs[1]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset;
+        MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
+        MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
+        Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
+        Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
        } else {
-        // Otherwise pin the low inputs.
+        // Otherwise pin the good inputs.
          for (int GoodInput : GoodInputs)
            MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
        }
  
-      int MoveMaskIdx =
-          std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) -
-          std::begin(MoveMask);
-      assert(MoveMaskIdx >= MoveOffset && "Established above");
-
        if (BadInputs.size() == 2) {
+        // If we have two bad inputs then there may be either one or two good
+        // inputs fixed in place. Find a fixed input, and then find the *other*
+        // two adjacent indices by using modular arithmetic.
+        int GoodMaskIdx =
+            std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
+                         [](int M) { return M >= 0; }) -
+            std::begin(MoveMask);
+        int MoveMaskIdx =
+            (((GoodMaskIdx - MoveOffset) & ~1) + 2 % 4) + MoveOffset;
          assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
          assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
-        MoveMask[MoveMaskIdx + Mask[BadInputs[0]] % 2] =
-            Mask[BadInputs[0]] - MaskOffset;
-        MoveMask[MoveMaskIdx + Mask[BadInputs[1]] % 2] =
-            Mask[BadInputs[1]] - MaskOffset;
-        Mask[BadInputs[0]] = MoveMaskIdx + Mask[BadInputs[0]] % 2 + MaskOffset;
-        Mask[BadInputs[1]] = MoveMaskIdx + Mask[BadInputs[1]] % 2 + MaskOffset;
+        MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
+        MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
+        Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
+        Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
        } else {
          assert(BadInputs.size() == 1 && "All sizes handled");
+      int MoveMaskIdx =
+          std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) -
+          std::begin(MoveMask);
          MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
          Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
        }
@@ -7699,6 +7769,74 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
                       DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
  }
  
+/// \brief Check whether a compaction lowering can be done by dropping even
+/// elements and compute how many times even elements must be dropped.
+///
+/// This handles shuffles which take every Nth element where N is a power of
+/// two. Example shuffle masks:
+///
+///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
+///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
+///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
+///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
+///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
+///
+/// Any of these lanes can of course be undef.
+///
+/// This routine only supports N <= 3.
+/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
+/// for larger N.
+///
+/// \returns N above, or the number of times even elements must be dropped if
+/// there is such a number. Otherwise returns zero.
+static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
+  // Figure out whether we're looping over two inputs or just one.
+  bool IsSingleInput = isSingleInputShuffleMask(Mask);
+
+  // The modulus for the shuffle vector entries is based on whether this is
+  // a single input or not.
+  int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
+  assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
+         "We should only be called with masks with a power-of-2 size!");
+
+  uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
+
+  // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
+  // and 2^3 simultaneously. This is because we may have ambiguity with
+  // partially undef inputs.
+  bool ViableForN[3] = {true, true, true};
+
+  for (int i = 0, e = Mask.size(); i < e; ++i) {
+    // Ignore undef lanes, we'll optimistically collapse them to the pattern we
+    // want.
+    if (Mask[i] == -1)
+      continue;
+
+    bool IsAnyViable = false;
+    for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+      if (ViableForN[j]) {
+        uint64_t N = j + 1;
+
+        // The shuffle mask must be equal to (i * 2^N) % M.
+        if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
+          IsAnyViable = true;
+        else
+          ViableForN[j] = false;
+      }
+    // Early exit if we exhaust the possible powers of two.
+    if (!IsAnyViable)
+      break;
+  }
+
+  for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+    if (ViableForN[j])
+      return j + 1;
+
+  // Return 0 as there is no viable power of two.
+  return 0;
+}
+
  /// \brief Generic lowering of v16i8 shuffles.
  ///
  /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
@@ -7838,6 +7976,79 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds);
    }
  
+  // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
+  // with PSHUFB. It is important to do this before we attempt to generate any
+  // blends but after all of the single-input lowerings. If the single input
+  // lowerings can find an instruction sequence that is faster than a PSHUFB, we
+  // want to preserve that and we can DAG combine any longer sequences into
+  // a PSHUFB in the end. But once we start blending from multiple inputs,
+  // the complexity of DAG combining bad patterns back into PSHUFB is too high,
+  // and there are *very* few patterns that would actually be faster than the
+  // PSHUFB approach because of its ability to zero lanes.
+  //
+  // FIXME: The only exceptions to the above are blends which are exact
+  // interleavings with direct instructions supporting them. We currently don't
+  // handle those well here.
+  if (Subtarget->hasSSSE3()) {
+    SDValue V1Mask[16];
+    SDValue V2Mask[16];
+    for (int i = 0; i < 16; ++i)
+      if (Mask[i] == -1) {
+        V1Mask[i] = V2Mask[i] = DAG.getConstant(0x80, MVT::i8);
+      } else {
+        V1Mask[i] = DAG.getConstant(Mask[i] < 16 ? Mask[i] : 0x80, MVT::i8);
+        V2Mask[i] =
+            DAG.getConstant(Mask[i] < 16 ? 0x80 : Mask[i] - 16, MVT::i8);
+      }
+    V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
+    if (isSingleInputShuffleMask(Mask))
+      return V1; // Single inputs are easy.
+
+    // Otherwise, blend the two.
+    V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
+    return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
+  }
+
+  // Check whether a compaction lowering can be done. This handles shuffles
+  // which take every Nth element for some even N. See the helper function for
+  // details.
+  //
+  // We special case these as they can be particularly efficiently handled with
+  // the PACKUSB instruction on x86 and they show up in common patterns of
+  // rearranging bytes to truncate wide elements.
+  if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
+    // NumEvenDrops is the power of two stride of the elements. Another way of
+    // thinking about it is that we need to drop the even elements this many
+    // times to get the original input.
+    bool IsSingleInput = isSingleInputShuffleMask(Mask);
+
+    // First we need to zero all the dropped bytes.
+    assert(NumEvenDrops <= 3 &&
+           "No support for dropping even elements more than 3 times.");
+    // We use the mask type to pick which bytes are preserved based on how many
+    // elements are dropped.
+    MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
+    SDValue ByteClearMask =
+        DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
+                    DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
+    V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
+    if (!IsSingleInput)
+      V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
+
+    // Now pack things back together.
+    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
+    V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
+    SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
+    for (int i = 1; i < NumEvenDrops; ++i) {
+      Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
+      Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
+    }
+
+    return Result;
+  }
+
    int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
    int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
    int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
@@ -10636,7 +10847,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
      if (Subtarget->is64Bit())
        IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
                             IDX, MachinePointerInfo(), MVT::i32,
-                           false, false, 0);
+                           false, false, false, 0);
      else
        IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
                          false, false, false, 0);
@@ -11021,7 +11232,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
    // FIXME: Avoid the extend by constructing the right constant pool?
    SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
                                   FudgePtr, MachinePointerInfo::getConstantPool(),
-                                 MVT::f32, false, false, 4);
+                                 MVT::f32, false, false, false, 4);
    // Extend everything to 80 bits to force it to be done on x87.
    SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
    return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
@@ -12918,7 +13129,8 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
        Load =
            DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
                           Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
-                         Ld->isNonTemporal(), Ld->getAlignment());
+                         Ld->isNonTemporal(), Ld->isInvariant(),
+                         Ld->getAlignment());
      }
  
      // Replace chain users with the new chain.
@@ -13397,7 +13609,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
      SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
      Chain = SP.getValue(1);
      unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
-    const TargetFrameLowering &TFI = *DAG.getTarget().getFrameLowering();
+    const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering();
      unsigned StackAlign = TFI.getStackAlignment();
      Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
      if (Align > StackAlign)
@@ -13455,8 +13667,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
  
      Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
  
-    const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+    const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+        DAG.getSubtarget().getRegisterInfo());
      unsigned SPReg = RegInfo->getStackRegister();
      SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
      Chain = SP.getValue(1);
@@ -14835,8 +15047,8 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
  
    if (Depth > 0) {
      SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    const X86RegisterInfo *RegInfo =
-      static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+    const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+        DAG.getSubtarget().getRegisterInfo());
      SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
      return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                         DAG.getNode(ISD::ADD, dl, PtrVT,
@@ -14857,8 +15069,8 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
    EVT VT = Op.getValueType();
    SDLoc dl(Op);  // FIXME probably not meaningful
    unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
    unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
    assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
            (FrameReg == X86::EBP && VT == MVT::i32)) &&
@@ -14886,8 +15098,8 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName,
  
  SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                       SelectionDAG &DAG) const {
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
    return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
  }
  
@@ -14898,8 +15110,8 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
    SDLoc dl      (Op);
  
    EVT PtrVT = getPointerTy();
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      DAG.getSubtarget().getRegisterInfo());
    unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
    assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
            (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
@@ -14946,7 +15158,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
    SDLoc dl (Op);
  
    const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
-  const TargetRegisterInfo* TRI = DAG.getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
  
    if (Subtarget->is64Bit()) {
      SDValue OutChains[6];
@@ -15110,7 +15322,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
  
    MachineFunction &MF = DAG.getMachineFunction();
    const TargetMachine &TM = MF.getTarget();
-  const TargetFrameLowering &TFI = *TM.getFrameLowering();
+  const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
    unsigned StackAlignment = TFI.getStackAlignment();
    MVT VT = Op.getSimpleValueType();
    SDLoc DL(Op);
@@ -17349,7 +17561,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
    MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
  
    // Machine Information
-  const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
    MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
    const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
    const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
@@ -17605,7 +17817,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
    XMMSaveMBB->addSuccessor(EndMBB);
  
    // Now add the instructions.
-  const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
    DebugLoc DL = MI->getDebugLoc();
  
    unsigned CountReg = MI->getOperand(0).getReg();
@@ -17688,7 +17900,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
  MachineBasicBlock *
  X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
                                       MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
    DebugLoc DL = MI->getDebugLoc();
  
    // To "insert" a SELECT_CC instruction, we actually have to insert the
@@ -17714,7 +17926,8 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
  
    // If the EFLAGS register isn't dead in the terminator, then claim that it's
    // live into the sink and copy blocks.
-  const TargetRegisterInfo* TRI = BB->getParent()->getTarget().getRegisterInfo();
+  const TargetRegisterInfo *TRI =
+      BB->getParent()->getSubtarget().getRegisterInfo();
    if (!MI->killsRegister(X86::EFLAGS) &&
        !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
      copy0MBB->addLiveIn(X86::EFLAGS);
@@ -17756,7 +17969,7 @@ MachineBasicBlock *
  X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
                                          bool Is64Bit) const {
    MachineFunction *MF = BB->getParent();
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
    DebugLoc DL = MI->getDebugLoc();
    const BasicBlock *LLVM_BB = BB->getBasicBlock();
  
@@ -17826,8 +18039,10 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
    BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
  
    // Calls into a routine in libgcc to allocate more space from the heap.
-  const uint32_t *RegMask =
-    MF->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+  const uint32_t *RegMask = MF->getTarget()
+                                .getSubtargetImpl()
+                                ->getRegisterInfo()
+                                ->getCallPreservedMask(CallingConv::C);
    if (Is64Bit) {
      BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
        .addReg(sizeVReg);
@@ -17876,7 +18091,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
  MachineBasicBlock *
  X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
                                          MachineBasicBlock *BB) const {
-  const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
    DebugLoc DL = MI->getDebugLoc();
  
    assert(!Subtarget->isTargetMacho());
@@ -17933,8 +18148,8 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
    // or EAX and doing an indirect call.  The return value will then
    // be in the normal return register.
    MachineFunction *F = BB->getParent();
-  const X86InstrInfo *TII
-    = static_cast<const X86InstrInfo*>(F->getTarget().getInstrInfo());
+  const X86InstrInfo *TII =
+      static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo());
    DebugLoc DL = MI->getDebugLoc();
  
    assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
@@ -17943,8 +18158,10 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
    // Get a register mask for the lowered call.
    // FIXME: The 32-bit calls have non-standard calling conventions. Use a
    // proper register mask.
-  const uint32_t *RegMask =
-    F->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+  const uint32_t *RegMask = F->getTarget()
+                                .getSubtargetImpl()
+                                ->getRegisterInfo()
+                                ->getCallPreservedMask(CallingConv::C);
    if (Subtarget->is64Bit()) {
      MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
                                        TII->get(X86::MOV64rm), X86::RDI)
@@ -17989,7 +18206,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
                                      MachineBasicBlock *MBB) const {
    DebugLoc DL = MI->getDebugLoc();
    MachineFunction *MF = MBB->getParent();
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
    MachineRegisterInfo &MRI = MF->getRegInfo();
  
    const BasicBlock *BB = MBB->getBasicBlock();
@@ -18095,8 +18312,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
    MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
            .addMBB(restoreMBB);
  
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      MF->getSubtarget().getRegisterInfo());
    MIB.addRegMask(RegInfo->getNoPreservedMask());
    thisMBB->addSuccessor(mainMBB);
    thisMBB->addSuccessor(restoreMBB);
@@ -18126,7 +18343,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
                                       MachineBasicBlock *MBB) const {
    DebugLoc DL = MI->getDebugLoc();
    MachineFunction *MF = MBB->getParent();
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
    MachineRegisterInfo &MRI = MF->getRegInfo();
  
    // Memory Reference
@@ -18141,8 +18358,8 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
      (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
    unsigned Tmp = MRI.createVirtualRegister(RC);
    // Since FP is only updated here but NOT referenced, it's treated as GPR.
-  const X86RegisterInfo *RegInfo =
-    static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo());
+  const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
+      MF->getSubtarget().getRegisterInfo());
    unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
    unsigned SP = RegInfo->getStackRegister();
  
@@ -18252,7 +18469,7 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
          default: llvm_unreachable("Unrecognized FMA variant.");
        }
  
-      const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+      const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
        MachineInstrBuilder MIB =
          BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
          .addOperand(MI->getOperand(0))
@@ -18318,7 +18535,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    case X86::FP80_TO_INT32_IN_MEM:
    case X86::FP80_TO_INT64_IN_MEM: {
      MachineFunction *F = BB->getParent();
-    const TargetInstrInfo *TII = F->getTarget().getInstrInfo();
+    const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
      DebugLoc DL = MI->getDebugLoc();
  
      // Change the floating point control register to use "round towards zero"
@@ -18402,7 +18619,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    case X86::VPCMPESTRM128MEM:
      assert(Subtarget->hasSSE42() &&
             "Target must have SSE4.2 or AVX features enabled");
-    return EmitPCMPSTRM(MI, BB, BB->getParent()->getTarget().getInstrInfo());
+    return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
  
    // String/text processing lowering.
    case X86::PCMPISTRIREG:
@@ -18415,15 +18632,16 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    case X86::VPCMPESTRIMEM:
      assert(Subtarget->hasSSE42() &&
             "Target must have SSE4.2 or AVX features enabled");
-    return EmitPCMPSTRI(MI, BB, BB->getParent()->getTarget().getInstrInfo());
+    return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
  
    // Thread synchronization.
    case X86::MONITOR:
-    return EmitMonitor(MI, BB, BB->getParent()->getTarget().getInstrInfo(), Subtarget);
+    return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(),
+                       Subtarget);
  
    // xbegin
    case X86::XBEGIN:
-    return EmitXBegin(MI, BB, BB->getParent()->getTarget().getInstrInfo());
+    return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
  
    case X86::VASTART_SAVE_XMM_REGS:
      return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
@@ -18706,7 +18924,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
  /// for this operation, or into a PSHUFB instruction which is a fully general
  /// instruction but should only be used to replace chains over a certain depth.
  static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
-                                   int Depth, SelectionDAG &DAG,
+                                   int Depth, bool HasPSHUFB, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget *Subtarget) {
    assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
@@ -18728,7 +18946,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
      return true;
    }
  
-  // Use the float domain if the operand type is a floatingc point type.
+  // Use the float domain if the operand type is a floating point type.
    bool FloatDomain = VT.isFloatingPoint();
  
    // If we don't have access to VEX encodings, the generic PSHUF instructions
@@ -18739,12 +18957,14 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
    // shuffle instructions freely as they can copy due to the extra register
    // operand.
    if (Subtarget->hasAVX()) {
-    // We have both floatincg point and integer variants of shuffles that dup
-    // either tho low or high half of the vector.
+    // We have both floating point and integer variants of shuffles that dup
+    // either the low or high half of the vector.
      if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
        bool Lo = Mask.equals(0, 0);
        unsigned Shuffle = FloatDomain ? (Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS)
                                       : (Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH);
+      if (Depth == 1 && Root->getOpcode() == Shuffle)
+        return false; // Nothing to do!
        MVT ShuffleVT = FloatDomain ? MVT::v4f32 : MVT::v2i64;
        Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
        DCI.AddToWorklist(Op.getNode());
@@ -18767,16 +18987,18 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
           Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
                       15))) {
        bool Lo = Mask[0] == 0;
+      unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+      if (Depth == 1 && Root->getOpcode() == Shuffle)
+        return false; // Nothing to do!
        MVT ShuffleVT;
        switch (Mask.size()) {
        case 4: ShuffleVT = MVT::v4i32; break;
-      case 8: ShuffleVT = MVT::v8i32; break;
-      case 16: ShuffleVT = MVT::v16i32; break;
+      case 8: ShuffleVT = MVT::v8i16; break;
+      case 16: ShuffleVT = MVT::v16i8; break;
        };
        Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
        DCI.AddToWorklist(Op.getNode());
-      Op = DAG.getNode(Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, ShuffleVT, Op,
-                       Op);
+      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
        DCI.AddToWorklist(Op.getNode());
        DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
                      /*AddTo*/ true);
@@ -18784,15 +19006,16 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
      }
    }
  
-  // Bail if we have fewer than 3 shuffle instructions in the chain.
-  if (Depth < 3)
+  // Don't try to re-form single instruction chains under any circumstances now
+  // that we've done encoding canonicalization for them.
+  if (Depth < 2)
      return false;
  
-  // If we have 3 or more shuffle instructions, we can replace them with
-  // a single PSHUFB instruction profitably. Intel's manuals suggest only using
-  // PSHUFB if doing so replacing 5 instructions, but in practice PSHUFB tends
-  // to be *very* fast so we're more aggressive.
-  if (Subtarget->hasSSSE3()) {
+  // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
+  // can replace them with a single PSHUFB instruction profitably. Intel's
+  // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
+  // in practice PSHUFB tends to be *very* fast so we're more aggressive.
+  if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
      SmallVector<SDValue, 16> PSHUFBMask;
      assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
      int Ratio = 16 / Mask.size();
@@ -18841,17 +19064,13 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
  /// This should never be an issue in practice as the shuffle lowering doesn't
  /// produce sequences of more than 8 instructions.
  ///
-/// FIXME: Currently, we don't collapse instructions *into* PSHUFB. We should,
-/// and we should do so more aggressively than we form PSHUFB because once we
-/// have a PSHUFB, we might as well do as much shuffling as we can.
-///
  /// FIXME: We will currently miss some cases where the redundant shuffling
  /// would simplify under the threshold for PSHUFB formation because of
  /// combine-ordering. To fix this, we should do the redundant instruction
  /// combining in this recursive walk.
  static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
                                            ArrayRef<int> IncomingMask, int Depth,
-                                          SelectionDAG &DAG,
+                                          bool HasPSHUFB, SelectionDAG &DAG,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            const X86Subtarget *Subtarget) {
    // Bound the depth of our recursive combine because this is ultimately
@@ -18913,12 +19132,14 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
  
    // See if we can recurse into the operand to combine more things.
    switch (Op.getOpcode()) {
+    case X86ISD::PSHUFB:
+      HasPSHUFB = true;
      case X86ISD::PSHUFD:
      case X86ISD::PSHUFHW:
      case X86ISD::PSHUFLW:
        if (Op.getOperand(0).hasOneUse() &&
            combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
-                                        DAG, DCI, Subtarget))
+                                        HasPSHUFB, DAG, DCI, Subtarget))
          return true;
        break;
  
@@ -18928,7 +19149,7 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
        // We can't check for single use, we have to check that this shuffle is the only user.
        if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
            combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
-                                        DAG, DCI, Subtarget))
+                                        HasPSHUFB, DAG, DCI, Subtarget))
            return true;
        break;
    }
@@ -18952,7 +19173,8 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
      Mask.swap(NewMask);
    }
  
-  return combineX86ShuffleChain(Op, Root, Mask, Depth, DAG, DCI, Subtarget);
+  return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
+                                Subtarget);
  }
  
  /// \brief Get the PSHUF-style mask from PSHUF node.
@@ -19381,7 +19603,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
      SmallVector<int, 1> NonceMask; // Just a placeholder.
      NonceMask.push_back(0);
      if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
-                                      /*Depth*/ 1, DAG, DCI, Subtarget))
+                                      /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
+                                      DCI, Subtarget))
        return SDValue(); // This routine will use CombineTo to replace N.
    }
  
@@ -22421,6 +22644,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case X86ISD::UNPCKL:
    case X86ISD::MOVHLPS:
    case X86ISD::MOVLHPS:
+  case X86ISD::PSHUFB:
    case X86ISD::PSHUFD:
    case X86ISD::PSHUFHW:
    case X86ISD::PSHUFLW:
@@ -23076,14 +23300,14 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
          Constraint[5] == ')' &&
          Constraint[6] == '}') {
  
-      Res.first = X86::ST0+Constraint[4]-'0';
+      Res.first = X86::FP0+Constraint[4]-'0';
        Res.second = &X86::RFP80RegClass;
        return Res;
      }
  
      // GCC allows "st(0)" to be called just plain "st".
      if (StringRef("{st}").equals_lower(Constraint)) {
-      Res.first = X86::ST0;
+      Res.first = X86::FP0;
        Res.second = &X86::RFP80RegClass;
        return Res;
      }