[X86] AVX512: Only allow k1-k7 as predicates to vpcmp*

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index c50f44daaee8c3903224bff1fe25d1fa7ffa5b5b..39537e358b9fa0d3e0f654bfcbc04892f0db78d7 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -58,6 +58,12 @@ using namespace llvm;
  
  STATISTIC(NumTailCalls, "Number of tail calls");
  
+static cl::opt<bool> ExperimentalVectorWideningLegalization(
+    "x86-experimental-vector-widening-legalization", cl::init(false),
+    cl::desc("Enable an experimental vector type legalization through widening "
+             "rather than promotion."),
+    cl::Hidden);
+
  static cl::opt<bool> ExperimentalVectorShuffleLowering(
      "x86-experimental-vector-shuffle-lowering", cl::init(false),
      cl::desc("Enable an experimental vector shuffle lowering code path."),
@@ -509,6 +515,14 @@ void X86TargetLowering::resetOperationActions() {
      }
    }
  
+  // Special handling for half-precision floating point conversions.
+  // If we don't have F16C support, then lower half float conversions
+  // into library calls.
+  if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
+    setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand);
+    setOperationAction(ISD::FP32_TO_FP16, MVT::i16, Expand);
+  }
+
    if (Subtarget->hasPOPCNT()) {
      setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
    } else {
@@ -592,21 +606,6 @@ void X86TargetLowering::resetOperationActions() {
      setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
    }
  
-  if (!Subtarget->is64Bit()) {
-    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
-  }
-
    if (Subtarget->hasCmpxchg16b()) {
      setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
    }
@@ -1603,6 +1602,16 @@ void X86TargetLowering::resetOperationActions() {
    setPrefFunctionAlignment(4); // 2^4 bytes.
  }
  
+TargetLoweringBase::LegalizeTypeAction
+X86TargetLowering::getPreferredVectorAction(EVT VT) const {
+  if (ExperimentalVectorWideningLegalization &&
+      VT.getVectorNumElements() != 1 &&
+      VT.getVectorElementType().getSimpleVT() != MVT::i1)
+    return TypeWidenVector;
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
  EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
    if (!VT.isVector())
      return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
@@ -7105,7 +7114,7 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
        // Now we do a normal shuffle of V1 by giving V1 as both operands to
        // a blend.
-      HighV = V1;
+      LowV = HighV = V1;
        NewMask[0] = Mask[0] < 4 ? 0 : 2;
        NewMask[1] = Mask[0] < 4 ? 2 : 0;
        NewMask[2] = Mask[2] < 4 ? 1 : 3;
@@ -7535,7 +7544,7 @@ static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
        } else {
          // Otherwise pin the low inputs.
          for (int GoodInput : GoodInputs)
-          MoveMask[Mask[GoodInput]] = Mask[GoodInput] - MaskOffset;
+          MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
        }
  
        int MoveMaskIdx =
@@ -7690,6 +7699,86 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
    MutableArrayRef<int> LoMask = Mask.slice(0, 8);
    MutableArrayRef<int> HiMask = Mask.slice(8, 8);
  
+  // For single-input shuffles, there are some nicer lowering tricks we can use.
+  if (isSingleInputShuffleMask(Mask)) {
+    // Check whether we can widen this to an i16 shuffle by duplicating bytes.
+    // Notably, this handles splat and partial-splat shuffles more efficiently.
+    //
+    // FIXME: We should check for other patterns which can be widened into an
+    // i16 shuffle as well.
+    auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
+      for (int i = 0; i < 16; i += 2) {
+        if (Mask[i] != Mask[i + 1])
+          return false;
+      }
+      return true;
+    };
+    if (canWidenViaDuplication(Mask)) {
+      SmallVector<int, 4> LoInputs;
+      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
+                   [](int M) { return M >= 0 && M < 8; });
+      std::sort(LoInputs.begin(), LoInputs.end());
+      LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
+                     LoInputs.end());
+      SmallVector<int, 4> HiInputs;
+      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
+                   [](int M) { return M >= 8; });
+      std::sort(HiInputs.begin(), HiInputs.end());
+      HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
+                     HiInputs.end());
+
+      bool TargetLo = LoInputs.size() >= HiInputs.size();
+      ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
+      ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
+
+      int ByteMask[16];
+      SmallDenseMap<int, int, 8> LaneMap;
+      for (int i = 0; i < 16; ++i)
+        ByteMask[i] = -1;
+      for (int I : InPlaceInputs) {
+        ByteMask[I] = I;
+        LaneMap[I] = I;
+      }
+      int FreeByteIdx = 0;
+      int TargetOffset = TargetLo ? 0 : 8;
+      for (int I : MovingInputs) {
+        // Walk the free index into the byte mask until we find an unoccupied
+        // spot. We bound this to 8 steps to catch bugs, the pigeonhole
+        // principle indicates that there *must* be a spot as we can only have
+        // 8 duplicated inputs. We have to walk the index using modular
+        // arithmetic to wrap around as necessary.
+        // FIXME: We could do a much better job of picking an inexpensive slot
+        // so this doesn't go through the worst case for the byte shuffle.
+        for (int j = 0; j < 8 && ByteMask[FreeByteIdx + TargetOffset] != -1;
+             ++j, FreeByteIdx = (FreeByteIdx + 1) % 8)
+          ;
+        assert(ByteMask[FreeByteIdx + TargetOffset] == -1 &&
+               "Failed to find a free byte!");
+        ByteMask[FreeByteIdx + TargetOffset] = I;
+        LaneMap[I] = FreeByteIdx + TargetOffset;
+      }
+      V1 = DAG.getVectorShuffle(MVT::v16i8, DL, V1, DAG.getUNDEF(MVT::v16i8),
+                                ByteMask);
+      for (int &M : Mask)
+        if (M != -1)
+          M = LaneMap[M];
+
+      // Unpack the bytes to form the i16s that will be shuffled into place.
+      V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
+                       MVT::v16i8, V1, V1);
+
+      int I16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+      for (int i = 0; i < 16; i += 2) {
+        if (Mask[i] != -1)
+          I16Shuffle[i / 2] = Mask[i] - (TargetLo ? 0 : 8);
+        assert(I16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!");
+      }
+      return DAG.getVectorShuffle(MVT::v8i16, DL,
+                                  DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
+                                  DAG.getUNDEF(MVT::v8i16), I16Shuffle);
+    }
+  }
+
    // Check whether an interleaving lowering is likely to be more efficient.
    // This isn't perfect but it is a strong heuristic that tends to work well on
    // the kinds of shuffles that show up in practice.
@@ -7712,19 +7801,19 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
  
      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds);
    }
-
-  SDValue LoV1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                             DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1,
-                                         DAG.getUNDEF(MVT::v8i16)));
-  SDValue HiV1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                             DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1,
-                                         DAG.getUNDEF(MVT::v8i16)));
-  SDValue LoV2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                             DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V2,
-                                         DAG.getUNDEF(MVT::v8i16)));
-  SDValue HiV2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
-                             DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V2,
-                                         DAG.getUNDEF(MVT::v8i16)));
+  SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
+  SDValue LoV1 =
+      DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                  DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, Zero));
+  SDValue HiV1 =
+      DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                  DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, Zero));
+  SDValue LoV2 =
+      DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                  DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V2, Zero));
+  SDValue HiV2 =
+      DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
+                  DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V2, Zero));
  
    int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
    int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
@@ -7835,6 +7924,47 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
          return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
        }
  
+  // Check for a shuffle of a splat, and return just the splat. While DAG
+  // combining will do a similar transformation, this shows up with the
+  // internally created shuffles and so we handle it specially here as we won't
+  // have another chance to DAG-combine the generic shuffle instructions.
+  if (V2IsUndef) {
+    SDValue V = V1;
+
+    // Look through any bitcasts. These can't change the size, just the number
+    // of elements which we check later.
+    while (V.getOpcode() == ISD::BITCAST)
+      V = V->getOperand(0);
+
+    // A splat should always show up as a build vector node.
+    if (V.getOpcode() == ISD::BUILD_VECTOR) {
+      SDValue Base;
+      bool AllSame = true;
+      for (unsigned i = 0; i != V->getNumOperands(); ++i)
+        if (V->getOperand(i).getOpcode() != ISD::UNDEF) {
+          Base = V->getOperand(i);
+          break;
+        }
+      // Splat of <u, u, ..., u>, return <u, u, ..., u>
+      if (!Base)
+        return V1;
+      for (unsigned i = 0; i != V->getNumOperands(); ++i)
+        if (V->getOperand(i) != Base) {
+          AllSame = false;
+          break;
+        }
+      // Splat of <x, x, ..., x>, return <x, x, ..., x>, provided that the
+      // number of elements match or the value splatted is a zero constant.
+      if (AllSame) {
+        if (V.getValueType().getVectorNumElements() == (unsigned)NumElements)
+          return V1;
+        if (auto *C = dyn_cast<ConstantSDNode>(Base))
+          if (C->isNullValue())
+            return V1;
+      }
+    }
+  }
+
    // For integer vector shuffles, try to collapse them into a shuffle of fewer
    // lanes but wider integers. We cap this to not form integers larger than i64
    // but it might be interesting to form i128 integers to handle flipping the
@@ -14145,6 +14275,51 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
    return SDValue(Res, 0);
  }
  
+// getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
+// read performance monitor counters (x86_rdpmc).
+static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
+                              SelectionDAG &DAG, const X86Subtarget *Subtarget,
+                              SmallVectorImpl<SDValue> &Results) {
+  assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue LO, HI;
+
+  // The ECX register is used to select the index of the performance counter
+  // to read.
+  SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
+                                   N->getOperand(2));
+  SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
+
+  // Reads the content of a 64-bit performance counter and returns it in the
+  // registers EDX:EAX.
+  if (Subtarget->is64Bit()) {
+    LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+                            LO.getValue(2));
+  } else {
+    LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+                            LO.getValue(2));
+  }
+  Chain = HI.getValue(1);
+
+  if (Subtarget->is64Bit()) {
+    // The EAX register is loaded with the low-order 32 bits. The EDX register
+    // is loaded with the supported high-order bits of the counter.
+    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+                              DAG.getConstant(32, MVT::i8));
+    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+    Results.push_back(Chain);
+    return;
+  }
+
+  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+  SDValue Ops[] = { LO, HI };
+  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+  Results.push_back(Pair);
+  Results.push_back(Chain);
+}
+
  // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
  // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
  // also used to custom lower READCYCLECOUNTER nodes.
@@ -14209,7 +14384,7 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
  }
  
  enum IntrinsicType {
-  GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDTSC, XTEST
+  GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST
  };
  
  struct IntrinsicData {
@@ -14303,6 +14478,8 @@ static void InitIntinsicsMap() {
                                  IntrinsicData(RDTSC,  X86ISD::RDTSC_DAG, 0)));
    IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp,
                                  IntrinsicData(RDTSC,  X86ISD::RDTSCP_DAG, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdpmc,
+                                IntrinsicData(RDPMC,  X86ISD::RDPMC_DAG, 0)));
    Initialized = true;
  }
  
@@ -14378,6 +14555,12 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
      getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results);
      return DAG.getMergeValues(Results, dl);
    }
+  // Read Performance Monitoring Counters.
+  case RDPMC: {
+    SmallVector<SDValue, 2> Results;
+    getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
+    return DAG.getMergeValues(Results, dl);
+  }
    // XTEST intrinsics.
    case XTEST: {
      SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
@@ -14966,7 +15149,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
    CLI.setDebugLoc(dl).setChain(InChain)
      .setCallee(getLibcallCallingConv(LC),
                 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
-               Callee, &Args, 0)
+               Callee, std::move(Args), 0)
      .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
  
    std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
@@ -15339,15 +15522,14 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
  
  static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
                            SelectionDAG &DAG) {
-
    MVT VT = Op.getSimpleValueType();
    SDLoc dl(Op);
    SDValue R = Op.getOperand(0);
    SDValue Amt = Op.getOperand(1);
    SDValue V;
  
-  if (!Subtarget->hasSSE2())
-    return SDValue();
+  assert(VT.isVector() && "Custom lowering only for vector shifts!");
+  assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
  
    V = LowerScalarImmediateShift(Op, DAG, Subtarget);
    if (V.getNode())
@@ -15966,7 +16148,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
  
    TargetLowering::CallLoweringInfo CLI(DAG);
    CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::C, RetTy, Callee, &Args, 0);
+    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
  
    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
  
@@ -16095,29 +16277,6 @@ static void ReplaceATOMIC_LOAD(SDNode *Node,
    Results.push_back(Swap.getValue(2));
  }
  
-static void
-ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
-                        SelectionDAG &DAG, unsigned NewOp) {
-  SDLoc dl(Node);
-  assert (Node->getValueType(0) == MVT::i64 &&
-          "Only know how to expand i64 atomics");
-
-  SDValue Chain = Node->getOperand(0);
-  SDValue In1 = Node->getOperand(1);
-  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                             Node->getOperand(2), DAG.getIntPtrConstant(0));
-  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                             Node->getOperand(2), DAG.getIntPtrConstant(1));
-  SDValue Ops[] = { Chain, In1, In2L, In2H };
-  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
-  SDValue Result =
-    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, MVT::i64,
-                            cast<MemSDNode>(Node)->getMemOperand());
-  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
-  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF));
-  Results.push_back(Result.getValue(2));
-}
-
  /// ReplaceNodeResults - Replace a node with an illegal result type
  /// with a new node built out of custom code.
  void X86TargetLowering::ReplaceNodeResults(SDNode *N,
@@ -16202,6 +16361,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
      case Intrinsic::x86_rdtscp:
        return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
                                       Results);
+    case Intrinsic::x86_rdpmc:
+      return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
      }
    }
    case ISD::READCYCLECOUNTER: {
@@ -16263,57 +16424,20 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
      Results.push_back(EFLAGS.getValue(1));
      return;
    }
+  case ISD::ATOMIC_SWAP:
    case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
    case ISD::ATOMIC_LOAD_AND:
-  case ISD::ATOMIC_LOAD_NAND:
    case ISD::ATOMIC_LOAD_OR:
-  case ISD::ATOMIC_LOAD_SUB:
    case ISD::ATOMIC_LOAD_XOR:
-  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_NAND:
    case ISD::ATOMIC_LOAD_MIN:
-  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_LOAD_MAX:
    case ISD::ATOMIC_LOAD_UMIN:
-  case ISD::ATOMIC_SWAP: {
-    unsigned Opc;
-    switch (N->getOpcode()) {
-    default: llvm_unreachable("Unexpected opcode");
-    case ISD::ATOMIC_LOAD_ADD:
-      Opc = X86ISD::ATOMADD64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_AND:
-      Opc = X86ISD::ATOMAND64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_NAND:
-      Opc = X86ISD::ATOMNAND64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_OR:
-      Opc = X86ISD::ATOMOR64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_SUB:
-      Opc = X86ISD::ATOMSUB64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_XOR:
-      Opc = X86ISD::ATOMXOR64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_MAX:
-      Opc = X86ISD::ATOMMAX64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_MIN:
-      Opc = X86ISD::ATOMMIN64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_UMAX:
-      Opc = X86ISD::ATOMUMAX64_DAG;
-      break;
-    case ISD::ATOMIC_LOAD_UMIN:
-      Opc = X86ISD::ATOMUMIN64_DAG;
-      break;
-    case ISD::ATOMIC_SWAP:
-      Opc = X86ISD::ATOMSWAP64_DAG;
-      break;
-    }
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
-    return;
-  }
+  case ISD::ATOMIC_LOAD_UMAX:
+    // Delegate to generic TypeLegalization. Situations we can really handle
+    // should have already been dealt with by X86AtomicExpand.cpp.
+    break;
    case ISD::ATOMIC_LOAD: {
      ReplaceATOMIC_LOAD(N, Results, DAG);
      return;
@@ -16334,6 +16458,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                     MVT::v2f64, N->getOperand(0));
      SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
  
+    if (ExperimentalVectorWideningLegalization) {
+      // If we are legalizing vectors by widening, we already have the desired
+      // legal vector type, just return it.
+      Results.push_back(ToVecInt);
+      return;
+    }
+
      SmallVector<SDValue, 8> Elts;
      for (unsigned i = 0, e = NumElts; i != e; ++i)
        Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
@@ -16366,6 +16497,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::CALL:               return "X86ISD::CALL";
    case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
    case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
+  case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
    case X86ISD::BT:                 return "X86ISD::BT";
    case X86ISD::CMP:                return "X86ISD::CMP";
    case X86ISD::COMI:               return "X86ISD::COMI";
@@ -16420,12 +16552,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
    case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
    case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
-  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
-  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
-  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
-  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
-  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
-  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
    case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
    case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
    case X86ISD::VZEXT:              return "X86ISD::VZEXT";
@@ -16816,685 +16942,6 @@ static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
    return sinkMBB;
  }
  
-// Get CMPXCHG opcode for the specified data type.
-static unsigned getCmpXChgOpcode(EVT VT) {
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::i8:  return X86::LCMPXCHG8;
-  case MVT::i16: return X86::LCMPXCHG16;
-  case MVT::i32: return X86::LCMPXCHG32;
-  case MVT::i64: return X86::LCMPXCHG64;
-  default:
-    break;
-  }
-  llvm_unreachable("Invalid operand size!");
-}
-
-// Get LOAD opcode for the specified data type.
-static unsigned getLoadOpcode(EVT VT) {
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::i8:  return X86::MOV8rm;
-  case MVT::i16: return X86::MOV16rm;
-  case MVT::i32: return X86::MOV32rm;
-  case MVT::i64: return X86::MOV64rm;
-  default:
-    break;
-  }
-  llvm_unreachable("Invalid operand size!");
-}
-
-// Get opcode of the non-atomic one from the specified atomic instruction.
-static unsigned getNonAtomicOpcode(unsigned Opc) {
-  switch (Opc) {
-  case X86::ATOMAND8:  return X86::AND8rr;
-  case X86::ATOMAND16: return X86::AND16rr;
-  case X86::ATOMAND32: return X86::AND32rr;
-  case X86::ATOMAND64: return X86::AND64rr;
-  case X86::ATOMOR8:   return X86::OR8rr;
-  case X86::ATOMOR16:  return X86::OR16rr;
-  case X86::ATOMOR32:  return X86::OR32rr;
-  case X86::ATOMOR64:  return X86::OR64rr;
-  case X86::ATOMXOR8:  return X86::XOR8rr;
-  case X86::ATOMXOR16: return X86::XOR16rr;
-  case X86::ATOMXOR32: return X86::XOR32rr;
-  case X86::ATOMXOR64: return X86::XOR64rr;
-  }
-  llvm_unreachable("Unhandled atomic-load-op opcode!");
-}
-
-// Get opcode of the non-atomic one from the specified atomic instruction with
-// extra opcode.
-static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc,
-                                               unsigned &ExtraOpc) {
-  switch (Opc) {
-  case X86::ATOMNAND8:  ExtraOpc = X86::NOT8r;   return X86::AND8rr;
-  case X86::ATOMNAND16: ExtraOpc = X86::NOT16r;  return X86::AND16rr;
-  case X86::ATOMNAND32: ExtraOpc = X86::NOT32r;  return X86::AND32rr;
-  case X86::ATOMNAND64: ExtraOpc = X86::NOT64r;  return X86::AND64rr;
-  case X86::ATOMMAX8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVL32rr;
-  case X86::ATOMMAX16:  ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr;
-  case X86::ATOMMAX32:  ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr;
-  case X86::ATOMMAX64:  ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr;
-  case X86::ATOMMIN8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVG32rr;
-  case X86::ATOMMIN16:  ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr;
-  case X86::ATOMMIN32:  ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr;
-  case X86::ATOMMIN64:  ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr;
-  case X86::ATOMUMAX8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVB32rr;
-  case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr;
-  case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr;
-  case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr;
-  case X86::ATOMUMIN8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVA32rr;
-  case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr;
-  case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr;
-  case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr;
-  }
-  llvm_unreachable("Unhandled atomic-load-op opcode!");
-}
-
-// Get opcode of the non-atomic one from the specified atomic instruction for
-// 64-bit data type on 32-bit target.
-static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) {
-  switch (Opc) {
-  case X86::ATOMAND6432:  HiOpc = X86::AND32rr; return X86::AND32rr;
-  case X86::ATOMOR6432:   HiOpc = X86::OR32rr;  return X86::OR32rr;
-  case X86::ATOMXOR6432:  HiOpc = X86::XOR32rr; return X86::XOR32rr;
-  case X86::ATOMADD6432:  HiOpc = X86::ADC32rr; return X86::ADD32rr;
-  case X86::ATOMSUB6432:  HiOpc = X86::SBB32rr; return X86::SUB32rr;
-  case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr;
-  case X86::ATOMMAX6432:  HiOpc = X86::SETLr;   return X86::SETLr;
-  case X86::ATOMMIN6432:  HiOpc = X86::SETGr;   return X86::SETGr;
-  case X86::ATOMUMAX6432: HiOpc = X86::SETBr;   return X86::SETBr;
-  case X86::ATOMUMIN6432: HiOpc = X86::SETAr;   return X86::SETAr;
-  }
-  llvm_unreachable("Unhandled atomic-load-op opcode!");
-}
-
-// Get opcode of the non-atomic one from the specified atomic instruction for
-// 64-bit data type on 32-bit target with extra opcode.
-static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc,
-                                                   unsigned &HiOpc,
-                                                   unsigned &ExtraOpc) {
-  switch (Opc) {
-  case X86::ATOMNAND6432:
-    ExtraOpc = X86::NOT32r;
-    HiOpc = X86::AND32rr;
-    return X86::AND32rr;
-  }
-  llvm_unreachable("Unhandled atomic-load-op opcode!");
-}
-
-// Get pseudo CMOV opcode from the specified data type.
-static unsigned getPseudoCMOVOpc(EVT VT) {
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::i8:  return X86::CMOV_GR8;
-  case MVT::i16: return X86::CMOV_GR16;
-  case MVT::i32: return X86::CMOV_GR32;
-  default:
-    break;
-  }
-  llvm_unreachable("Unknown CMOV opcode!");
-}
-
-// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions.
-// They will be translated into a spin-loop or compare-exchange loop from
-//
-//    ...
-//    dst = atomic-fetch-op MI.addr, MI.val
-//    ...
-//
-// to
-//
-//    ...
-//    t1 = LOAD MI.addr
-// loop:
-//    t4 = phi(t1, t3 / loop)
-//    t2 = OP MI.val, t4
-//    EAX = t4
-//    LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined]
-//    t3 = EAX
-//    JNE loop
-// sink:
-//    dst = t3
-//    ...
-MachineBasicBlock *
-X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
-                                       MachineBasicBlock *MBB) const {
-  MachineFunction *MF = MBB->getParent();
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
-
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-
-  const BasicBlock *BB = MBB->getBasicBlock();
-  MachineFunction::iterator I = MBB;
-  ++I;
-
-  assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
-         "Unexpected number of operands");
-
-  assert(MI->hasOneMemOperand() &&
-         "Expected atomic-load-op to have one memoperand");
-
-  // Memory Reference
-  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
-  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
-
-  unsigned DstReg, SrcReg;
-  unsigned MemOpndSlot;
-
-  unsigned CurOp = 0;
-
-  DstReg = MI->getOperand(CurOp++).getReg();
-  MemOpndSlot = CurOp;
-  CurOp += X86::AddrNumOperands;
-  SrcReg = MI->getOperand(CurOp++).getReg();
-
-  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
-  MVT::SimpleValueType VT = *RC->vt_begin();
-  unsigned t1 = MRI.createVirtualRegister(RC);
-  unsigned t2 = MRI.createVirtualRegister(RC);
-  unsigned t3 = MRI.createVirtualRegister(RC);
-  unsigned t4 = MRI.createVirtualRegister(RC);
-  unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT);
-
-  unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
-  unsigned LOADOpc = getLoadOpcode(VT);
-
-  // For the atomic load-arith operator, we generate
-  //
-  //  thisMBB:
-  //    t1 = LOAD [MI.addr]
-  //  mainMBB:
-  //    t4 = phi(t1 / thisMBB, t3 / mainMBB)
-  //    t1 = OP MI.val, EAX
-  //    EAX = t4
-  //    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
-  //    t3 = EAX
-  //    JNE mainMBB
-  //  sinkMBB:
-  //    dst = t3
-
-  MachineBasicBlock *thisMBB = MBB;
-  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
-  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
-  MF->insert(I, mainMBB);
-  MF->insert(I, sinkMBB);
-
-  MachineInstrBuilder MIB;
-
-  // Transfer the remainder of BB and its successor edges to sinkMBB.
-  sinkMBB->splice(sinkMBB->begin(), MBB,
-                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
-  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
-
-  // thisMBB:
-  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1);
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
-    if (NewMO.isReg())
-      NewMO.setIsKill(false);
-    MIB.addOperand(NewMO);
-  }
-  for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
-    unsigned flags = (*MMOI)->getFlags();
-    flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
-    MachineMemOperand *MMO =
-      MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
-                               (*MMOI)->getSize(),
-                               (*MMOI)->getBaseAlignment(),
-                               (*MMOI)->getTBAAInfo(),
-                               (*MMOI)->getRanges());
-    MIB.addMemOperand(MMO);
-  }
-
-  thisMBB->addSuccessor(mainMBB);
-
-  // mainMBB:
-  MachineBasicBlock *origMainMBB = mainMBB;
-
-  // Add a PHI.
-  MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4)
-                        .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
-
-  unsigned Opc = MI->getOpcode();
-  switch (Opc) {
-  default:
-    llvm_unreachable("Unhandled atomic-load-op opcode!");
-  case X86::ATOMAND8:
-  case X86::ATOMAND16:
-  case X86::ATOMAND32:
-  case X86::ATOMAND64:
-  case X86::ATOMOR8:
-  case X86::ATOMOR16:
-  case X86::ATOMOR32:
-  case X86::ATOMOR64:
-  case X86::ATOMXOR8:
-  case X86::ATOMXOR16:
-  case X86::ATOMXOR32:
-  case X86::ATOMXOR64: {
-    unsigned ARITHOpc = getNonAtomicOpcode(Opc);
-    BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg)
-      .addReg(t4);
-    break;
-  }
-  case X86::ATOMNAND8:
-  case X86::ATOMNAND16:
-  case X86::ATOMNAND32:
-  case X86::ATOMNAND64: {
-    unsigned Tmp = MRI.createVirtualRegister(RC);
-    unsigned NOTOpc;
-    unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
-    BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg)
-      .addReg(t4);
-    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp);
-    break;
-  }
-  case X86::ATOMMAX8:
-  case X86::ATOMMAX16:
-  case X86::ATOMMAX32:
-  case X86::ATOMMAX64:
-  case X86::ATOMMIN8:
-  case X86::ATOMMIN16:
-  case X86::ATOMMIN32:
-  case X86::ATOMMIN64:
-  case X86::ATOMUMAX8:
-  case X86::ATOMUMAX16:
-  case X86::ATOMUMAX32:
-  case X86::ATOMUMAX64:
-  case X86::ATOMUMIN8:
-  case X86::ATOMUMIN16:
-  case X86::ATOMUMIN32:
-  case X86::ATOMUMIN64: {
-    unsigned CMPOpc;
-    unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc);
-
-    BuildMI(mainMBB, DL, TII->get(CMPOpc))
-      .addReg(SrcReg)
-      .addReg(t4);
-
-    if (Subtarget->hasCMov()) {
-      if (VT != MVT::i8) {
-        // Native support
-        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
-          .addReg(SrcReg)
-          .addReg(t4);
-      } else {
-        // Promote i8 to i32 to use CMOV32
-        const TargetRegisterInfo* TRI = MF->getTarget().getRegisterInfo();
-        const TargetRegisterClass *RC32 =
-          TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit);
-        unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
-        unsigned AccReg32 = MRI.createVirtualRegister(RC32);
-        unsigned Tmp = MRI.createVirtualRegister(RC32);
-
-        unsigned Undef = MRI.createVirtualRegister(RC32);
-        BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef);
-
-        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32)
-          .addReg(Undef)
-          .addReg(SrcReg)
-          .addImm(X86::sub_8bit);
-        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32)
-          .addReg(Undef)
-          .addReg(t4)
-          .addImm(X86::sub_8bit);
-
-        BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp)
-          .addReg(SrcReg32)
-          .addReg(AccReg32);
-
-        BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2)
-          .addReg(Tmp, 0, X86::sub_8bit);
-      }
-    } else {
-      // Use pseudo select and lower them.
-      assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
-             "Invalid atomic-load-op transformation!");
-      unsigned SelOpc = getPseudoCMOVOpc(VT);
-      X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
-      assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
-      MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2)
-              .addReg(SrcReg).addReg(t4)
-              .addImm(CC);
-      mainMBB = EmitLoweredSelect(MIB, mainMBB);
-      // Replace the original PHI node as mainMBB is changed after CMOV
-      // lowering.
-      BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4)
-        .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
-      Phi->eraseFromParent();
-    }
-    break;
-  }
-  }
-
-  // Copy PhyReg back from virtual register.
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg)
-    .addReg(t4);
-
-  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
-    if (NewMO.isReg())
-      NewMO.setIsKill(false);
-    MIB.addOperand(NewMO);
-  }
-  MIB.addReg(t2);
-  MIB.setMemRefs(MMOBegin, MMOEnd);
-
-  // Copy PhyReg back to virtual register.
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3)
-    .addReg(PhyReg);
-
-  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
-
-  mainMBB->addSuccessor(origMainMBB);
-  mainMBB->addSuccessor(sinkMBB);
-
-  // sinkMBB:
-  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
-          TII->get(TargetOpcode::COPY), DstReg)
-    .addReg(t3);
-
-  MI->eraseFromParent();
-  return sinkMBB;
-}
-
-// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic
-// instructions. They will be translated into a spin-loop or compare-exchange
-// loop from
-//
-//    ...
-//    dst = atomic-fetch-op MI.addr, MI.val
-//    ...
-//
-// to
-//
-//    ...
-//    t1L = LOAD [MI.addr + 0]
-//    t1H = LOAD [MI.addr + 4]
-// loop:
-//    t4L = phi(t1L, t3L / loop)
-//    t4H = phi(t1H, t3H / loop)
-//    t2L = OP MI.val.lo, t4L
-//    t2H = OP MI.val.hi, t4H
-//    EAX = t4L
-//    EDX = t4H
-//    EBX = t2L
-//    ECX = t2H
-//    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
-//    t3L = EAX
-//    t3H = EDX
-//    JNE loop
-// sink:
-//    dstL = t3L
-//    dstH = t3H
-//    ...
-MachineBasicBlock *
-X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
-                                           MachineBasicBlock *MBB) const {
-  MachineFunction *MF = MBB->getParent();
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
-
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-
-  const BasicBlock *BB = MBB->getBasicBlock();
-  MachineFunction::iterator I = MBB;
-  ++I;
-
-  assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 &&
-         "Unexpected number of operands");
-
-  assert(MI->hasOneMemOperand() &&
-         "Expected atomic-load-op32 to have one memoperand");
-
-  // Memory Reference
-  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
-  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
-
-  unsigned DstLoReg, DstHiReg;
-  unsigned SrcLoReg, SrcHiReg;
-  unsigned MemOpndSlot;
-
-  unsigned CurOp = 0;
-
-  DstLoReg = MI->getOperand(CurOp++).getReg();
-  DstHiReg = MI->getOperand(CurOp++).getReg();
-  MemOpndSlot = CurOp;
-  CurOp += X86::AddrNumOperands;
-  SrcLoReg = MI->getOperand(CurOp++).getReg();
-  SrcHiReg = MI->getOperand(CurOp++).getReg();
-
-  const TargetRegisterClass *RC = &X86::GR32RegClass;
-  const TargetRegisterClass *RC8 = &X86::GR8RegClass;
-
-  unsigned t1L = MRI.createVirtualRegister(RC);
-  unsigned t1H = MRI.createVirtualRegister(RC);
-  unsigned t2L = MRI.createVirtualRegister(RC);
-  unsigned t2H = MRI.createVirtualRegister(RC);
-  unsigned t3L = MRI.createVirtualRegister(RC);
-  unsigned t3H = MRI.createVirtualRegister(RC);
-  unsigned t4L = MRI.createVirtualRegister(RC);
-  unsigned t4H = MRI.createVirtualRegister(RC);
-
-  unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
-  unsigned LOADOpc = X86::MOV32rm;
-
-  // For the atomic load-arith operator, we generate
-  //
-  //  thisMBB:
-  //    t1L = LOAD [MI.addr + 0]
-  //    t1H = LOAD [MI.addr + 4]
-  //  mainMBB:
-  //    t4L = phi(t1L / thisMBB, t3L / mainMBB)
-  //    t4H = phi(t1H / thisMBB, t3H / mainMBB)
-  //    t2L = OP MI.val.lo, t4L
-  //    t2H = OP MI.val.hi, t4H
-  //    EBX = t2L
-  //    ECX = t2H
-  //    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
-  //    t3L = EAX
-  //    t3H = EDX
-  //    JNE loop
-  //  sinkMBB:
-  //    dstL = t3L
-  //    dstH = t3H
-
-  MachineBasicBlock *thisMBB = MBB;
-  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
-  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
-  MF->insert(I, mainMBB);
-  MF->insert(I, sinkMBB);
-
-  MachineInstrBuilder MIB;
-
-  // Transfer the remainder of BB and its successor edges to sinkMBB.
-  sinkMBB->splice(sinkMBB->begin(), MBB,
-                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
-  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
-
-  // thisMBB:
-  // Lo
-  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L);
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
-    if (NewMO.isReg())
-      NewMO.setIsKill(false);
-    MIB.addOperand(NewMO);
-  }
-  for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
-    unsigned flags = (*MMOI)->getFlags();
-    flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
-    MachineMemOperand *MMO =
-      MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
-                               (*MMOI)->getSize(),
-                               (*MMOI)->getBaseAlignment(),
-                               (*MMOI)->getTBAAInfo(),
-                               (*MMOI)->getRanges());
-    MIB.addMemOperand(MMO);
-  };
-  MachineInstr *LowMI = MIB;
-
-  // Hi
-  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H);
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    if (i == X86::AddrDisp) {
-      MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
-    } else {
-      MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
-      if (NewMO.isReg())
-        NewMO.setIsKill(false);
-      MIB.addOperand(NewMO);
-    }
-  }
-  MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end());
-
-  thisMBB->addSuccessor(mainMBB);
-
-  // mainMBB:
-  MachineBasicBlock *origMainMBB = mainMBB;
-
-  // Add PHIs.
-  MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L)
-                        .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
-  MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H)
-                        .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
-
-  unsigned Opc = MI->getOpcode();
-  switch (Opc) {
-  default:
-    llvm_unreachable("Unhandled atomic-load-op6432 opcode!");
-  case X86::ATOMAND6432:
-  case X86::ATOMOR6432:
-  case X86::ATOMXOR6432:
-  case X86::ATOMADD6432:
-  case X86::ATOMSUB6432: {
-    unsigned HiOpc;
-    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
-    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L)
-      .addReg(SrcLoReg);
-    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H)
-      .addReg(SrcHiReg);
-    break;
-  }
-  case X86::ATOMNAND6432: {
-    unsigned HiOpc, NOTOpc;
-    unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
-    unsigned TmpL = MRI.createVirtualRegister(RC);
-    unsigned TmpH = MRI.createVirtualRegister(RC);
-    BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg)
-      .addReg(t4L);
-    BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg)
-      .addReg(t4H);
-    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL);
-    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH);
-    break;
-  }
-  case X86::ATOMMAX6432:
-  case X86::ATOMMIN6432:
-  case X86::ATOMUMAX6432:
-  case X86::ATOMUMIN6432: {
-    unsigned HiOpc;
-    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
-    unsigned cL = MRI.createVirtualRegister(RC8);
-    unsigned cH = MRI.createVirtualRegister(RC8);
-    unsigned cL32 = MRI.createVirtualRegister(RC);
-    unsigned cH32 = MRI.createVirtualRegister(RC);
-    unsigned cc = MRI.createVirtualRegister(RC);
-    // cl := cmp src_lo, lo
-    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
-      .addReg(SrcLoReg).addReg(t4L);
-    BuildMI(mainMBB, DL, TII->get(LoOpc), cL);
-    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL);
-    // ch := cmp src_hi, hi
-    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
-      .addReg(SrcHiReg).addReg(t4H);
-    BuildMI(mainMBB, DL, TII->get(HiOpc), cH);
-    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH);
-    // cc := if (src_hi == hi) ? cl : ch;
-    if (Subtarget->hasCMov()) {
-      BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc)
-        .addReg(cH32).addReg(cL32);
-    } else {
-      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc)
-              .addReg(cH32).addReg(cL32)
-              .addImm(X86::COND_E);
-      mainMBB = EmitLoweredSelect(MIB, mainMBB);
-    }
-    BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc);
-    if (Subtarget->hasCMov()) {
-      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L)
-        .addReg(SrcLoReg).addReg(t4L);
-      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H)
-        .addReg(SrcHiReg).addReg(t4H);
-    } else {
-      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L)
-              .addReg(SrcLoReg).addReg(t4L)
-              .addImm(X86::COND_NE);
-      mainMBB = EmitLoweredSelect(MIB, mainMBB);
-      // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the
-      // 2nd CMOV lowering.
-      mainMBB->addLiveIn(X86::EFLAGS);
-      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H)
-              .addReg(SrcHiReg).addReg(t4H)
-              .addImm(X86::COND_NE);
-      mainMBB = EmitLoweredSelect(MIB, mainMBB);
-      // Replace the original PHI node as mainMBB is changed after CMOV
-      // lowering.
-      BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L)
-        .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
-      BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H)
-        .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
-      PhiL->eraseFromParent();
-      PhiH->eraseFromParent();
-    }
-    break;
-  }
-  case X86::ATOMSWAP6432: {
-    unsigned HiOpc;
-    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
-    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg);
-    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg);
-    break;
-  }
-  }
-
-  // Copy EDX:EAX back from HiReg:LoReg
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L);
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H);
-  // Copy ECX:EBX from t1H:t1L
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L);
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H);
-
-  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
-  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
-    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
-    if (NewMO.isReg())
-      NewMO.setIsKill(false);
-    MIB.addOperand(NewMO);
-  }
-  MIB.setMemRefs(MMOBegin, MMOEnd);
-
-  // Copy EDX:EAX back to t3H:t3L
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX);
-  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX);
-
-  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
-
-  mainMBB->addSuccessor(origMainMBB);
-  mainMBB->addSuccessor(sinkMBB);
-
-  // sinkMBB:
-  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
-          TII->get(TargetOpcode::COPY), DstLoReg)
-    .addReg(t3L);
-  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
-          TII->get(TargetOpcode::COPY), DstHiReg)
-    .addReg(t3H);
-
-  MI->eraseFromParent();
-  return sinkMBB;
-}
-
  // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
  // or XMM0_V32I8 in AVX all of this code can be replaced with that
  // in the .td file.
@@ -18704,62 +18151,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    case X86::XBEGIN:
      return EmitXBegin(MI, BB, BB->getParent()->getTarget().getInstrInfo());
  
-  // Atomic Lowering.
-  case X86::ATOMAND8:
-  case X86::ATOMAND16:
-  case X86::ATOMAND32:
-  case X86::ATOMAND64:
-    // Fall through
-  case X86::ATOMOR8:
-  case X86::ATOMOR16:
-  case X86::ATOMOR32:
-  case X86::ATOMOR64:
-    // Fall through
-  case X86::ATOMXOR16:
-  case X86::ATOMXOR8:
-  case X86::ATOMXOR32:
-  case X86::ATOMXOR64:
-    // Fall through
-  case X86::ATOMNAND8:
-  case X86::ATOMNAND16:
-  case X86::ATOMNAND32:
-  case X86::ATOMNAND64:
-    // Fall through
-  case X86::ATOMMAX8:
-  case X86::ATOMMAX16:
-  case X86::ATOMMAX32:
-  case X86::ATOMMAX64:
-    // Fall through
-  case X86::ATOMMIN8:
-  case X86::ATOMMIN16:
-  case X86::ATOMMIN32:
-  case X86::ATOMMIN64:
-    // Fall through
-  case X86::ATOMUMAX8:
-  case X86::ATOMUMAX16:
-  case X86::ATOMUMAX32:
-  case X86::ATOMUMAX64:
-    // Fall through
-  case X86::ATOMUMIN8:
-  case X86::ATOMUMIN16:
-  case X86::ATOMUMIN32:
-  case X86::ATOMUMIN64:
-    return EmitAtomicLoadArith(MI, BB);
-
-  // This group does 64-bit operations on a 32-bit host.
-  case X86::ATOMAND6432:
-  case X86::ATOMOR6432:
-  case X86::ATOMXOR6432:
-  case X86::ATOMNAND6432:
-  case X86::ATOMADD6432:
-  case X86::ATOMSUB6432:
-  case X86::ATOMMAX6432:
-  case X86::ATOMMIN6432:
-  case X86::ATOMUMAX6432:
-  case X86::ATOMUMIN6432:
-  case X86::ATOMSWAP6432:
-    return EmitAtomicLoadArith6432(MI, BB);
-
    case X86::VASTART_SAVE_XMM_REGS:
      return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
  
@@ -19119,9 +18510,16 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
    SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
    for (int &M : Mask)
      M = VMask[M];
-  V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V.getOperand(0),
+  V = DAG.getNode(X86ISD::PSHUFD, DL, V.getValueType(), V.getOperand(0),
                    getV4X86ShuffleImm8ForMask(Mask, DAG));
  
+  // It is possible that one of the combinable shuffles was completely absorbed
+  // by the other, just replace it and revisit all users in that case.
+  if (Old.getNode() == V.getNode()) {
+    DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo=*/true);
+    return true;
+  }
+
    // Replace N with its operand as we're going to combine that shuffle away.
    DAG.ReplaceAllUsesWith(N, N.getOperand(0));