Explicitly update the number of nodes to be traversed

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index fc5ad788a488bea2b40d9fd9b9769eedda9aa81e..340524ca2091b93ba4717a80bb582b1d5896fdea 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5114,6 +5114,86 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
    return SDValue();
  }
  
+// LowerVectorFpExtend - Recognize the scalarized FP_EXTEND from v2f32 to v2f64
+// and convert it into X86ISD::VFPEXT due to the current ISD::FP_EXTEND has the
+// constraint of matching input/output vector elements.
+SDValue
+X86TargetLowering::LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  SDNode *N = Op.getNode();
+  EVT VT = Op.getValueType();
+  unsigned NumElts = Op.getNumOperands();
+
+  // Check supported types and sub-targets.
+  //
+  // Only v2f32 -> v2f64 needs special handling.
+  if (VT != MVT::v2f64 || !Subtarget->hasSSE2())
+    return SDValue();
+
+  SDValue VecIn;
+  EVT VecInVT;
+  SmallVector<int, 8> Mask;
+  EVT SrcVT = MVT::Other;
+
+  // Check the patterns could be translated into X86vfpext.
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue In = N->getOperand(i);
+    unsigned Opcode = In.getOpcode();
+
+    // Skip if the element is undefined.
+    if (Opcode == ISD::UNDEF) {
+      Mask.push_back(-1);
+      continue;
+    }
+
+    // Quit if one of the elements is not defined from 'fpext'.
+    if (Opcode != ISD::FP_EXTEND)
+      return SDValue();
+
+    // Check how the source of 'fpext' is defined.
+    SDValue L2In = In.getOperand(0);
+    EVT L2InVT = L2In.getValueType();
+
+    // Check the original type
+    if (SrcVT == MVT::Other)
+      SrcVT = L2InVT;
+    else if (SrcVT != L2InVT) // Quit if non-homogenous typed.
+      return SDValue();
+
+    // Check whether the value being 'fpext'ed is extracted from the same
+    // source.
+    Opcode = L2In.getOpcode();
+
+    // Quit if it's not extracted with a constant index.
+    if (Opcode != ISD::EXTRACT_VECTOR_ELT ||
+        !isa<ConstantSDNode>(L2In.getOperand(1)))
+      return SDValue();
+
+    SDValue ExtractedFromVec = L2In.getOperand(0);
+
+    if (VecIn.getNode() == 0) {
+      VecIn = ExtractedFromVec;
+      VecInVT = ExtractedFromVec.getValueType();
+    } else if (VecIn != ExtractedFromVec) // Quit if built from more than 1 vec.
+      return SDValue();
+
+    Mask.push_back(cast<ConstantSDNode>(L2In.getOperand(1))->getZExtValue());
+  }
+
+  // Quit if all operands of BUILD_VECTOR are undefined.
+  if (!VecIn.getNode())
+    return SDValue();
+
+  // Fill the remaining mask as undef.
+  for (unsigned i = NumElts; i < VecInVT.getVectorNumElements(); ++i)
+    Mask.push_back(-1);
+
+  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
+                     DAG.getVectorShuffle(VecInVT, DL,
+                                          VecIn, DAG.getUNDEF(VecInVT),
+                                          &Mask[0]));
+}
+
  SDValue
  X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
    DebugLoc dl = Op.getDebugLoc();
@@ -5146,6 +5226,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
    if (Broadcast.getNode())
      return Broadcast;
  
+  SDValue FpExt = LowerVectorFpExtend(Op, DAG);
+  if (FpExt.getNode())
+    return FpExt;
+
    unsigned EVTBits = ExtVT.getSizeInBits();
  
    unsigned NumZero  = 0;
@@ -8203,7 +8287,33 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
  
    unsigned Opcode = 0;
    unsigned NumOperands = 0;
-  switch (Op.getNode()->getOpcode()) {
+
+  // Truncate operations may prevent the merge of the SETCC instruction
+  // and the arithmetic intruction before it. Attempt to truncate the operands
+  // of the arithmetic instruction and use a reduced bit-width instruction.
+  bool NeedTruncation = false;
+  SDValue ArithOp = Op;
+  if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
+    SDValue Arith = Op->getOperand(0);
+    // Both the trunc and the arithmetic op need to have one user each.
+    if (Arith->hasOneUse())
+      switch (Arith.getOpcode()) {
+        default: break;
+        case ISD::ADD:
+        case ISD::SUB:
+        case ISD::AND:
+        case ISD::OR:
+        case ISD::XOR: {
+          NeedTruncation = true;
+          ArithOp = Arith;
+        }
+      }
+  }
+
+  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
+  // which may be the result of a CAST.  We use the variable 'Op', which is the
+  // non-casted variable when we check for possible users.
+  switch (ArithOp.getOpcode()) {
    case ISD::ADD:
      // Due to an isel shortcoming, be conservative if this add is likely to be
      // selected as part of a load-modify-store instruction. When the root node
@@ -8223,7 +8333,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
          goto default_case;
  
      if (ConstantSDNode *C =
-        dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
+        dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
        // An add of one will be selected as an INC.
        if (C->getAPIntValue() == 1) {
          Opcode = X86ISD::INC;
@@ -8259,7 +8369,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
  
        if (User->getOpcode() != ISD::BRCOND &&
            User->getOpcode() != ISD::SETCC &&
-          (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
+          !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
          NonFlagUse = true;
          break;
        }
@@ -8280,11 +8390,9 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
          goto default_case;
  
      // Otherwise use a regular EFLAGS-setting instruction.
-    switch (Op.getNode()->getOpcode()) {
+    switch (ArithOp.getOpcode()) {
      default: llvm_unreachable("unexpected operator!");
-    case ISD::SUB:
-      Opcode = X86ISD::SUB;
-      break;
+    case ISD::SUB: Opcode = X86ISD::SUB; break;
      case ISD::OR:  Opcode = X86ISD::OR;  break;
      case ISD::XOR: Opcode = X86ISD::XOR; break;
      case ISD::AND: Opcode = X86ISD::AND; break;
@@ -8305,19 +8413,40 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
      break;
    }
  
+  // If we found that truncation is beneficial, perform the truncation and
+  // update 'Op'.
+  if (NeedTruncation) {
+    EVT VT = Op.getValueType();
+    SDValue WideVal = Op->getOperand(0);
+    EVT WideVT = WideVal.getValueType();
+    unsigned ConvertedOp = 0;
+    // Use a target machine opcode to prevent further DAGCombine
+    // optimizations that may separate the arithmetic operations
+    // from the setcc node.
+    switch (WideVal.getOpcode()) {
+      default: break;
+      case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
+      case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
+      case ISD::AND: ConvertedOp = X86ISD::AND; break;
+      case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
+      case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
+    }
+
+    if (ConvertedOp) {
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
+        SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
+        SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
+        Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
+      }
+    }
+  }
+
    if (Opcode == 0)
      // Emit a CMP with 0, which is the TEST pattern.
      return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                         DAG.getConstant(0, Op.getValueType()));
  
-  if (Opcode == X86ISD::CMP) {
-    SDValue New = DAG.getNode(Opcode, dl, MVT::i32, Op.getOperand(0),
-                              Op.getOperand(1));
-    // We can't replace usage of SUB with CMP.
-    // The SUB node will be removed later because there is no use of it.
-    return SDValue(New.getNode(), 0);
-  }
-
    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
    SmallVector<SDValue, 4> Ops;
    for (unsigned i = 0; i != NumOperands; ++i)
@@ -9814,62 +9943,6 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
                                 Op.getOperand(1), Op.getOperand(2), DAG);
    }
  
-  // Fix vector shift instructions where the last operand is a non-immediate
-  // i32 value.
-  case Intrinsic::x86_mmx_pslli_w:
-  case Intrinsic::x86_mmx_pslli_d:
-  case Intrinsic::x86_mmx_pslli_q:
-  case Intrinsic::x86_mmx_psrli_w:
-  case Intrinsic::x86_mmx_psrli_d:
-  case Intrinsic::x86_mmx_psrli_q:
-  case Intrinsic::x86_mmx_psrai_w:
-  case Intrinsic::x86_mmx_psrai_d: {
-    SDValue ShAmt = Op.getOperand(2);
-    if (isa<ConstantSDNode>(ShAmt))
-      return SDValue();
-
-    unsigned NewIntNo;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_mmx_pslli_w:
-      NewIntNo = Intrinsic::x86_mmx_psll_w;
-      break;
-    case Intrinsic::x86_mmx_pslli_d:
-      NewIntNo = Intrinsic::x86_mmx_psll_d;
-      break;
-    case Intrinsic::x86_mmx_pslli_q:
-      NewIntNo = Intrinsic::x86_mmx_psll_q;
-      break;
-    case Intrinsic::x86_mmx_psrli_w:
-      NewIntNo = Intrinsic::x86_mmx_psrl_w;
-      break;
-    case Intrinsic::x86_mmx_psrli_d:
-      NewIntNo = Intrinsic::x86_mmx_psrl_d;
-      break;
-    case Intrinsic::x86_mmx_psrli_q:
-      NewIntNo = Intrinsic::x86_mmx_psrl_q;
-      break;
-    case Intrinsic::x86_mmx_psrai_w:
-      NewIntNo = Intrinsic::x86_mmx_psra_w;
-      break;
-    case Intrinsic::x86_mmx_psrai_d:
-      NewIntNo = Intrinsic::x86_mmx_psra_d;
-      break;
-    }
-
-    // The vector shift intrinsics with scalars uses 32b shift amounts but
-    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
-    // to be zero.
-    ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt,
-                         DAG.getConstant(0, MVT::i32));
-// FIXME this must be lowered to get rid of the invalid type.
-
-    EVT VT = Op.getValueType();
-    ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(NewIntNo, MVT::i32),
-                       Op.getOperand(1), ShAmt);
-  }
    case Intrinsic::x86_sse42_pcmpistria128:
    case Intrinsic::x86_sse42_pcmpestria128:
    case Intrinsic::x86_sse42_pcmpistric128:
@@ -9948,6 +10021,78 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
      SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
      return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
    }
+  case Intrinsic::x86_fma_vfmadd_ps:
+  case Intrinsic::x86_fma_vfmadd_pd:
+  case Intrinsic::x86_fma_vfmsub_ps:
+  case Intrinsic::x86_fma_vfmsub_pd:
+  case Intrinsic::x86_fma_vfnmadd_ps:
+  case Intrinsic::x86_fma_vfnmadd_pd:
+  case Intrinsic::x86_fma_vfnmsub_ps:
+  case Intrinsic::x86_fma_vfnmsub_pd:
+  case Intrinsic::x86_fma_vfmaddsub_ps:
+  case Intrinsic::x86_fma_vfmaddsub_pd:
+  case Intrinsic::x86_fma_vfmsubadd_ps:
+  case Intrinsic::x86_fma_vfmsubadd_pd:
+  case Intrinsic::x86_fma_vfmadd_ps_256:
+  case Intrinsic::x86_fma_vfmadd_pd_256:
+  case Intrinsic::x86_fma_vfmsub_ps_256:
+  case Intrinsic::x86_fma_vfmsub_pd_256:
+  case Intrinsic::x86_fma_vfnmadd_ps_256:
+  case Intrinsic::x86_fma_vfnmadd_pd_256:
+  case Intrinsic::x86_fma_vfnmsub_ps_256:
+  case Intrinsic::x86_fma_vfnmsub_pd_256:
+  case Intrinsic::x86_fma_vfmaddsub_ps_256:
+  case Intrinsic::x86_fma_vfmaddsub_pd_256:
+  case Intrinsic::x86_fma_vfmsubadd_ps_256:
+  case Intrinsic::x86_fma_vfmsubadd_pd_256: {
+    // Only lower intrinsics if FMA is enabled. FMA4 still uses patterns.
+    if (!Subtarget->hasFMA())
+      return SDValue();
+
+    unsigned Opc;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_fma_vfmadd_ps:
+    case Intrinsic::x86_fma_vfmadd_pd:
+    case Intrinsic::x86_fma_vfmadd_ps_256:
+    case Intrinsic::x86_fma_vfmadd_pd_256:
+      Opc = X86ISD::FMADD;
+      break;
+    case Intrinsic::x86_fma_vfmsub_ps:
+    case Intrinsic::x86_fma_vfmsub_pd:
+    case Intrinsic::x86_fma_vfmsub_ps_256:
+    case Intrinsic::x86_fma_vfmsub_pd_256:
+      Opc = X86ISD::FMSUB;
+      break;
+    case Intrinsic::x86_fma_vfnmadd_ps:
+    case Intrinsic::x86_fma_vfnmadd_pd:
+    case Intrinsic::x86_fma_vfnmadd_ps_256:
+    case Intrinsic::x86_fma_vfnmadd_pd_256:
+      Opc = X86ISD::FNMADD;
+      break;
+    case Intrinsic::x86_fma_vfnmsub_ps:
+    case Intrinsic::x86_fma_vfnmsub_pd:
+    case Intrinsic::x86_fma_vfnmsub_ps_256:
+    case Intrinsic::x86_fma_vfnmsub_pd_256:
+      Opc = X86ISD::FNMSUB;
+      break;
+    case Intrinsic::x86_fma_vfmaddsub_ps:
+    case Intrinsic::x86_fma_vfmaddsub_pd:
+    case Intrinsic::x86_fma_vfmaddsub_ps_256:
+    case Intrinsic::x86_fma_vfmaddsub_pd_256:
+      Opc = X86ISD::FMADDSUB;
+      break;
+    case Intrinsic::x86_fma_vfmsubadd_ps:
+    case Intrinsic::x86_fma_vfmsubadd_pd:
+    case Intrinsic::x86_fma_vfmsubadd_ps_256:
+    case Intrinsic::x86_fma_vfmsubadd_pd_256:
+      Opc = X86ISD::FMSUBADD;
+      break;
+    }
+
+    return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3));
+  }
    }
  }
  
@@ -11122,9 +11267,9 @@ static void ReplaceATOMIC_LOAD(SDNode *Node,
    Results.push_back(Swap.getValue(1));
  }
  
-void X86TargetLowering::
+static void
  ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
-                        SelectionDAG &DAG, unsigned NewOp) const {
+                        SelectionDAG &DAG, unsigned NewOp) {
    DebugLoc dl = Node->getDebugLoc();
    assert (Node->getValueType(0) == MVT::i64 &&
            "Only know how to expand i64 atomics");
@@ -11245,26 +11390,40 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
      return;
    }
    case ISD::ATOMIC_LOAD_ADD:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
-    return;
    case ISD::ATOMIC_LOAD_AND:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
-    return;
    case ISD::ATOMIC_LOAD_NAND:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
-    return;
    case ISD::ATOMIC_LOAD_OR:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
-    return;
    case ISD::ATOMIC_LOAD_SUB:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
-    return;
    case ISD::ATOMIC_LOAD_XOR:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
-    return;
-  case ISD::ATOMIC_SWAP:
-    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
+  case ISD::ATOMIC_SWAP: {
+    unsigned Opc;
+    switch (N->getOpcode()) {
+    default: llvm_unreachable("Unexpected opcode");
+    case ISD::ATOMIC_LOAD_ADD:
+      Opc = X86ISD::ATOMADD64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_AND:
+      Opc = X86ISD::ATOMAND64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_NAND:
+      Opc = X86ISD::ATOMNAND64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_OR:
+      Opc = X86ISD::ATOMOR64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_SUB:
+      Opc = X86ISD::ATOMSUB64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_XOR:
+      Opc = X86ISD::ATOMXOR64_DAG;
+      break;
+    case ISD::ATOMIC_SWAP:
+      Opc = X86ISD::ATOMSWAP64_DAG;
+      break;
+    }
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
      return;
+  }
    case ISD::ATOMIC_LOAD:
      ReplaceATOMIC_LOAD(N, Results, DAG);
    }
@@ -11324,6 +11483,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::FHSUB:              return "X86ISD::FHSUB";
    case X86ISD::FMAX:               return "X86ISD::FMAX";
    case X86ISD::FMIN:               return "X86ISD::FMIN";
+  case X86ISD::FMAXC:              return "X86ISD::FMAXC";
+  case X86ISD::FMINC:              return "X86ISD::FMINC";
    case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
    case X86ISD::FRCP:               return "X86ISD::FRCP";
    case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
@@ -11342,7 +11503,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
    case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
    case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
+  case X86ISD::VSEXT_MOVL:         return "X86ISD::VSEXT_MOVL";
    case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
+  case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
    case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
    case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
    case X86ISD::VSHL:               return "X86ISD::VSHL";
@@ -12792,183 +12955,208 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
      // String/text processing lowering.
    case X86::PCMPISTRM128REG:
    case X86::VPCMPISTRM128REG:
-    return EmitPCMP(MI, BB, 3, false /* in-mem */);
    case X86::PCMPISTRM128MEM:
    case X86::VPCMPISTRM128MEM:
-    return EmitPCMP(MI, BB, 3, true /* in-mem */);
    case X86::PCMPESTRM128REG:
    case X86::VPCMPESTRM128REG:
-    return EmitPCMP(MI, BB, 5, false /* in mem */);
    case X86::PCMPESTRM128MEM:
-  case X86::VPCMPESTRM128MEM:
-    return EmitPCMP(MI, BB, 5, true /* in mem */);
+  case X86::VPCMPESTRM128MEM: {
+    unsigned NumArgs;
+    bool MemArg;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::PCMPISTRM128REG:
+    case X86::VPCMPISTRM128REG:
+      NumArgs = 3; MemArg = false; break;
+    case X86::PCMPISTRM128MEM:
+    case X86::VPCMPISTRM128MEM:
+      NumArgs = 3; MemArg = true; break;
+    case X86::PCMPESTRM128REG:
+    case X86::VPCMPESTRM128REG:
+      NumArgs = 5; MemArg = false; break;
+    case X86::PCMPESTRM128MEM:
+    case X86::VPCMPESTRM128MEM:
+      NumArgs = 5; MemArg = true; break;
+    }
+    return EmitPCMP(MI, BB, NumArgs, MemArg);
+  }
  
      // Thread synchronization.
    case X86::MONITOR:
      return EmitMonitor(MI, BB);
  
      // Atomic Lowering.
-  case X86::ATOMAND32:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
-                                               X86::AND32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32,
-                                               X86::NOT32r, X86::EAX,
-                                               &X86::GR32RegClass);
-  case X86::ATOMOR32:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
-                                               X86::OR32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32,
-                                               X86::NOT32r, X86::EAX,
-                                               &X86::GR32RegClass);
-  case X86::ATOMXOR32:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
-                                               X86::XOR32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32,
-                                               X86::NOT32r, X86::EAX,
-                                               &X86::GR32RegClass);
-  case X86::ATOMNAND32:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
-                                               X86::AND32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32,
-                                               X86::NOT32r, X86::EAX,
-                                               &X86::GR32RegClass, true);
    case X86::ATOMMIN32:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
    case X86::ATOMMAX32:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
    case X86::ATOMUMIN32:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
    case X86::ATOMUMAX32:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
+  case X86::ATOMMIN16:
+  case X86::ATOMMAX16:
+  case X86::ATOMUMIN16:
+  case X86::ATOMUMAX16:
+  case X86::ATOMMIN64:
+  case X86::ATOMMAX64:
+  case X86::ATOMUMIN64:
+  case X86::ATOMUMAX64: {
+    unsigned Opc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::ATOMMIN32:  Opc = X86::CMOVL32rr; break;
+    case X86::ATOMMAX32:  Opc = X86::CMOVG32rr; break;
+    case X86::ATOMUMIN32: Opc = X86::CMOVB32rr; break;
+    case X86::ATOMUMAX32: Opc = X86::CMOVA32rr; break;
+    case X86::ATOMMIN16:  Opc = X86::CMOVL16rr; break;
+    case X86::ATOMMAX16:  Opc = X86::CMOVG16rr; break;
+    case X86::ATOMUMIN16: Opc = X86::CMOVB16rr; break;
+    case X86::ATOMUMAX16: Opc = X86::CMOVA16rr; break;
+    case X86::ATOMMIN64:  Opc = X86::CMOVL64rr; break;
+    case X86::ATOMMAX64:  Opc = X86::CMOVG64rr; break;
+    case X86::ATOMUMIN64: Opc = X86::CMOVB64rr; break;
+    case X86::ATOMUMAX64: Opc = X86::CMOVA64rr; break;
+    // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
+    }
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, Opc);
+  }
+
+  case X86::ATOMAND32:
+  case X86::ATOMOR32:
+  case X86::ATOMXOR32:
+  case X86::ATOMNAND32: {
+    bool Invert = false;
+    unsigned RegOpc, ImmOpc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::ATOMAND32:
+      RegOpc = X86::AND32rr; ImmOpc = X86::AND32ri; break;
+    case X86::ATOMOR32:
+      RegOpc = X86::OR32rr;  ImmOpc = X86::OR32ri; break;
+    case X86::ATOMXOR32:
+      RegOpc = X86::XOR32rr; ImmOpc = X86::XOR32ri; break;
+    case X86::ATOMNAND32:
+      RegOpc = X86::AND32rr; ImmOpc = X86::AND32ri; Invert = true; break;
+    }
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
+                                               X86::MOV32rm, X86::LCMPXCHG32,
+                                               X86::NOT32r, X86::EAX,
+                                               &X86::GR32RegClass, Invert);
+  }
  
    case X86::ATOMAND16:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
-                                               X86::AND16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16,
-                                               X86::NOT16r, X86::AX,
-                                               &X86::GR16RegClass);
    case X86::ATOMOR16:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
-                                               X86::OR16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16,
-                                               X86::NOT16r, X86::AX,
-                                               &X86::GR16RegClass);
    case X86::ATOMXOR16:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
-                                               X86::XOR16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16,
-                                               X86::NOT16r, X86::AX,
-                                               &X86::GR16RegClass);
-  case X86::ATOMNAND16:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
-                                               X86::AND16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16,
+  case X86::ATOMNAND16: {
+    bool Invert = false;
+    unsigned RegOpc, ImmOpc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::ATOMAND16:
+      RegOpc = X86::AND16rr; ImmOpc = X86::AND16ri; break;
+    case X86::ATOMOR16:
+      RegOpc = X86::OR16rr;  ImmOpc = X86::OR16ri; break;
+    case X86::ATOMXOR16:
+      RegOpc = X86::XOR16rr; ImmOpc = X86::XOR16ri; break;
+    case X86::ATOMNAND16:
+      RegOpc = X86::AND16rr; ImmOpc = X86::AND16ri; Invert = true; break;
+    }
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
+                                               X86::MOV16rm, X86::LCMPXCHG16,
                                                 X86::NOT16r, X86::AX,
-                                               &X86::GR16RegClass, true);
-  case X86::ATOMMIN16:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
-  case X86::ATOMMAX16:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
-  case X86::ATOMUMIN16:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
-  case X86::ATOMUMAX16:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
+                                               &X86::GR16RegClass, Invert);
+  }
  
    case X86::ATOMAND8:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
-                                               X86::AND8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8,
-                                               X86::NOT8r, X86::AL,
-                                               &X86::GR8RegClass);
    case X86::ATOMOR8:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
-                                               X86::OR8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8,
-                                               X86::NOT8r, X86::AL,
-                                               &X86::GR8RegClass);
    case X86::ATOMXOR8:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
-                                               X86::XOR8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8,
-                                               X86::NOT8r, X86::AL,
-                                               &X86::GR8RegClass);
-  case X86::ATOMNAND8:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
-                                               X86::AND8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8,
+  case X86::ATOMNAND8: {
+    bool Invert = false;
+    unsigned RegOpc, ImmOpc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::ATOMAND8:
+      RegOpc = X86::AND8rr; ImmOpc = X86::AND8ri; break;
+    case X86::ATOMOR8:
+      RegOpc = X86::OR8rr;  ImmOpc = X86::OR8ri; break;
+    case X86::ATOMXOR8:
+      RegOpc = X86::XOR8rr; ImmOpc = X86::XOR8ri; break;
+    case X86::ATOMNAND8:
+      RegOpc = X86::AND8rr; ImmOpc = X86::AND8ri; Invert = true; break;
+    }
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
+                                               X86::MOV8rm, X86::LCMPXCHG8,
                                                 X86::NOT8r, X86::AL,
-                                               &X86::GR8RegClass, true);
-  // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
+                                               &X86::GR8RegClass, Invert);
+  }
+
    // This group is for 64-bit host.
    case X86::ATOMAND64:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
-                                               X86::AND64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64,
-                                               X86::NOT64r, X86::RAX,
-                                               &X86::GR64RegClass);
    case X86::ATOMOR64:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
-                                               X86::OR64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64,
-                                               X86::NOT64r, X86::RAX,
-                                               &X86::GR64RegClass);
    case X86::ATOMXOR64:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
-                                               X86::XOR64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64,
-                                               X86::NOT64r, X86::RAX,
-                                               &X86::GR64RegClass);
-  case X86::ATOMNAND64:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
-                                               X86::AND64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64,
+  case X86::ATOMNAND64: {
+    bool Invert = false;
+    unsigned RegOpc, ImmOpc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::ATOMAND64:
+      RegOpc = X86::AND64rr; ImmOpc = X86::AND64ri32; break;
+    case X86::ATOMOR64:
+      RegOpc = X86::OR64rr;  ImmOpc = X86::OR64ri32; break;
+    case X86::ATOMXOR64:
+      RegOpc = X86::XOR64rr; ImmOpc = X86::XOR64ri32; break;
+    case X86::ATOMNAND64:
+      RegOpc = X86::AND64rr; ImmOpc = X86::AND64ri32; Invert = true; break;
+    }
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
+                                               X86::MOV64rm, X86::LCMPXCHG64,
                                                 X86::NOT64r, X86::RAX,
-                                               &X86::GR64RegClass, true);
-  case X86::ATOMMIN64:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
-  case X86::ATOMMAX64:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
-  case X86::ATOMUMIN64:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
-  case X86::ATOMUMAX64:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
+                                               &X86::GR64RegClass, Invert);
+  }
  
    // This group does 64-bit operations on a 32-bit host.
    case X86::ATOMAND6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::AND32rr, X86::AND32rr,
-                                               X86::AND32ri, X86::AND32ri,
-                                               false);
    case X86::ATOMOR6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::OR32rr, X86::OR32rr,
-                                               X86::OR32ri, X86::OR32ri,
-                                               false);
    case X86::ATOMXOR6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::XOR32rr, X86::XOR32rr,
-                                               X86::XOR32ri, X86::XOR32ri,
-                                               false);
    case X86::ATOMNAND6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::AND32rr, X86::AND32rr,
-                                               X86::AND32ri, X86::AND32ri,
-                                               true);
    case X86::ATOMADD6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::ADD32rr, X86::ADC32rr,
-                                               X86::ADD32ri, X86::ADC32ri,
-                                               false);
    case X86::ATOMSUB6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::SUB32rr, X86::SBB32rr,
-                                               X86::SUB32ri, X86::SBB32ri,
-                                               false);
-  case X86::ATOMSWAP6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::MOV32rr, X86::MOV32rr,
-                                               X86::MOV32ri, X86::MOV32ri,
-                                               false);
+  case X86::ATOMSWAP6432: {
+    bool Invert = false;
+    unsigned RegOpcL, RegOpcH, ImmOpcL, ImmOpcH;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::ATOMAND6432:
+      RegOpcL = RegOpcH = X86::AND32rr;
+      ImmOpcL = ImmOpcH = X86::AND32ri;
+      break;
+    case X86::ATOMOR6432:
+      RegOpcL = RegOpcH = X86::OR32rr;
+      ImmOpcL = ImmOpcH = X86::OR32ri;
+      break;
+    case X86::ATOMXOR6432:
+      RegOpcL = RegOpcH = X86::XOR32rr;
+      ImmOpcL = ImmOpcH = X86::XOR32ri;
+      break;
+    case X86::ATOMNAND6432:
+      RegOpcL = RegOpcH = X86::AND32rr;
+      ImmOpcL = ImmOpcH = X86::AND32ri;
+      Invert = true;
+      break;
+    case X86::ATOMADD6432:
+      RegOpcL = X86::ADD32rr; RegOpcH = X86::ADC32rr;
+      ImmOpcL = X86::ADD32ri; ImmOpcH = X86::ADC32ri;
+      break;
+    case X86::ATOMSUB6432:
+      RegOpcL = X86::SUB32rr; RegOpcH = X86::SBB32rr;
+      ImmOpcL = X86::SUB32ri; ImmOpcH = X86::SBB32ri;
+      break;
+    case X86::ATOMSWAP6432:
+      RegOpcL = RegOpcH = X86::MOV32rr;
+      ImmOpcL = ImmOpcH = X86::MOV32ri;
+      break;
+    }
+    return EmitAtomicBit6432WithCustomInserter(MI, BB, RegOpcL, RegOpcH,
+                                               ImmOpcL, ImmOpcH, Invert);
+  }
+
    case X86::VASTART_SAVE_XMM_REGS:
      return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
  
@@ -13266,8 +13454,9 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
      // PSHUFD
      static const int ShufMask1[] = {0, 2, 0, 0};
  
-    OpLo = DAG.getVectorShuffle(VT, dl, OpLo, DAG.getUNDEF(VT), ShufMask1);
-    OpHi = DAG.getVectorShuffle(VT, dl, OpHi, DAG.getUNDEF(VT), ShufMask1);
+    SDValue Undef = DAG.getUNDEF(VT);
+    OpLo = DAG.getVectorShuffle(VT, dl, OpLo, Undef, ShufMask1);
+    OpHi = DAG.getVectorShuffle(VT, dl, OpHi, Undef, ShufMask1);
  
      // MOVLHPS
      static const int ShufMask2[] = {0, 1, 4, 5};
@@ -13325,10 +13514,9 @@ SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
      static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
                                     -1, -1, -1, -1, -1, -1, -1, -1};
  
-    OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, DAG.getUNDEF(MVT::v16i8),
-                                ShufMask1);
-    OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, DAG.getUNDEF(MVT::v16i8),
-                                ShufMask1);
+    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
+    OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, Undef, ShufMask1);
+    OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, Undef, ShufMask1);
  
      OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
      OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
@@ -13848,7 +14036,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
  //
  // where Op could be BRCOND or CMOV.
  //
-static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
+static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
    // Quit if not CMP and SUB with its value result used.
    if (Cmp.getOpcode() != X86ISD::CMP &&
        (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
@@ -13899,7 +14087,85 @@ static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
    return SetCC.getOperand(1);
  }
  
-static bool IsValidFCMOVCondition(X86::CondCode CC) {
+/// checkFlaggedOrCombine - DAG combination on X86ISD::OR, i.e. with EFLAGS
+/// updated. If only flag result is used and the result is evaluated from a
+/// series of element extraction, try to combine it into a PTEST.
+static SDValue checkFlaggedOrCombine(SDValue Or, X86::CondCode &CC,
+                                     SelectionDAG &DAG,
+                                     const X86Subtarget *Subtarget) {
+  SDNode *N = Or.getNode();
+  DebugLoc DL = N->getDebugLoc();
+
+  // Only SSE4.1 and beyond supports PTEST or like.
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  if (N->getOpcode() != X86ISD::OR)
+    return SDValue();
+
+  // Quit if the value result of OR is used.
+  if (N->hasAnyUseOfValue(0))
+    return SDValue();
+
+  // Quit if not used as a boolean value.
+  if (CC != X86::COND_E && CC != X86::COND_NE)
+    return SDValue();
+
+  SmallVector<SDValue, 8> Opnds;
+  SDValue VecIn;
+  EVT VT = MVT::Other;
+  unsigned Mask = 0;
+
+  // Recognize a special case where a vector is casted into wide integer to
+  // test all 0s.
+  Opnds.push_back(N->getOperand(0));
+  Opnds.push_back(N->getOperand(1));
+
+  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+    SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
+    // BFS traverse all OR'd operands.
+    if (I->getOpcode() == ISD::OR) {
+      Opnds.push_back(I->getOperand(0));
+      Opnds.push_back(I->getOperand(1));
+      // Re-evaluate the number of nodes to be traversed.
+      e += 2;
+      continue;
+    }
+
+    // Quit if a non-EXTRACT_VECTOR_ELT
+    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // Quit if without a constant index.
+    SDValue Idx = I->getOperand(1);
+    if (!isa<ConstantSDNode>(Idx))
+      return SDValue();
+
+    // Check if all elements are extracted from the same vector.
+    SDValue ExtractedFromVec = I->getOperand(0);
+    if (VecIn.getNode() == 0) {
+      VT = ExtractedFromVec.getValueType();
+      // FIXME: only 128-bit vector is supported so far.
+      if (!VT.is128BitVector())
+        return SDValue();
+      VecIn = ExtractedFromVec;
+    } else if (VecIn != ExtractedFromVec)
+      return SDValue();
+
+    // Record the constant index.
+    Mask |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+  }
+
+  assert(VT.is128BitVector() && "Only 128-bit vector PTEST is supported so far.");
+
+  // Quit if not all elements are used.
+  if (Mask != (1U << VT.getVectorNumElements()) - 1U)
+    return SDValue();
+
+  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIn, VecIn);
+}
+
+static bool isValidFCMOVCondition(X86::CondCode CC) {
    switch (CC) {
    default:
      return false;
@@ -13917,7 +14183,8 @@ static bool IsValidFCMOVCondition(X86::CondCode CC) {
  
  /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
  static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI) {
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
    DebugLoc DL = N->getDebugLoc();
  
    // If the flag operand isn't dead, don't touch this CMOV.
@@ -13942,10 +14209,18 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
  
    SDValue Flags;
  
-  Flags = BoolTestSetCCCombine(Cond, CC);
+  Flags = checkBoolTestSetCCCombine(Cond, CC);
    if (Flags.getNode() &&
        // Extra check as FCMOV only supports a subset of X86 cond.
-      (FalseOp.getValueType() != MVT::f80 || IsValidFCMOVCondition(CC))) {
+      (FalseOp.getValueType() != MVT::f80 || isValidFCMOVCondition(CC))) {
+    SDValue Ops[] = { FalseOp, TrueOp,
+                      DAG.getConstant(CC, MVT::i8), Flags };
+    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
+                       Ops, array_lengthof(Ops));
+  }
+
+  Flags = checkFlaggedOrCombine(Cond, CC, DAG, Subtarget);
+  if (Flags.getNode()) {
      SDValue Ops[] = { FalseOp, TrueOp,
                        DAG.getConstant(CC, MVT::i8), Flags };
      return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
@@ -15202,6 +15477,29 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
    return SDValue();
  }
  
+/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
+/// X86ISD::FMAX nodes.
+static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
+
+  // Only perform optimizations if UnsafeMath is used.
+  if (!DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
+  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
+  // into FMINC and MMAXC, which are Commutative operations.
+  unsigned NewOp = 0;
+  switch (N->getOpcode()) {
+    default: llvm_unreachable("unknown opcode");
+    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
+    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
+  }
+
+  return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0),
+                     N->getOperand(0), N->getOperand(1));
+}
+
+
  /// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
  static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
    // FAND(0.0, x) -> 0.0
@@ -15277,19 +15575,19 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
      // concat the vectors to original VT
  
      unsigned NumElems = OpVT.getVectorNumElements();
+    SDValue Undef = DAG.getUNDEF(OpVT);
+
      SmallVector<int,8> ShufMask1(NumElems, -1);
      for (unsigned i = 0; i != NumElems/2; ++i)
        ShufMask1[i] = i;
  
-    SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
-                                        &ShufMask1[0]);
+    SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask1[0]);
  
      SmallVector<int,8> ShufMask2(NumElems, -1);
      for (unsigned i = 0; i != NumElems/2; ++i)
        ShufMask2[i] = i + NumElems/2;
  
-    SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, DAG.getUNDEF(OpVT),
-                                        &ShufMask2[0]);
+    SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask2[0]);
  
      EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
                                    VT.getVectorNumElements()/2);
@@ -15430,7 +15728,9 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
  }
  
  // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
-static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
    DebugLoc DL = N->getDebugLoc();
    X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
    SDValue EFLAGS = N->getOperand(1);
@@ -15446,7 +15746,13 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
  
    SDValue Flags;
  
-  Flags = BoolTestSetCCCombine(EFLAGS, CC);
+  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
+  }
+
+  Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
    if (Flags.getNode()) {
      SDValue Cond = DAG.getConstant(CC, MVT::i8);
      return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
@@ -15468,7 +15774,14 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
  
    SDValue Flags;
  
-  Flags = BoolTestSetCCCombine(EFLAGS, CC);
+  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
+                       Flags);
+  }
+
+  Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
    if (Flags.getNode()) {
      SDValue Cond = DAG.getConstant(CC, MVT::i8);
      return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
@@ -15663,7 +15976,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
      return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
    case ISD::VSELECT:
    case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
-  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
+  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
    case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
    case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
    case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
@@ -15683,6 +15996,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
    case X86ISD::FXOR:
    case X86ISD::FOR:         return PerformFORCombine(N, DAG);
+  case X86ISD::FMIN:
+  case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
    case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
    case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
    case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
@@ -15691,7 +16006,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
    case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG, DCI);
    case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
-  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG);
+  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
    case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
    case X86ISD::SHUFP:       // Handle all target specific shuffles
    case X86ISD::PALIGN: