indentation

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 8ce28bf9ba14a42c1a775d905d90c870bdf06321..7c92e80147ed9acf0ce754ac3575e2acfaed3f80 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -38,8 +38,12 @@
  #include "llvm/Target/TargetOptions.h"
  #include "llvm/ADT/SmallSet.h"
  #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
  using namespace llvm;
  
+static cl::opt<bool>
+DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX"));
+
  // Forward declarations.
  static SDValue getMOVLMask(unsigned NumElems, SelectionDAG &DAG);
  
@@ -59,7 +63,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
  
    // X86 is weird, it always uses i8 for shift amounts and setcc results.
    setShiftAmountType(MVT::i8);
-  setSetCCResultContents(ZeroOrOneSetCCResult);
+  setBooleanContents(ZeroOrOneBooleanContent);
    setSchedulingPreference(SchedulingForRegPressure);
    setShiftAmountFlavor(Mask);   // shl X, 32 == shl X, 0
    setStackPointerRegisterToSaveRestore(X86StackPtr);
@@ -84,7 +88,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    if (Subtarget->is64Bit())
      addRegisterClass(MVT::i64, X86::GR64RegisterClass);
  
-  setLoadXAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
  
    // We don't accept any truncstore of integer registers.  
    setTruncStoreAction(MVT::i64, MVT::i32, Expand);
@@ -92,7 +96,15 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
    setTruncStoreAction(MVT::i32, MVT::i16, Expand);
    setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
-  setTruncStoreAction(MVT::i16, MVT::i8, Expand);
+  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
+
+  // SETOEQ and SETUNE require checking two conditions.
+  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
  
    // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
    // operation.
@@ -104,10 +116,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
      setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
    } else {
-    if (X86ScalarSSEf64)
+    if (X86ScalarSSEf64) {
+      // We have an impenetrably clever algorithm for ui64->double only.
+      setOperationAction(ISD::UINT_TO_FP   , MVT::i64  , Custom);
        // If SSE i64 SINT_TO_FP is not available, expand i32 UINT_TO_FP.
        setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Expand);
-    else
+    } else
        setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
    }
  
@@ -292,15 +306,25 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::MEMBARRIER    , MVT::Other, Expand);
  
    // Expand certain atomics
-  setOperationAction(ISD::ATOMIC_CMP_SWAP_8 , MVT::i8, Custom);
-  setOperationAction(ISD::ATOMIC_CMP_SWAP_16, MVT::i16, Custom);
-  setOperationAction(ISD::ATOMIC_CMP_SWAP_32, MVT::i32, Custom);
-  setOperationAction(ISD::ATOMIC_CMP_SWAP_64, MVT::i64, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
  
-  setOperationAction(ISD::ATOMIC_LOAD_SUB_8, MVT::i8, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_SUB_16, MVT::i16, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_SUB_32, MVT::i32, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_SUB_64, MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
+
+  if (!Subtarget->is64Bit()) {
+    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
+  }
  
    // Use the default ISD::DBG_STOPPOINT, ISD::DECLARE expansion.
    setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
@@ -472,13 +496,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
    {
+    bool ignored;
      APFloat TmpFlt(+0.0);
-    TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven);
+    TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+                   &ignored);
      addLegalFPImmediate(TmpFlt);  // FLD0
      TmpFlt.changeSign();
      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
      APFloat TmpFlt2(+1.0);
-    TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven);
+    TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+                    &ignored);
      addLegalFPImmediate(TmpFlt2);  // FLD1
      TmpFlt2.changeSign();
      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
@@ -494,24 +521,15 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
    setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
  
-  setOperationAction(ISD::FLOG, MVT::f32, Expand);
-  setOperationAction(ISD::FLOG, MVT::f64, Expand);
    setOperationAction(ISD::FLOG, MVT::f80, Expand);
-  setOperationAction(ISD::FLOG2, MVT::f32, Expand);
-  setOperationAction(ISD::FLOG2, MVT::f64, Expand);
    setOperationAction(ISD::FLOG2, MVT::f80, Expand);
-  setOperationAction(ISD::FLOG10, MVT::f32, Expand);
-  setOperationAction(ISD::FLOG10, MVT::f64, Expand);
    setOperationAction(ISD::FLOG10, MVT::f80, Expand);
-  setOperationAction(ISD::FEXP, MVT::f32, Expand);
-  setOperationAction(ISD::FEXP, MVT::f64, Expand);
    setOperationAction(ISD::FEXP, MVT::f80, Expand);
-  setOperationAction(ISD::FEXP2, MVT::f32, Expand);
-  setOperationAction(ISD::FEXP2, MVT::f64, Expand);
    setOperationAction(ISD::FEXP2, MVT::f80, Expand);
  
-  // First set operation action for all vector types to expand. Then we
-  // will selectively turn on ones that can be effectively codegen'd.
+  // First set operation action for all vector types to either promote
+  // (for widening) or expand (for scalarization). Then we will selectively
+  // turn on ones that can be effectively codegen'd.
    for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
         VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
      setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
@@ -552,9 +570,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
    }
  
-  if (Subtarget->hasMMX()) {
+  if (!DisableMMX && Subtarget->hasMMX()) {
      addRegisterClass(MVT::v8i8,  X86::VR64RegisterClass);
      addRegisterClass(MVT::v4i16, X86::VR64RegisterClass);
      addRegisterClass(MVT::v2i32, X86::VR64RegisterClass);
@@ -627,6 +650,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Custom);
  
      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i16, Custom);
+
+    setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
+    setOperationAction(ISD::TRUNCATE,           MVT::v8i8, Expand);
+    setOperationAction(ISD::SELECT,             MVT::v8i8, Promote);
+    setOperationAction(ISD::SELECT,             MVT::v4i16, Promote);
+    setOperationAction(ISD::SELECT,             MVT::v2i32, Promote);
+    setOperationAction(ISD::SELECT,             MVT::v1i64, Custom);
    }
  
    if (Subtarget->hasSSE1()) {
@@ -657,6 +687,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
      setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
      setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
+    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
      setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
      setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
      setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
@@ -728,7 +759,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    if (Subtarget->hasSSE41()) {
      // FIXME: Do we need to handle scalar-to-vector here?
      setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
-    setOperationAction(ISD::MUL,                MVT::v2i64, Legal);
  
      // i8 and i16 vectors are custom , because the source register and source
      // source memory operand types are not the same width.  f32 vectors are
@@ -757,6 +787,20 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    // We want to custom lower some of our intrinsics.
    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  
+  // Add/Sub/Mul with overflow operations are custom lowered.
+  setOperationAction(ISD::SADDO, MVT::i32, Custom);
+  setOperationAction(ISD::SADDO, MVT::i64, Custom);
+  setOperationAction(ISD::UADDO, MVT::i32, Custom);
+  setOperationAction(ISD::UADDO, MVT::i64, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
+  setOperationAction(ISD::USUBO, MVT::i32, Custom);
+  setOperationAction(ISD::USUBO, MVT::i64, Custom);
+  setOperationAction(ISD::SMULO, MVT::i32, Custom);
+  setOperationAction(ISD::SMULO, MVT::i64, Custom);
+  setOperationAction(ISD::UMULO, MVT::i32, Custom);
+  setOperationAction(ISD::UMULO, MVT::i64, Custom);
+
    // We have target-specific dag combine patterns for the following nodes:
    setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
    setTargetDAGCombine(ISD::BUILD_VECTOR);
@@ -832,10 +876,15 @@ unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
  MVT
  X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
                                         bool isSrcConst, bool isSrcStr) const {
-  if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
-    return MVT::v4i32;
-  if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
-    return MVT::v4f32;
+  // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
+  // linux.  This is because the stack realignment code can't handle certain
+  // cases like PR2962.  This should be removed when PR2962 is fixed.
+  if (Subtarget->getStackAlignment() >= 16) {
+    if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
+      return MVT::v4i32;
+    if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
+      return MVT::v4f32;
+  }
    if (Subtarget->is64Bit() && Size >= 8)
      return MVT::i64;
    return MVT::i32;
@@ -885,7 +934,7 @@ SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
      SDValue TargetAddress = TailCall.getOperand(1);
      SDValue StackAdjustment = TailCall.getOperand(2);
      assert(((TargetAddress.getOpcode() == ISD::Register &&
-               (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::ECX ||
+               (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::EAX ||
                  cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R9)) ||
                TargetAddress.getOpcode() == ISD::TargetExternalSymbol ||
                TargetAddress.getOpcode() == ISD::TargetGlobalAddress) && 
@@ -972,12 +1021,12 @@ SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
  /// being lowered.  The returns a SDNode with the same number of values as the
  /// ISD::CALL.
  SDNode *X86TargetLowering::
-LowerCallResult(SDValue Chain, SDValue InFlag, SDNode *TheCall, 
+LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, 
                  unsigned CallingConv, SelectionDAG &DAG) {
    
    // Assign locations to each value returned by this call.
    SmallVector<CCValAssign, 16> RVLocs;
-  bool isVarArg = cast<ConstantSDNode>(TheCall->getOperand(2))->getValue() != 0;
+  bool isVarArg = TheCall->isVarArg();
    CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
    CCInfo.AnalyzeCallResult(TheCall, RetCC_X86);
  
@@ -1014,8 +1063,8 @@ LowerCallResult(SDValue Chain, SDValue InFlag, SDNode *TheCall,
  
    // Merge everything together with a MERGE_VALUES node.
    ResultVals.push_back(Chain);
-  return DAG.getMergeValues(TheCall->getVTList(), &ResultVals[0],
-                            ResultVals.size()).getNode();
+  return DAG.getNode(ISD::MERGE_VALUES, TheCall->getVTList(), &ResultVals[0],
+                     ResultVals.size()).getNode();
  }
  
  
@@ -1042,12 +1091,12 @@ static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg,
  
  /// CallIsStructReturn - Determines whether a CALL node uses struct return
  /// semantics.
-static bool CallIsStructReturn(SDValue Op) {
-  unsigned NumOps = (Op.getNumOperands() - 5) / 2;
+static bool CallIsStructReturn(CallSDNode *TheCall) {
+  unsigned NumOps = TheCall->getNumArgs();
    if (!NumOps)
      return false;
  
-  return cast<ARG_FLAGSSDNode>(Op.getOperand(6))->getArgFlags().isSRet();
+  return TheCall->getArgFlags(0).isSRet();
  }
  
  /// ArgsAreStructReturn - Determines whether a FORMAL_ARGUMENTS node uses struct
@@ -1063,12 +1112,11 @@ static bool ArgsAreStructReturn(SDValue Op) {
  /// IsCalleePop - Determines whether a CALL or FORMAL_ARGUMENTS node requires
  /// the callee to pop its own arguments. Callee pop is necessary to support tail
  /// calls.
-bool X86TargetLowering::IsCalleePop(SDValue Op) {
-  bool IsVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+bool X86TargetLowering::IsCalleePop(bool IsVarArg, unsigned CallingConv) {
    if (IsVarArg)
      return false;
  
-  switch (cast<ConstantSDNode>(Op.getOperand(1))->getValue()) {
+  switch (CallingConv) {
    default:
      return false;
    case CallingConv::X86_StdCall:
@@ -1080,11 +1128,9 @@ bool X86TargetLowering::IsCalleePop(SDValue Op) {
    }
  }
  
-/// CCAssignFnForNode - Selects the correct CCAssignFn for a CALL or
-/// FORMAL_ARGUMENTS node.
-CCAssignFn *X86TargetLowering::CCAssignFnForNode(SDValue Op) const {
-  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
-  
+/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
+/// given CallingConvention value.
+CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const {
    if (Subtarget->is64Bit()) {
      if (Subtarget->isTargetWin64())
        return CC_X86_Win64_C;
@@ -1096,8 +1142,8 @@ CCAssignFn *X86TargetLowering::CCAssignFnForNode(SDValue Op) const {
  
    if (CC == CallingConv::X86_FastCall)
      return CC_X86_32_FastCall;
-  else if (CC == CallingConv::Fast && PerformTailCallOpt)
-    return CC_X86_32_TailCall;
+  else if (CC == CallingConv::Fast)
+    return CC_X86_32_FastCC;
    else
      return CC_X86_32_C;
  }
@@ -1106,7 +1152,7 @@ CCAssignFn *X86TargetLowering::CCAssignFnForNode(SDValue Op) const {
  /// apply to a MachineFunction containing a given FORMAL_ARGUMENTS node.
  NameDecorationStyle
  X86TargetLowering::NameDecorationForFORMAL_ARGUMENTS(SDValue Op) {
-  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    if (CC == CallingConv::X86_FastCall)
      return FastCall;
    else if (CC == CallingConv::X86_StdCall)
@@ -1184,7 +1230,7 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
    
    MachineFrameInfo *MFI = MF.getFrameInfo();
    SDValue Root = Op.getOperand(0);
-  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
    unsigned CC = MF.getFunction()->getCallingConv();
    bool Is64Bit = Subtarget->is64Bit();
    bool IsWin64 = Subtarget->isTargetWin64();
@@ -1195,7 +1241,7 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
    // Assign locations to all of the incoming arguments.
    SmallVector<CCValAssign, 16> ArgLocs;
    CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
-  CCInfo.AnalyzeFormalArguments(Op.getNode(), CCAssignFnForNode(Op));
+  CCInfo.AnalyzeFormalArguments(Op.getNode(), CCAssignFnForNode(CC));
    
    SmallVector<SDValue, 8> ArgValues;
    unsigned LastVal = ~0U;
@@ -1380,13 +1426,13 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
    ArgValues.push_back(Root);
  
    // Some CCs need callee pop.
-  if (IsCalleePop(Op)) {
+  if (IsCalleePop(isVarArg, CC)) {
      BytesToPopOnReturn  = StackSize; // Callee pops everything.
      BytesCallerReserves = 0;
    } else {
      BytesToPopOnReturn  = 0; // Callee pops nothing.
      // If this is an sret function, the return should pop the hidden pointer.
-    if (!Is64Bit && ArgsAreStructReturn(Op))
+    if (!Is64Bit && CC != CallingConv::Fast && ArgsAreStructReturn(Op))
        BytesToPopOnReturn = 4;  
      BytesCallerReserves = StackSize;
    }
@@ -1400,21 +1446,19 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
    FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);
  
    // Return the new list of results.
-  return DAG.getMergeValues(Op.getNode()->getVTList(), &ArgValues[0],
-                            ArgValues.size()).getValue(Op.getResNo());
+  return DAG.getNode(ISD::MERGE_VALUES, Op.getNode()->getVTList(),
+                     &ArgValues[0], ArgValues.size()).getValue(Op.getResNo());
  }
  
  SDValue
-X86TargetLowering::LowerMemOpCallTo(SDValue Op, SelectionDAG &DAG,
+X86TargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
                                      const SDValue &StackPtr,
                                      const CCValAssign &VA,
                                      SDValue Chain,
-                                    SDValue Arg) {
+                                    SDValue Arg, ISD::ArgFlagsTy Flags) {
    unsigned LocMemOffset = VA.getLocMemOffset();
    SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
    PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
-  ISD::ArgFlagsTy Flags =
-    cast<ARG_FLAGSSDNode>(Op.getOperand(6+2*VA.getValNo()))->getArgFlags();
    if (Flags.isByVal()) {
      return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG);
    }
@@ -1462,14 +1506,15 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
  
  SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
    MachineFunction &MF = DAG.getMachineFunction();
-  SDValue Chain       = Op.getOperand(0);
-  unsigned CC         = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
-  bool isVarArg       = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
-  bool IsTailCall     = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0
-                        && CC == CallingConv::Fast && PerformTailCallOpt;
-  SDValue Callee      = Op.getOperand(4);
+  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
+  SDValue Chain       = TheCall->getChain();
+  unsigned CC         = TheCall->getCallingConv();
+  bool isVarArg       = TheCall->isVarArg();
+  bool IsTailCall     = TheCall->isTailCall() &&
+                        CC == CallingConv::Fast && PerformTailCallOpt;
+  SDValue Callee      = TheCall->getCallee();
    bool Is64Bit        = Subtarget->is64Bit();
-  bool IsStructRet    = CallIsStructReturn(Op);
+  bool IsStructRet    = CallIsStructReturn(TheCall);
  
    assert(!(isVarArg && CC == CallingConv::Fast) &&
           "Var args not supported with calling convention fastcc");
@@ -1477,11 +1522,11 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
    // Analyze operands of the call, assigning locations to each operand.
    SmallVector<CCValAssign, 16> ArgLocs;
    CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
-  CCInfo.AnalyzeCallOperands(Op.getNode(), CCAssignFnForNode(Op));
+  CCInfo.AnalyzeCallOperands(TheCall, CCAssignFnForNode(CC));
    
    // Get a count of how many bytes are to be pushed on the stack.
    unsigned NumBytes = CCInfo.getNextStackOffset();
-  if (IsTailCall)
+  if (PerformTailCallOpt && CC == CallingConv::Fast)
      NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
  
    int FPDiff = 0;
@@ -1497,7 +1542,7 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
        MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
    }
  
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes));
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
  
    SDValue RetAddrFrIdx;
    // Load return adress for tail calls.
@@ -1512,9 +1557,9 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
    // of tail call optimization arguments are handle later.
    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
      CCValAssign &VA = ArgLocs[i];
-    SDValue Arg = Op.getOperand(5+2*VA.getValNo());
-    bool isByVal = cast<ARG_FLAGSSDNode>(Op.getOperand(6+2*VA.getValNo()))->
-      getArgFlags().isByVal();
+    SDValue Arg = TheCall->getArg(i);
+    ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
+    bool isByVal = Flags.isByVal();
    
      // Promote the value if needed.
      switch (VA.getLocInfo()) {
@@ -1563,8 +1608,8 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
          if (StackPtr.getNode() == 0)
            StackPtr = DAG.getCopyFromReg(Chain, X86StackPtr, getPointerTy());
          
-        MemOpChains.push_back(LowerMemOpCallTo(Op, DAG, StackPtr, VA, Chain,
-                                               Arg));
+        MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA,
+                                               Chain, Arg, Flags));
        }
      }
    }
@@ -1602,7 +1647,7 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
    if (CallRequiresFnAddressInReg(Is64Bit, IsTailCall)) {
      // Note: The actual moving to ecx is done further down.
      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
-    if (G &&  !G->getGlobal()->hasHiddenVisibility() &&
+    if (G && !G->getGlobal()->hasHiddenVisibility() &&
          !G->getGlobal()->hasProtectedVisibility())
        Callee =  LowerGlobalAddress(Callee, DAG);
      else if (isa<ExternalSymbolSDNode>(Callee))
@@ -1643,10 +1688,8 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
        CCValAssign &VA = ArgLocs[i];
        if (!VA.isRegLoc()) {
          assert(VA.isMemLoc());
-        SDValue Arg = Op.getOperand(5+2*VA.getValNo());
-        SDValue FlagsOp = Op.getOperand(6+2*VA.getValNo());
-        ISD::ArgFlagsTy Flags =
-          cast<ARG_FLAGSSDNode>(FlagsOp)->getArgFlags();
+        SDValue Arg = TheCall->getArg(i);
+        ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
          // Create frame index.
          int32_t Offset = VA.getLocMemOffset()+FPDiff;
          uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
@@ -1695,11 +1738,12 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
      // non-JIT mode.
      if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(),
                                          getTargetMachine(), true))
-      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy(),
+                                          G->getOffset());
    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
      Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
    } else if (IsTailCall) {
-    unsigned Opc = Is64Bit ? X86::R9 : X86::ECX;
+    unsigned Opc = Is64Bit ? X86::R9 : X86::EAX;
  
      Chain = DAG.getCopyToReg(Chain, 
                               DAG.getRegister(Opc, getPointerTy()), 
@@ -1715,8 +1759,8 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
  
    if (IsTailCall) {
      Ops.push_back(Chain);
-    Ops.push_back(DAG.getIntPtrConstant(NumBytes));
-    Ops.push_back(DAG.getIntPtrConstant(0));
+    Ops.push_back(DAG.getIntPtrConstant(NumBytes, true));
+    Ops.push_back(DAG.getIntPtrConstant(0, true));
      if (InFlag.getNode())
        Ops.push_back(InFlag);
      Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
@@ -1756,7 +1800,7 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
      assert(InFlag.getNode() && 
             "Flag must be set. Depend on flag being set in LowerRET");
      Chain = DAG.getNode(X86ISD::TAILCALL,
-                        Op.getNode()->getVTList(), &Ops[0], Ops.size());
+                        TheCall->getVTList(), &Ops[0], Ops.size());
        
      return SDValue(Chain.getNode(), Op.getResNo());
    }
@@ -1766,9 +1810,9 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
  
    // Create the CALLSEQ_END node.
    unsigned NumBytesForCalleeToPush;
-  if (IsCalleePop(Op))
+  if (IsCalleePop(isVarArg, CC))
      NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
-  else if (!Is64Bit && IsStructRet)
+  else if (!Is64Bit && CC != CallingConv::Fast && IsStructRet)
      // If this is is a call to a struct-return function, the callee
      // pops the hidden struct pointer, so we have to push it back.
      // This is common for Darwin/X86, Linux & Mingw32 targets.
@@ -1778,14 +1822,15 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
    
    // Returns a flag for retval copy to use.
    Chain = DAG.getCALLSEQ_END(Chain,
-                             DAG.getIntPtrConstant(NumBytes),
-                             DAG.getIntPtrConstant(NumBytesForCalleeToPush),
+                             DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(NumBytesForCalleeToPush,
+                                                   true),
                               InFlag);
    InFlag = Chain.getValue(1);
  
    // Handle result values, copying them out of physregs into vregs that we
    // return.
-  return SDValue(LowerCallResult(Chain, InFlag, Op.getNode(), CC, DAG),
+  return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG),
                   Op.getResNo());
  }
  
@@ -1847,18 +1892,18 @@ unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
  /// following the call is a return. A function is eligible if caller/callee
  /// calling conventions match, currently only fastcc supports tail calls, and
  /// the function CALL is immediatly followed by a RET.
-bool X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Call,
+bool X86TargetLowering::IsEligibleForTailCallOptimization(CallSDNode *TheCall,
                                                        SDValue Ret,
                                                        SelectionDAG& DAG) const {
    if (!PerformTailCallOpt)
      return false;
  
-  if (CheckTailCallReturnConstraints(Call, Ret)) {
+  if (CheckTailCallReturnConstraints(TheCall, Ret)) {
      MachineFunction &MF = DAG.getMachineFunction();
      unsigned CallerCC = MF.getFunction()->getCallingConv();
-    unsigned CalleeCC = cast<ConstantSDNode>(Call.getOperand(1))->getValue();
+    unsigned CalleeCC= TheCall->getCallingConv();
      if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
-      SDValue Callee = Call.getOperand(4);
+      SDValue Callee = TheCall->getCallee();
        // On x86/32Bit PIC/GOT  tail calls are supported.
        if (getTargetMachine().getRelocationModel() != Reloc::PIC_ ||
            !Subtarget->isPICStyleGOT()|| !Subtarget->is64Bit())
@@ -1877,10 +1922,20 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Call,
  
  FastISel *
  X86TargetLowering::createFastISel(MachineFunction &mf,
+                                  MachineModuleInfo *mmo,
                                    DenseMap<const Value *, unsigned> &vm,
                                    DenseMap<const BasicBlock *,
-                                           MachineBasicBlock *> &bm) {
-  return X86::createFastISel(mf, vm, bm);
+                                           MachineBasicBlock *> &bm,
+                                  DenseMap<const AllocaInst *, int> &am
+#ifndef NDEBUG
+                                  , SmallSet<Instruction*, 8> &cil
+#endif
+                                  ) {
+  return X86::createFastISel(mf, mmo, vm, bm, am
+#ifndef NDEBUG
+                             , cil
+#endif
+                             );
  }
  
  
@@ -1924,7 +1979,7 @@ static bool translateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
          // X < 0   -> X == 0, jump on sign.
          X86CC = X86::COND_S;
          return true;
-      } else if (SetCCOpcode == ISD::SETLT && RHSC->getValue() == 1) {
+      } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
          // X < 1   -> X <= 0
          RHS = DAG.getConstant(0, RHS.getValueType());
          X86CC = X86::COND_LE;
@@ -1933,7 +1988,7 @@ static bool translateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
      }
  
      switch (SetCCOpcode) {
-    default: break;
+    default: assert(0 && "Invalid integer condition!");
      case ISD::SETEQ:  X86CC = X86::COND_E;  break;
      case ISD::SETGT:  X86CC = X86::COND_G;  break;
      case ISD::SETGE:  X86CC = X86::COND_GE; break;
@@ -1945,75 +2000,55 @@ static bool translateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
      case ISD::SETULE: X86CC = X86::COND_BE; break;
      case ISD::SETUGE: X86CC = X86::COND_AE; break;
      }
-  } else {
-    // First determine if it requires or is profitable to flip the operands.
-    bool Flip = false;
-    switch (SetCCOpcode) {
-    default: break;
-    case ISD::SETOLT:
-    case ISD::SETOLE:
-    case ISD::SETUGT:
-    case ISD::SETUGE:
-      Flip = true;
-      break;
-    }
+    return true;
+  }
+  
+  // First determine if it is required or is profitable to flip the operands.
  
-    // If LHS is a foldable load, but RHS is not, flip the condition.
-    if (!Flip &&
-        (ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
-        !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
-      SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
-      Flip = true;
-    }
-    if (Flip)
-      std::swap(LHS, RHS);
-
-    // On a floating point condition, the flags are set as follows:
-    // ZF  PF  CF   op
-    //  0 | 0 | 0 | X > Y
-    //  0 | 0 | 1 | X < Y
-    //  1 | 0 | 0 | X == Y
-    //  1 | 1 | 1 | unordered
-    switch (SetCCOpcode) {
-    default: break;
-    case ISD::SETUEQ:
-    case ISD::SETEQ:
-      X86CC = X86::COND_E;
-      break;
-    case ISD::SETOLT:              // flipped
-    case ISD::SETOGT:
-    case ISD::SETGT:
-      X86CC = X86::COND_A;
-      break;
-    case ISD::SETOLE:              // flipped
-    case ISD::SETOGE:
-    case ISD::SETGE:
-      X86CC = X86::COND_AE;
-      break;
-    case ISD::SETUGT:              // flipped
-    case ISD::SETULT:
-    case ISD::SETLT:
-      X86CC = X86::COND_B;
-      break;
-    case ISD::SETUGE:              // flipped
-    case ISD::SETULE:
-    case ISD::SETLE:
-      X86CC = X86::COND_BE;
-      break;
-    case ISD::SETONE:
-    case ISD::SETNE:
-      X86CC = X86::COND_NE;
-      break;
-    case ISD::SETUO:
-      X86CC = X86::COND_P;
-      break;
-    case ISD::SETO:
-      X86CC = X86::COND_NP;
-      break;
-    }
+  // If LHS is a foldable load, but RHS is not, flip the condition.
+  if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
+      !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
+    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
+    std::swap(LHS, RHS);
+  }
+
+  switch (SetCCOpcode) {
+  default: break;
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    std::swap(LHS, RHS);
+    break;
    }
  
-  return X86CC != X86::COND_INVALID;
+  // On a floating point condition, the flags are set as follows:
+  // ZF  PF  CF   op
+  //  0 | 0 | 0 | X > Y
+  //  0 | 0 | 1 | X < Y
+  //  1 | 0 | 0 | X == Y
+  //  1 | 1 | 1 | unordered
+  switch (SetCCOpcode) {
+  default: return false;
+  case ISD::SETUEQ:
+  case ISD::SETEQ:   X86CC = X86::COND_E; return true;
+  case ISD::SETOLT:              // flipped
+  case ISD::SETOGT:
+  case ISD::SETGT:   X86CC = X86::COND_A; return true;
+  case ISD::SETOLE:              // flipped
+  case ISD::SETOGE:
+  case ISD::SETGE:   X86CC = X86::COND_AE; return true;
+  case ISD::SETUGT:              // flipped
+  case ISD::SETULT:
+  case ISD::SETLT:   X86CC = X86::COND_B;  return true;
+  case ISD::SETUGE:              // flipped
+  case ISD::SETULE:
+  case ISD::SETLE:   X86CC = X86::COND_BE; return true;
+  case ISD::SETONE:
+  case ISD::SETNE:   X86CC = X86::COND_NE; return true;
+  case ISD::SETUO:   X86CC = X86::COND_P;  return true;
+  case ISD::SETO:    X86CC = X86::COND_NP; return true;
+  }
  }
  
  /// hasFPCMov - is there a floating point cmov for the specific X86 condition
@@ -2041,7 +2076,7 @@ static bool isUndefOrInRange(SDValue Op, unsigned Low, unsigned Hi) {
    if (Op.getOpcode() == ISD::UNDEF)
      return true;
  
-  unsigned Val = cast<ConstantSDNode>(Op)->getValue();
+  unsigned Val = cast<ConstantSDNode>(Op)->getZExtValue();
    return (Val >= Low && Val < Hi);
  }
  
@@ -2050,7 +2085,7 @@ static bool isUndefOrInRange(SDValue Op, unsigned Low, unsigned Hi) {
  static bool isUndefOrEqual(SDValue Op, unsigned Val) {
    if (Op.getOpcode() == ISD::UNDEF)
      return true;
-  return cast<ConstantSDNode>(Op)->getValue() == Val;
+  return cast<ConstantSDNode>(Op)->getZExtValue() == Val;
  }
  
  /// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand
@@ -2066,7 +2101,7 @@ bool X86::isPSHUFDMask(SDNode *N) {
      SDValue Arg = N->getOperand(i);
      if (Arg.getOpcode() == ISD::UNDEF) continue;
      assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-    if (cast<ConstantSDNode>(Arg)->getValue() >= e)
+    if (cast<ConstantSDNode>(Arg)->getZExtValue() >= e)
        return false;
    }
  
@@ -2086,7 +2121,7 @@ bool X86::isPSHUFHWMask(SDNode *N) {
      SDValue Arg = N->getOperand(i);
      if (Arg.getOpcode() == ISD::UNDEF) continue;
      assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-    if (cast<ConstantSDNode>(Arg)->getValue() != i)
+    if (cast<ConstantSDNode>(Arg)->getZExtValue() != i)
        return false;
    }
  
@@ -2095,7 +2130,7 @@ bool X86::isPSHUFHWMask(SDNode *N) {
      SDValue Arg = N->getOperand(i);
      if (Arg.getOpcode() == ISD::UNDEF) continue;
      assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
      if (Val < 4 || Val > 7)
        return false;
    }
@@ -2411,7 +2446,7 @@ bool X86::isMOVSHDUPMask(SDNode *N) {
      SDValue Arg = N->getOperand(i);
      if (Arg.getOpcode() == ISD::UNDEF) continue;
      assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
      if (Val != 1) return false;
    }
  
@@ -2420,7 +2455,7 @@ bool X86::isMOVSHDUPMask(SDNode *N) {
      SDValue Arg = N->getOperand(i);
      if (Arg.getOpcode() == ISD::UNDEF) continue;
      assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
      if (Val != 3) return false;
      HasHi = true;
    }
@@ -2442,7 +2477,7 @@ bool X86::isMOVSLDUPMask(SDNode *N) {
      SDValue Arg = N->getOperand(i);
      if (Arg.getOpcode() == ISD::UNDEF) continue;
      assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
      if (Val != 0) return false;
    }
  
@@ -2451,7 +2486,7 @@ bool X86::isMOVSLDUPMask(SDNode *N) {
      SDValue Arg = N->getOperand(i);
      if (Arg.getOpcode() == ISD::UNDEF) continue;
      assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
      if (Val != 2) return false;
      HasHi = true;
    }
@@ -2499,9 +2534,26 @@ static bool isSplatMask(SDNode *N) {
    }
  
    // Make sure it is a splat of the first vector operand.
-  return cast<ConstantSDNode>(ElementBase)->getValue() < NumElems;
+  return cast<ConstantSDNode>(ElementBase)->getZExtValue() < NumElems;
+}
+
+/// getSplatMaskEltNo - Given a splat mask, return the index to the element
+/// we want to splat.
+static SDValue getSplatMaskEltNo(SDNode *N) {
+  assert(isSplatMask(N) && "Not a splat mask");
+  unsigned NumElems = N->getNumOperands();
+  SDValue ElementBase;
+  unsigned i = 0;
+  for (; i != NumElems; ++i) {
+    SDValue Elt = N->getOperand(i);
+    if (isa<ConstantSDNode>(Elt))
+      return Elt;
+  }
+  assert(0 && " No splat value found!");
+  return SDValue();
  }
  
+
  /// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies
  /// a splat of a single element and it's a 2 or 4 element mask.
  bool X86::isSplatMask(SDNode *N) {
@@ -2524,6 +2576,21 @@ bool X86::isSplatLoMask(SDNode *N) {
    return true;
  }
  
+/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
+bool X86::isMOVDDUPMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  unsigned e = N->getNumOperands() / 2;
+  for (unsigned i = 0; i < e; ++i)
+    if (!isUndefOrEqual(N->getOperand(i), i))
+      return false;
+  for (unsigned i = 0; i < e; ++i)
+    if (!isUndefOrEqual(N->getOperand(e+i), i))
+      return false;
+  return true;
+}
+
  /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
  /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
  /// instructions.
@@ -2535,7 +2602,7 @@ unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
      unsigned Val = 0;
      SDValue Arg = N->getOperand(NumOperands-i-1);
      if (Arg.getOpcode() != ISD::UNDEF)
-      Val = cast<ConstantSDNode>(Arg)->getValue();
+      Val = cast<ConstantSDNode>(Arg)->getZExtValue();
      if (Val >= NumOperands) Val -= NumOperands;
      Mask |= Val;
      if (i != NumOperands - 1)
@@ -2555,7 +2622,7 @@ unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
      unsigned Val = 0;
      SDValue Arg = N->getOperand(i);
      if (Arg.getOpcode() != ISD::UNDEF)
-      Val = cast<ConstantSDNode>(Arg)->getValue();
+      Val = cast<ConstantSDNode>(Arg)->getZExtValue();
      Mask |= (Val - 4);
      if (i != 4)
        Mask <<= 2;
@@ -2574,7 +2641,7 @@ unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
      unsigned Val = 0;
      SDValue Arg = N->getOperand(i);
      if (Arg.getOpcode() != ISD::UNDEF)
-      Val = cast<ConstantSDNode>(Arg)->getValue();
+      Val = cast<ConstantSDNode>(Arg)->getZExtValue();
      Mask |= Val;
      if (i != 0)
        Mask <<= 2;
@@ -2597,7 +2664,7 @@ static bool isPSHUFHW_PSHUFLWMask(SDNode *N) {
      SDValue Arg = N->getOperand(i);
      if (Arg.getOpcode() == ISD::UNDEF) continue;
      assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
      if (Val >= 4)
        return false;
    }
@@ -2607,7 +2674,7 @@ static bool isPSHUFHW_PSHUFLWMask(SDNode *N) {
      SDValue Arg = N->getOperand(i);
      if (Arg.getOpcode() == ISD::UNDEF) continue;
      assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
      if (Val < 4 || Val > 7)
        return false;
    }
@@ -2633,7 +2700,7 @@ static SDValue CommuteVectorShuffle(SDValue Op, SDValue &V1,
        continue;
      }
      assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
      if (Val < NumElems)
        MaskVec.push_back(DAG.getConstant(Val + NumElems, EltVT));
      else
@@ -2660,7 +2727,7 @@ SDValue CommuteVectorShuffleMask(SDValue Mask, SelectionDAG &DAG) {
        continue;
      }
      assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
      if (Val < NumElems)
        MaskVec.push_back(DAG.getConstant(Val + NumElems, EltVT));
      else
@@ -2691,15 +2758,14 @@ static bool ShouldXformToMOVHLPS(SDNode *Mask) {
  /// is promoted to a vector. It also returns the LoadSDNode by reference if
  /// required.
  static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
-  if (N->getOpcode() == ISD::SCALAR_TO_VECTOR) {
-    N = N->getOperand(0).getNode();
-    if (ISD::isNON_EXTLoad(N)) {
-      if (LD)
-        *LD = cast<LoadSDNode>(N);
-      return true;
-    }
-  }
-  return false;
+  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
+    return false;
+  N = N->getOperand(0).getNode();
+  if (!ISD::isNON_EXTLoad(N))
+    return false;
+  if (LD)
+    *LD = cast<LoadSDNode>(N);
+  return true;
  }
  
  /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
@@ -2753,7 +2819,7 @@ static bool isUndefShuffle(SDNode *N) {
    for (unsigned i = 0; i != NumElems; ++i) {
      SDValue Arg = Mask.getOperand(i);
      if (Arg.getOpcode() != ISD::UNDEF) {
-      unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+      unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
        if (Val < NumElems && V1.getOpcode() != ISD::UNDEF)
          return false;
        else if (Val >= NumElems && V2.getOpcode() != ISD::UNDEF)
@@ -2767,7 +2833,7 @@ static bool isUndefShuffle(SDNode *N) {
  /// constant +0.0.
  static inline bool isZeroNode(SDValue Elt) {
    return ((isa<ConstantSDNode>(Elt) &&
-           cast<ConstantSDNode>(Elt)->getValue() == 0) ||
+           cast<ConstantSDNode>(Elt)->getZExtValue() == 0) ||
            (isa<ConstantFPSDNode>(Elt) &&
             cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
  }
@@ -2787,7 +2853,7 @@ static bool isZeroShuffle(SDNode *N) {
      if (Arg.getOpcode() == ISD::UNDEF)
        continue;
      
-    unsigned Idx = cast<ConstantSDNode>(Arg)->getValue();
+    unsigned Idx = cast<ConstantSDNode>(Arg)->getZExtValue();
      if (Idx < NumElems) {
        unsigned Opc = V1.getNode()->getOpcode();
        if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
@@ -2856,7 +2922,7 @@ static SDValue NormalizeMask(SDValue Mask, SelectionDAG &DAG) {
    for (unsigned i = 0; i != NumElems; ++i) {
      SDValue Arg = Mask.getOperand(i);
      if (Arg.getOpcode() != ISD::UNDEF) {
-      unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+      unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue();
        if (Val > NumElems) {
          Arg = DAG.getConstant(NumElems, Arg.getValueType());
          Changed = true;
@@ -2934,15 +3000,26 @@ static SDValue PromoteSplat(SDValue Op, SelectionDAG &DAG, bool HasSSE2) {
      return Op;
    SDValue V1 = Op.getOperand(0);
    SDValue Mask = Op.getOperand(2);
-  unsigned NumElems = Mask.getNumOperands();
+  unsigned MaskNumElems = Mask.getNumOperands();
+  unsigned NumElems = MaskNumElems;
    // Special handling of v4f32 -> v4i32.
    if (VT != MVT::v4f32) {
-    Mask = getUnpacklMask(NumElems, DAG);
+    // Find which element we want to splat.
+    SDNode* EltNoNode = getSplatMaskEltNo(Mask.getNode()).getNode();
+    unsigned EltNo = cast<ConstantSDNode>(EltNoNode)->getZExtValue();
+    // unpack elements to the correct location
      while (NumElems > 4) {
+      if (EltNo < NumElems/2) {
+        Mask = getUnpacklMask(MaskNumElems, DAG);
+      } else {
+        Mask = getUnpackhMask(MaskNumElems, DAG);
+        EltNo -= NumElems/2;
+      }
        V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1, Mask);
        NumElems >>= 1;
      }
-    Mask = getZeroVector(MVT::v4i32, true, DAG);
+    SDValue Cst = DAG.getConstant(EltNo, MVT::i32);
+    Mask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Cst, Cst, Cst, Cst);
    }
  
    V1 = DAG.getNode(ISD::BIT_CONVERT, PVT, V1);
@@ -2951,6 +3028,46 @@ static SDValue PromoteSplat(SDValue Op, SelectionDAG &DAG, bool HasSSE2) {
    return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle);
  }
  
+/// isVectorLoad - Returns true if the node is a vector load, a scalar
+/// load that's promoted to vector, or a load bitcasted.
+static bool isVectorLoad(SDValue Op) {
+  assert(Op.getValueType().isVector() && "Expected a vector type");
+  if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR ||
+      Op.getOpcode() == ISD::BIT_CONVERT) {
+    return isa<LoadSDNode>(Op.getOperand(0));
+  }
+  return isa<LoadSDNode>(Op);
+}
+
+
+/// CanonicalizeMovddup - Cannonicalize movddup shuffle to v2f64.
+///
+static SDValue CanonicalizeMovddup(SDValue Op, SDValue V1, SDValue Mask,
+                                   SelectionDAG &DAG, bool HasSSE3) {
+  // If we have sse3 and shuffle has more than one use or input is a load, then
+  // use movddup. Otherwise, use movlhps.
+  bool UseMovddup = HasSSE3 && (!Op.hasOneUse() || isVectorLoad(V1));
+  MVT PVT = UseMovddup ? MVT::v2f64 : MVT::v4f32;
+  MVT VT = Op.getValueType();
+  if (VT == PVT)
+    return Op;
+  unsigned NumElems = PVT.getVectorNumElements();
+  if (NumElems == 2) {
+    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+    Mask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, Cst, Cst);
+  } else {
+    assert(NumElems == 4);
+    SDValue Cst0 = DAG.getTargetConstant(0, MVT::i32);
+    SDValue Cst1 = DAG.getTargetConstant(1, MVT::i32);
+    Mask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Cst0, Cst1, Cst0, Cst1);
+  }
+
+  V1 = DAG.getNode(ISD::BIT_CONVERT, PVT, V1);
+  SDValue Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, PVT, V1,
+                                DAG.getNode(ISD::UNDEF, PVT), Mask);
+  return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle);
+}
+
  /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
  /// vector of zero or undef vector.  This produces a shuffle where the low
  /// element of V2 is swizzled into the zero/undef vector, landing at element
@@ -3020,7 +3137,7 @@ static bool isVectorShift(SDValue Op, SDValue Mask, SelectionDAG &DAG,
      SDValue Idx = Mask.getOperand(isLeft ? i : (i - NumZeros));
      if (Idx.getOpcode() == ISD::UNDEF)
        continue;
-    unsigned Index = cast<ConstantSDNode>(Idx)->getValue();
+    unsigned Index = cast<ConstantSDNode>(Idx)->getZExtValue();
      if (Index < NumElems)
        SeenV1 = true;
      else {
@@ -3385,7 +3502,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2,
      SDValue Elt = MaskElts[i];
      if (Elt.getOpcode() == ISD::UNDEF)
        continue;
-    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
      int QuadIdx = EltIdx / 4;
      ++LowQuad[QuadIdx];
    }
@@ -3405,7 +3522,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2,
      SDValue Elt = MaskElts[i];
      if (Elt.getOpcode() == ISD::UNDEF)
        continue;
-    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
      int QuadIdx = EltIdx / 4;
      ++HighQuad[QuadIdx];
    }
@@ -3453,7 +3570,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2,
            MaskVec.push_back(Elt);
            InOrder.set(i);
          } else {
-          unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+          unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
            if (EltIdx != i)
              AnyOutOrder = true;
  
@@ -3487,7 +3604,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2,
            MaskVec.push_back(Elt);
            InOrder.set(i);
          } else {
-          unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+          unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
            if (EltIdx != i)
              AnyOutOrder = true;
  
@@ -3513,7 +3630,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2,
        SDValue Elt = MaskElts[i];
        if (Elt.getOpcode() == ISD::UNDEF)
          continue;
-      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
        SDValue ExtOp = (EltIdx < 8)
          ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V1,
                        DAG.getConstant(EltIdx, PtrVT))
@@ -3544,7 +3661,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2,
        ++V2InOrder;
        continue;
      }
-    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
      if (EltIdx == i) {
        V1Elts.push_back(Elt);
        V2Elts.push_back(DAG.getConstant(i+8, MaskEVT));
@@ -3555,8 +3672,10 @@ SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2,
        ++V2InOrder;
      } else if (EltIdx < 8) {
        V1Elts.push_back(Elt);
+      V2Elts.push_back(DAG.getConstant(i+8, MaskEVT));
        ++V1FromV1;
      } else {
+      V1Elts.push_back(Elt);
        V2Elts.push_back(DAG.getConstant(EltIdx-8, MaskEVT));
        ++V2FromV2;
      }
@@ -3581,7 +3700,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2,
            MaskVec.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
            continue;
          }
-        unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+        unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
          if (EltIdx >= 8)
            MaskVec.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
          else
@@ -3596,7 +3715,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2,
        SDValue Elt = V1Elts[i];
        if (Elt.getOpcode() == ISD::UNDEF)
          continue;
-      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
        if (EltIdx < 8)
          continue;
        SDValue ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V2,
@@ -3612,7 +3731,7 @@ SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2,
        SDValue Elt = V1Elts[i];
        if (Elt.getOpcode() == ISD::UNDEF)
          continue;
-      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
        SDValue ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V1,
                                      DAG.getConstant(EltIdx, PtrVT));
        NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, NewV, ExtOp,
@@ -3659,7 +3778,7 @@ SDValue RewriteAsNarrowerShuffle(SDValue V1, SDValue V2,
        SDValue Elt = PermMask.getOperand(i+j);
        if (Elt.getOpcode() == ISD::UNDEF)
          continue;
-      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
        if (StartIdx == ~0U)
          StartIdx = EltIdx - (EltIdx % Scale);
        if (EltIdx != StartIdx + j)
@@ -3728,7 +3847,7 @@ LowerVECTOR_SHUFFLE_4wide(SDValue V1, SDValue V2,
      if (Elt.getOpcode() == ISD::UNDEF) {
        Locs[i] = std::make_pair(-1, -1);
      } else {
-      unsigned Val = cast<ConstantSDNode>(Elt)->getValue();
+      unsigned Val = cast<ConstantSDNode>(Elt)->getZExtValue();
        assert(Val < 8 && "Invalid VECTOR_SHUFFLE index!");
        if (Val < 4) {
          Locs[i] = std::make_pair(0, NumLo);
@@ -3786,7 +3905,7 @@ LowerVECTOR_SHUFFLE_4wide(SDValue V1, SDValue V2,
        SDValue Elt = PermMask.getOperand(HiIndex);
        if (Elt.getOpcode() == ISD::UNDEF)
          continue;
-      unsigned Val = cast<ConstantSDNode>(Elt)->getValue();
+      unsigned Val = cast<ConstantSDNode>(Elt)->getZExtValue();
        if (Val >= 4)
          break;
      }
@@ -3811,11 +3930,13 @@ LowerVECTOR_SHUFFLE_4wide(SDValue V1, SDValue V2,
        Mask1[2] = PermMask.getOperand(2);
        Mask1[3] = PermMask.getOperand(3);
        if (Mask1[2].getOpcode() != ISD::UNDEF)
-        Mask1[2] = DAG.getConstant(cast<ConstantSDNode>(Mask1[2])->getValue()+4,
-                                   MaskEVT);
+        Mask1[2] =
+          DAG.getConstant(cast<ConstantSDNode>(Mask1[2])->getZExtValue()+4,
+                          MaskEVT);
        if (Mask1[3].getOpcode() != ISD::UNDEF)
-        Mask1[3] = DAG.getConstant(cast<ConstantSDNode>(Mask1[3])->getValue()+4,
-                                   MaskEVT);
+        Mask1[3] =
+          DAG.getConstant(cast<ConstantSDNode>(Mask1[3])->getZExtValue()+4,
+                          MaskEVT);
        return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V2, V1,
                           DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &Mask1[0], 4));
      }
@@ -3839,7 +3960,7 @@ LowerVECTOR_SHUFFLE_4wide(SDValue V1, SDValue V2,
      SDValue Elt = PermMask.getOperand(i);
      if (Elt.getOpcode() == ISD::UNDEF) {
        Locs[i] = std::make_pair(-1, -1);
-    } else if (cast<ConstantSDNode>(Elt)->getValue() < 4) {
+    } else if (cast<ConstantSDNode>(Elt)->getZExtValue() < 4) {
        Locs[i] = std::make_pair(MaskIdx, LoIdx);
        (*MaskPtr)[LoIdx] = Elt;
        LoIdx++;
@@ -3894,6 +4015,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
    else if (isIdentityMask(PermMask.getNode(), true))
      return V2;
  
+  // Canonicalize movddup shuffles.
+  if (V2IsUndef && Subtarget->hasSSE2() &&
+      VT.getSizeInBits() == 128 &&
+      X86::isMOVDDUPMask(PermMask.getNode()))
+    return CanonicalizeMovddup(Op, V1, PermMask, DAG, Subtarget->hasSSE3());
+
    if (isSplatMask(PermMask.getNode())) {
      if (isMMX || NumElems < 4) return Op;
      // Promote it to a v4{if}32 splat.
@@ -4103,11 +4230,15 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
    } else if (VT == MVT::f32) {
      // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
      // the result back to FR32 register. It's only worth matching if the
-    // result has a single use which is a store or a bitcast to i32.
+    // result has a single use which is a store or a bitcast to i32.  And in
+    // the case of a store, it's not worth it if the index is a constant 0,
+    // because a MOVSSmr can be used instead, which is smaller and faster.
      if (!Op.hasOneUse())
        return SDValue();
      SDNode *User = *Op.getNode()->use_begin();
-    if (User->getOpcode() != ISD::STORE &&
+    if ((User->getOpcode() != ISD::STORE ||
+         (isa<ConstantSDNode>(Op.getOperand(1)) &&
+          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
          (User->getOpcode() != ISD::BIT_CONVERT ||
           User->getValueType(0) != MVT::i32))
        return SDValue();
@@ -4135,7 +4266,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
    // TODO: handle v16i8.
    if (VT.getSizeInBits() == 16) {
      SDValue Vec = Op.getOperand(0);
-    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
      if (Idx == 0)
        return DAG.getNode(ISD::TRUNCATE, MVT::i16,
                           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32,
@@ -4149,7 +4280,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
                                      DAG.getValueType(VT));
      return DAG.getNode(ISD::TRUNCATE, VT, Assert);
    } else if (VT.getSizeInBits() == 32) {
-    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
      if (Idx == 0)
        return Op;
      // SHUFPS the element to the lowest double word, then movss.
@@ -4174,7 +4305,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
      // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
      // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
      //        to match extract_elt for f64.
-    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
      if (Idx == 0)
        return Op;
  
@@ -4216,7 +4347,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
      if (N1.getValueType() != MVT::i32)
        N1 = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, N1);
      if (N2.getValueType() != MVT::i32)
-      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getValue());
+      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
      return DAG.getNode(Opc, VT, N0, N1, N2);
    } else if (EVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
      // Bits [7:6] of the constant are the source select.  This will always be
@@ -4227,7 +4358,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
      //  value of the incoming immediate.
      // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may 
      //   combine either bitwise AND or insert of float 0.0 to set these bits.
-    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getValue() << 4);
+    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
      return DAG.getNode(X86ISD::INSERTPS, VT, N0, N1, N2);
    }
    return SDValue();
@@ -4254,7 +4385,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
      if (N1.getValueType() != MVT::i32)
        N1 = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, N1);
      if (N2.getValueType() != MVT::i32)
-      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getValue());
+      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
      return DAG.getNode(X86ISD::PINSRW, VT, N0, N1, N2);
    }
    return SDValue();
@@ -4306,13 +4437,25 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
  }
  
  SDValue
-X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
-  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  SDValue Result = DAG.getTargetGlobalAddress(GV, getPointerTy());
+X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
+                                      int64_t Offset,
+                                      SelectionDAG &DAG) const {
+  bool IsPic = getTargetMachine().getRelocationModel() == Reloc::PIC_;
+  bool ExtraLoadRequired =
+    Subtarget->GVRequiresExtraLoad(GV, getTargetMachine(), false);
+
+  // Create the TargetGlobalAddress node, folding in the constant
+  // offset if it is legal.
+  SDValue Result;
+  if (!IsPic && !ExtraLoadRequired && isInt32(Offset)) {
+    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
+    Offset = 0;
+  } else
+    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0);
    Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
+
    // With PIC, the address is actually $g + Offset.
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
-      !Subtarget->isPICStyleRIPRel()) {
+  if (IsPic && !Subtarget->isPICStyleRIPRel()) {
      Result = DAG.getNode(ISD::ADD, getPointerTy(),
                           DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
                           Result);
@@ -4323,13 +4466,26 @@ X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
    // the GlobalAddress must be in the base or index register of the address, not
    // the GV offset field. Platform check is inside GVRequiresExtraLoad() call
    // The same applies for external symbols during PIC codegen
-  if (Subtarget->GVRequiresExtraLoad(GV, getTargetMachine(), false))
+  if (ExtraLoadRequired)
      Result = DAG.getLoad(getPointerTy(), DAG.getEntryNode(), Result,
                           PseudoSourceValue::getGOT(), 0);
  
+  // If there was a non-zero offset that we didn't fold, create an explicit
+  // addition for it.
+  if (Offset != 0)
+    Result = DAG.getNode(ISD::ADD, getPointerTy(), Result,
+                         DAG.getConstant(Offset, getPointerTy()));
+
    return Result;
  }
  
+SDValue
+X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+  return LowerGlobalAddress(GV, Offset, DAG);
+}
+
  // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
  static SDValue
  LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
@@ -4577,6 +4733,103 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
    return Result;
  }
  
+SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  MVT SrcVT = Op.getOperand(0).getValueType();
+  assert(SrcVT.getSimpleVT() == MVT::i64 && "Unknown UINT_TO_FP to lower!");
+  
+  // We only handle SSE2 f64 target here; caller can handle the rest.
+  if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64)
+    return SDValue();
+  
+  // This algorithm is not obvious.  Here it is in C code, more or less:
+/*
+ double uint64_to_double( uint32_t hi, uint32_t lo )
+  {
+    static const __m128i exp = { 0x4330000045300000ULL, 0 };
+    static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
+
+    // copy ints to xmm registers
+    __m128i xh = _mm_cvtsi32_si128( hi );
+    __m128i xl = _mm_cvtsi32_si128( lo );
+
+    // combine into low half of a single xmm register
+    __m128i x = _mm_unpacklo_epi32( xh, xl );
+    __m128d d;
+    double sd;
+
+    // merge in appropriate exponents to give the integer bits the 
+    // right magnitude
+    x = _mm_unpacklo_epi32( x, exp );
+
+    // subtract away the biases to deal with the IEEE-754 double precision
+    // implicit 1
+    d = _mm_sub_pd( (__m128d) x, bias );
+
+    // All conversions up to here are exact. The correctly rounded result is 
+    // calculated using the
+    // current rounding mode using the following horizontal add.
+    d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
+    _mm_store_sd( &sd, d );   //since we are returning doubles in XMM, this
+    // store doesn't really need to be here (except maybe to zero the other
+    // double)
+    return sd;
+  }
+*/
+
+  // Build some magic constants.
+  std::vector<Constant*>CV0;
+  CV0.push_back(ConstantInt::get(APInt(32, 0x45300000)));
+  CV0.push_back(ConstantInt::get(APInt(32, 0x43300000)));
+  CV0.push_back(ConstantInt::get(APInt(32, 0)));
+  CV0.push_back(ConstantInt::get(APInt(32, 0)));
+  Constant *C0 = ConstantVector::get(CV0);
+  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 4);
+
+  std::vector<Constant*>CV1;
+  CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4530000000000000ULL))));
+  CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4330000000000000ULL))));
+  Constant *C1 = ConstantVector::get(CV1);
+  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 4);
+
+  SmallVector<SDValue, 4> MaskVec;
+  MaskVec.push_back(DAG.getConstant(0, MVT::i32));
+  MaskVec.push_back(DAG.getConstant(4, MVT::i32));
+  MaskVec.push_back(DAG.getConstant(1, MVT::i32));
+  MaskVec.push_back(DAG.getConstant(5, MVT::i32));
+  SDValue UnpcklMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, &MaskVec[0],
+                                   MaskVec.size());
+  SmallVector<SDValue, 4> MaskVec2;
+  MaskVec2.push_back(DAG.getConstant(1, MVT::i32));
+  MaskVec2.push_back(DAG.getConstant(0, MVT::i32));
+  SDValue ShufMask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, &MaskVec2[0],
+                                 MaskVec2.size());
+
+  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v4i32,
+                            DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32,
+                                        Op.getOperand(0),
+                                        DAG.getIntPtrConstant(1)));
+  SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v4i32,
+                            DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32,
+                                        Op.getOperand(0),
+                                        DAG.getIntPtrConstant(0)));
+  SDValue Unpck1 = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v4i32,
+                                XR1, XR2, UnpcklMask);
+  SDValue CLod0 = DAG.getLoad(MVT::v4i32, DAG.getEntryNode(), CPIdx0,
+                         PseudoSourceValue::getConstantPool(), 0, false, 16);
+  SDValue Unpck2 = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v4i32,
+                                Unpck1, CLod0, UnpcklMask);
+  SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, MVT::v2f64, Unpck2);
+  SDValue CLod1 = DAG.getLoad(MVT::v2f64, CLod0.getValue(1), CPIdx1,
+                         PseudoSourceValue::getConstantPool(), 0, false, 16);
+  SDValue Sub = DAG.getNode(ISD::FSUB, MVT::v2f64, XR2F, CLod1);
+  // Add the halves; easiest way is to swap them into another reg first.
+  SDValue Shuf = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v2f64,
+                             Sub, Sub, ShufMask);
+  SDValue Add = DAG.getNode(ISD::FADD, MVT::v2f64, Shuf, Sub);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::f64, Add,
+                     DAG.getIntPtrConstant(0));
+}
+
  std::pair<SDValue,SDValue> X86TargetLowering::
  FP_TO_SINTHelper(SDValue Op, SelectionDAG &DAG) {
    assert(Op.getValueType().getSimpleVT() <= MVT::i64 &&
@@ -4638,22 +4891,6 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
    return DAG.getLoad(Op.getValueType(), FIST, StackSlot, NULL, 0);
  }
  
-SDNode *X86TargetLowering::ExpandFP_TO_SINT(SDNode *N, SelectionDAG &DAG) {
-  std::pair<SDValue,SDValue> Vals = FP_TO_SINTHelper(SDValue(N, 0), DAG);
-  SDValue FIST = Vals.first, StackSlot = Vals.second;
-  if (FIST.getNode() == 0) return 0;
-
-  MVT VT = N->getValueType(0);
-
-  // Return a load from the stack slot.
-  SDValue Res = DAG.getLoad(VT, FIST, StackSlot, NULL, 0);
-
-  // Use MERGE_VALUES to drop the chain result value and get a node with one
-  // result.  This requires turning off getMergeValues simplification, since
-  // otherwise it will give us Res back.
-  return DAG.getMergeValues(&Res, 1, false).getNode();
-}
-
  SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) {
    MVT VT = Op.getValueType();
    MVT EltVT = VT;
@@ -4791,37 +5028,17 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
    SDValue Op0 = Op.getOperand(0);
    SDValue Op1 = Op.getOperand(1);
    SDValue CC = Op.getOperand(2);
-  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
    bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
    unsigned X86CC;
  
-  if (translateX86CC(cast<CondCodeSDNode>(CC)->get(), isFP, X86CC,
-                     Op0, Op1, DAG)) {
-    Cond = DAG.getNode(X86ISD::CMP, MVT::i32, Op0, Op1);
-    return DAG.getNode(X86ISD::SETCC, MVT::i8,
-                       DAG.getConstant(X86CC, MVT::i8), Cond);
-  }
-
-  assert(isFP && "Illegal integer SetCC!");
-
+  if (!translateX86CC(cast<CondCodeSDNode>(CC)->get(), isFP, X86CC,
+                     Op0, Op1, DAG))
+    assert(0 && "Illegal SetCC!");
+    
+    
    Cond = DAG.getNode(X86ISD::CMP, MVT::i32, Op0, Op1);
-  switch (SetCCOpcode) {
-  default: assert(false && "Illegal floating point SetCC!");
-  case ISD::SETOEQ: {  // !PF & ZF
-    SDValue Tmp1 = DAG.getNode(X86ISD::SETCC, MVT::i8,
-                                 DAG.getConstant(X86::COND_NP, MVT::i8), Cond);
-    SDValue Tmp2 = DAG.getNode(X86ISD::SETCC, MVT::i8,
-                                 DAG.getConstant(X86::COND_E, MVT::i8), Cond);
-    return DAG.getNode(ISD::AND, MVT::i8, Tmp1, Tmp2);
-  }
-  case ISD::SETUNE: {  // PF | !ZF
-    SDValue Tmp1 = DAG.getNode(X86ISD::SETCC, MVT::i8,
-                                 DAG.getConstant(X86::COND_P, MVT::i8), Cond);
-    SDValue Tmp2 = DAG.getNode(X86ISD::SETCC, MVT::i8,
-                                 DAG.getConstant(X86::COND_NE, MVT::i8), Cond);
-    return DAG.getNode(ISD::OR, MVT::i8, Tmp1, Tmp2);
-  }
-  }
+  return DAG.getNode(X86ISD::SETCC, MVT::i8,
+                     DAG.getConstant(X86CC, MVT::i8), Cond);
  }
  
  SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
@@ -4940,6 +5157,11 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
    return Result;
  }
  
+// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
+static bool isX86LogicalCmp(unsigned Opc) {
+  return Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI;
+}
+
  SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
    bool addTest = true;
    SDValue Cond  = Op.getOperand(0);
@@ -4960,11 +5182,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
      bool IllegalFPCMov = false;
      if (VT.isFloatingPoint() && !VT.isVector() &&
          !isScalarFPTypeInSSEReg(VT))  // FPStack?
-      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSignExtended());
+      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
      
-    if ((Opc == X86ISD::CMP ||
-         Opc == X86ISD::COMI ||
-         Opc == X86ISD::UCOMI) && !IllegalFPCMov) {
+    if (isX86LogicalCmp(Opc) && !IllegalFPCMov) {
        Cond = Cmp;
        addTest = false;
      }
@@ -4987,6 +5207,19 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
    return DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size());
  }
  
+// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
+// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
+// from the AND / OR.
+static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
+  Opc = Op.getOpcode();
+  if (Opc != ISD::OR && Opc != ISD::AND)
+    return false;
+  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+          Op.getOperand(0).hasOneUse() &&
+          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
+          Op.getOperand(1).hasOneUse());
+}
+
  SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
    bool addTest = true;
    SDValue Chain = Op.getOperand(0);
@@ -4996,6 +5229,11 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
  
    if (Cond.getOpcode() == ISD::SETCC)
      Cond = LowerSETCC(Cond, DAG);
+  else if (Cond.getOpcode() == X86ISD::ADD  ||
+           Cond.getOpcode() == X86ISD::SUB  ||
+           Cond.getOpcode() == X86ISD::SMUL ||
+           Cond.getOpcode() == X86ISD::UMUL)
+    Cond = LowerXALUO(Cond, DAG);
  
    // If condition flag is set by a X86ISD::CMP, then use it as the condition
    // setting operand in place of the X86ISD::SETCC.
@@ -5004,11 +5242,74 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
  
      SDValue Cmp = Cond.getOperand(1);
      unsigned Opc = Cmp.getOpcode();
-    if (Opc == X86ISD::CMP ||
-        Opc == X86ISD::COMI ||
-        Opc == X86ISD::UCOMI) {
+    if (isX86LogicalCmp(Opc)) {
        Cond = Cmp;
        addTest = false;
+    } else {
+      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
+      default: break;
+      case X86::COND_O:
+      case X86::COND_C:
+        // These can only come from an arithmetic instruction with overflow, e.g.
+        // SADDO, UADDO.
+        Cond = Cond.getNode()->getOperand(1);
+        addTest = false;
+        break;
+      }
+    }
+  } else {
+    unsigned CondOpc;
+    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
+      SDValue Cmp = Cond.getOperand(0).getOperand(1);
+      unsigned Opc = Cmp.getOpcode();
+      if (CondOpc == ISD::OR) {
+        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
+        // two branches instead of an explicit OR instruction with a
+        // separate test.
+        if (Cmp == Cond.getOperand(1).getOperand(1) &&
+            isX86LogicalCmp(Opc)) {
+          CC = Cond.getOperand(0).getOperand(0);
+          Chain = DAG.getNode(X86ISD::BRCOND, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = Cond.getOperand(1).getOperand(0);
+          Cond = Cmp;
+          addTest = false;
+        }
+      } else { // ISD::AND
+        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
+        // two branches instead of an explicit AND instruction with a
+        // separate test. However, we only do this if this block doesn't
+        // have a fall-through edge, because this requires an explicit
+        // jmp when the condition is false.
+        if (Cmp == Cond.getOperand(1).getOperand(1) &&
+            isX86LogicalCmp(Opc) &&
+            Op.getNode()->hasOneUse()) {
+          X86::CondCode CCode =
+            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+          CCode = X86::GetOppositeBranchCondition(CCode);
+          CC = DAG.getConstant(CCode, MVT::i8);
+          SDValue User = SDValue(*Op.getNode()->use_begin(), 0);
+          // Look for an unconditional branch following this conditional branch.
+          // We need this because we need to reverse the successors in order
+          // to implement FCMP_OEQ.
+          if (User.getOpcode() == ISD::BR) {
+            SDValue FalseBB = User.getOperand(1);
+            SDValue NewBR =
+              DAG.UpdateNodeOperands(User, User.getOperand(0), Dest);
+            assert(NewBR == User);
+            Dest = FalseBB;
+
+            Chain = DAG.getNode(X86ISD::BRCOND, Op.getValueType(),
+                                Chain, Dest, CC, Cmp);
+            X86::CondCode CCode =
+              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
+            CCode = X86::GetOppositeBranchCondition(CCode);
+            CC = DAG.getConstant(CCode, MVT::i8);
+            Cond = Cmp;
+            addTest = false;
+          }
+        }
+      }
      }
    }
  
@@ -5017,7 +5318,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
      Cond= DAG.getNode(X86ISD::CMP, MVT::i32, Cond, DAG.getConstant(0, MVT::i8));
    }
    return DAG.getNode(X86ISD::BRCOND, Op.getValueType(),
-                     Chain, Op.getOperand(2), CC, Cond);
+                     Chain, Dest, CC, Cond);
  }
  
  
@@ -5042,7 +5343,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
    MVT IntPtr = getPointerTy();
    MVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
  
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0));
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
  
    Chain = DAG.getCopyToReg(Chain, X86::EAX, Size, Flag);
    Flag = Chain.getValue(1);
@@ -5057,8 +5358,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
    Flag = Chain.getValue(1);
  
    Chain = DAG.getCALLSEQ_END(Chain,
-                             DAG.getIntPtrConstant(0),
-                             DAG.getIntPtrConstant(0),
+                             DAG.getIntPtrConstant(0, true),
+                             DAG.getIntPtrConstant(0, true),
                               Flag);
  
    Chain = DAG.getCopyFromReg(Chain, X86StackPtr, SPTy).getValue(1);
@@ -5069,24 +5370,27 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
  
  SDValue
  X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG,
-                                        SDValue Chain,
-                                        SDValue Dst, SDValue Src,
-                                        SDValue Size, unsigned Align,
-                                        const Value *DstSV, uint64_t DstSVOff) {
+                                           SDValue Chain,
+                                           SDValue Dst, SDValue Src,
+                                           SDValue Size, unsigned Align,
+                                           const Value *DstSV,
+                                           uint64_t DstSVOff) {
    ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
  
-  /// If not DWORD aligned or size is more than the threshold, call the library.
-  /// The libc version is likely to be faster for these cases. It can use the
-  /// address value and run time information about the CPU.
+  // If not DWORD aligned or size is more than the threshold, call the library.
+  // The libc version is likely to be faster for these cases. It can use the
+  // address value and run time information about the CPU.
    if ((Align & 3) != 0 ||
        !ConstantSize ||
-      ConstantSize->getValue() > getSubtarget()->getMaxInlineSizeThreshold()) {
+      ConstantSize->getZExtValue() >
+        getSubtarget()->getMaxInlineSizeThreshold()) {
      SDValue InFlag(0, 0);
  
      // Check to see if there is a specialized entry-point for memory zeroing.
      ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
-    if (const char *bzeroEntry = 
-          V && V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
+
+    if (const char *bzeroEntry =  V &&
+        V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
        MVT IntPtr = getPointerTy();
        const Type *IntPtrTy = TD->getIntPtrType();
        TargetLowering::ArgListTy Args; 
@@ -5097,9 +5401,9 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG,
        Entry.Node = Size;
        Args.push_back(Entry);
        std::pair<SDValue,SDValue> CallResult =
-        LowerCallTo(Chain, Type::VoidTy, false, false, false, CallingConv::C,
-                    false, DAG.getExternalSymbol(bzeroEntry, IntPtr),
-                    Args, DAG);
+        LowerCallTo(Chain, Type::VoidTy, false, false, false, false, 
+                    CallingConv::C, false, 
+                    DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG);
        return CallResult.second;
      }
  
@@ -5107,7 +5411,7 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG,
      return SDValue();
    }
  
-  uint64_t SizeVal = ConstantSize->getValue();
+  uint64_t SizeVal = ConstantSize->getZExtValue();
    SDValue InFlag(0, 0);
    MVT AVT;
    SDValue Count;
@@ -5116,7 +5420,7 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG,
    bool TwoRepStos = false;
    if (ValC) {
      unsigned ValReg;
-    uint64_t Val = ValC->getValue() & 255;
+    uint64_t Val = ValC->getZExtValue() & 255;
  
      // If the value is a constant, then we can potentially use larger sets.
      switch (Align & 3) {
@@ -5218,7 +5522,7 @@ X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG,
    ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
    if (!ConstantSize)
      return SDValue();
-  uint64_t SizeVal = ConstantSize->getValue();
+  uint64_t SizeVal = ConstantSize->getZExtValue();
    if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold())
      return SDValue();
  
@@ -5276,36 +5580,6 @@ X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG,
    return DAG.getNode(ISD::TokenFactor, MVT::Other, &Results[0], Results.size());
  }
  
-/// Expand the result of: i64,outchain = READCYCLECOUNTER inchain
-SDNode *X86TargetLowering::ExpandREADCYCLECOUNTER(SDNode *N, SelectionDAG &DAG){
-  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
-  SDValue TheChain = N->getOperand(0);
-  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, Tys, &TheChain, 1);
-  if (Subtarget->is64Bit()) {
-    SDValue rax = DAG.getCopyFromReg(rd, X86::RAX, MVT::i64, rd.getValue(1));
-    SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), X86::RDX,
-                                       MVT::i64, rax.getValue(2));
-    SDValue Tmp = DAG.getNode(ISD::SHL, MVT::i64, rdx,
-                                DAG.getConstant(32, MVT::i8));
-    SDValue Ops[] = {
-      DAG.getNode(ISD::OR, MVT::i64, rax, Tmp), rdx.getValue(1)
-    };
-    
-    return DAG.getMergeValues(Ops, 2).getNode();
-  }
-  
-  SDValue eax = DAG.getCopyFromReg(rd, X86::EAX, MVT::i32, rd.getValue(1));
-  SDValue edx = DAG.getCopyFromReg(eax.getValue(1), X86::EDX,
-                                       MVT::i32, eax.getValue(2));
-  // Use a buildpair to merge the two 32-bit values into a 64-bit one. 
-  SDValue Ops[] = { eax, edx };
-  Ops[0] = DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Ops, 2);
-
-  // Use a MERGE_VALUES to return the value and chain.
-  Ops[1] = edx.getValue(1);
-  return DAG.getMergeValues(Ops, 2).getNode();
-}
-
  SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) {
    const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
  
@@ -5378,7 +5652,7 @@ SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) {
  
  SDValue
  X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
-  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getValue();
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    switch (IntNo) {
    default: return SDValue();    // Don't custom lower most intrinsics.
    // Comparison intrinsics.
@@ -5576,7 +5850,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
  
  SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
    // Depths > 0 not supported yet!
-  if (cast<ConstantSDNode>(Op.getOperand(0))->getValue() > 0)
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
      return SDValue();
    
    // Just load the return address
@@ -5585,13 +5859,15 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
  }
  
  SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
-  // Depths > 0 not supported yet!
-  if (cast<ConstantSDNode>(Op.getOperand(0))->getValue() > 0)
-    return SDValue();
-
-  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
-  return DAG.getNode(ISD::SUB, getPointerTy(), RetAddrFI,
-                     DAG.getIntPtrConstant(TD->getPointerSize()));
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+  MVT VT = Op.getValueType();
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), FrameReg, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, DAG.getEntryNode(), FrameAddr, NULL, 0);
+  return FrameAddr;
  }
  
  SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
@@ -5697,7 +5973,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
  
        // Check that ECX wasn't needed by an 'inreg' parameter.
        const FunctionType *FTy = Func->getFunctionType();
-      const PAListPtr &Attrs = Func->getParamAttrs();
+      const AttrListPtr &Attrs = Func->getAttributes();
  
        if (!Attrs.isEmpty() && !Func->isVarArg()) {
          unsigned InRegCount = 0;
@@ -5705,7 +5981,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
  
          for (FunctionType::param_iterator I = FTy->param_begin(),
               E = FTy->param_end(); I != E; ++I, ++Idx)
-          if (Attrs.paramHasAttr(Idx, ParamAttr::InReg))
+          if (Attrs.paramHasAttr(Idx, Attribute::InReg))
              // FIXME: should only count parameters that are lowered to integers.
              InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
  
@@ -5783,7 +6059,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
  
    SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, MVT::Other,
-                                DAG.getEntryNode(), StackSlot);
+                              DAG.getEntryNode(), StackSlot);
  
    // Load FP Control Word from stack slot
    SDValue CWD = DAG.getLoad(MVT::i16, Chain, StackSlot, NULL, 0);
@@ -5872,6 +6148,101 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
    return Op;
  }
  
+SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType();
+  assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
+  
+  //  ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
+  //  ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
+  //  ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
+  //  ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
+  //  ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
+  //
+  //  AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
+  //  AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
+  //  return AloBlo + AloBhi + AhiBlo;
+
+  SDValue A = Op.getOperand(0);
+  SDValue B = Op.getOperand(1);
+  
+  SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+                       A, DAG.getConstant(32, MVT::i32));
+  SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+                       B, DAG.getConstant(32, MVT::i32));
+  SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
+                       A, B);
+  SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
+                       A, Bhi);
+  SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
+                       Ahi, B);
+  AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+                       AloBhi, DAG.getConstant(32, MVT::i32));
+  AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+                       AhiBlo, DAG.getConstant(32, MVT::i32));
+  SDValue Res = DAG.getNode(ISD::ADD, VT, AloBlo, AloBhi);
+  Res = DAG.getNode(ISD::ADD, VT, Res, AhiBlo);
+  return Res;
+}
+
+
+SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
+  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
+  // looks for this combo and may remove the "setcc" instruction if the "setcc"
+  // has only one use.
+  SDNode *N = Op.getNode();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  unsigned BaseOp = 0;
+  unsigned Cond = 0;
+
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Unknown ovf instruction!");
+  case ISD::SADDO:
+    BaseOp = X86ISD::ADD;
+    Cond = X86::COND_O;
+    break;
+  case ISD::UADDO:
+    BaseOp = X86ISD::ADD;
+    Cond = X86::COND_C;
+    break;
+  case ISD::SSUBO:
+    BaseOp = X86ISD::SUB;
+    Cond = X86::COND_O;
+    break;
+  case ISD::USUBO:
+    BaseOp = X86ISD::SUB;
+    Cond = X86::COND_C;
+    break;
+  case ISD::SMULO:
+    BaseOp = X86ISD::SMUL;
+    Cond = X86::COND_O;
+    break;
+  case ISD::UMULO:
+    BaseOp = X86ISD::UMUL;
+    Cond = X86::COND_C;
+    break;
+  }
+
+  // Also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+  SDValue Sum = DAG.getNode(BaseOp, VTs, LHS, RHS);
+
+  SDValue SetCC =
+    DAG.getNode(X86ISD::SETCC, N->getValueType(1),
+                DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1));
+
+  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
+  return Sum;
+}
+
  SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) {
    MVT T = Op.getValueType();
    unsigned Reg = 0;
@@ -5883,19 +6254,17 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) {
    case MVT::i16: Reg = X86::AX;  size = 2; break;
    case MVT::i32: Reg = X86::EAX; size = 4; break;
    case MVT::i64: 
-    if (Subtarget->is64Bit()) {
-      Reg = X86::RAX; size = 8;
-    } else //Should go away when LowerType stuff lands
-      return SDValue(ExpandATOMIC_CMP_SWAP(Op.getNode(), DAG), 0);
+    assert(Subtarget->is64Bit() && "Node not type legal!");
+    Reg = X86::RAX; size = 8;
      break;
-  };
+  }
    SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), Reg,
-                                    Op.getOperand(3), SDValue());
+                                    Op.getOperand(2), SDValue());
    SDValue Ops[] = { cpIn.getValue(0),
-                      Op.getOperand(1),
-                      Op.getOperand(2),
-                      DAG.getTargetConstant(size, MVT::i8),
-                      cpIn.getValue(1) };
+                    Op.getOperand(1),
+                    Op.getOperand(3),
+                    DAG.getTargetConstant(size, MVT::i8),
+                    cpIn.getValue(1) };
    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
    SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, Tys, Ops, 5);
    SDValue cpOut = 
@@ -5903,55 +6272,35 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) {
    return cpOut;
  }
  
-SDNode* X86TargetLowering::ExpandATOMIC_CMP_SWAP(SDNode* Op,
+SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
                                                   SelectionDAG &DAG) {
-  MVT T = Op->getValueType(0);
-  assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
-  SDValue cpInL, cpInH;
-  cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op->getOperand(3),
-                      DAG.getConstant(0, MVT::i32));
-  cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op->getOperand(3),
-                      DAG.getConstant(1, MVT::i32));
-  cpInL = DAG.getCopyToReg(Op->getOperand(0), X86::EAX,
-                           cpInL, SDValue());
-  cpInH = DAG.getCopyToReg(cpInL.getValue(0), X86::EDX,
-                           cpInH, cpInL.getValue(1));
-  SDValue swapInL, swapInH;
-  swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op->getOperand(2),
-                        DAG.getConstant(0, MVT::i32));
-  swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, Op->getOperand(2),
-                        DAG.getConstant(1, MVT::i32));
-  swapInL = DAG.getCopyToReg(cpInH.getValue(0), X86::EBX,
-                             swapInL, cpInH.getValue(1));
-  swapInH = DAG.getCopyToReg(swapInL.getValue(0), X86::ECX,
-                             swapInH, swapInL.getValue(1));
-  SDValue Ops[] = { swapInH.getValue(0),
-                      Op->getOperand(1),
-                      swapInH.getValue(1)};
+  assert(Subtarget->is64Bit() && "Result not type legalized?");
    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
-  SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, Tys, Ops, 3);
-  SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), X86::EAX, MVT::i32, 
-                                        Result.getValue(1));
-  SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), X86::EDX, MVT::i32, 
-                                        cpOutL.getValue(2));
-  SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
-  SDValue ResultVal = DAG.getNode(ISD::BUILD_PAIR, MVT::i64, OpsF, 2);
-  SDValue Vals[2] = { ResultVal, cpOutH.getValue(1) };
-  return DAG.getMergeValues(Vals, 2).getNode();
+  SDValue TheChain = Op.getOperand(0);
+  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, Tys, &TheChain, 1);
+  SDValue rax = DAG.getCopyFromReg(rd, X86::RAX, MVT::i64, rd.getValue(1));
+  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), X86::RDX, MVT::i64,
+                                   rax.getValue(2));
+  SDValue Tmp = DAG.getNode(ISD::SHL, MVT::i64, rdx,
+                            DAG.getConstant(32, MVT::i8));
+  SDValue Ops[] = {
+    DAG.getNode(ISD::OR, MVT::i64, rax, Tmp),
+    rdx.getValue(1)
+  };
+  return DAG.getMergeValues(Ops, 2);
  }
  
-SDNode* X86TargetLowering::ExpandATOMIC_LOAD_SUB(SDNode* Op,
-                                                 SelectionDAG &DAG) {
-  MVT T = Op->getValueType(0);
+SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  MVT T = Node->getValueType(0);
    SDValue negOp = DAG.getNode(ISD::SUB, T,
-                                DAG.getConstant(0, T), Op->getOperand(2));
-  return DAG.getAtomic((T==MVT::i8 ? ISD::ATOMIC_LOAD_ADD_8:
-                        T==MVT::i16 ? ISD::ATOMIC_LOAD_ADD_16:
-                        T==MVT::i32 ? ISD::ATOMIC_LOAD_ADD_32:
-                        T==MVT::i64 ? ISD::ATOMIC_LOAD_ADD_64: 0),
-                       Op->getOperand(0), Op->getOperand(1), negOp,
-                       cast<AtomicSDNode>(Op)->getSrcValue(),
-                       cast<AtomicSDNode>(Op)->getAlignment()).getNode();
+                                DAG.getConstant(0, T), Node->getOperand(2));
+  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD,
+                       cast<AtomicSDNode>(Node)->getMemoryVT(),
+                       Node->getOperand(0),
+                       Node->getOperand(1), negOp,
+                       cast<AtomicSDNode>(Node)->getSrcValue(),
+                       cast<AtomicSDNode>(Node)->getAlignment());
  }
  
  /// LowerOperation - Provide custom lowering hooks for some operations.
@@ -5959,10 +6308,8 @@ SDNode* X86TargetLowering::ExpandATOMIC_LOAD_SUB(SDNode* Op,
  SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
    switch (Op.getOpcode()) {
    default: assert(0 && "Should not custom lower this!");
-  case ISD::ATOMIC_CMP_SWAP_8:  return LowerCMP_SWAP(Op,DAG);
-  case ISD::ATOMIC_CMP_SWAP_16: return LowerCMP_SWAP(Op,DAG);
-  case ISD::ATOMIC_CMP_SWAP_32: return LowerCMP_SWAP(Op,DAG);
-  case ISD::ATOMIC_CMP_SWAP_64: return LowerCMP_SWAP(Op,DAG);
+  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
+  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
    case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
    case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
    case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
@@ -5976,6 +6323,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
    case ISD::SRA_PARTS:
    case ISD::SRL_PARTS:          return LowerShift(Op, DAG);
    case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
+  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
    case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
    case ISD::FABS:               return LowerFABS(Op, DAG);
    case ISD::FNEG:               return LowerFNEG(Op, DAG);
@@ -6002,25 +6350,127 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
    case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
    case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
    case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
-      
-  // FIXME: REMOVE THIS WHEN LegalizeDAGTypes lands.
-  case ISD::READCYCLECOUNTER:
-    return SDValue(ExpandREADCYCLECOUNTER(Op.getNode(), DAG), 0);
-  }
+  case ISD::MUL:                return LowerMUL_V2I64(Op, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:              return LowerXALUO(Op, DAG);
+  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, DAG);
+  }
+}
+
+void X86TargetLowering::
+ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
+                        SelectionDAG &DAG, unsigned NewOp) {
+  MVT T = Node->getValueType(0);
+  assert (T == MVT::i64 && "Only know how to expand i64 atomics");
+
+  SDValue Chain = Node->getOperand(0);
+  SDValue In1 = Node->getOperand(1);
+  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32,
+                             Node->getOperand(2), DAG.getIntPtrConstant(0));
+  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32,
+                             Node->getOperand(2), DAG.getIntPtrConstant(1));
+  // This is a generalized SDNode, not an AtomicSDNode, so it doesn't
+  // have a MemOperand.  Pass the info through as a normal operand.
+  SDValue LSI = DAG.getMemOperand(cast<MemSDNode>(Node)->getMemOperand());
+  SDValue Ops[] = { Chain, In1, In2L, In2H, LSI };
+  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+  SDValue Result = DAG.getNode(NewOp, Tys, Ops, 5);
+  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, MVT::i64, OpsF, 2));
+  Results.push_back(Result.getValue(2));
  }
  
  /// ReplaceNodeResults - Replace a node with an illegal result type
  /// with a new node built out of custom code.
-SDNode *X86TargetLowering::ReplaceNodeResults(SDNode *N, SelectionDAG &DAG) {
+void X86TargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) {
    switch (N->getOpcode()) {
-  default: assert(0 && "Should not custom lower this!");
-  case ISD::FP_TO_SINT:         return ExpandFP_TO_SINT(N, DAG);
-  case ISD::READCYCLECOUNTER:   return ExpandREADCYCLECOUNTER(N, DAG);
-  case ISD::ATOMIC_CMP_SWAP_64: return ExpandATOMIC_CMP_SWAP(N, DAG);
-  case ISD::ATOMIC_LOAD_SUB_8:  return ExpandATOMIC_LOAD_SUB(N,DAG);
-  case ISD::ATOMIC_LOAD_SUB_16: return ExpandATOMIC_LOAD_SUB(N,DAG);
-  case ISD::ATOMIC_LOAD_SUB_32: return ExpandATOMIC_LOAD_SUB(N,DAG);
-  case ISD::ATOMIC_LOAD_SUB_64: return ExpandATOMIC_LOAD_SUB(N,DAG);
+  default:
+    assert(false && "Do not know how to custom type legalize this operation!");
+    return;
+  case ISD::FP_TO_SINT: {
+    std::pair<SDValue,SDValue> Vals = FP_TO_SINTHelper(SDValue(N, 0), DAG);
+    SDValue FIST = Vals.first, StackSlot = Vals.second;
+    if (FIST.getNode() != 0) {
+      MVT VT = N->getValueType(0);
+      // Return a load from the stack slot.
+      Results.push_back(DAG.getLoad(VT, FIST, StackSlot, NULL, 0));
+    }
+    return;
+  }
+  case ISD::READCYCLECOUNTER: {
+    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+    SDValue TheChain = N->getOperand(0);
+    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, Tys, &TheChain, 1);
+    SDValue eax = DAG.getCopyFromReg(rd, X86::EAX, MVT::i32, rd.getValue(1));
+    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), X86::EDX, MVT::i32,
+                                     eax.getValue(2));
+    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+    SDValue Ops[] = { eax, edx };
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, MVT::i64, Ops, 2));
+    Results.push_back(edx.getValue(1));
+    return;
+  }
+  case ISD::ATOMIC_CMP_SWAP: {
+    MVT T = N->getValueType(0);
+    assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
+    SDValue cpInL, cpInH;
+    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, N->getOperand(2),
+                        DAG.getConstant(0, MVT::i32));
+    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, N->getOperand(2),
+                        DAG.getConstant(1, MVT::i32));
+    cpInL = DAG.getCopyToReg(N->getOperand(0), X86::EAX, cpInL, SDValue());
+    cpInH = DAG.getCopyToReg(cpInL.getValue(0), X86::EDX, cpInH,
+                             cpInL.getValue(1));
+    SDValue swapInL, swapInH;
+    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, N->getOperand(3),
+                          DAG.getConstant(0, MVT::i32));
+    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, MVT::i32, N->getOperand(3),
+                          DAG.getConstant(1, MVT::i32));
+    swapInL = DAG.getCopyToReg(cpInH.getValue(0), X86::EBX, swapInL,
+                               cpInH.getValue(1));
+    swapInH = DAG.getCopyToReg(swapInL.getValue(0), X86::ECX, swapInH,
+                               swapInL.getValue(1));
+    SDValue Ops[] = { swapInH.getValue(0),
+                      N->getOperand(1),
+                      swapInH.getValue(1) };
+    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+    SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, Tys, Ops, 3);
+    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), X86::EAX, MVT::i32,
+                                        Result.getValue(1));
+    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), X86::EDX, MVT::i32,
+                                        cpOutL.getValue(2));
+    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, MVT::i64, OpsF, 2));
+    Results.push_back(cpOutH.getValue(1));
+    return;
+  }
+  case ISD::ATOMIC_LOAD_ADD:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_AND:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_NAND:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_OR:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_SUB:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_XOR:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
+    return;
+  case ISD::ATOMIC_SWAP:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
+    return;
    }
  }
  
@@ -6045,6 +6495,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::CALL:               return "X86ISD::CALL";
    case X86ISD::TAILCALL:           return "X86ISD::TAILCALL";
    case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
+  case X86ISD::BT:                 return "X86ISD::BT";
    case X86ISD::CMP:                return "X86ISD::CMP";
    case X86ISD::COMI:               return "X86ISD::COMI";
    case X86ISD::UCOMI:              return "X86ISD::UCOMI";
@@ -6072,6 +6523,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
    case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
    case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
+  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
+  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
+  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
+  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
+  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
+  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
    case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
    case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
    case X86ISD::VSHL:               return "X86ISD::VSHL";
@@ -6086,6 +6543,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::PCMPGTW:            return "X86ISD::PCMPGTW";
    case X86ISD::PCMPGTD:            return "X86ISD::PCMPGTD";
    case X86ISD::PCMPGTQ:            return "X86ISD::PCMPGTQ";
+  case X86ISD::ADD:                return "X86ISD::ADD";
+  case X86ISD::SUB:                return "X86ISD::SUB";
+  case X86ISD::SMUL:               return "X86ISD::SMUL";
+  case X86ISD::UMUL:               return "X86ISD::UMUL";
    }
  }
  
@@ -6103,6 +6564,10 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
      // We can only fold this if we don't need an extra load.
      if (Subtarget->GVRequiresExtraLoad(AM.BaseGV, getTargetMachine(), false))
        return false;
+    // If BaseGV requires a register, we cannot also have a BaseReg.
+    if (Subtarget->GVRequiresRegister(AM.BaseGV, getTargetMachine(), false) &&
+        AM.HasBaseReg)
+      return false;
  
      // X86-64 only supports addr of globals in small code model.
      if (Subtarget->is64Bit()) {
@@ -6267,8 +6732,9 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
      tt = t1;
  
    unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
-  assert(   (argOpers[valArgIndx]->isReg() || argOpers[valArgIndx]->isImm())
-         && "invalid operand");
+  assert((argOpers[valArgIndx]->isReg() ||
+          argOpers[valArgIndx]->isImm()) &&
+         "invalid operand");
    if (argOpers[valArgIndx]->isReg())
      MIB = BuildMI(newMBB, TII->get(regOpc), t2);
    else
@@ -6296,6 +6762,153 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
    return nextMBB;
  }
  
+// private utility function:  64 bit atomics on 32 bit host.
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
+                                                       MachineBasicBlock *MBB,
+                                                       unsigned regOpcL,
+                                                       unsigned regOpcH,
+                                                       unsigned immOpcL,
+                                                       unsigned immOpcH,
+                                                       bool invSrc) {
+  // For the atomic bitwise operator, we generate
+  //   thisMBB (instructions are in pairs, except cmpxchg8b)
+  //     ld t1,t2 = [bitinstr.addr]
+  //   newMBB:
+  //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
+  //     op  t5, t6 <- out1, out2, [bitinstr.val]
+  //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
+  //     mov ECX, EBX <- t5, t6
+  //     mov EAX, EDX <- t1, t2
+  //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
+  //     mov t3, t4 <- EAX, EDX
+  //     bz  newMBB
+  //     result in out1, out2
+  //     fallthrough -->nextMBB
+
+  const TargetRegisterClass *RC = X86::GR32RegisterClass;
+  const unsigned LoadOpc = X86::MOV32rm;
+  const unsigned copyOpc = X86::MOV32rr;
+  const unsigned NotOpc = X86::NOT32r;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction::iterator MBBIter = MBB;
+  ++MBBIter;
+  
+  /// First build the CFG
+  MachineFunction *F = MBB->getParent();
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(MBBIter, newMBB);
+  F->insert(MBBIter, nextMBB);
+  
+  // Move all successors to thisMBB to nextMBB
+  nextMBB->transferSuccessors(thisMBB);
+    
+  // Update thisMBB to fall through to newMBB
+  thisMBB->addSuccessor(newMBB);
+  
+  // newMBB jumps to itself and fall through to nextMBB
+  newMBB->addSuccessor(nextMBB);
+  newMBB->addSuccessor(newMBB);
+  
+  // Insert instructions into newMBB based on incoming instruction
+  // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
+  assert(bInstr->getNumOperands() < 18 && "unexpected number of operands");
+  MachineOperand& dest1Oper = bInstr->getOperand(0);
+  MachineOperand& dest2Oper = bInstr->getOperand(1);
+  MachineOperand* argOpers[6];
+  for (int i=0; i < 6; ++i)
+    argOpers[i] = &bInstr->getOperand(i+2);
+
+  // x86 address has 4 operands: base, index, scale, and displacement
+  int lastAddrIndx = 3; // [0,3]
+  
+  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
+  MachineInstrBuilder MIB = BuildMI(thisMBB, TII->get(LoadOpc), t1);
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
+  MIB = BuildMI(thisMBB, TII->get(LoadOpc), t2);
+  // add 4 to displacement.
+  for (int i=0; i <= lastAddrIndx-1; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  MachineOperand newOp3 = *(argOpers[3]);
+  if (newOp3.isImm())
+    newOp3.setImm(newOp3.getImm()+4);
+  else
+    newOp3.setOffset(newOp3.getOffset()+4);
+  (*MIB).addOperand(newOp3);
+
+  // t3/4 are defined later, at the bottom of the loop
+  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
+  unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
+  BuildMI(newMBB, TII->get(X86::PHI), dest1Oper.getReg())
+    .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
+  BuildMI(newMBB, TII->get(X86::PHI), dest2Oper.getReg())
+    .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
+
+  unsigned tt1 = F->getRegInfo().createVirtualRegister(RC);
+  unsigned tt2 = F->getRegInfo().createVirtualRegister(RC);
+  if (invSrc) {  
+    MIB = BuildMI(newMBB, TII->get(NotOpc), tt1).addReg(t1);
+    MIB = BuildMI(newMBB, TII->get(NotOpc), tt2).addReg(t2);
+  } else {
+    tt1 = t1;
+    tt2 = t2;
+  }
+
+  assert((argOpers[4]->isReg() || argOpers[4]->isImm()) &&
+         "invalid operand");
+  unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
+  unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
+  if (argOpers[4]->isReg())
+    MIB = BuildMI(newMBB, TII->get(regOpcL), t5);
+  else
+    MIB = BuildMI(newMBB, TII->get(immOpcL), t5);
+  if (regOpcL != X86::MOV32rr)
+    MIB.addReg(tt1);
+  (*MIB).addOperand(*argOpers[4]);
+  assert(argOpers[5]->isReg() == argOpers[4]->isReg());
+  assert(argOpers[5]->isImm() == argOpers[4]->isImm());
+  if (argOpers[5]->isReg())
+    MIB = BuildMI(newMBB, TII->get(regOpcH), t6);
+  else
+    MIB = BuildMI(newMBB, TII->get(immOpcH), t6);
+  if (regOpcH != X86::MOV32rr)
+    MIB.addReg(tt2);
+  (*MIB).addOperand(*argOpers[5]);
+
+  MIB = BuildMI(newMBB, TII->get(copyOpc), X86::EAX);
+  MIB.addReg(t1);
+  MIB = BuildMI(newMBB, TII->get(copyOpc), X86::EDX);
+  MIB.addReg(t2);
+
+  MIB = BuildMI(newMBB, TII->get(copyOpc), X86::EBX);
+  MIB.addReg(t5);
+  MIB = BuildMI(newMBB, TII->get(copyOpc), X86::ECX);
+  MIB.addReg(t6);
+  
+  MIB = BuildMI(newMBB, TII->get(X86::LCMPXCHG8B));
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+
+  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
+  (*MIB).addMemOperand(*F, *bInstr->memoperands_begin());
+
+  MIB = BuildMI(newMBB, TII->get(copyOpc), t3);
+  MIB.addReg(X86::EAX);
+  MIB = BuildMI(newMBB, TII->get(copyOpc), t4);
+  MIB.addReg(X86::EDX);
+  
+  // insert branch
+  BuildMI(newMBB, TII->get(X86::JNE)).addMBB(newMBB);
+
+  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
+  return nextMBB;
+}
+
  // private utility function
  MachineBasicBlock *
  X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
@@ -6354,8 +6967,9 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
      (*MIB).addOperand(*argOpers[i]);
  
    // We only support register and immediate values
-  assert(   (argOpers[valArgIndx]->isReg() || argOpers[valArgIndx]->isImm())
-         && "invalid operand");
+  assert((argOpers[valArgIndx]->isReg() ||
+          argOpers[valArgIndx]->isImm()) &&
+         "invalid operand");
    
    unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);  
    if (argOpers[valArgIndx]->isReg())
@@ -6402,6 +7016,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
    switch (MI->getOpcode()) {
    default: assert(false && "Unexpected instr type to insert");
+  case X86::CMOV_V1I64:
    case X86::CMOV_FR32:
    case X86::CMOV_FR64:
    case X86::CMOV_V4F32:
@@ -6506,7 +7121,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
  
      X86AddressMode AM;
      MachineOperand &Op = MI->getOperand(0);
-    if (Op.isRegister()) {
+    if (Op.isReg()) {
        AM.BaseType = X86AddressMode::RegBase;
        AM.Base.Reg = Op.getReg();
      } else {
@@ -6514,13 +7129,13 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
        AM.Base.FrameIndex = Op.getIndex();
      }
      Op = MI->getOperand(1);
-    if (Op.isImmediate())
+    if (Op.isImm())
        AM.Scale = Op.getImm();
      Op = MI->getOperand(2);
-    if (Op.isImmediate())
+    if (Op.isImm())
        AM.IndexReg = Op.getImm();
      Op = MI->getOperand(3);
-    if (Op.isGlobalAddress()) {
+    if (Op.isGlobal()) {
        AM.GV = Op.getGlobal();
      } else {
        AM.Disp = Op.getImm();
@@ -6625,6 +7240,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                 X86::NOT8r, X86::AL,
                                                 X86::GR8RegisterClass, true);
    // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
+  // This group is for 64-bit host.
    case X86::ATOMAND64:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
                                                 X86::AND64ri32, X86::MOV64rm, 
@@ -6657,6 +7273,43 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
      return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
    case X86::ATOMUMAX64:
      return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
+
+  // This group does 64-bit operations on a 32-bit host.
+  case X86::ATOMAND6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB, 
+                                               X86::AND32rr, X86::AND32rr,
+                                               X86::AND32ri, X86::AND32ri,
+                                               false);
+  case X86::ATOMOR6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB, 
+                                               X86::OR32rr, X86::OR32rr,
+                                               X86::OR32ri, X86::OR32ri,
+                                               false);
+  case X86::ATOMXOR6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB, 
+                                               X86::XOR32rr, X86::XOR32rr,
+                                               X86::XOR32ri, X86::XOR32ri,
+                                               false);
+  case X86::ATOMNAND6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB, 
+                                               X86::AND32rr, X86::AND32rr,
+                                               X86::AND32ri, X86::AND32ri,
+                                               true);
+  case X86::ATOMADD6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB, 
+                                               X86::ADD32rr, X86::ADC32rr,
+                                               X86::ADD32ri, X86::ADC32ri,
+                                               false);
+  case X86::ATOMSUB6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB, 
+                                               X86::SUB32rr, X86::SBB32rr,
+                                               X86::SUB32ri, X86::SBB32ri,
+                                               false);
+  case X86::ATOMSWAP6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB, 
+                                               X86::MOV32rr, X86::MOV32rr,
+                                               X86::MOV32ri, X86::MOV32ri,
+                                               false);
    }
  }
  
@@ -6695,6 +7348,7 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
    if (N->getOpcode() == X86ISD::Wrapper) {
      if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
        GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
+      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
        return true;
      }
    }
@@ -6772,8 +7426,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
  
  /// PerformBuildVectorCombine - build_vector 0,(load i64 / f64) -> movq / movsd.
  static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
-                                           const X86Subtarget *Subtarget,
-                                           const TargetLowering &TLI) {
+                                         const X86Subtarget *Subtarget,
+                                         const TargetLowering &TLI) {
    unsigned NumOps = N->getNumOperands();
  
    // Ignore single operand BUILD_VECTOR.
@@ -6809,7 +7463,11 @@ static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
    if (LD->getExtensionType() != ISD::NON_EXTLOAD)
      return SDValue();
    
-  return DAG.getNode(X86ISD::VZEXT_LOAD, VT, LD->getChain(), LD->getBasePtr());
+  SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+  SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
+  SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, Tys, Ops, 2);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Base, 1), ResNode.getValue(1));
+  return ResNode;
  }                                           
  
  /// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
@@ -7004,7 +7662,7 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
  
  
  SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
-                                               DAGCombinerInfo &DCI) const {
+                                             DAGCombinerInfo &DCI) const {
    SelectionDAG &DAG = DCI.DAG;
    switch (N->getOpcode()) {
    default: break;
@@ -7032,6 +7690,7 @@ X86TargetLowering::getConstraintType(const std::string &Constraint) const {
    if (Constraint.size() == 1) {
      switch (Constraint[0]) {
      case 'A':
+      return C_Register;
      case 'f':
      case 'r':
      case 'R':
@@ -7070,6 +7729,7 @@ LowerXConstraint(MVT ConstraintVT) const {
  /// vector.  If it is invalid, don't add anything to Ops.
  void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                       char Constraint,
+                                                     bool hasMemory,
                                                       std::vector<SDValue>&Ops,
                                                       SelectionDAG &DAG) const {
    SDValue Result(0, 0);
@@ -7078,16 +7738,24 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
    default: break;
    case 'I':
      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-      if (C->getValue() <= 31) {
-        Result = DAG.getTargetConstant(C->getValue(), Op.getValueType());
+      if (C->getZExtValue() <= 31) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'J':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 63) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
          break;
        }
      }
      return;
    case 'N':
      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-      if (C->getValue() <= 255) {
-        Result = DAG.getTargetConstant(C->getValue(), Op.getValueType());
+      if (C->getZExtValue() <= 255) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
          break;
        }
      }
@@ -7095,7 +7763,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
    case 'i': {
      // Literal immediates are always ok.
      if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
-      Result = DAG.getTargetConstant(CST->getValue(), Op.getValueType());
+      Result = DAG.getTargetConstant(CST->getZExtValue(), Op.getValueType());
        break;
      }
  
@@ -7111,26 +7779,23 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
        ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
        GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
        if (C && GA) {
-        Offset = GA->getOffset()+C->getValue();
+        Offset = GA->getOffset()+C->getZExtValue();
        } else {
          C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
          GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
          if (C && GA)
-          Offset = GA->getOffset()+C->getValue();
+          Offset = GA->getOffset()+C->getZExtValue();
          else
            C = 0, GA = 0;
        }
      }
      
      if (GA) {
-      // If addressing this global requires a load (e.g. in PIC mode), we can't
-      // match.
-      if (Subtarget->GVRequiresExtraLoad(GA->getGlobal(), getTargetMachine(),
-                                         false))
-        return;
-
-      Op = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
-                                      Offset);
+      if (hasMemory) 
+        Op = LowerGlobalAddress(GA->getGlobal(), Offset, DAG);
+      else
+        Op = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
+                                        Offset);
        Result = Op;
        break;
      }
@@ -7144,7 +7809,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
      Ops.push_back(Result);
      return;
    }
-  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
+                                                      Ops, DAG);
  }
  
  std::vector<unsigned> X86TargetLowering::
@@ -7154,10 +7820,6 @@ getRegClassForInlineAsmConstraint(const std::string &Constraint,
      // FIXME: not handling fp-stack yet!
      switch (Constraint[0]) {      // GCC X86 Constraint Letters
      default: break;  // Unknown constraint letter
-    case 'A':   // EAX/EDX
-      if (VT == MVT::i32 || VT == MVT::i64)
-        return make_vector<unsigned>(X86::EAX, X86::EDX, 0);
-      break;
      case 'q':   // Q_REGS (GENERAL_REGS in 64-bit mode)
      case 'Q':   // Q_REGS
        if (VT == MVT::i32)
@@ -7187,15 +7849,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
      case 'r':   // GENERAL_REGS
      case 'R':   // LEGACY_REGS
      case 'l':   // INDEX_REGS
-      if (VT == MVT::i64 && Subtarget->is64Bit())
-        return std::make_pair(0U, X86::GR64RegisterClass);
-      if (VT == MVT::i32)
-        return std::make_pair(0U, X86::GR32RegisterClass);
-      else if (VT == MVT::i16)
-        return std::make_pair(0U, X86::GR16RegisterClass);
-      else if (VT == MVT::i8)
+      if (VT == MVT::i8)
          return std::make_pair(0U, X86::GR8RegisterClass);
-      break;
+      if (VT == MVT::i16)
+        return std::make_pair(0U, X86::GR16RegisterClass);
+      if (VT == MVT::i32 || !Subtarget->is64Bit())
+        return std::make_pair(0U, X86::GR32RegisterClass);  
+      return std::make_pair(0U, X86::GR64RegisterClass);
      case 'f':  // FP Stack registers.
        // If SSE is enabled for this VT, use f80 to ensure the isel moves the
        // value to the correct fpstack register class.
@@ -7207,7 +7867,6 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
      case 'y':   // MMX_REGS if MMX allowed.
        if (!Subtarget->hasMMX()) break;
        return std::make_pair(0U, X86::VR64RegisterClass);
-      break;
      case 'Y':   // SSE_REGS if SSE2 allowed
        if (!Subtarget->hasSSE2()) break;
        // FALL THROUGH.
@@ -7248,7 +7907,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
        Res.first = X86::ST0;
        Res.second = X86::RFP80RegisterClass;
      }
-
+    // 'A' means EAX + EDX.
+    if (Constraint == "A") {
+      Res.first = X86::EAX;
+      Res.second = X86::GRADRegisterClass;
+    }
      return Res;
    }
  
@@ -7328,3 +7991,41 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
  
    return Res;
  }
+
+//===----------------------------------------------------------------------===//
+//                           X86 Widen vector type
+//===----------------------------------------------------------------------===//
+
+/// getWidenVectorType: given a vector type, returns the type to widen
+/// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself.
+/// If there is no vector type that we want to widen to, returns MVT::Other
+/// When and where to widen is target dependent based on the cost of
+/// scalarizing vs using the wider vector type.
+
+MVT X86TargetLowering::getWidenVectorType(MVT VT) {
+  assert(VT.isVector());
+  if (isTypeLegal(VT))
+    return VT;
+  
+  // TODO: In computeRegisterProperty, we can compute the list of legal vector
+  //       type based on element type.  This would speed up our search (though
+  //       it may not be worth it since the size of the list is relatively
+  //       small).
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NElts = VT.getVectorNumElements();
+  
+  // On X86, it make sense to widen any vector wider than 1
+  if (NElts <= 1)
+    return MVT::Other;
+  
+  for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE; 
+       nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+    MVT SVT = (MVT::SimpleValueType)nVT;
+    
+    if (isTypeLegal(SVT) && 
+        SVT.getVectorElementType() == EltVT && 
+        SVT.getVectorNumElements() > NElts)
+      return SVT;
+  }
+  return MVT::Other;
+}