Handle some 64-bit atomics on x86-32, some of the time.

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 3401a2c4d35872b8b1fba51549135a4086cd8fee..f65167bd8877862f5c8834d42cf5e345be645b69 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -297,10 +297,20 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setOperationAction(ISD::ATOMIC_CMP_SWAP_32, MVT::i32, Custom);
    setOperationAction(ISD::ATOMIC_CMP_SWAP_64, MVT::i64, Custom);
  
-  setOperationAction(ISD::ATOMIC_LOAD_SUB_8, MVT::i8, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_SUB_16, MVT::i16, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_SUB_32, MVT::i32, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_SUB_64, MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB_8 , MVT::i8, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB_16, MVT::i16, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB_32, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB_64, MVT::i64, Custom);
+
+  if (!Subtarget->is64Bit()) {
+    setOperationAction(ISD::ATOMIC_LOAD_ADD_64, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB_64, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_AND_64, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_OR_64, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR_64, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND_64, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_SWAP_64, MVT::i64, Custom);
+  }
  
    // Use the default ISD::DBG_STOPPOINT, ISD::DECLARE expansion.
    setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
@@ -494,20 +504,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
    setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
  
-  setOperationAction(ISD::FLOG, MVT::f32, Expand);
-  setOperationAction(ISD::FLOG, MVT::f64, Expand);
    setOperationAction(ISD::FLOG, MVT::f80, Expand);
-  setOperationAction(ISD::FLOG2, MVT::f32, Expand);
-  setOperationAction(ISD::FLOG2, MVT::f64, Expand);
    setOperationAction(ISD::FLOG2, MVT::f80, Expand);
-  setOperationAction(ISD::FLOG10, MVT::f32, Expand);
-  setOperationAction(ISD::FLOG10, MVT::f64, Expand);
    setOperationAction(ISD::FLOG10, MVT::f80, Expand);
-  setOperationAction(ISD::FEXP, MVT::f32, Expand);
-  setOperationAction(ISD::FEXP, MVT::f64, Expand);
    setOperationAction(ISD::FEXP, MVT::f80, Expand);
-  setOperationAction(ISD::FEXP2, MVT::f32, Expand);
-  setOperationAction(ISD::FEXP2, MVT::f64, Expand);
    setOperationAction(ISD::FEXP2, MVT::f80, Expand);
  
    // First set operation action for all vector types to expand. Then we
@@ -890,7 +890,7 @@ SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
      SDValue TargetAddress = TailCall.getOperand(1);
      SDValue StackAdjustment = TailCall.getOperand(2);
      assert(((TargetAddress.getOpcode() == ISD::Register &&
-               (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::ECX ||
+               (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::EAX ||
                  cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R9)) ||
                TargetAddress.getOpcode() == ISD::TargetExternalSymbol ||
                TargetAddress.getOpcode() == ISD::TargetGlobalAddress) && 
@@ -1098,8 +1098,6 @@ CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const {
  
    if (CC == CallingConv::X86_FastCall)
      return CC_X86_32_FastCall;
-  else if (CC == CallingConv::Fast && PerformTailCallOpt)
-    return CC_X86_32_TailCall;
    else if (CC == CallingConv::Fast)
      return CC_X86_32_FastCC;
    else
@@ -1605,7 +1603,7 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
    if (CallRequiresFnAddressInReg(Is64Bit, IsTailCall)) {
      // Note: The actual moving to ecx is done further down.
      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
-    if (G &&  !G->getGlobal()->hasHiddenVisibility() &&
+    if (G && !G->getGlobal()->hasHiddenVisibility() &&
          !G->getGlobal()->hasProtectedVisibility())
        Callee =  LowerGlobalAddress(Callee, DAG);
      else if (isa<ExternalSymbolSDNode>(Callee))
@@ -1700,7 +1698,7 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
      Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
    } else if (IsTailCall) {
-    unsigned Opc = Is64Bit ? X86::R9 : X86::ECX;
+    unsigned Opc = Is64Bit ? X86::R9 : X86::EAX;
  
      Chain = DAG.getCopyToReg(Chain, 
                               DAG.getRegister(Opc, getPointerTy()), 
@@ -1878,12 +1876,13 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(CallSDNode *TheCall,
  
  FastISel *
  X86TargetLowering::createFastISel(MachineFunction &mf,
+                                  MachineModuleInfo *mmo,
                                    DenseMap<const Value *, unsigned> &vm,
                                    DenseMap<const BasicBlock *,
                                             MachineBasicBlock *> &bm,
                                    DenseMap<const AllocaInst *, int> &am) {
                                           
-  return X86::createFastISel(mf, vm, bm, am);
+  return X86::createFastISel(mf, mmo, vm, bm, am);
  }
  
  
@@ -2527,6 +2526,21 @@ bool X86::isSplatLoMask(SDNode *N) {
    return true;
  }
  
+/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
+bool X86::isMOVDDUPMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  unsigned e = N->getNumOperands() / 2;
+  for (unsigned i = 0; i < e; ++i)
+    if (!isUndefOrEqual(N->getOperand(i), i))
+      return false;
+  for (unsigned i = 0; i < e; ++i)
+    if (!isUndefOrEqual(N->getOperand(e+i), i))
+      return false;
+  return true;
+}
+
  /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
  /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
  /// instructions.
@@ -2694,15 +2708,14 @@ static bool ShouldXformToMOVHLPS(SDNode *Mask) {
  /// is promoted to a vector. It also returns the LoadSDNode by reference if
  /// required.
  static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
-  if (N->getOpcode() == ISD::SCALAR_TO_VECTOR) {
-    N = N->getOperand(0).getNode();
-    if (ISD::isNON_EXTLoad(N)) {
-      if (LD)
-        *LD = cast<LoadSDNode>(N);
-      return true;
-    }
-  }
-  return false;
+  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
+    return false;
+  N = N->getOperand(0).getNode();
+  if (!ISD::isNON_EXTLoad(N))
+    return false;
+  if (LD)
+    *LD = cast<LoadSDNode>(N);
+  return true;
  }
  
  /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
@@ -2954,6 +2967,46 @@ static SDValue PromoteSplat(SDValue Op, SelectionDAG &DAG, bool HasSSE2) {
    return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle);
  }
  
+/// isVectorLoad - Returns true if the node is a vector load, a scalar
+/// load that's promoted to vector, or a load bitcasted.
+static bool isVectorLoad(SDValue Op) {
+  assert(Op.getValueType().isVector() && "Expected a vector type");
+  if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR ||
+      Op.getOpcode() == ISD::BIT_CONVERT) {
+    return isa<LoadSDNode>(Op.getOperand(0));
+  }
+  return isa<LoadSDNode>(Op);
+}
+
+
+/// CanonicalizeMovddup - Cannonicalize movddup shuffle to v2f64.
+///
+static SDValue CanonicalizeMovddup(SDValue Op, SDValue V1, SDValue Mask,
+                                   SelectionDAG &DAG, bool HasSSE3) {
+  // If we have sse3 and shuffle has more than one use or input is a load, then
+  // use movddup. Otherwise, use movlhps.
+  bool UseMovddup = HasSSE3 && (!Op.hasOneUse() || isVectorLoad(V1));
+  MVT PVT = UseMovddup ? MVT::v2f64 : MVT::v4f32;
+  MVT VT = Op.getValueType();
+  if (VT == PVT)
+    return Op;
+  unsigned NumElems = PVT.getVectorNumElements();
+  if (NumElems == 2) {
+    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+    Mask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, Cst, Cst);
+  } else {
+    assert(NumElems == 4);
+    SDValue Cst0 = DAG.getTargetConstant(0, MVT::i32);
+    SDValue Cst1 = DAG.getTargetConstant(1, MVT::i32);
+    Mask = DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, Cst0, Cst1, Cst0, Cst1);
+  }
+
+  V1 = DAG.getNode(ISD::BIT_CONVERT, PVT, V1);
+  SDValue Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, PVT, V1,
+                                DAG.getNode(ISD::UNDEF, PVT), Mask);
+  return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle);
+}
+
  /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
  /// vector of zero or undef vector.  This produces a shuffle where the low
  /// element of V2 is swizzled into the zero/undef vector, landing at element
@@ -3899,6 +3952,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
    else if (isIdentityMask(PermMask.getNode(), true))
      return V2;
  
+  // Canonicalize movddup shuffles.
+  if (V2IsUndef && Subtarget->hasSSE2() &&
+      X86::isMOVDDUPMask(PermMask.getNode()))
+    return CanonicalizeMovddup(Op, V1, PermMask, DAG, Subtarget->hasSSE3());
+
    if (isSplatMask(PermMask.getNode())) {
      if (isMMX || NumElems < 4) return Op;
      // Promote it to a v4{if}32 splat.
@@ -4311,8 +4369,8 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
  }
  
  SDValue
-X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
-  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
+                                      SelectionDAG &DAG) const {
    SDValue Result = DAG.getTargetGlobalAddress(GV, getPointerTy());
    Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
    // With PIC, the address is actually $g + Offset.
@@ -4335,6 +4393,12 @@ X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
    return Result;
  }
  
+SDValue
+X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  return LowerGlobalAddress(GV, DAG);
+}
+
  // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
  static SDValue
  LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
@@ -4965,7 +5029,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
      bool IllegalFPCMov = false;
      if (VT.isFloatingPoint() && !VT.isVector() &&
          !isScalarFPTypeInSSEReg(VT))  // FPStack?
-      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSignExtended());
+      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
      
      if ((Opc == X86ISD::CMP ||
           Opc == X86ISD::COMI ||
@@ -5074,15 +5138,16 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
  
  SDValue
  X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG,
-                                        SDValue Chain,
-                                        SDValue Dst, SDValue Src,
-                                        SDValue Size, unsigned Align,
-                                        const Value *DstSV, uint64_t DstSVOff) {
+                                           SDValue Chain,
+                                           SDValue Dst, SDValue Src,
+                                           SDValue Size, unsigned Align,
+                                           const Value *DstSV,
+                                           uint64_t DstSVOff) {
    ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
  
-  /// If not DWORD aligned or size is more than the threshold, call the library.
-  /// The libc version is likely to be faster for these cases. It can use the
-  /// address value and run time information about the CPU.
+  // If not DWORD aligned or size is more than the threshold, call the library.
+  // The libc version is likely to be faster for these cases. It can use the
+  // address value and run time information about the CPU.
    if ((Align & 3) != 0 ||
        !ConstantSize ||
        ConstantSize->getZExtValue() >
@@ -5091,8 +5156,9 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG,
  
      // Check to see if there is a specialized entry-point for memory zeroing.
      ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
-    if (const char *bzeroEntry = 
-          V && V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
+
+    if (const char *bzeroEntry =  V &&
+        V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
        MVT IntPtr = getPointerTy();
        const Type *IntPtrTy = TD->getIntPtrType();
        TargetLowering::ArgListTy Args; 
@@ -5103,9 +5169,9 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG,
        Entry.Node = Size;
        Args.push_back(Entry);
        std::pair<SDValue,SDValue> CallResult =
-        LowerCallTo(Chain, Type::VoidTy, false, false, false, CallingConv::C,
-                    false, DAG.getExternalSymbol(bzeroEntry, IntPtr),
-                    Args, DAG);
+        LowerCallTo(Chain, Type::VoidTy, false, false, false, false, 
+                    CallingConv::C, false, 
+                    DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG);
        return CallResult.second;
      }
  
@@ -5591,13 +5657,15 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
  }
  
  SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
-  // Depths > 0 not supported yet!
-  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
-    return SDValue();
-
-  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
-  return DAG.getNode(ISD::SUB, getPointerTy(), RetAddrFI,
-                     DAG.getIntPtrConstant(TD->getPointerSize()));
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+  MVT VT = Op.getValueType();
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), FrameReg, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, DAG.getEntryNode(), FrameAddr, NULL, 0);
+  return FrameAddr;
  }
  
  SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
@@ -5703,7 +5771,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
  
        // Check that ECX wasn't needed by an 'inreg' parameter.
        const FunctionType *FTy = Func->getFunctionType();
-      const PAListPtr &Attrs = Func->getParamAttrs();
+      const AttrListPtr &Attrs = Func->getAttributes();
  
        if (!Attrs.isEmpty() && !Func->isVarArg()) {
          unsigned InRegCount = 0;
@@ -5711,7 +5779,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
  
          for (FunctionType::param_iterator I = FTy->param_begin(),
               E = FTy->param_end(); I != E; ++I, ++Idx)
-          if (Attrs.paramHasAttr(Idx, ParamAttr::InReg))
+          if (Attrs.paramHasAttr(Idx, Attribute::InReg))
              // FIXME: should only count parameters that are lowered to integers.
              InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
  
@@ -5789,7 +5857,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
  
    SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, MVT::Other,
-                                DAG.getEntryNode(), StackSlot);
+                              DAG.getEntryNode(), StackSlot);
  
    // Load FP Control Word from stack slot
    SDValue CWD = DAG.getLoad(MVT::i16, Chain, StackSlot, NULL, 0);
@@ -5898,10 +5966,10 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) {
    SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), Reg,
                                      Op.getOperand(2), SDValue());
    SDValue Ops[] = { cpIn.getValue(0),
-                      Op.getOperand(1),
-                      Op.getOperand(3),
-                      DAG.getTargetConstant(size, MVT::i8),
-                      cpIn.getValue(1) };
+                    Op.getOperand(1),
+                    Op.getOperand(3),
+                    DAG.getTargetConstant(size, MVT::i8),
+                    cpIn.getValue(1) };
    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
    SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, Tys, Ops, 5);
    SDValue cpOut = 
@@ -5932,8 +6000,8 @@ SDNode* X86TargetLowering::ExpandATOMIC_CMP_SWAP(SDNode* Op,
    swapInH = DAG.getCopyToReg(swapInL.getValue(0), X86::ECX,
                               swapInH, swapInL.getValue(1));
    SDValue Ops[] = { swapInH.getValue(0),
-                      Op->getOperand(1),
-                      swapInH.getValue(1)};
+                    Op->getOperand(1),
+                    swapInH.getValue(1) };
    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
    SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, Tys, Ops, 3);
    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), X86::EAX, MVT::i32, 
@@ -5946,18 +6014,43 @@ SDNode* X86TargetLowering::ExpandATOMIC_CMP_SWAP(SDNode* Op,
    return DAG.getMergeValues(Vals, 2).getNode();
  }
  
-SDNode* X86TargetLowering::ExpandATOMIC_LOAD_SUB(SDNode* Op,
-                                                 SelectionDAG &DAG) {
-  MVT T = Op->getValueType(0);
+SDValue X86TargetLowering::LowerATOMIC_BINARY_64(SDValue Op,
+                                                 SelectionDAG &DAG,
+                                                 unsigned NewOp) {
+  SDNode *Node = Op.getNode();
+  MVT T = Node->getValueType(0);
+  assert (T == MVT::i64 && "Only know how to expand i64 atomics");
+  
+  SDValue Chain = Node->getOperand(0);
+  SDValue In1 = Node->getOperand(1);
+  assert(Node->getOperand(2).getNode()->getOpcode()==ISD::BUILD_PAIR);
+  SDValue In2L = Node->getOperand(2).getNode()->getOperand(0);
+  SDValue In2H = Node->getOperand(2).getNode()->getOperand(1);
+  SDValue Ops[] = { Chain, In1, In2L, In2H };
+  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+  SDValue Result = DAG.getNode(NewOp, Tys, Ops, 4);
+  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
+  SDValue ResultVal = DAG.getNode(ISD::BUILD_PAIR, MVT::i64, OpsF, 2);
+  SDValue Vals[2] = { ResultVal, Result.getValue(2) };
+  return SDValue(DAG.getMergeValues(Vals, 2).getNode(), 0);
+}
+
+SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  MVT T = Node->getValueType(0);
    SDValue negOp = DAG.getNode(ISD::SUB, T,
-                                DAG.getConstant(0, T), Op->getOperand(2));
-  return DAG.getAtomic((T==MVT::i8 ? ISD::ATOMIC_LOAD_ADD_8:
-                        T==MVT::i16 ? ISD::ATOMIC_LOAD_ADD_16:
-                        T==MVT::i32 ? ISD::ATOMIC_LOAD_ADD_32:
-                        T==MVT::i64 ? ISD::ATOMIC_LOAD_ADD_64: 0),
-                       Op->getOperand(0), Op->getOperand(1), negOp,
-                       cast<AtomicSDNode>(Op)->getSrcValue(),
-                       cast<AtomicSDNode>(Op)->getAlignment()).getNode();
+                                DAG.getConstant(0, T), Node->getOperand(2));
+  return DAG.getAtomic((Op.getOpcode()==ISD::ATOMIC_LOAD_SUB_8 ? 
+                                        ISD::ATOMIC_LOAD_ADD_8 :
+                        Op.getOpcode()==ISD::ATOMIC_LOAD_SUB_16 ? 
+                                        ISD::ATOMIC_LOAD_ADD_16 :
+                        Op.getOpcode()==ISD::ATOMIC_LOAD_SUB_32 ? 
+                                        ISD::ATOMIC_LOAD_ADD_32 :
+                                        ISD::ATOMIC_LOAD_ADD_64),
+                       Node->getOperand(0),
+                       Node->getOperand(1), negOp,
+                       cast<AtomicSDNode>(Node)->getSrcValue(),
+                       cast<AtomicSDNode>(Node)->getAlignment());
  }
  
  /// LowerOperation - Provide custom lowering hooks for some operations.
@@ -5965,10 +6058,27 @@ SDNode* X86TargetLowering::ExpandATOMIC_LOAD_SUB(SDNode* Op,
  SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
    switch (Op.getOpcode()) {
    default: assert(0 && "Should not custom lower this!");
-  case ISD::ATOMIC_CMP_SWAP_8:  return LowerCMP_SWAP(Op,DAG);
-  case ISD::ATOMIC_CMP_SWAP_16: return LowerCMP_SWAP(Op,DAG);
-  case ISD::ATOMIC_CMP_SWAP_32: return LowerCMP_SWAP(Op,DAG);
+  case ISD::ATOMIC_CMP_SWAP_8:  
+  case ISD::ATOMIC_CMP_SWAP_16: 
+  case ISD::ATOMIC_CMP_SWAP_32: 
    case ISD::ATOMIC_CMP_SWAP_64: return LowerCMP_SWAP(Op,DAG);
+  case ISD::ATOMIC_LOAD_SUB_8:  
+  case ISD::ATOMIC_LOAD_SUB_16: 
+  case ISD::ATOMIC_LOAD_SUB_32: return LowerLOAD_SUB(Op,DAG);
+  case ISD::ATOMIC_LOAD_SUB_64: return (Subtarget->is64Bit()) ?
+                                  LowerLOAD_SUB(Op,DAG) :
+                                  LowerATOMIC_BINARY_64(Op,DAG,
+                                        X86ISD::ATOMSUB64_DAG);
+  case ISD::ATOMIC_LOAD_AND_64: return LowerATOMIC_BINARY_64(Op,DAG,
+                                        X86ISD::ATOMAND64_DAG);
+  case ISD::ATOMIC_LOAD_OR_64: return LowerATOMIC_BINARY_64(Op, DAG,
+                                        X86ISD::ATOMOR64_DAG);
+  case ISD::ATOMIC_LOAD_XOR_64: return LowerATOMIC_BINARY_64(Op,DAG,
+                                        X86ISD::ATOMXOR64_DAG);
+  case ISD::ATOMIC_LOAD_NAND_64: return LowerATOMIC_BINARY_64(Op,DAG,
+                                        X86ISD::ATOMNAND64_DAG);
+  case ISD::ATOMIC_LOAD_ADD_64: return LowerATOMIC_BINARY_64(Op,DAG,
+                                        X86ISD::ATOMADD64_DAG);
    case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
    case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
    case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
@@ -6023,10 +6133,6 @@ SDNode *X86TargetLowering::ReplaceNodeResults(SDNode *N, SelectionDAG &DAG) {
    case ISD::FP_TO_SINT:         return ExpandFP_TO_SINT(N, DAG);
    case ISD::READCYCLECOUNTER:   return ExpandREADCYCLECOUNTER(N, DAG);
    case ISD::ATOMIC_CMP_SWAP_64: return ExpandATOMIC_CMP_SWAP(N, DAG);
-  case ISD::ATOMIC_LOAD_SUB_8:  return ExpandATOMIC_LOAD_SUB(N,DAG);
-  case ISD::ATOMIC_LOAD_SUB_16: return ExpandATOMIC_LOAD_SUB(N,DAG);
-  case ISD::ATOMIC_LOAD_SUB_32: return ExpandATOMIC_LOAD_SUB(N,DAG);
-  case ISD::ATOMIC_LOAD_SUB_64: return ExpandATOMIC_LOAD_SUB(N,DAG);
    }
  }
  
@@ -6078,6 +6184,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
    case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
    case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
+  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
+  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
+  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
+  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
+  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
+  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
    case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
    case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
    case X86ISD::VSHL:               return "X86ISD::VSHL";
@@ -6303,6 +6415,146 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
    return nextMBB;
  }
  
+// private utility function
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
+                                                       MachineBasicBlock *MBB,
+                                                       unsigned regOpcL,
+                                                       unsigned regOpcH,
+                                                       unsigned immOpcL,
+                                                       unsigned immOpcH,
+                                                       bool invSrc) {
+  // For the atomic bitwise operator, we generate
+  //   thisMBB (instructions are in pairs, except cmpxchg8b)
+  //     ld t1,t2 = [bitinstr.addr]
+  //   newMBB:
+  //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
+  //     op  t5, t6 <- out1, out2, [bitinstr.val]
+  //     mov ECX, EBX <- t5, t6
+  //     mov EAX, EDX <- t1, t2
+  //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
+  //     mov t3, t4 <- EAX, EDX
+  //     bz  newMBB
+  //     result in out1, out2
+  //     fallthrough -->nextMBB
+
+  const TargetRegisterClass *RC = X86::GR32RegisterClass;
+  const unsigned LoadOpc = X86::MOV32rm;
+  const unsigned copyOpc = X86::MOV32rr;
+  const unsigned NotOpc = X86::NOT32r;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction::iterator MBBIter = MBB;
+  ++MBBIter;
+  
+  /// First build the CFG
+  MachineFunction *F = MBB->getParent();
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(MBBIter, newMBB);
+  F->insert(MBBIter, nextMBB);
+  
+  // Move all successors to thisMBB to nextMBB
+  nextMBB->transferSuccessors(thisMBB);
+    
+  // Update thisMBB to fall through to newMBB
+  thisMBB->addSuccessor(newMBB);
+  
+  // newMBB jumps to itself and fall through to nextMBB
+  newMBB->addSuccessor(nextMBB);
+  newMBB->addSuccessor(newMBB);
+  
+  // Insert instructions into newMBB based on incoming instruction
+  // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
+  assert(bInstr->getNumOperands() < 18 && "unexpected number of operands");
+  MachineOperand& dest1Oper = bInstr->getOperand(0);
+  MachineOperand& dest2Oper = bInstr->getOperand(1);
+  MachineOperand* argOpers[6];
+  for (int i=0; i < 6; ++i)
+    argOpers[i] = &bInstr->getOperand(i+2);
+
+  // x86 address has 4 operands: base, index, scale, and displacement
+  int lastAddrIndx = 3; // [0,3]
+  
+  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
+  MachineInstrBuilder MIB = BuildMI(thisMBB, TII->get(LoadOpc), t1);
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
+  MIB = BuildMI(thisMBB, TII->get(LoadOpc), t2);
+  // add 4 to displacement.  getImm verifies it's immediate.
+  for (int i=0; i <= lastAddrIndx-1; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  MachineOperand newOp3 = MachineOperand::CreateImm(argOpers[3]->getImm()+4);
+  (*MIB).addOperand(newOp3);
+
+  // t3/4 are defined later, at the bottom of the loop
+  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
+  unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
+  BuildMI(newMBB, TII->get(X86::PHI), dest1Oper.getReg())
+    .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
+  BuildMI(newMBB, TII->get(X86::PHI), dest2Oper.getReg())
+    .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
+
+  unsigned tt1 = F->getRegInfo().createVirtualRegister(RC);
+  unsigned tt2 = F->getRegInfo().createVirtualRegister(RC);
+  if (invSrc) {  
+    MIB = BuildMI(newMBB, TII->get(NotOpc), tt1).addReg(t1);
+    MIB = BuildMI(newMBB, TII->get(NotOpc), tt2).addReg(t2);
+  } else {
+    tt1 = t1;
+    tt2 = t2;
+  }
+
+  assert((argOpers[4]->isRegister() || argOpers[4]->isImmediate()) &&
+         "invalid operand");
+  unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
+  unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
+  if (argOpers[4]->isRegister())
+    MIB = BuildMI(newMBB, TII->get(regOpcL), t5);
+  else
+    MIB = BuildMI(newMBB, TII->get(immOpcL), t5);
+  MIB.addReg(tt1);
+  (*MIB).addOperand(*argOpers[4]);
+  assert(argOpers[5]->isRegister() == argOpers[4]->isRegister());
+  assert(argOpers[5]->isImmediate() == argOpers[4]->isImmediate());
+  if (argOpers[5]->isRegister())
+    MIB = BuildMI(newMBB, TII->get(regOpcH), t6);
+  else
+    MIB = BuildMI(newMBB, TII->get(immOpcH), t6);
+  MIB.addReg(tt2);
+  (*MIB).addOperand(*argOpers[5]);
+
+  MIB = BuildMI(newMBB, TII->get(copyOpc), X86::EAX);
+  MIB.addReg(t1);
+  MIB = BuildMI(newMBB, TII->get(copyOpc), X86::EDX);
+  MIB.addReg(t2);
+
+  MIB = BuildMI(newMBB, TII->get(copyOpc), X86::EBX);
+  MIB.addReg(t5);
+  MIB = BuildMI(newMBB, TII->get(copyOpc), X86::ECX);
+  MIB.addReg(t6);
+  
+  MIB = BuildMI(newMBB, TII->get(X86::LCMPXCHG8B));
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+
+  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
+  (*MIB).addMemOperand(*F, *bInstr->memoperands_begin());
+
+  MIB = BuildMI(newMBB, TII->get(copyOpc), t3);
+  MIB.addReg(X86::EAX);
+  MIB = BuildMI(newMBB, TII->get(copyOpc), t4);
+  MIB.addReg(X86::EDX);
+  
+  // insert branch
+  BuildMI(newMBB, TII->get(X86::JNE)).addMBB(newMBB);
+
+  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
+  return nextMBB;
+}
+
  // private utility function
  MachineBasicBlock *
  X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
@@ -6633,6 +6885,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                 X86::NOT8r, X86::AL,
                                                 X86::GR8RegisterClass, true);
    // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
+  // This group is for 64-bit host.
    case X86::ATOMAND64:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
                                                 X86::AND64ri32, X86::MOV64rm, 
@@ -6665,6 +6918,40 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
      return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
    case X86::ATOMUMAX64:
      return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
+
+  // This group does 64-bit operations on a 32-bit host.
+  case X86::ATOMAND6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB, 
+                                               X86::AND32rr, X86::AND32rr,
+                                               X86::AND32ri, X86::AND32ri,
+                                               false);
+  case X86::ATOMOR6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB, 
+                                               X86::OR32rr, X86::OR32rr,
+                                               X86::OR32ri, X86::OR32ri,
+                                               false);
+  case X86::ATOMXOR6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB, 
+                                               X86::XOR32rr, X86::XOR32rr,
+                                               X86::XOR32ri, X86::XOR32ri,
+                                               false);
+  case X86::ATOMNAND6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB, 
+                                               X86::AND32rr, X86::AND32rr,
+                                               X86::AND32ri, X86::AND32ri,
+                                               true);
+  // FIXME carry
+  case X86::ATOMADD6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB, 
+                                               X86::ADD32rr, X86::ADC32rr,
+                                               X86::ADD32ri, X86::ADC32ri,
+                                               false);
+  // FIXME carry
+  case X86::ATOMSUB6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB, 
+                                               X86::SUB32rr, X86::SBB32rr,
+                                               X86::SUB32ri, X86::SBB32ri,
+                                               false);
    }
  }
  
@@ -6780,8 +7067,8 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
  
  /// PerformBuildVectorCombine - build_vector 0,(load i64 / f64) -> movq / movsd.
  static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
-                                           const X86Subtarget *Subtarget,
-                                           const TargetLowering &TLI) {
+                                         const X86Subtarget *Subtarget,
+                                         const TargetLowering &TLI) {
    unsigned NumOps = N->getNumOperands();
  
    // Ignore single operand BUILD_VECTOR.
@@ -6817,7 +7104,11 @@ static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
    if (LD->getExtensionType() != ISD::NON_EXTLOAD)
      return SDValue();
    
-  return DAG.getNode(X86ISD::VZEXT_LOAD, VT, LD->getChain(), LD->getBasePtr());
+  SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+  SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
+  SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, Tys, Ops, 2);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Base, 1), ResNode.getValue(1));
+  return ResNode;
  }                                           
  
  /// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
@@ -7078,6 +7369,7 @@ LowerXConstraint(MVT ConstraintVT) const {
  /// vector.  If it is invalid, don't add anything to Ops.
  void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                       char Constraint,
+                                                     bool hasMemory,
                                                       std::vector<SDValue>&Ops,
                                                       SelectionDAG &DAG) const {
    SDValue Result(0, 0);
@@ -7092,6 +7384,14 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
        }
      }
      return;
+  case 'J':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 63) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
    case 'N':
      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
        if (C->getZExtValue() <= 255) {
@@ -7131,14 +7431,11 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
      }
      
      if (GA) {
-      // If addressing this global requires a load (e.g. in PIC mode), we can't
-      // match.
-      if (Subtarget->GVRequiresExtraLoad(GA->getGlobal(), getTargetMachine(),
-                                         false))
-        return;
-
-      Op = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
-                                      Offset);
+      if (hasMemory) 
+        Op = LowerGlobalAddress(GA->getGlobal(), DAG);
+      else
+        Op = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
+                                        Offset);
        Result = Op;
        break;
      }
@@ -7152,7 +7449,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
      Ops.push_back(Result);
      return;
    }
-  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
+                                                      Ops, DAG);
  }
  
  std::vector<unsigned> X86TargetLowering::