Custom lower the memory barrier instructions and add support

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index d5385f814c54aa0286a05c8568f56be451b9c456..634c08e54de8b820fbd6d2fdc8dd7530e6d435ce 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -62,21 +62,19 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                         SDValue V2);
  
  static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
-  switch (TM.getSubtarget<X86Subtarget>().TargetType) {
-  default: llvm_unreachable("unknown subtarget type");
-  case X86Subtarget::isDarwin:
-    if (TM.getSubtarget<X86Subtarget>().is64Bit())
-      return new X8664_MachoTargetObjectFile();
+  
+  bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+  
+  if (TM.getSubtarget<X86Subtarget>().isTargetDarwin()) {
+    if (is64Bit) return new X8664_MachoTargetObjectFile();
      return new TargetLoweringObjectFileMachO();
-  case X86Subtarget::isELF:
-   if (TM.getSubtarget<X86Subtarget>().is64Bit())
-     return new X8664_ELFTargetObjectFile(TM);
+  } else if (TM.getSubtarget<X86Subtarget>().isTargetELF() ){
+    if (is64Bit) return new X8664_ELFTargetObjectFile(TM);
      return new X8632_ELFTargetObjectFile(TM);
-  case X86Subtarget::isMingw:
-  case X86Subtarget::isCygwin:
-  case X86Subtarget::isWindows:
+  } else if (TM.getSubtarget<X86Subtarget>().isTargetCOFF()) {
      return new TargetLoweringObjectFileCOFF();
-  }
+  }  
+  llvm_unreachable("unknown subtarget type");
  }
  
  X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
@@ -345,8 +343,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    if (Subtarget->hasSSE1())
      setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
  
-  if (!Subtarget->hasSSE2())
-    setOperationAction(ISD::MEMBARRIER    , MVT::Other, Expand);
+  // We may not have a libcall for MEMBARRIER so we should lower this.
+  setOperationAction(ISD::MEMBARRIER    , MVT::Other, Custom);
+  
    // On X86 and X86-64, atomic operations are lowered to locked instructions.
    // Locked instructions, in turn, have implicit fence semantics (all memory
    // operations are flushed before issuing the locked instruction, and they
@@ -617,7 +616,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      addRegisterClass(MVT::v8i8,  X86::VR64RegisterClass, false);
      addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false);
      addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false);
-    addRegisterClass(MVT::v2f32, X86::VR64RegisterClass, false);
+    
      addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false);
  
      setOperationAction(ISD::ADD,                MVT::v8i8,  Legal);
@@ -663,14 +662,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      AddPromotedToType (ISD::LOAD,               MVT::v4i16, MVT::v1i64);
      setOperationAction(ISD::LOAD,               MVT::v2i32, Promote);
      AddPromotedToType (ISD::LOAD,               MVT::v2i32, MVT::v1i64);
-    setOperationAction(ISD::LOAD,               MVT::v2f32, Promote);
-    AddPromotedToType (ISD::LOAD,               MVT::v2f32, MVT::v1i64);
      setOperationAction(ISD::LOAD,               MVT::v1i64, Legal);
  
      setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i8,  Custom);
      setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i16, Custom);
      setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f32, Custom);
      setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i64, Custom);
  
      setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i8,  Custom);
@@ -678,7 +674,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i32, Custom);
      setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v1i64, Custom);
  
-    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2f32, Custom);
      setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Custom);
      setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Custom);
      setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Custom);
@@ -697,7 +692,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
        setOperationAction(ISD::BIT_CONVERT,        MVT::v8i8,  Custom);
        setOperationAction(ISD::BIT_CONVERT,        MVT::v4i16, Custom);
        setOperationAction(ISD::BIT_CONVERT,        MVT::v2i32, Custom);
-      setOperationAction(ISD::BIT_CONVERT,        MVT::v2f32, Custom);
        setOperationAction(ISD::BIT_CONVERT,        MVT::v1i64, Custom);
      }
    }
@@ -798,9 +792,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
        EVT VT = SVT;
  
        // Do not attempt to promote non-128-bit vectors
-      if (!VT.is128BitVector()) {
+      if (!VT.is128BitVector())
          continue;
-      }
        
        setOperationAction(ISD::AND,    SVT, Promote);
        AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
@@ -1197,6 +1190,27 @@ unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
    return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4;
  }
  
+bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
+                                               unsigned &Offset) const {
+  if (!Subtarget->isTargetLinux())
+    return false;
+
+  if (Subtarget->is64Bit()) {
+    // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
+    Offset = 0x28;
+    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
+      AddressSpace = 256;
+    else
+      AddressSpace = 257;
+  } else {
+    // %gs:0x14 on i386
+    Offset = 0x14;
+    AddressSpace = 256;
+  }
+  return true;
+}
+
+
  //===----------------------------------------------------------------------===//
  //               Return Value Calling Convention Implementation
  //===----------------------------------------------------------------------===//
@@ -1205,19 +1219,19 @@ unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
  
  bool 
  X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
-                        const SmallVectorImpl<EVT> &OutTys,
-                        const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags,
-                        SelectionDAG &DAG) const {
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const {
    SmallVector<CCValAssign, 16> RVLocs;
    CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
-                 RVLocs, *DAG.getContext());
-  return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_X86);
+                 RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_X86);
  }
  
  SDValue
  X86TargetLowering::LowerReturn(SDValue Chain,
                                 CallingConv::ID CallConv, bool isVarArg,
                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
                                 DebugLoc dl, SelectionDAG &DAG) const {
    MachineFunction &MF = DAG.getMachineFunction();
    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
@@ -1245,7 +1259,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
    for (unsigned i = 0; i != RVLocs.size(); ++i) {
      CCValAssign &VA = RVLocs[i];
      assert(VA.isRegLoc() && "Can only return in registers!");
-    SDValue ValToCopy = Outs[i].Val;
+    SDValue ValToCopy = OutVals[i];
  
      // Returns in ST0/ST1 are handled specially: these are pushed as operands to
      // the RET instruction and handled by the FP Stackifier.
@@ -1267,7 +1281,8 @@ X86TargetLowering::LowerReturn(SDValue Chain,
        if (ValVT.isVector() && ValVT.getSizeInBits() == 64) {
          ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
          if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1)
-          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy);
+          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+                                  ValToCopy);
        }
      }
  
@@ -1333,17 +1348,34 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
        report_fatal_error("SSE register return with SSE disabled");
      }
  
+    SDValue Val;
+
      // If this is a call to a function that returns an fp value on the floating
-    // point stack, but where we prefer to use the value in xmm registers, copy
-    // it out as F80 and use a truncate to move it from fp stack reg to xmm reg.
-    if ((VA.getLocReg() == X86::ST0 ||
-         VA.getLocReg() == X86::ST1) &&
-        isScalarFPTypeInSSEReg(VA.getValVT())) {
-      CopyVT = MVT::f80;
-    }
+    // point stack, we must guarantee the the value is popped from the stack, so
+    // a CopyFromReg is not good enough - the copy instruction may be eliminated
+    // if the return value is not used. We use the FpGET_ST0 instructions
+    // instead.
+    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
+      // If we prefer to use the value in xmm registers, copy it out as f80 and
+      // use a truncate to move it from fp stack reg to xmm reg.
+      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
+      bool isST0 = VA.getLocReg() == X86::ST0;
+      unsigned Opc = 0;
+      if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32;
+      if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64;
+      if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80;
+      SDValue Ops[] = { Chain, InFlag };
+      Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Flag,
+                                         Ops, 2), 1);
+      Val = Chain.getValue(0);
  
-    SDValue Val;
-    if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
+      // Round the f80 to the right size, which also moves it to the appropriate
+      // xmm register.
+      if (CopyVT != VA.getValVT())
+        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+                          // This truncation won't change the value.
+                          DAG.getIntPtrConstant(1));
+    } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
        // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64.
        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
          Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
@@ -1363,15 +1395,6 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
        Val = Chain.getValue(0);
      }
      InFlag = Chain.getValue(2);
-
-    if (CopyVT != VA.getValVT()) {
-      // Round the F80 the right size, which also moves to the appropriate xmm
-      // register.
-      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
-                        // This truncation won't change the value.
-                        DAG.getIntPtrConstant(1));
-    }
-
      InVals.push_back(Val);
    }
  
@@ -1485,11 +1508,11 @@ X86TargetLowering::LowerMemArgument(SDValue Chain,
    // could be overwritten by lowering of arguments in case of a tail call.
    if (Flags.isByVal()) {
      int FI = MFI->CreateFixedObject(Flags.getByValSize(),
-                                    VA.getLocMemOffset(), isImmutable, false);
+                                    VA.getLocMemOffset(), isImmutable);
      return DAG.getFrameIndex(FI, getPointerTy());
    } else {
      int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
-                                    VA.getLocMemOffset(), isImmutable, false);
+                                    VA.getLocMemOffset(), isImmutable);
      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
      return DAG.getLoad(ValVT, dl, Chain, FIN,
                         PseudoSourceValue::getFixedStack(FI), 0,
@@ -1617,8 +1640,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
    if (isVarArg) {
      if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
                      CallConv != CallingConv::X86_ThisCall)) {
-      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,
-                                                            true, false));
+      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
      }
      if (Is64Bit) {
        unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
@@ -1790,7 +1812,7 @@ EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
    // Calculate the new stack slot for the return address.
    int SlotSize = Is64Bit ? 8 : 4;
    int NewReturnAddrFI =
-    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false, false);
+    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
    EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
    SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
    Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
@@ -1804,6 +1826,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                               CallingConv::ID CallConv, bool isVarArg,
                               bool &isTailCall,
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
                               const SmallVectorImpl<ISD::InputArg> &Ins,
                               DebugLoc dl, SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &InVals) const {
@@ -1816,7 +1839,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
      // Check if it's really possible to do a tail call.
      isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
                      isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
-                                                   Outs, Ins, DAG);
+                                                   Outs, OutVals, Ins, DAG);
  
      // Sibcalls are automatically detected tailcalls which do not require
      // ABI changes.
@@ -1876,7 +1899,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
      CCValAssign &VA = ArgLocs[i];
      EVT RegVT = VA.getLocVT();
-    SDValue Arg = Outs[i].Val;
+    SDValue Arg = OutVals[i];
      ISD::ArgFlagsTy Flags = Outs[i].Flags;
      bool isByVal = Flags.isByVal();
  
@@ -1969,7 +1992,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
      }
    }
  
-  if (Is64Bit && isVarArg) {
+  if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) {
      // From AMD64 ABI document:
      // For calls that may call functions that use varargs or stdargs
      // (prototype-less calls or calls to functions containing ellipsis (...) in
@@ -1978,7 +2001,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
      // the number of registers, but must be an ubound on the number of SSE
      // registers used and is in the range 0 - 8 inclusive.
  
-    // FIXME: Verify this on Win64
      // Count the number of XMM registers allocated.
      static const unsigned XMMArgRegs[] = {
        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
@@ -2015,12 +2037,12 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
          if (VA.isRegLoc())
            continue;
          assert(VA.isMemLoc());
-        SDValue Arg = Outs[i].Val;
+        SDValue Arg = OutVals[i];
          ISD::ArgFlagsTy Flags = Outs[i].Flags;
          // Create frame index.
          int32_t Offset = VA.getLocMemOffset()+FPDiff;
          uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
-        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true, false);
+        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
          FIN = DAG.getFrameIndex(FI, getPointerTy());
  
          if (Flags.isByVal()) {
@@ -2061,7 +2083,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                                       FPDiff, dl);
    }
  
-  bool WasGlobalOrExternal = false;
    if (getTargetMachine().getCodeModel() == CodeModel::Large) {
      assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
      // In the 64-bit large code model, we have to make all calls
@@ -2069,7 +2090,6 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
      // pc-relative offset may not be large enough to hold the whole
      // address.
    } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    WasGlobalOrExternal = true;
      // If the callee is a GlobalAddress node (quite common, every direct call
      // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
      // it.
@@ -2097,11 +2117,10 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
          OpFlags = X86II::MO_DARWIN_STUB;
        }
  
-      Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(),
+      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
                                            G->getOffset(), OpFlags);
      }
    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    WasGlobalOrExternal = true;
      unsigned char OpFlags = 0;
  
      // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external
@@ -2311,6 +2330,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
                                                       bool isCalleeStructRet,
                                                       bool isCallerStructRet,
                                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
                                      const SmallVectorImpl<ISD::InputArg> &Ins,
                                                       SelectionDAG& DAG) const {
    if (!IsTailCallConvention(CalleeCC) &&
@@ -2337,8 +2357,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
    if (RegInfo->needsStackRealignment(MF))
      return false;
  
-  // Do not sibcall optimize vararg calls unless the call site is not passing any
-  // arguments.
+  // Do not sibcall optimize vararg calls unless the call site is not passing
+  // any arguments.
    if (isVarArg && !Outs.empty())
      return false;
  
@@ -2424,8 +2444,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
          ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
        for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
          CCValAssign &VA = ArgLocs[i];
-        EVT RegVT = VA.getLocVT();
-        SDValue Arg = Outs[i].Val;
+        SDValue Arg = OutVals[i];
          ISD::ArgFlagsTy Flags = Outs[i].Flags;
          if (VA.getLocInfo() == CCValAssign::Indirect)
            return false;
@@ -2440,17 +2459,23 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
      // If the tailcall address may be in a register, then make sure it's
      // possible to register allocate for it. In 32-bit, the call address can
      // only target EAX, EDX, or ECX since the tail call must be scheduled after
-    // callee-saved registers are restored. In 64-bit, it's RAX, RCX, RDX, RSI,
-    // RDI, R8, R9, R11.
-    if (!isa<GlobalAddressSDNode>(Callee) &&
+    // callee-saved registers are restored. These happen to be the same
+    // registers used to pass 'inreg' arguments so watch out for those.
+    if (!Subtarget->is64Bit() &&
+        !isa<GlobalAddressSDNode>(Callee) &&
          !isa<ExternalSymbolSDNode>(Callee)) {
-      unsigned Limit = Subtarget->is64Bit() ? 8 : 3;
        unsigned NumInRegs = 0;
        for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
          CCValAssign &VA = ArgLocs[i];
-        if (VA.isRegLoc()) {
-          if (++NumInRegs == Limit)
+        if (!VA.isRegLoc())
+          continue;
+        unsigned Reg = VA.getLocReg();
+        switch (Reg) {
+        default: break;
+        case X86::EAX: case X86::EDX: case X86::ECX:
+          if (++NumInRegs == 3)
              return false;
+          break;
          }
        }
      }
@@ -2460,20 +2485,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
  }
  
  FastISel *
-X86TargetLowering::createFastISel(MachineFunction &mf,
-                            DenseMap<const Value *, unsigned> &vm,
-                            DenseMap<const BasicBlock*, MachineBasicBlock*> &bm,
-                            DenseMap<const AllocaInst *, int> &am,
-                            std::vector<std::pair<MachineInstr*, unsigned> > &pn
-#ifndef NDEBUG
-                          , SmallSet<const Instruction *, 8> &cil
-#endif
-                                  ) const {
-  return X86::createFastISel(mf, vm, bm, am, pn
-#ifndef NDEBUG
-                             , cil
-#endif
-                             );
+X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const {
+  return X86::createFastISel(funcInfo);
  }
  
  
@@ -2491,7 +2504,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
      // Set up a frame object for the return address.
      uint64_t SlotSize = TD->getPointerSize();
      ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
-                                                           false, false);
+                                                           false);
      FuncInfo->setRAIndex(ReturnAddrIndex);
    }
  
@@ -4448,7 +4461,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
  }
  
  /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
-/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
+/// ones, or rewriting v4i32 / v2i32 as 2 wide ones if possible. This can be
  /// done when every pair / quad of shuffle mask elements point to elements in
  /// the right sequence. e.g.
  /// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
@@ -4462,7 +4475,6 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
    unsigned NumElems = VT.getVectorNumElements();
    unsigned NewWidth = (NumElems == 4) ? 2 : 4;
    EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
-  EVT MaskEltVT = MaskVT.getVectorElementType();
    EVT NewVT = MaskVT;
    switch (VT.getSimpleVT().SimpleTy) {
    default: assert(false && "Unexpected!");
@@ -5074,13 +5086,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
  SDValue
  X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
    DebugLoc dl = Op.getDebugLoc();
-  if (Op.getValueType() == MVT::v2f32)
-    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32,
-                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32,
-                                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32,
-                                               Op.getOperand(0))));
-
-  if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64)
+  
+  if (Op.getValueType() == MVT::v1i64 &&
+      Op.getOperand(0).getValueType() == MVT::i64)
      return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
  
    SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
@@ -5245,10 +5253,10 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
    if (OpFlags == X86II::MO_NO_FLAG &&
        X86::isOffsetSuitableForCodeModel(Offset, M)) {
      // A direct static reference to a global.
-    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
+    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
      Offset = 0;
    } else {
-    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags);
+    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
    }
  
    if (Subtarget->isPICStyleRIPRel() &&
@@ -5293,7 +5301,7 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
    DebugLoc dl = GA->getDebugLoc();
-  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                             GA->getValueType(0),
                                             GA->getOffset(),
                                             OperandFlags);
@@ -5366,7 +5374,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
  
    // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
    // exec)
-  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 
+                                           GA->getValueType(0),
                                             GA->getOffset(), OperandFlags);
    SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
  
@@ -5423,12 +5432,10 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
        OpFlag = X86II::MO_TLVP_PIC_BASE;
      else
        OpFlag = X86II::MO_TLVP;
-    
-    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), 
+    DebugLoc DL = Op.getDebugLoc();    
+    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
                                                  getPointerTy(),
                                                  GA->getOffset(), OpFlag);
-    
-    DebugLoc DL = Op.getDebugLoc();
      SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
    
      // With PIC32, the address is actually $g + Offset.
@@ -5777,7 +5784,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
  
    // Load the value out, extending it from f32 to f80.
    // FIXME: Avoid the extend by constructing the right constant pool?
-  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
+  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(),
                                   FudgePtr, PseudoSourceValue::getConstantPool(),
                                   0, MVT::f32, false, false, 4);
    // Extend everything to 80 bits to force it to be done on x87.
@@ -6026,6 +6033,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
    bool NeedCF = false;
    bool NeedOF = false;
    switch (X86CC) {
+  default: break;
    case X86::COND_A: case X86::COND_AE:
    case X86::COND_B: case X86::COND_BE:
      NeedCF = true;
@@ -6035,120 +6043,129 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
    case X86::COND_O: case X86::COND_NO:
      NeedOF = true;
      break;
-  default: break;
    }
  
    // See if we can use the EFLAGS value from the operand instead of
    // doing a separate TEST. TEST always sets OF and CF to 0, so unless
    // we prove that the arithmetic won't overflow, we can't use OF or CF.
-  if (Op.getResNo() == 0 && !NeedOF && !NeedCF) {
-    unsigned Opcode = 0;
-    unsigned NumOperands = 0;
-    switch (Op.getNode()->getOpcode()) {
-    case ISD::ADD:
-      // Due to an isel shortcoming, be conservative if this add is
-      // likely to be selected as part of a load-modify-store
-      // instruction. When the root node in a match is a store, isel
-      // doesn't know how to remap non-chain non-flag uses of other
-      // nodes in the match, such as the ADD in this case. This leads
-      // to the ADD being left around and reselected, with the result
-      // being two adds in the output.  Alas, even if none our users
-      // are stores, that doesn't prove we're O.K.  Ergo, if we have
-      // any parents that aren't CopyToReg or SETCC, eschew INC/DEC.
-      // A better fix seems to require climbing the DAG back to the
-      // root, and it doesn't seem to be worth the effort.
-      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
-             UE = Op.getNode()->use_end(); UI != UE; ++UI)
-        if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC)
-          goto default_case;
-      if (ConstantSDNode *C =
-            dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
-        // An add of one will be selected as an INC.
-        if (C->getAPIntValue() == 1) {
-          Opcode = X86ISD::INC;
-          NumOperands = 1;
-          break;
-        }
-        // An add of negative one (subtract of one) will be selected as a DEC.
-        if (C->getAPIntValue().isAllOnesValue()) {
-          Opcode = X86ISD::DEC;
-          NumOperands = 1;
-          break;
-        }
+  if (Op.getResNo() != 0 || NeedOF || NeedCF)
+    // Emit a CMP with 0, which is the TEST pattern.
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                       DAG.getConstant(0, Op.getValueType()));
+
+  unsigned Opcode = 0;
+  unsigned NumOperands = 0;
+  switch (Op.getNode()->getOpcode()) {
+  case ISD::ADD:
+    // Due to an isel shortcoming, be conservative if this add is likely to be
+    // selected as part of a load-modify-store instruction. When the root node
+    // in a match is a store, isel doesn't know how to remap non-chain non-flag
+    // uses of other nodes in the match, such as the ADD in this case. This
+    // leads to the ADD being left around and reselected, with the result being
+    // two adds in the output.  Alas, even if none our users are stores, that
+    // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
+    // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
+    // climbing the DAG back to the root, and it doesn't seem to be worth the
+    // effort.
+    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+           UE = Op.getNode()->use_end(); UI != UE; ++UI)
+      if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC)
+        goto default_case;
+
+    if (ConstantSDNode *C =
+        dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
+      // An add of one will be selected as an INC.
+      if (C->getAPIntValue() == 1) {
+        Opcode = X86ISD::INC;
+        NumOperands = 1;
+        break;
        }
-      // Otherwise use a regular EFLAGS-setting add.
-      Opcode = X86ISD::ADD;
-      NumOperands = 2;
-      break;
-    case ISD::AND: {
-      // If the primary and result isn't used, don't bother using X86ISD::AND,
-      // because a TEST instruction will be better.
-      bool NonFlagUse = false;
-      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
-             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
-        SDNode *User = *UI;
-        unsigned UOpNo = UI.getOperandNo();
-        if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
-          // Look pass truncate.
-          UOpNo = User->use_begin().getOperandNo();
-          User = *User->use_begin();
-        }
-        if (User->getOpcode() != ISD::BRCOND &&
-            User->getOpcode() != ISD::SETCC &&
-            (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
-          NonFlagUse = true;
-          break;
-        }
+
+      // An add of negative one (subtract of one) will be selected as a DEC.
+      if (C->getAPIntValue().isAllOnesValue()) {
+        Opcode = X86ISD::DEC;
+        NumOperands = 1;
+        break;
+      }
+    }
+
+    // Otherwise use a regular EFLAGS-setting add.
+    Opcode = X86ISD::ADD;
+    NumOperands = 2;
+    break;
+  case ISD::AND: {
+    // If the primary and result isn't used, don't bother using X86ISD::AND,
+    // because a TEST instruction will be better.
+    bool NonFlagUse = false;
+    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+           UE = Op.getNode()->use_end(); UI != UE; ++UI) {
+      SDNode *User = *UI;
+      unsigned UOpNo = UI.getOperandNo();
+      if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+        // Look pass truncate.
+        UOpNo = User->use_begin().getOperandNo();
+        User = *User->use_begin();
        }
-      if (!NonFlagUse)
+
+      if (User->getOpcode() != ISD::BRCOND &&
+          User->getOpcode() != ISD::SETCC &&
+          (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
+        NonFlagUse = true;
          break;
+      }
      }
+
+    if (!NonFlagUse)
+      break;
+  }
      // FALL THROUGH
-    case ISD::SUB:
-    case ISD::OR:
-    case ISD::XOR:
-      // Due to the ISEL shortcoming noted above, be conservative if this op is
-      // likely to be selected as part of a load-modify-store instruction.
-      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+  case ISD::SUB:
+  case ISD::OR:
+  case ISD::XOR:
+    // Due to the ISEL shortcoming noted above, be conservative if this op is
+    // likely to be selected as part of a load-modify-store instruction.
+    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
             UE = Op.getNode()->use_end(); UI != UE; ++UI)
-        if (UI->getOpcode() == ISD::STORE)
-          goto default_case;
-      // Otherwise use a regular EFLAGS-setting instruction.
-      switch (Op.getNode()->getOpcode()) {
-      case ISD::SUB: Opcode = X86ISD::SUB; break;
-      case ISD::OR:  Opcode = X86ISD::OR;  break;
-      case ISD::XOR: Opcode = X86ISD::XOR; break;
-      case ISD::AND: Opcode = X86ISD::AND; break;
-      default: llvm_unreachable("unexpected operator!");
-      }
-      NumOperands = 2;
-      break;
-    case X86ISD::ADD:
-    case X86ISD::SUB:
-    case X86ISD::INC:
-    case X86ISD::DEC:
-    case X86ISD::OR:
-    case X86ISD::XOR:
-    case X86ISD::AND:
-      return SDValue(Op.getNode(), 1);
-    default:
-    default_case:
-      break;
-    }
-    if (Opcode != 0) {
-      SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-      SmallVector<SDValue, 4> Ops;
-      for (unsigned i = 0; i != NumOperands; ++i)
-        Ops.push_back(Op.getOperand(i));
-      SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
-      DAG.ReplaceAllUsesWith(Op, New);
-      return SDValue(New.getNode(), 1);
+      if (UI->getOpcode() == ISD::STORE)
+        goto default_case;
+
+    // Otherwise use a regular EFLAGS-setting instruction.
+    switch (Op.getNode()->getOpcode()) {
+    default: llvm_unreachable("unexpected operator!");
+    case ISD::SUB: Opcode = X86ISD::SUB; break;
+    case ISD::OR:  Opcode = X86ISD::OR;  break;
+    case ISD::XOR: Opcode = X86ISD::XOR; break;
+    case ISD::AND: Opcode = X86ISD::AND; break;
      }
+
+    NumOperands = 2;
+    break;
+  case X86ISD::ADD:
+  case X86ISD::SUB:
+  case X86ISD::INC:
+  case X86ISD::DEC:
+  case X86ISD::OR:
+  case X86ISD::XOR:
+  case X86ISD::AND:
+    return SDValue(Op.getNode(), 1);
+  default:
+  default_case:
+    break;
    }
  
-  // Otherwise just emit a CMP with 0, which is the TEST pattern.
-  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
-                     DAG.getConstant(0, Op.getValueType()));
+  if (Opcode == 0)
+    // Emit a CMP with 0, which is the TEST pattern.
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                       DAG.getConstant(0, Op.getValueType()));
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+  SmallVector<SDValue, 4> Ops;
+  for (unsigned i = 0; i != NumOperands; ++i)
+    Ops.push_back(Op.getOperand(i));
+
+  SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
+  DAG.ReplaceAllUsesWith(Op, New);
+  return SDValue(New.getNode(), 1);
  }
  
  /// Emit nodes that will be selected as "cmp Op0,Op1", or something
@@ -6701,7 +6718,6 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
  
    SDValue Flag;
  
-  EVT IntPtr = getPointerTy();
    EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
  
    Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag);
@@ -6754,7 +6770,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
    Store = DAG.getStore(Op.getOperand(0), dl,
                         DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
                                         MVT::i32),
-                       FIN, SV, 0, false, false, 0);
+                       FIN, SV, 4, false, false, 0);
    MemOps.push_back(Store);
  
    // Store ptr to overflow_arg_area
@@ -6762,7 +6778,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                      FIN, DAG.getIntPtrConstant(4));
    SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
                                      getPointerTy());
-  Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0,
+  Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 8,
                         false, false, 0);
    MemOps.push_back(Store);
  
@@ -6771,7 +6787,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                      FIN, DAG.getIntPtrConstant(8));
    SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
                                      getPointerTy());
-  Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0,
+  Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 16,
                         false, false, 0);
    MemOps.push_back(Store);
    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
@@ -6781,9 +6797,6 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
  SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
    // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
    assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!");
-  SDValue Chain = Op.getOperand(0);
-  SDValue SrcPtr = Op.getOperand(1);
-  SDValue SrcSV = Op.getOperand(2);
  
    report_fatal_error("VAArgInst is not yet implemented for x86-64!");
    return SDValue();
@@ -7206,7 +7219,8 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
              InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
  
          if (InRegCount > 2) {
-          report_fatal_error("Nest register in use - reduce number of inreg parameters!");
+          report_fatal_error("Nest register in use - reduce number of inreg"
+                             " parameters!");
          }
        }
        break;
@@ -7496,6 +7510,36 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
    return Sum;
  }
  
+SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
+  DebugLoc dl = Op.getDebugLoc();
+  
+  if (!Subtarget->hasSSE2())
+    return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
+                       DAG.getConstant(0, MVT::i32));
+  
+  unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
+  if(!isDev)
+    return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
+  else {
+    unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+    unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+    unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+    
+    // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
+    if (!Op1 && !Op2 && !Op3 && Op4)
+      return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0));
+    
+    // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
+    if (Op1 && !Op2 && !Op3 && !Op4)
+      return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0));
+    
+    // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 
+    //           (MFENCE)>;
+    return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
+  }
+}
+
  SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
    EVT T = Op.getValueType();
    DebugLoc dl = Op.getDebugLoc();
@@ -7585,6 +7629,7 @@ SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const {
  SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    switch (Op.getOpcode()) {
    default: llvm_unreachable("Should not custom lower this!");
+  case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op,DAG);
    case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
    case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
    case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
@@ -7987,7 +8032,6 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
                                                         unsigned immOpc,
                                                         unsigned LoadOpc,
                                                         unsigned CXchgOpc,
-                                                       unsigned copyOpc,
                                                         unsigned notOpc,
                                                         unsigned EAXreg,
                                                         TargetRegisterClass *RC,
@@ -8014,8 +8058,11 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
    F->insert(MBBIter, newMBB);
    F->insert(MBBIter, nextMBB);
  
-  // Move all successors to thisMBB to nextMBB
-  nextMBB->transferSuccessors(thisMBB);
+  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
+  nextMBB->splice(nextMBB->begin(), thisMBB,
+                  llvm::next(MachineBasicBlock::iterator(bInstr)),
+                  thisMBB->end());
+  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
  
    // Update thisMBB to fall through to newMBB
    thisMBB->addSuccessor(newMBB);
@@ -8025,17 +8072,17 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
    newMBB->addSuccessor(newMBB);
  
    // Insert instructions into newMBB based on incoming instruction
-  assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 &&
+  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
           "unexpected number of operands");
    DebugLoc dl = bInstr->getDebugLoc();
    MachineOperand& destOper = bInstr->getOperand(0);
-  MachineOperand* argOpers[2 + X86AddrNumOperands];
+  MachineOperand* argOpers[2 + X86::AddrNumOperands];
    int numArgs = bInstr->getNumOperands() - 1;
    for (int i=0; i < numArgs; ++i)
      argOpers[i] = &bInstr->getOperand(i+1);
  
    // x86 address has 4 operands: base, index, scale, and displacement
-  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
    int valArgIndx = lastAddrIndx + 1;
  
    unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
@@ -8061,7 +8108,7 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
    MIB.addReg(tt);
    (*MIB).addOperand(*argOpers[valArgIndx]);
  
-  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg);
+  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg);
    MIB.addReg(t1);
  
    MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
@@ -8072,13 +8119,13 @@ X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
    (*MIB).setMemRefs(bInstr->memoperands_begin(),
                      bInstr->memoperands_end());
  
-  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg());
+  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
    MIB.addReg(EAXreg);
  
    // insert branch
    BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
  
-  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
+  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
    return nextMBB;
  }
  
@@ -8108,7 +8155,6 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
  
    const TargetRegisterClass *RC = X86::GR32RegisterClass;
    const unsigned LoadOpc = X86::MOV32rm;
-  const unsigned copyOpc = X86::MOV32rr;
    const unsigned NotOpc = X86::NOT32r;
    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
@@ -8123,8 +8169,11 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
    F->insert(MBBIter, newMBB);
    F->insert(MBBIter, nextMBB);
  
-  // Move all successors to thisMBB to nextMBB
-  nextMBB->transferSuccessors(thisMBB);
+  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
+  nextMBB->splice(nextMBB->begin(), thisMBB,
+                  llvm::next(MachineBasicBlock::iterator(bInstr)),
+                  thisMBB->end());
+  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
  
    // Update thisMBB to fall through to newMBB
    thisMBB->addSuccessor(newMBB);
@@ -8136,12 +8185,12 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
    DebugLoc dl = bInstr->getDebugLoc();
    // Insert instructions into newMBB based on incoming instruction
    // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
-  assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 &&
+  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 &&
           "unexpected number of operands");
    MachineOperand& dest1Oper = bInstr->getOperand(0);
    MachineOperand& dest2Oper = bInstr->getOperand(1);
-  MachineOperand* argOpers[2 + X86AddrNumOperands];
-  for (int i=0; i < 2 + X86AddrNumOperands; ++i) {
+  MachineOperand* argOpers[2 + X86::AddrNumOperands];
+  for (int i=0; i < 2 + X86::AddrNumOperands; ++i) {
      argOpers[i] = &bInstr->getOperand(i+2);
  
      // We use some of the operands multiple times, so conservatively just
@@ -8151,7 +8200,7 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
    }
  
    // x86 address has 5 operands: base, index, scale, displacement, and segment.
-  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
  
    unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
    MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
@@ -8215,14 +8264,14 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
      MIB.addReg(t2);
    (*MIB).addOperand(*argOpers[valArgIndx + 1]);
  
-  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX);
+  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
    MIB.addReg(t1);
-  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX);
+  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX);
    MIB.addReg(t2);
  
-  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX);
+  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX);
    MIB.addReg(t5);
-  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX);
+  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX);
    MIB.addReg(t6);
  
    MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
@@ -8233,15 +8282,15 @@ X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
    (*MIB).setMemRefs(bInstr->memoperands_begin(),
                      bInstr->memoperands_end());
  
-  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3);
+  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3);
    MIB.addReg(X86::EAX);
-  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4);
+  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4);
    MIB.addReg(X86::EDX);
  
    // insert branch
    BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
  
-  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
+  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
    return nextMBB;
  }
  
@@ -8275,8 +8324,11 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
    F->insert(MBBIter, newMBB);
    F->insert(MBBIter, nextMBB);
  
-  // Move all successors of thisMBB to nextMBB
-  nextMBB->transferSuccessors(thisMBB);
+  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
+  nextMBB->splice(nextMBB->begin(), thisMBB,
+                  llvm::next(MachineBasicBlock::iterator(mInstr)),
+                  thisMBB->end());
+  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
  
    // Update thisMBB to fall through to newMBB
    thisMBB->addSuccessor(newMBB);
@@ -8287,16 +8339,16 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
  
    DebugLoc dl = mInstr->getDebugLoc();
    // Insert instructions into newMBB based on incoming instruction
-  assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 &&
+  assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
           "unexpected number of operands");
    MachineOperand& destOper = mInstr->getOperand(0);
-  MachineOperand* argOpers[2 + X86AddrNumOperands];
+  MachineOperand* argOpers[2 + X86::AddrNumOperands];
    int numArgs = mInstr->getNumOperands() - 1;
    for (int i=0; i < numArgs; ++i)
      argOpers[i] = &mInstr->getOperand(i+1);
  
    // x86 address has 4 operands: base, index, scale, and displacement
-  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
    int valArgIndx = lastAddrIndx + 1;
  
    unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
@@ -8311,12 +8363,12 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
  
    unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
    if (argOpers[valArgIndx]->isReg())
-    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
+    MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
    else
      MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
    (*MIB).addOperand(*argOpers[valArgIndx]);
  
-  MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX);
+  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
    MIB.addReg(t1);
  
    MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
@@ -8338,13 +8390,13 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
    (*MIB).setMemRefs(mInstr->memoperands_begin(),
                      mInstr->memoperands_end());
  
-  MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg());
+  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
    MIB.addReg(X86::EAX);
  
    // insert branch
    BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
  
-  F->DeleteMachineInstr(mInstr);   // The pseudo instruction is gone now.
+  mInstr->eraseFromParent();   // The pseudo instruction is gone now.
    return nextMBB;
  }
  
@@ -8354,7 +8406,6 @@ MachineBasicBlock *
  X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
                              unsigned numArgs, bool memArg) const {
  
-  MachineFunction *F = BB->getParent();
    DebugLoc dl = MI->getDebugLoc();
    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
  
@@ -8376,7 +8427,7 @@ X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
    BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
      .addReg(X86::XMM0);
  
-  F->DeleteMachineInstr(MI);
+  MI->eraseFromParent();
  
    return BB;
  }
@@ -8405,9 +8456,12 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
    F->insert(MBBIter, XMMSaveMBB);
    F->insert(MBBIter, EndMBB);
  
-  // Set up the CFG.
-  // Move any original successors of MBB to the end block.
-  EndMBB->transferSuccessors(MBB);
+  // Transfer the remainder of MBB and its successor edges to EndMBB.
+  EndMBB->splice(EndMBB->begin(), MBB,
+                 llvm::next(MachineBasicBlock::iterator(MI)),
+                 MBB->end());
+  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
    // The original block will now fall through to the XMM save block.
    MBB->addSuccessor(XMMSaveMBB);
    // The XMMSaveMBB will fall through to the end block.
@@ -8446,7 +8500,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
        .addMemOperand(MMO);
    }
  
-  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
  
    return EndMBB;
  }
@@ -8475,44 +8529,39 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
    MachineFunction *F = BB->getParent();
    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
    MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  unsigned Opc =
-    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
-
-  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
    F->insert(It, copy0MBB);
    F->insert(It, sinkMBB);
  
-  // Update machine-CFG edges by first adding all successors of the current
-  // block to the new block which will contain the Phi node for the select.
-  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(),
-         E = BB->succ_end(); I != E; ++I)
-    sinkMBB->addSuccessor(*I);
-
-  // Next, remove all successors of the current block, and add the true
-  // and fallthrough blocks as its successors.
-  while (!BB->succ_empty())
-    BB->removeSuccessor(BB->succ_begin());
-
-  // Add the true and fallthrough blocks as its successors.
-  BB->addSuccessor(copy0MBB);
-  BB->addSuccessor(sinkMBB);
-
    // If the EFLAGS register isn't dead in the terminator, then claim that it's
    // live into the sink and copy blocks.
    const MachineFunction *MF = BB->getParent();
    const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
    BitVector ReservedRegs = TRI->getReservedRegs(*MF);
-  const MachineInstr *Term = BB->getFirstTerminator();
  
-  for (unsigned I = 0, E = Term->getNumOperands(); I != E; ++I) {
-    const MachineOperand &MO = Term->getOperand(I);
-    if (!MO.isReg() || MO.isKill() || MO.isDead()) continue;
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue;
      unsigned Reg = MO.getReg();
      if (Reg != X86::EFLAGS) continue;
      copy0MBB->addLiveIn(Reg);
      sinkMBB->addLiveIn(Reg);
    }
  
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  // Create the conditional branch instruction.
+  unsigned Opc =
+    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
+
    //  copy0MBB:
    //   %FalseValue = ...
    //   # fallthrough to sinkMBB
@@ -8521,11 +8570,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
    //  sinkMBB:
    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
    //  ...
-  BuildMI(sinkMBB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg())
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(X86::PHI), MI->getOperand(0).getReg())
      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
  
-  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
    return sinkMBB;
  }
  
@@ -8534,21 +8584,20 @@ X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI,
                                            MachineBasicBlock *BB) const {
    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
    DebugLoc DL = MI->getDebugLoc();
-  MachineFunction *F = BB->getParent();
  
    // The lowering is pretty easy: we're just emitting the call to _alloca.  The
    // non-trivial part is impdef of ESP.
    // FIXME: The code should be tweaked as soon as we'll try to do codegen for
    // mingw-w64.
  
-  BuildMI(BB, DL, TII->get(X86::CALLpcrel32))
+  BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
      .addExternalSymbol("_alloca")
      .addReg(X86::EAX, RegState::Implicit)
      .addReg(X86::ESP, RegState::Implicit)
      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
      .addReg(X86::ESP, RegState::Define | RegState::Implicit);
  
-  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
    return BB;
  }
  
@@ -8567,35 +8616,38 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
    assert(MI->getOperand(3).isGlobal() && "This should be a global");
    
    if (Subtarget->is64Bit()) {
-    MachineInstrBuilder MIB = BuildMI(BB, DL, TII->get(X86::MOV64rm), X86::RDI)
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV64rm), X86::RDI)
      .addReg(X86::RIP)
      .addImm(0).addReg(0)
      .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 
                        MI->getOperand(3).getTargetFlags())
      .addReg(0);
-    MIB = BuildMI(BB, DL, TII->get(X86::CALL64m));
-    addDirectMem(MIB, X86::RDI).addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
+    addDirectMem(MIB, X86::RDI);
    } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
-    MachineInstrBuilder MIB = BuildMI(BB, DL, TII->get(X86::MOV32rm), X86::EAX)
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV32rm), X86::EAX)
      .addReg(0)
      .addImm(0).addReg(0)
      .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 
                        MI->getOperand(3).getTargetFlags())
      .addReg(0);
-    MIB = BuildMI(BB, DL, TII->get(X86::CALL32m));
-    addDirectMem(MIB, X86::EAX).addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+    addDirectMem(MIB, X86::EAX);
    } else {
-    MachineInstrBuilder MIB = BuildMI(BB, DL, TII->get(X86::MOV32rm), X86::EAX)
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV32rm), X86::EAX)
      .addReg(TII->getGlobalBaseReg(F))
      .addImm(0).addReg(0)
      .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 
                        MI->getOperand(3).getTargetFlags())
      .addReg(0);
-    MIB = BuildMI(BB, DL, TII->get(X86::CALL32m));
-    addDirectMem(MIB, X86::EAX).addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+    addDirectMem(MIB, X86::EAX);
    }
    
-  F->DeleteMachineInstr(MI); // The pseudo instruction is gone now.
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
    return BB;
  }
  
@@ -8639,23 +8691,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
      // mode when truncating to an integer value.
      MachineFunction *F = BB->getParent();
      int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
-    addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx);
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
  
      // Load the old value of the high byte of the control word...
      unsigned OldCW =
        F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
-    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW),
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
                        CWFrameIdx);
  
      // Set the high part to be round to zero...
-    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
        .addImm(0xC7F);
  
      // Reload the modified control word now...
-    addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FLDCW16m)), CWFrameIdx);
  
      // Restore the memory image of control word to original value
-    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
        .addReg(OldCW);
  
      // Get the X86 opcode to use.
@@ -8694,13 +8748,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
      } else {
        AM.Disp = Op.getImm();
      }
-    addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM)
-                      .addReg(MI->getOperand(X86AddrNumOperands).getReg());
+    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
+                      .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
  
      // Reload the original control word now.
-    addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FLDCW16m)), CWFrameIdx);
  
-    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    MI->eraseFromParent();   // The pseudo instruction is gone now.
      return BB;
    }
      // String/text processing lowering.
@@ -8717,25 +8772,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    case X86::ATOMAND32:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
                                                 X86::AND32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32, X86::MOV32rr,
+                                               X86::LCMPXCHG32,
                                                 X86::NOT32r, X86::EAX,
                                                 X86::GR32RegisterClass);
    case X86::ATOMOR32:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
                                                 X86::OR32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32, X86::MOV32rr,
+                                               X86::LCMPXCHG32,
                                                 X86::NOT32r, X86::EAX,
                                                 X86::GR32RegisterClass);
    case X86::ATOMXOR32:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
                                                 X86::XOR32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32, X86::MOV32rr,
+                                               X86::LCMPXCHG32,
                                                 X86::NOT32r, X86::EAX,
                                                 X86::GR32RegisterClass);
    case X86::ATOMNAND32:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
                                                 X86::AND32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32, X86::MOV32rr,
+                                               X86::LCMPXCHG32,
                                                 X86::NOT32r, X86::EAX,
                                                 X86::GR32RegisterClass, true);
    case X86::ATOMMIN32:
@@ -8750,25 +8805,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    case X86::ATOMAND16:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
                                                 X86::AND16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16, X86::MOV16rr,
+                                               X86::LCMPXCHG16,
                                                 X86::NOT16r, X86::AX,
                                                 X86::GR16RegisterClass);
    case X86::ATOMOR16:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
                                                 X86::OR16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16, X86::MOV16rr,
+                                               X86::LCMPXCHG16,
                                                 X86::NOT16r, X86::AX,
                                                 X86::GR16RegisterClass);
    case X86::ATOMXOR16:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
                                                 X86::XOR16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16, X86::MOV16rr,
+                                               X86::LCMPXCHG16,
                                                 X86::NOT16r, X86::AX,
                                                 X86::GR16RegisterClass);
    case X86::ATOMNAND16:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
                                                 X86::AND16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16, X86::MOV16rr,
+                                               X86::LCMPXCHG16,
                                                 X86::NOT16r, X86::AX,
                                                 X86::GR16RegisterClass, true);
    case X86::ATOMMIN16:
@@ -8783,25 +8838,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    case X86::ATOMAND8:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
                                                 X86::AND8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8, X86::MOV8rr,
+                                               X86::LCMPXCHG8,
                                                 X86::NOT8r, X86::AL,
                                                 X86::GR8RegisterClass);
    case X86::ATOMOR8:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
                                                 X86::OR8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8, X86::MOV8rr,
+                                               X86::LCMPXCHG8,
                                                 X86::NOT8r, X86::AL,
                                                 X86::GR8RegisterClass);
    case X86::ATOMXOR8:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
                                                 X86::XOR8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8, X86::MOV8rr,
+                                               X86::LCMPXCHG8,
                                                 X86::NOT8r, X86::AL,
                                                 X86::GR8RegisterClass);
    case X86::ATOMNAND8:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
                                                 X86::AND8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8, X86::MOV8rr,
+                                               X86::LCMPXCHG8,
                                                 X86::NOT8r, X86::AL,
                                                 X86::GR8RegisterClass, true);
    // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
@@ -8809,25 +8864,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
    case X86::ATOMAND64:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
                                                 X86::AND64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64, X86::MOV64rr,
+                                               X86::LCMPXCHG64,
                                                 X86::NOT64r, X86::RAX,
                                                 X86::GR64RegisterClass);
    case X86::ATOMOR64:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
                                                 X86::OR64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64, X86::MOV64rr,
+                                               X86::LCMPXCHG64,
                                                 X86::NOT64r, X86::RAX,
                                                 X86::GR64RegisterClass);
    case X86::ATOMXOR64:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
                                                 X86::XOR64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64, X86::MOV64rr,
+                                               X86::LCMPXCHG64,
                                                 X86::NOT64r, X86::RAX,
                                                 X86::GR64RegisterClass);
    case X86::ATOMNAND64:
      return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
                                                 X86::AND64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64, X86::MOV64rr,
+                                               X86::LCMPXCHG64,
                                                 X86::NOT64r, X86::RAX,
                                                 X86::GR64RegisterClass, true);
    case X86::ATOMMIN64:
@@ -9008,8 +9063,8 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
  
    // Store the value to a temporary stack slot.
    SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
-  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 0,
-                            false, false, 0);
+  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL,
+                            0, false, false, 0);
  
    // Replace each use (extract) with a load of the appropriate element.
    for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
@@ -9023,11 +9078,12 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
      uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
      SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
  
-    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), OffsetVal, StackPtr);
+    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(),
+                                     OffsetVal, StackPtr);
  
      // Load the scalar.
-    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, ScalarAddr,
-                          NULL, 0, false, false, 0);
+    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
+                                     ScalarAddr, NULL, 0, false, false, 0);
  
      // Replace the exact with the load.
      DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
@@ -9065,8 +9121,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
          // Converting this to a min would handle NaNs incorrectly, and swapping
          // the operands would cause it to handle comparisons between positive
          // and negative zero incorrectly.
-        if (!FiniteOnlyFPMath() &&
-            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) {
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
            if (!UnsafeFPMath &&
                !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
              break;
@@ -9104,8 +9159,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
          // Converting this to a max would handle NaNs incorrectly, and swapping
          // the operands would cause it to handle comparisons between positive
          // and negative zero incorrectly.
-        if (!FiniteOnlyFPMath() &&
-            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) {
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
            if (!UnsafeFPMath &&
                !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
              break;
@@ -9134,8 +9188,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
          // cause it to handle NaNs incorrectly.
          if (!UnsafeFPMath &&
              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
-          if (!FiniteOnlyFPMath() &&
-              (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
              break;
            std::swap(LHS, RHS);
          }
@@ -9160,8 +9213,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
  
        case ISD::SETULT:
          // Converting this to a max would handle NaNs incorrectly.
-        if (!FiniteOnlyFPMath() &&
-            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
            break;
          Opcode = X86ISD::FMAX;
          break;
@@ -9171,8 +9223,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
          // cause it to handle NaNs incorrectly.
          if (!UnsafeFPMath &&
              !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
-          if (!FiniteOnlyFPMath() &&
-              (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
              break;
            std::swap(LHS, RHS);
          }
@@ -10021,8 +10072,8 @@ static bool LowerToBSwap(CallInst *CI) {
    // so don't worry about this.
  
    // Verify this is a simple bswap.
-  if (CI->getNumOperands() != 2 ||
-      CI->getType() != CI->getOperand(1)->getType() ||
+  if (CI->getNumArgOperands() != 1 ||
+      CI->getType() != CI->getArgOperand(0)->getType() ||
        !CI->getType()->isIntegerTy())
      return false;
  
@@ -10035,7 +10086,7 @@ static bool LowerToBSwap(CallInst *CI) {
    Module *M = CI->getParent()->getParent()->getParent();
    Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1);
  
-  Value *Op = CI->getOperand(1);
+  Value *Op = CI->getArgOperand(0);
    Op = CallInst::Create(Int, Op, CI->getName(), CI);
  
    CI->replaceAllUsesWith(Op);
@@ -10244,8 +10295,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
      // In any sort of PIC mode addresses need to be computed at runtime by
      // adding in a register or some sort of table lookup.  These can't
      // be used as immediates.
-    if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC() ||
-        Subtarget->isPICStyleRIPRel())
+    if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
        return;
  
      // If we are in non-pic codegen mode, we allow the address of a global (with
@@ -10283,7 +10333,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                          getTargetMachine())))
        return;
  
-    Result = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset);
+    Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
+                                        GA->getValueType(0), Offset);
      break;
    }
    }