simplify some code based on the fact that picstyles != none are only valid

[oota-llvm.git] / lib / Target / X86 / X86ISelLowering.cpp
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 882ee3a01f1e7f3ec3875ff72c0354748e6016b0..d859423300117c26c769e6abdf81c3c998c64844 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -15,30 +15,30 @@
  #include "X86.h"
  #include "X86InstrBuilder.h"
  #include "X86ISelLowering.h"
-#include "X86MachineFunctionInfo.h"
  #include "X86TargetMachine.h"
  #include "llvm/CallingConv.h"
  #include "llvm/Constants.h"
  #include "llvm/DerivedTypes.h"
+#include "llvm/GlobalAlias.h"
  #include "llvm/GlobalVariable.h"
  #include "llvm/Function.h"
  #include "llvm/Intrinsics.h"
  #include "llvm/ADT/BitVector.h"
  #include "llvm/ADT/VectorExtras.h"
-#include "llvm/CodeGen/CallingConvLower.h"
  #include "llvm/CodeGen/MachineFrameInfo.h"
  #include "llvm/CodeGen/MachineFunction.h"
  #include "llvm/CodeGen/MachineInstrBuilder.h"
  #include "llvm/CodeGen/MachineModuleInfo.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
  #include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/CodeGen/SelectionDAG.h"
  #include "llvm/Support/MathExtras.h"
  #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
  #include "llvm/Target/TargetOptions.h"
  #include "llvm/ADT/SmallSet.h"
  #include "llvm/ADT/StringExtras.h"
  #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
  using namespace llvm;
  
  static cl::opt<bool>
@@ -129,7 +129,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
    setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
  
-  if (!UseSoftFloat && !NoImplicitFloat) {
+  if (!UseSoftFloat) {
      // SSE has no i16 to fp conversion, only i32
      if (X86ScalarSSEf32) {
        setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
@@ -553,6 +553,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
      setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
    }
  
    // FIXME: In order to prevent SSE instructions being expanded to MMX ones
@@ -698,6 +702,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
        // Do not attempt to custom lower non-power-of-2 vectors
        if (!isPowerOf2_32(VT.getVectorNumElements()))
          continue;
+      // Do not attempt to custom lower non-128-bit vectors
+      if (!VT.is128BitVector())
+        continue;
        setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
        setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
        setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -716,17 +723,23 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      }
  
      // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
-    for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) {
-      setOperationAction(ISD::AND,    (MVT::SimpleValueType)VT, Promote);
-      AddPromotedToType (ISD::AND,    (MVT::SimpleValueType)VT, MVT::v2i64);
-      setOperationAction(ISD::OR,     (MVT::SimpleValueType)VT, Promote);
-      AddPromotedToType (ISD::OR,     (MVT::SimpleValueType)VT, MVT::v2i64);
-      setOperationAction(ISD::XOR,    (MVT::SimpleValueType)VT, Promote);
-      AddPromotedToType (ISD::XOR,    (MVT::SimpleValueType)VT, MVT::v2i64);
-      setOperationAction(ISD::LOAD,   (MVT::SimpleValueType)VT, Promote);
-      AddPromotedToType (ISD::LOAD,   (MVT::SimpleValueType)VT, MVT::v2i64);
-      setOperationAction(ISD::SELECT, (MVT::SimpleValueType)VT, Promote);
-      AddPromotedToType (ISD::SELECT, (MVT::SimpleValueType)VT, MVT::v2i64);
+    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
+      MVT VT = (MVT::SimpleValueType)i;
+
+      // Do not attempt to promote non-128-bit vectors
+      if (!VT.is128BitVector()) {
+        continue;
+      }
+      setOperationAction(ISD::AND,    VT, Promote);
+      AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
+      setOperationAction(ISD::OR,     VT, Promote);
+      AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
+      setOperationAction(ISD::XOR,    VT, Promote);
+      AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
+      setOperationAction(ISD::LOAD,   VT, Promote);
+      AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
      }
  
      setTruncStoreAction(MVT::f64, MVT::f32, Expand);
@@ -737,6 +750,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
      setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
  
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
+    if (!DisableMMX && Subtarget->hasMMX()) {
+      setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
+      setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
+    }
    }
  
    if (Subtarget->hasSSE41()) {
@@ -767,6 +786,114 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
      setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
    }
  
+  if (!UseSoftFloat && Subtarget->hasAVX()) {
+    addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
+    addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
+    addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
+    addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
+
+    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v8i32, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
+    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
+    //setOperationAction(ISD::BUILD_VECTOR,       MVT::v8f32, Custom);
+    //setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8f32, Custom);
+    //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom);
+    //setOperationAction(ISD::SELECT,             MVT::v8f32, Custom);
+    //setOperationAction(ISD::VSETCC,             MVT::v8f32, Custom);
+
+    // Operations to consider commented out -v16i16 v32i8
+    //setOperationAction(ISD::ADD,                MVT::v16i16, Legal);
+    setOperationAction(ISD::ADD,                MVT::v8i32, Custom);
+    setOperationAction(ISD::ADD,                MVT::v4i64, Custom);
+    //setOperationAction(ISD::SUB,                MVT::v32i8, Legal);
+    //setOperationAction(ISD::SUB,                MVT::v16i16, Legal);
+    setOperationAction(ISD::SUB,                MVT::v8i32, Custom);
+    setOperationAction(ISD::SUB,                MVT::v4i64, Custom);
+    //setOperationAction(ISD::MUL,                MVT::v16i16, Legal);
+    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
+
+    setOperationAction(ISD::VSETCC,             MVT::v4f64, Custom);
+    // setOperationAction(ISD::VSETCC,             MVT::v32i8, Custom);
+    // setOperationAction(ISD::VSETCC,             MVT::v16i16, Custom);
+    setOperationAction(ISD::VSETCC,             MVT::v8i32, Custom);
+
+    // setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i8, Custom);
+    // setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i16, Custom);
+    // setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8f32, Custom);
+
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f64, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i64, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom);
+
+#if 0
+    // Not sure we want to do this since there are no 256-bit integer
+    // operations in AVX
+
+    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+    // This includes 256-bit vectors
+    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
+
+      // Do not attempt to custom lower non-power-of-2 vectors
+      if (!isPowerOf2_32(VT.getVectorNumElements()))
+        continue;
+
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+    }
+
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i64, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom);
+    }    
+#endif
+
+#if 0
+    // Not sure we want to do this since there are no 256-bit integer
+    // operations in AVX
+
+    // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64.
+    // Including 256-bit vectors
+    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) {
+      MVT VT = (MVT::SimpleValueType)i;
+
+      if (!VT.is256BitVector()) {
+        continue;
+      }
+      setOperationAction(ISD::AND,    VT, Promote);
+      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
+      setOperationAction(ISD::OR,     VT, Promote);
+      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
+      setOperationAction(ISD::XOR,    VT, Promote);
+      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
+      setOperationAction(ISD::LOAD,   VT, Promote);
+      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
+    }
+
+    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+#endif
+  }
+
    // We want to custom lower some of our intrinsics.
    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  
@@ -781,8 +908,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setOperationAction(ISD::USUBO, MVT::i64, Custom);
    setOperationAction(ISD::SMULO, MVT::i32, Custom);
    setOperationAction(ISD::SMULO, MVT::i64, Custom);
-  setOperationAction(ISD::UMULO, MVT::i32, Custom);
-  setOperationAction(ISD::UMULO, MVT::i64, Custom);
  
    if (!Subtarget->is64Bit()) {
      // These libcalls are not available in 32-bit.
@@ -799,6 +924,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
    setTargetDAGCombine(ISD::SRA);
    setTargetDAGCombine(ISD::SRL);
    setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::MEMBARRIER);
    if (Subtarget->is64Bit())
      setTargetDAGCombine(ISD::MUL);
  
@@ -871,11 +997,14 @@ unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
  /// determining it.
  MVT
  X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
-                                       bool isSrcConst, bool isSrcStr) const {
+                                       bool isSrcConst, bool isSrcStr,
+                                       SelectionDAG &DAG) const {
    // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
    // linux.  This is because the stack realignment code can't handle certain
    // cases like PR2962.  This should be removed when PR2962 is fixed.
-  if (!NoImplicitFloat && Subtarget->getStackAlignment() >= 16) {
+  const Function *F = DAG.getMachineFunction().getFunction();
+  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
+  if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) {
      if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
        return MVT::v4i32;
      if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
@@ -892,7 +1021,7 @@ SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
                                                        SelectionDAG &DAG) const {
    if (usesGlobalOffsetTable())
      return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy());
-  if (!Subtarget->isPICStyleRIPRel())
+  if (!Subtarget->is64Bit())
      // This doesn't have DebugLoc associated with it, but is not really the
      // same as a Register.
      return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(),
@@ -900,6 +1029,11 @@ SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
    return Table;
  }
  
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
+  return F->hasFnAttr(Attribute::OptimizeForSize) ? 1 : 4;
+}
+
  //===----------------------------------------------------------------------===//
  //               Return Value Calling Convention Implementation
  //===----------------------------------------------------------------------===//
@@ -934,7 +1068,7 @@ SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) {
      SDValue StackAdjustment = TailCall.getOperand(2);
      assert(((TargetAddress.getOpcode() == ISD::Register &&
                 (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::EAX ||
-                cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R9)) ||
+                cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R11)) ||
                TargetAddress.getOpcode() == ISD::TargetExternalSymbol ||
                TargetAddress.getOpcode() == ISD::TargetGlobalAddress) &&
               "Expecting an global address, external symbol, or register");
@@ -1053,8 +1187,7 @@ LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall,
      // If this is x86-64, and we disabled SSE, we can't return FP values
      if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
          ((Is64Bit || TheCall->isInreg()) && !Subtarget->hasSSE1())) {
-      cerr << "SSE register return with SSE disabled\n";
-      exit(1);
+      llvm_report_error("SSE register return with SSE disabled");
      }
  
      // If this is a call to a function that returns an fp value on the floating
@@ -1161,8 +1294,6 @@ CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const {
    if (Subtarget->is64Bit()) {
      if (Subtarget->isTargetWin64())
        return CC_X86_Win64_C;
-    else if (CC == CallingConv::Fast && PerformTailCallOpt)
-      return CC_X86_64_TailCall;
      else
        return CC_X86_64_C;
    }
@@ -1188,23 +1319,6 @@ X86TargetLowering::NameDecorationForFORMAL_ARGUMENTS(SDValue Op) {
  }
  
  
-/// CallRequiresGOTInRegister - Check whether the call requires the GOT pointer
-/// in a register before calling.
-bool X86TargetLowering::CallRequiresGOTPtrInReg(bool Is64Bit, bool IsTailCall) {
-  return !IsTailCall && !Is64Bit &&
-    getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
-    Subtarget->isPICStyleGOT();
-}
-
-/// CallRequiresFnAddressInReg - Check whether the call requires the function
-/// address to be loaded in a register.
-bool
-X86TargetLowering::CallRequiresFnAddressInReg(bool Is64Bit, bool IsTailCall) {
-  return !Is64Bit && IsTailCall &&
-    getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
-    Subtarget->isPICStyleGOT();
-}
-
  /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
  /// by "Src" to address "Dst" with size and alignment information specified by
  /// the specific parameter attribute. The copy will be passed as a byval
@@ -1407,11 +1521,12 @@ X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
        unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
                                                         TotalNumXMMRegs);
  
+      bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
        assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
               "SSE register cannot be used when SSE is disabled!");
-      assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloat) &&
+      assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) &&
               "SSE register cannot be used when SSE is disabled!");
-      if (UseSoftFloat || NoImplicitFloat || !Subtarget->hasSSE1())
+      if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
          // Kernel mode asks for SSE to be disabled, so don't push them
          // on the stack.
          TotalNumXMMRegs = 0;
@@ -1670,30 +1785,34 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
        InFlag = Chain.getValue(1);
      }
  
-  // ELF / PIC requires GOT in the EBX register before function calls via PLT
-  // GOT pointer.
-  if (CallRequiresGOTPtrInReg(Is64Bit, IsTailCall)) {
-    Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
-                             DAG.getNode(X86ISD::GlobalBaseReg,
-                                         DebugLoc::getUnknownLoc(),
-                                         getPointerTy()),
-                             InFlag);
-    InFlag = Chain.getValue(1);
-  }
-  // If we are tail calling and generating PIC/GOT style code load the address
-  // of the callee into ecx. The value in ecx is used as target of the tail
-  // jump. This is done to circumvent the ebx/callee-saved problem for tail
-  // calls on PIC/GOT architectures. Normally we would just put the address of
-  // GOT into ebx and then call target@PLT. But for tail callss ebx would be
-  // restored (since ebx is callee saved) before jumping to the target@PLT.
-  if (CallRequiresFnAddressInReg(Is64Bit, IsTailCall)) {
-    // Note: The actual moving to ecx is done further down.
-    GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
-    if (G && !G->getGlobal()->hasHiddenVisibility() &&
-        !G->getGlobal()->hasProtectedVisibility())
-      Callee =  LowerGlobalAddress(Callee, DAG);
-    else if (isa<ExternalSymbolSDNode>(Callee))
-      Callee = LowerExternalSymbol(Callee,DAG);
+  
+  if (Subtarget->isPICStyleGOT()) {
+    // ELF / PIC requires GOT in the EBX register before function calls via PLT
+    // GOT pointer.
+    if (!IsTailCall) {
+      Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
+                               DAG.getNode(X86ISD::GlobalBaseReg,
+                                           DebugLoc::getUnknownLoc(),
+                                           getPointerTy()),
+                               InFlag);
+      InFlag = Chain.getValue(1);
+    } else {
+      // If we are tail calling and generating PIC/GOT style code load the
+      // address of the callee into ECX. The value in ecx is used as target of
+      // the tail jump. This is done to circumvent the ebx/callee-saved problem
+      // for tail calls on PIC/GOT architectures. Normally we would just put the
+      // address of GOT into ebx and then call target@PLT. But for tail calls
+      // ebx would be restored (since ebx is callee saved) before jumping to the
+      // target@PLT.
+
+      // Note: The actual moving to ECX is done further down.
+      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+      if (G && !G->getGlobal()->hasHiddenVisibility() &&
+          !G->getGlobal()->hasProtectedVisibility())
+        Callee = LowerGlobalAddress(Callee, DAG);
+      else if (isa<ExternalSymbolSDNode>(Callee))
+        Callee = LowerExternalSymbol(Callee, DAG);
+    }
    }
  
    if (Is64Bit && isVarArg) {
@@ -1788,7 +1907,7 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
      Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
    } else if (IsTailCall) {
-    unsigned Opc = Is64Bit ? X86::R9 : X86::EAX;
+    unsigned Opc = Is64Bit ? X86::R11 : X86::EAX;
  
      Chain = DAG.getCopyToReg(Chain,  dl,
                               DAG.getRegister(Opc, getPointerTy()),
@@ -1825,9 +1944,7 @@ SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
                                    RegsToPass[i].second.getValueType()));
  
    // Add an implicit use GOT pointer in EBX.
-  if (!IsTailCall && !Is64Bit &&
-      getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
-      Subtarget->isPICStyleGOT())
+  if (!IsTailCall && Subtarget->isPICStyleGOT())
      Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
  
    // Add an implicit use of AL for x86 vararg functions.
@@ -1940,22 +2057,11 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(CallSDNode *TheCall,
      return false;
  
    if (CheckTailCallReturnConstraints(TheCall, Ret)) {
-    MachineFunction &MF = DAG.getMachineFunction();
-    unsigned CallerCC = MF.getFunction()->getCallingConv();
-    unsigned CalleeCC= TheCall->getCallingConv();
-    if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
-      SDValue Callee = TheCall->getCallee();
-      // On x86/32Bit PIC/GOT  tail calls are supported.
-      if (getTargetMachine().getRelocationModel() != Reloc::PIC_ ||
-          !Subtarget->isPICStyleGOT()|| !Subtarget->is64Bit())
-        return true;
-
-      // Can only do local tail calls (in same module, hidden or protected) on
-      // x86_64 PIC/GOT at the moment.
-      if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-        return G->getGlobal()->hasHiddenVisibility()
-            || G->getGlobal()->hasProtectedVisibility();
-    }
+    unsigned CallerCC =
+      DAG.getMachineFunction().getFunction()->getCallingConv();
+    unsigned CalleeCC = TheCall->getCallingConv();
+    if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC)
+      return true;
    }
  
    return false;
@@ -2417,9 +2523,10 @@ bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
  /// specifies a shuffle of elements that is suitable for input to MOVSS,
  /// MOVSD, and MOVD, i.e. setting the lowest element.
  static bool isMOVLMask(const SmallVectorImpl<int> &Mask, MVT VT) {
-  int NumElts = VT.getVectorNumElements();
-  if (NumElts != 2 && NumElts != 4)
+  if (VT.getVectorElementType().getSizeInBits() < 32)
      return false;
+
+  int NumElts = VT.getVectorNumElements();
    
    if (!isUndefOrEqual(Mask[0], NumElts))
      return false;
@@ -3071,7 +3178,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
    }
  
    // Special case for single non-zero, non-undef, element.
-  if (NumNonZero == 1 && NumElems <= 4) {
+  if (NumNonZero == 1) {
      unsigned Idx = CountTrailingZeros_32(NonZeros);
      SDValue Item = Op.getOperand(Idx);
  
@@ -3112,15 +3219,24 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
      // If we have a constant or non-constant insertion into the low element of
      // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
      // the rest of the elements.  This will be matched as movd/movq/movss/movsd
-    // depending on what the source datatype is.  Because we can only get here
-    // when NumElems <= 4, this only needs to handle i32/f32/i64/f64.
-    if (Idx == 0 &&
-        // Don't do this for i64 values on x86-32.
-        (EVT != MVT::i64 || Subtarget->is64Bit())) {
-      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
-      // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
-      return getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
-                                         Subtarget->hasSSE2(), DAG);
+    // depending on what the source datatype is.
+    if (Idx == 0) {
+      if (NumZero == 0) {
+        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+      } else if (EVT == MVT::i32 || EVT == MVT::f32 || EVT == MVT::f64 ||
+          (EVT == MVT::i64 && Subtarget->is64Bit())) {
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
+                                           DAG);
+      } else if (EVT == MVT::i16 || EVT == MVT::i8) {
+        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
+        MVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32;
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
+        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
+                                           Subtarget->hasSSE2(), DAG);
+        return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item);
+      }
      }
  
      // Is it a vector logical left shift?
@@ -4251,7 +4367,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
    SDValue N1 = Op.getOperand(1);
    SDValue N2 = Op.getOperand(2);
  
-  if (EVT.getSizeInBits() == 16) {
+  if (EVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
      // Transform it so it match pinsrw which expects a 16-bit value in a GR32
      // as its second argument.
      if (N1.getValueType() != MVT::i32)
@@ -4294,21 +4410,104 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
  SDValue
  X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
    ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
-  // FIXME there isn't really any debug info here, should come from the parent
-  DebugLoc dl = CP->getDebugLoc();
+  
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  
+  if (Subtarget->is64Bit() &&
+      getTargetMachine().getCodeModel() == CodeModel::Small) {
+    WrapperKind = X86ISD::WrapperRIP;
+  } else if (Subtarget->isPICStyleGOT()) {
+    OpFlag = X86II::MO_GOTOFF;
+  } else if (Subtarget->isPICStyleStub() &&
+             getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+  }
+  
    SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
-                                             CP->getAlignment());
-  Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+                                             CP->getAlignment(),
+                                             CP->getOffset(), OpFlag);
+  DebugLoc DL = CP->getDebugLoc();
+  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  // With PIC, the address is actually $g + Offset.
+  if (OpFlag) {
+    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg,
+                                     DebugLoc::getUnknownLoc(), getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  
+  if (Subtarget->is64Bit()) {
+    WrapperKind = X86ISD::WrapperRIP;
+  } else if (Subtarget->isPICStyleGOT()) {
+    OpFlag = X86II::MO_GOTOFF;
+  } else if (Subtarget->isPICStyleStub() &&
+             getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+  }
+  
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
+                                          OpFlag);
+  DebugLoc DL = JT->getDebugLoc();
+  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  
+  // With PIC, the address is actually $g + Offset.
+  if (OpFlag) {
+    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg,
+                                     DebugLoc::getUnknownLoc(), getPointerTy()),
+                         Result);
+  }
+  
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) {
+  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+  
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  if (Subtarget->is64Bit()) {
+    WrapperKind = X86ISD::WrapperRIP;
+  } else if (Subtarget->isPICStyleGOT()) {
+    OpFlag = X86II::MO_GOTOFF;
+  } else if (Subtarget->isPICStyleStub() &&
+             getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+  }
+  
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
+  
+  DebugLoc DL = Op.getDebugLoc();
+  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  
+  
    // With PIC, the address is actually $g + Offset.
    if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
-      !Subtarget->isPICStyleRIPRel()) {
-    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+      !Subtarget->is64Bit()) {
+    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                           DAG.getNode(X86ISD::GlobalBaseReg,
                                       DebugLoc::getUnknownLoc(),
                                       getPointerTy()),
                           Result);
    }
-
+  
    return Result;
  }
  
@@ -4324,14 +4523,35 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
    // offset if it is legal.
    SDValue Result;
    if (!IsPic && !ExtraLoadRequired && isInt32(Offset)) {
+    // A direct static reference to a global.
      Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
      Offset = 0;
-  } else
-    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0);
-  Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+  } else {
+    unsigned char OpFlags = 0;
+    
+    if (GV->hasDLLImportLinkage())
+      OpFlags = X86II::MO_DLLIMPORT;
+    else if (Subtarget->isPICStyleRIPRel()) {
+      if (ExtraLoadRequired)
+        OpFlags = X86II::MO_GOTPCREL;
+    } else if (Subtarget->isPICStyleGOT()) {
+      if (ExtraLoadRequired)
+        OpFlags = X86II::MO_GOT;
+      else
+        OpFlags = X86II::MO_GOTOFF;
+    }
+    
+    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags);
+  }
+  
+  if (Subtarget->is64Bit() &&
+      getTargetMachine().getCodeModel() == CodeModel::Small)
+    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+  else
+    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
  
    // With PIC, the address is actually $g + Offset.
-  if (IsPic && !Subtarget->isPICStyleRIPRel()) {
+  if (IsPic && !Subtarget->is64Bit()) {
      Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
                           DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
                           Result);
@@ -4364,12 +4584,14 @@ X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
  
  static SDValue
  GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
-           SDValue *InFlag, const MVT PtrVT, unsigned ReturnReg) {
+           SDValue *InFlag, const MVT PtrVT, unsigned ReturnReg,
+           unsigned char OperandFlags) {
    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
    DebugLoc dl = GA->getDebugLoc();
    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
                                             GA->getValueType(0),
-                                           GA->getOffset());
+                                           GA->getOffset(),
+                                           OperandFlags);
    if (InFlag) {
      SDValue Ops[] = { Chain,  TGA, *InFlag };
      Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
@@ -4393,14 +4615,15 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                                   PtrVT), InFlag);
    InFlag = Chain.getValue(1);
  
-  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX);
+  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
  }
  
  // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
  static SDValue
  LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                  const MVT PtrVT) {
-  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX);
+  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
+                    X86::RAX, X86II::MO_TLSGD);
  }
  
  // Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
@@ -4418,12 +4641,26 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
    SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base,
                                        NULL, 0);
  
+  unsigned char OperandFlags = 0;
+  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
+  // initialexec.
+  unsigned WrapperKind = X86ISD::Wrapper;
+  if (model == TLSModel::LocalExec) {
+    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
+  } else if (is64Bit) {
+    assert(model == TLSModel::InitialExec);
+    OperandFlags = X86II::MO_GOTTPOFF;
+    WrapperKind = X86ISD::WrapperRIP;
+  } else {
+    assert(model == TLSModel::InitialExec);
+    OperandFlags = X86II::MO_INDNTPOFF;
+  }
+  
    // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
    // exec)
-  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
-                                             GA->getValueType(0),
-                                             GA->getOffset());
-  SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
+                                           GA->getOffset(), OperandFlags);
+  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
  
    if (model == TLSModel::InitialExec)
      Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
@@ -4441,72 +4678,33 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
    assert(Subtarget->isTargetELF() &&
           "TLS not implemented for non-ELF targets");
    GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-  GlobalValue *GV = GA->getGlobal();
-  TLSModel::Model model =
-    getTLSModel (GV, getTargetMachine().getRelocationModel());
-  if (Subtarget->is64Bit()) {
-    switch (model) {
-    case TLSModel::GeneralDynamic:
-    case TLSModel::LocalDynamic: // not implemented
+  const GlobalValue *GV = GA->getGlobal();
+  
+  // If GV is an alias then use the aliasee for determining
+  // thread-localness.
+  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+    GV = GA->resolveAliasedGlobal(false);
+  
+  TLSModel::Model model = getTLSModel(GV,
+                                      getTargetMachine().getRelocationModel());
+  
+  switch (model) {
+  case TLSModel::GeneralDynamic:
+  case TLSModel::LocalDynamic: // not implemented
+    if (Subtarget->is64Bit())
        return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
-
-    case TLSModel::InitialExec:
-    case TLSModel::LocalExec:
-      return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, true);
-    }
-  } else {
-    switch (model) {
-    case TLSModel::GeneralDynamic:
-    case TLSModel::LocalDynamic: // not implemented
-      return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
-
-    case TLSModel::InitialExec:
-    case TLSModel::LocalExec:
-      return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, false);
-    }
+    return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
+    
+  case TLSModel::InitialExec:
+  case TLSModel::LocalExec:
+    return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
+                               Subtarget->is64Bit());
    }
+  
    assert(0 && "Unreachable");
    return SDValue();
  }
  
-SDValue
-X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) {
-  // FIXME there isn't really any debug info here
-  DebugLoc dl = Op.getDebugLoc();
-  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
-  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
-  Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
-  // With PIC, the address is actually $g + Offset.
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
-      !Subtarget->isPICStyleRIPRel()) {
-    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg,
-                                     DebugLoc::getUnknownLoc(),
-                                     getPointerTy()),
-                         Result);
-  }
-
-  return Result;
-}
-
-SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
-  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-  // FIXME there isn't really any debug into here
-  DebugLoc dl = JT->getDebugLoc();
-  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy());
-  Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
-  // With PIC, the address is actually $g + Offset.
-  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
-      !Subtarget->isPICStyleRIPRel()) {
-    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                         DAG.getNode(X86ISD::GlobalBaseReg,
-                                     DebugLoc::getUnknownLoc(),
-                                     getPointerTy()),
-                         Result);
-  }
-
-  return Result;
-}
  
  /// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
  /// take a 2 x i32 value to shift plus a shift amount.
@@ -4557,6 +4755,14 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
  
  SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
    MVT SrcVT = Op.getOperand(0).getValueType();
+
+  if (SrcVT.isVector()) {
+    if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) {
+      return Op;
+    }
+    return SDValue();
+  }
+
    assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
           "Unknown SINT_TO_FP to lower!");
  
@@ -4848,6 +5054,14 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) {
  }
  
  SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
+  if (Op.getValueType().isVector()) {
+    if (Op.getValueType() == MVT::v2i32 &&
+        Op.getOperand(0).getValueType() == MVT::v2f64) {
+      return Op;
+    }
+    return SDValue();
+  }
+
    std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
    SDValue FIST = Vals.first, StackSlot = Vals.second;
    // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
@@ -5577,7 +5791,7 @@ X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
        Args.push_back(Entry);
        std::pair<SDValue,SDValue> CallResult =
          LowerCallTo(Chain, Type::VoidTy, false, false, false, false,
-                    CallingConv::C, false,
+                    0, CallingConv::C, false,
                      DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl);
        return CallResult.second;
      }
@@ -5818,8 +6032,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) {
    SDValue SrcPtr = Op.getOperand(1);
    SDValue SrcSV = Op.getOperand(2);
  
-  assert(0 && "VAArgInst is not yet implemented for x86-64!");
-  abort();
+  llvm_report_error("VAArgInst is not yet implemented for x86-64!");
    return SDValue();
  }
  
@@ -6020,7 +6233,7 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
        case Intrinsic::x86_mmx_psrai_d:
          NewIntNo = Intrinsic::x86_mmx_psra_d;
          break;
-      default: abort();  // Can't reach here.
+      default: LLVM_UNREACHABLE("Impossible intrinsic");  // Can't reach here.
        }
        break;
      }
@@ -6192,8 +6405,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
              InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
  
          if (InRegCount > 2) {
-          cerr << "Nest register in use - reduce number of inreg parameters!\n";
-          abort();
+          llvm_report_error("Nest register in use - reduce number of inreg parameters!");
          }
        }
        break;
@@ -6747,6 +6959,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
    case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
    case X86ISD::Wrapper:            return "X86ISD::Wrapper";
+  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
    case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
    case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
    case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
@@ -7654,13 +7867,15 @@ static bool isBaseAlignmentOfN(unsigned N, SDNode *Base,
  }
  
  static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
-                                     MVT EVT, SDNode *&Base,
+                                     MVT EVT, LoadSDNode *&LDBase,
+                                     unsigned &LastLoadedElt,
                                       SelectionDAG &DAG, MachineFrameInfo *MFI,
                                       const TargetLowering &TLI) {
-  Base = NULL;
+  LDBase = NULL;
+  LastLoadedElt = -1U;
    for (unsigned i = 0; i < NumElems; ++i) {
      if (N->getMaskElt(i) < 0) {
-      if (!Base)
+      if (!LDBase)
          return false;
        continue;
      }
@@ -7669,18 +7884,20 @@ static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
      if (!Elt.getNode() ||
          (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
        return false;
-    if (!Base) {
-      Base = Elt.getNode();
-      if (Base->getOpcode() == ISD::UNDEF)
+    if (!LDBase) {
+      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
          return false;
+      LDBase = cast<LoadSDNode>(Elt.getNode());
+      LastLoadedElt = i;
        continue;
      }
      if (Elt.getOpcode() == ISD::UNDEF)
        continue;
  
-    if (!TLI.isConsecutiveLoad(Elt.getNode(), Base,
-                               EVT.getSizeInBits()/8, i, MFI))
+    LoadSDNode *LD = cast<LoadSDNode>(Elt);
+    if (!TLI.isConsecutiveLoad(LD, LDBase, EVT.getSizeInBits()/8, i, MFI))
        return false;
+    LastLoadedElt = i;
    }
    return true;
  }
@@ -7699,99 +7916,32 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
    ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
    unsigned NumElems = VT.getVectorNumElements();
  
-  // For x86-32 machines, if we see an insert and then a shuffle in a v2i64
-  // where the upper half is 0, it is advantageous to rewrite it as a build
-  // vector of (0, val) so it can use movq.
-  if (VT == MVT::v2i64) {
-    SDValue In[2];
-    In[0] = N->getOperand(0);
-    In[1] = N->getOperand(1);
-    int Idx0 = SVN->getMaskElt(0);
-    int Idx1 = SVN->getMaskElt(1);
-    // FIXME: can we take advantage of undef index?
-    if (Idx0 >= 0 && Idx1 >= 0 &&
-        In[Idx0/2].getOpcode() == ISD::INSERT_VECTOR_ELT &&
-        In[Idx1/2].getOpcode() == ISD::BUILD_VECTOR) {
-      ConstantSDNode* InsertVecIdx =
-                             dyn_cast<ConstantSDNode>(In[Idx0/2].getOperand(2));
-      if (InsertVecIdx &&
-          InsertVecIdx->getZExtValue() == (unsigned)(Idx0 % 2) &&
-          isZeroNode(In[Idx1/2].getOperand(Idx1 % 2))) {
-        return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
-                           In[Idx0/2].getOperand(1),
-                           In[Idx1/2].getOperand(Idx1 % 2));
-      }
-    }
-  }
+  if (VT.getSizeInBits() != 128)
+    return SDValue();
  
    // Try to combine a vector_shuffle into a 128-bit load.
    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  SDNode *Base = NULL;
-  if (!EltsFromConsecutiveLoads(SVN, NumElems, EVT, Base, DAG, MFI, TLI))
+  LoadSDNode *LD = NULL;
+  unsigned LastLoadedElt;
+  if (!EltsFromConsecutiveLoads(SVN, NumElems, EVT, LD, LastLoadedElt, DAG,
+                                MFI, TLI))
      return SDValue();
  
-  LoadSDNode *LD = cast<LoadSDNode>(Base);
-  if (isBaseAlignmentOfN(16, Base->getOperand(1).getNode(), TLI))
+  if (LastLoadedElt == NumElems - 1) {
+    if (isBaseAlignmentOfN(16, LD->getBasePtr().getNode(), TLI))
+      return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
+                         LD->getSrcValue(), LD->getSrcValueOffset(),
+                         LD->isVolatile());
      return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
                         LD->getSrcValue(), LD->getSrcValueOffset(),
-                       LD->isVolatile());
-  return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
-                     LD->getSrcValue(), LD->getSrcValueOffset(),
-                     LD->isVolatile(), LD->getAlignment());
-}
-
-/// PerformBuildVectorCombine - build_vector 0,(load i64 / f64) -> movq / movsd.
-static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
-                                         TargetLowering::DAGCombinerInfo &DCI,
-                                         const X86Subtarget *Subtarget,
-                                         const TargetLowering &TLI) {
-  unsigned NumOps = N->getNumOperands();
-  DebugLoc dl = N->getDebugLoc();
-
-  // Ignore single operand BUILD_VECTOR.
-  if (NumOps == 1)
-    return SDValue();
-
-  MVT VT = N->getValueType(0);
-  MVT EVT = VT.getVectorElementType();
-  if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit())
-    // We are looking for load i64 and zero extend. We want to transform
-    // it before legalizer has a chance to expand it. Also look for i64
-    // BUILD_PAIR bit casted to f64.
-    return SDValue();
-  // This must be an insertion into a zero vector.
-  SDValue HighElt = N->getOperand(1);
-  if (!isZeroNode(HighElt))
-    return SDValue();
-
-  // Value must be a load.
-  SDNode *Base = N->getOperand(0).getNode();
-  if (!isa<LoadSDNode>(Base)) {
-    if (Base->getOpcode() != ISD::BIT_CONVERT)
-      return SDValue();
-    Base = Base->getOperand(0).getNode();
-    if (!isa<LoadSDNode>(Base))
-      return SDValue();
+                       LD->isVolatile(), LD->getAlignment());
+  } else if (NumElems == 4 && LastLoadedElt == 1) {
+    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+    SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
+    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
    }
-
-  // Transform it into VZEXT_LOAD addr.
-  LoadSDNode *LD = cast<LoadSDNode>(Base);
-
-  // Load must not be an extload.
-  if (LD->getExtensionType() != ISD::NON_EXTLOAD)
-    return SDValue();
-
-  // Load type should legal type so we don't have to legalize it.
-  if (!TLI.isTypeLegal(VT))
-    return SDValue();
-
-  SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-  SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
-  SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
-  TargetLowering::TargetLoweringOpt TLO(DAG);
-  TLO.CombineTo(SDValue(Base, 1), ResNode.getValue(1));
-  DCI.CommitTargetLoweringOpt(TLO);
-  return ResNode;
+  return SDValue();
  }
  
  /// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
@@ -8245,7 +8395,10 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
    if (VT.getSizeInBits() != 64)
      return SDValue();
  
-  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloat && Subtarget->hasSSE2();
+  const Function *F = DAG.getMachineFunction().getFunction();
+  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
+  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 
+    && Subtarget->hasSSE2();
    if ((VT.isVector() ||
         (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
        isa<LoadSDNode>(St->getValue()) &&
@@ -8387,14 +8540,77 @@ static SDValue PerformBTCombine(SDNode *N,
    return SDValue();
  }
  
+static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op = N->getOperand(0);
+  if (Op.getOpcode() == ISD::BIT_CONVERT)
+    Op = Op.getOperand(0);
+  MVT VT = N->getValueType(0), OpVT = Op.getValueType();
+  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
+      VT.getVectorElementType().getSizeInBits() == 
+      OpVT.getVectorElementType().getSizeInBits()) {
+    return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op);
+  }
+  return SDValue();
+}
+
+// On X86 and X86-64, atomic operations are lowered to locked instructions.
+// Locked instructions, in turn, have implicit fence semantics (all memory
+// operations are flushed before issuing the locked instruction, and the
+// are not buffered), so we can fold away the common pattern of 
+// fence-atomic-fence.
+static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) {
+  SDValue atomic = N->getOperand(0);
+  switch (atomic.getOpcode()) {
+    case ISD::ATOMIC_CMP_SWAP:
+    case ISD::ATOMIC_SWAP:
+    case ISD::ATOMIC_LOAD_ADD:
+    case ISD::ATOMIC_LOAD_SUB:
+    case ISD::ATOMIC_LOAD_AND:
+    case ISD::ATOMIC_LOAD_OR:
+    case ISD::ATOMIC_LOAD_XOR:
+    case ISD::ATOMIC_LOAD_NAND:
+    case ISD::ATOMIC_LOAD_MIN:
+    case ISD::ATOMIC_LOAD_MAX:
+    case ISD::ATOMIC_LOAD_UMIN:
+    case ISD::ATOMIC_LOAD_UMAX:
+      break;
+    default:
+      return SDValue();
+  }
+  
+  SDValue fence = atomic.getOperand(0);
+  if (fence.getOpcode() != ISD::MEMBARRIER)
+    return SDValue();
+  
+  switch (atomic.getOpcode()) {
+    case ISD::ATOMIC_CMP_SWAP:
+      return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
+                                    atomic.getOperand(1), atomic.getOperand(2),
+                                    atomic.getOperand(3));
+    case ISD::ATOMIC_SWAP:
+    case ISD::ATOMIC_LOAD_ADD:
+    case ISD::ATOMIC_LOAD_SUB:
+    case ISD::ATOMIC_LOAD_AND:
+    case ISD::ATOMIC_LOAD_OR:
+    case ISD::ATOMIC_LOAD_XOR:
+    case ISD::ATOMIC_LOAD_NAND:
+    case ISD::ATOMIC_LOAD_MIN:
+    case ISD::ATOMIC_LOAD_MAX:
+    case ISD::ATOMIC_LOAD_UMIN:
+    case ISD::ATOMIC_LOAD_UMAX:
+      return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
+                                    atomic.getOperand(1), atomic.getOperand(2));
+    default:
+      return SDValue();
+  }
+}
+
  SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                               DAGCombinerInfo &DCI) const {
    SelectionDAG &DAG = DCI.DAG;
    switch (N->getOpcode()) {
    default: break;
    case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
-  case ISD::BUILD_VECTOR:
-    return PerformBuildVectorCombine(N, DAG, DCI, Subtarget, *this);
    case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
    case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
    case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
@@ -8406,6 +8622,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
    case X86ISD::FOR:         return PerformFORCombine(N, DAG);
    case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
    case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
+  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
+  case ISD::MEMBARRIER:     return PerformMEMBARRIERCombine(N, DAG);
    }
  
    return SDValue();
@@ -8487,6 +8705,14 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
        }
      }
      return;
+  case 'K':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
    case 'N':
      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
        if (C->getZExtValue() <= 255) {
@@ -8553,10 +8779,15 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
            continue;
          }
        }
-      
+
        // Otherwise, this isn't something we can handle, reject it.
        return;
      }
+    // If we require an extra load to get this address, as in PIC mode, we
+    // can't accept it.
+    if (Subtarget->GVRequiresExtraLoad(GA->getGlobal(),
+                                       getTargetMachine(), false))
+      return;
  
      if (hasMemory)
        Op = LowerGlobalAddress(GA->getGlobal(), Op.getDebugLoc(), Offset, DAG);