Revert r185595-185596 which broke buildbots.

[oota-llvm.git] / lib / Target / ARM / ARMISelLowering.cpp
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp

index b2eb5784879370213fb9bccd3699ef79f2fc2fe5..9b68590f856d07ac6b8c2c24afb57b4fce933466 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -23,14 +23,8 @@
  #include "ARMTargetMachine.h"
  #include "ARMTargetObjectFile.h"
  #include "MCTargetDesc/ARMAddressingModes.h"
-#include "llvm/CallingConv.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/GlobalValue.h"
-#include "llvm/Instruction.h"
-#include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/Type.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
  #include "llvm/CodeGen/CallingConvLower.h"
  #include "llvm/CodeGen/IntrinsicLowering.h"
  #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -40,14 +34,20 @@
  #include "llvm/CodeGen/MachineModuleInfo.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
  #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
  #include "llvm/MC/MCSectionMachO.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/Statistic.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/ErrorHandling.h"
  #include "llvm/Support/MathExtras.h"
  #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
  using namespace llvm;
  
  STATISTIC(NumTailCalls, "Number of tail calls");
@@ -504,6 +504,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
      setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
      setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
+    setOperationAction(ISD::FMA, MVT::v2f64, Expand);
  
      setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
      setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
@@ -515,8 +516,29 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
      setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
      setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
+    setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
      setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
  
+    // Mark v2f32 intrinsics.
+    setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
+    setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
+    setOperationAction(ISD::FPOWI, MVT::v2f32, Expand);
+    setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
+    setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
+    setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
+    setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
+
      // Neon does not support some operations on v1i64 and v2i64 types.
      setOperationAction(ISD::MUL, MVT::v1i64, Expand);
      // Custom handling for some quad-vector types to detect VMULL.
@@ -539,6 +561,33 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
      setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
  
+    setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
+    setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
+
+    // Custom expand long extensions to vectors.
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32,  Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32,  Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64,  Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64,  Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
+
+    // NEON does not have single instruction CTPOP for vectors with element
+    // types wider than 8-bits.  However, custom lowering can leverage the
+    // v8i8/v16i8 vcnt instruction.
+    setOperationAction(ISD::CTPOP,      MVT::v2i32, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
+
+    // NEON only has FMA instructions as of VFP4.
+    if (!Subtarget->hasVFP4()) {
+      setOperationAction(ISD::FMA, MVT::v2f32, Expand);
+      setOperationAction(ISD::FMA, MVT::v4f32, Expand);
+    }
+
      setTargetDAGCombine(ISD::INTRINSIC_VOID);
      setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
      setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
@@ -632,6 +681,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i32  , Expand);
    setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i32  , Expand);
  
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
+
    // Only ARMv6 has BSWAP.
    if (!Subtarget->hasV6Ops())
      setOperationAction(ISD::BSWAP, MVT::i32, Expand);
@@ -680,7 +731,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
        (Subtarget->hasV6Ops() && !Subtarget->isThumb())) {
      // membarrier needs custom lowering; the rest are legal and handled
      // normally.
-    setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom);
      setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
      // Custom lowering for 64-bit ops
      setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i64, Custom);
@@ -688,13 +738,16 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i64, Custom);
      setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i64, Custom);
      setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i64, Custom);
-    setOperationAction(ISD::ATOMIC_SWAP,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
      setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
      // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
      setInsertFencesForAtomic(true);
    } else {
      // Set them all for expansion, which will force libcalls.
-    setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
      setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other, Expand);
      setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
      setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
@@ -712,8 +765,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      // Unordered/Monotonic case.
      setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
      setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
-    // Since the libcalls include locking, fold in the fences
-    setShouldFoldAtomicFences(true);
    }
  
    setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
@@ -762,6 +813,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
    setOperationAction(ISD::FSIN,      MVT::f32, Expand);
    setOperationAction(ISD::FCOS,      MVT::f32, Expand);
    setOperationAction(ISD::FCOS,      MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
    setOperationAction(ISD::FREM,      MVT::f64, Expand);
    setOperationAction(ISD::FREM,      MVT::f32, Expand);
    if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
@@ -814,18 +867,19 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
      setSchedulingPreference(Sched::Hybrid);
  
    //// temporary - rewrite interface to use type
-  maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 1;
-  maxStoresPerMemset = 16;
-  maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+  MaxStoresPerMemset = 8;
+  MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+  MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
+  MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
+  MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
+  MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
  
    // On ARM arguments smaller than 4 bytes are extended, so all arguments
    // are at least 4 bytes aligned.
    setMinStackArgumentAlignment(4);
  
-  benefitFromCodePlacementOpt = true;
-
    // Prefer likely predicted branches to selects on out-of-order cores.
-  predictableSelectIsExpensive = Subtarget->isLikeA9();
+  PredictableSelectIsExpensive = Subtarget->isLikeA9();
  
    setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
  }
@@ -841,10 +895,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
  // due to the common occurrence of cross class copies and subregister insertions
  // and extractions.
  std::pair<const TargetRegisterClass*, uint8_t>
-ARMTargetLowering::findRepresentativeClass(EVT VT) const{
+ARMTargetLowering::findRepresentativeClass(MVT VT) const{
    const TargetRegisterClass *RRC = 0;
    uint8_t Cost = 1;
-  switch (VT.getSimpleVT().SimpleTy) {
+  switch (VT.SimpleTy) {
    default:
      return TargetLowering::findRepresentativeClass(VT);
    // Use DPR as representative register class for all floating point
@@ -1017,14 +1071,14 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
    }
  }
  
-EVT ARMTargetLowering::getSetCCResultType(EVT VT) const {
+EVT ARMTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
    if (!VT.isVector()) return getPointerTy();
    return VT.changeVectorElementTypeToInteger();
  }
  
  /// getRegClassFor - Return the register class that should be used for the
  /// specified value type.
-const TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const {
+const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
    // Map v4i64 to QQ registers but do not make the type legal. Similarly map
    // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
    // load / store 4 to 8 consecutive D registers.
@@ -1181,8 +1235,9 @@ SDValue
  ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                     CallingConv::ID CallConv, bool isVarArg,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                   DebugLoc dl, SelectionDAG &DAG,
-                                   SmallVectorImpl<SDValue> &InVals) const {
+                                   SDLoc dl, SelectionDAG &DAG,
+                                   SmallVectorImpl<SDValue> &InVals,
+                                   bool isThisReturn, SDValue ThisVal) const {
  
    // Assign locations to each value returned by this call.
    SmallVector<CCValAssign, 16> RVLocs;
@@ -1196,6 +1251,15 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
    for (unsigned i = 0; i != RVLocs.size(); ++i) {
      CCValAssign VA = RVLocs[i];
  
+    // Pass 'this' value directly from the argument to return value, to avoid
+    // reg unit interference
+    if (i == 0 && isThisReturn) {
+      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
+             "unexpected return calling convention register assignment");
+      InVals.push_back(ThisVal);
+      continue;
+    }
+
      SDValue Val;
      if (VA.needsCustom()) {
        // Handle f64 or half of a v2f64.
@@ -1252,7 +1316,7 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
  SDValue
  ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
                                      SDValue StackPtr, SDValue Arg,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                      const CCValAssign &VA,
                                      ISD::ArgFlagsTy Flags) const {
    unsigned LocMemOffset = VA.getLocMemOffset();
@@ -1263,7 +1327,7 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
                        false, false, 0);
  }
  
-void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
+void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
                                           SDValue Chain, SDValue &Arg,
                                           RegsToPassVector &RegsToPass,
                                           CCValAssign &VA, CCValAssign &NextVA,
@@ -1295,7 +1359,7 @@ SDValue
  ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                               SmallVectorImpl<SDValue> &InVals) const {
    SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
+  SDLoc &dl                          = CLI.DL;
    SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
    SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
    SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
@@ -1307,21 +1371,22 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    bool isVarArg                         = CLI.IsVarArg;
  
    MachineFunction &MF = DAG.getMachineFunction();
-  bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
-  bool IsSibCall = false;
+  bool isStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+  bool isThisReturn   = false;
+  bool isSibCall      = false;
    // Disable tail calls if they're not supported.
    if (!EnableARMTailCalls && !Subtarget->supportsTailCall())
      isTailCall = false;
    if (isTailCall) {
      // Check if it's really possible to do a tail call.
      isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
-                    isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
+                    isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(),
                                                     Outs, OutVals, Ins, DAG);
      // We don't support GuaranteedTailCallOpt for ARM, only automatically
      // detected sibcalls.
      if (isTailCall) {
        ++NumTailCalls;
-      IsSibCall = true;
+      isSibCall = true;
      }
    }
  
@@ -1337,13 +1402,14 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    unsigned NumBytes = CCInfo.getNextStackOffset();
  
    // For tail calls, memory operands are available in our caller's stack.
-  if (IsSibCall)
+  if (isSibCall)
      NumBytes = 0;
  
    // Adjust the stack pointer for the new arguments...
    // These operations are automatically eliminated by the prolog/epilog pass
-  if (!IsSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  if (!isSibCall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                                 dl);
  
    SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
  
@@ -1404,6 +1470,13 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                           StackPtr, MemOpChains, Flags);
        }
      } else if (VA.isRegLoc()) {
+      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) {
+        assert(VA.getLocVT() == MVT::i32 &&
+               "unexpected calling convention register assignment");
+        assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
+               "unexpected use of 'returned'");
+        isThisReturn = true;
+      }
        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
      } else if (isByVal) {
        assert(VA.isMemLoc());
@@ -1411,10 +1484,17 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  
        // True if this byval aggregate will be split between registers
        // and memory.
-      if (CCInfo.isFirstByValRegValid()) {
+      unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
+      unsigned CurByValIdx = CCInfo.getInRegsParamsProceed();
+
+      if (CurByValIdx < ByValArgsCount) {
+
+        unsigned RegBegin, RegEnd;
+        CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
+
          EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
          unsigned int i, j;
-        for (i = 0, j = CCInfo.getFirstByValReg(); j < ARM::R4; i++, j++) {
+        for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
            SDValue Const = DAG.getConstant(4*i, MVT::i32);
            SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
            SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
@@ -1423,11 +1503,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
            MemOpChains.push_back(Load.getValue(1));
            RegsToPass.push_back(std::make_pair(j, Load));
          }
-        offset = ARM::R4 - CCInfo.getFirstByValReg();
-        CCInfo.clearFirstByValReg();
+
+        // If parameter size outsides register area, "offset" value
+        // helps us to calculate stack slot for remained part properly.
+        offset = RegEnd - RegBegin;
+
+        CCInfo.nextInRegsParam();
        }
  
-      if (Flags.getByValSize() - 4*offset > 0) {
+      if (Flags.getByValSize() > 4*offset) {
          unsigned LocMemOffset = VA.getLocMemOffset();
          SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset);
          SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
@@ -1443,7 +1527,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
          MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
                                            Ops, array_lengthof(Ops)));
        }
-    } else if (!IsSibCall) {
+    } else if (!isSibCall) {
        assert(VA.isMemLoc());
  
        MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
@@ -1483,7 +1567,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                 RegsToPass[i].second, InFlag);
        InFlag = Chain.getValue(1);
      }
-    InFlag =SDValue();
+    InFlag = SDValue();
    }
  
    // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
@@ -1557,7 +1641,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
        // On ELF targets for PIC code, direct calls should go through the PLT
        unsigned OpFlags = 0;
        if (Subtarget->isTargetELF() &&
-                  getTargetMachine().getRelocationModel() == Reloc::PIC_)
+          getTargetMachine().getRelocationModel() == Reloc::PIC_)
          OpFlags = ARMII::MO_PLT;
        Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
      }
@@ -1594,19 +1678,19 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
  
    // FIXME: handle tail calls differently.
    unsigned CallOpc;
+  bool HasMinSizeAttr = MF.getFunction()->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
    if (Subtarget->isThumb()) {
      if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
        CallOpc = ARMISD::CALL_NOLINK;
-    else if (doesNotRet && isDirect && !isARMFunc &&
-             Subtarget->hasRAS() && !Subtarget->isThumb1Only())
-      // "mov lr, pc; b _foo" to avoid confusing the RSP
-      CallOpc = ARMISD::CALL_NOLINK;
      else
        CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
    } else {
-    if (!isDirect && !Subtarget->hasV5TOps()) {
+    if (!isDirect && !Subtarget->hasV5TOps())
        CallOpc = ARMISD::CALL_NOLINK;
-    } else if (doesNotRet && isDirect && Subtarget->hasRAS())
+    else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
+               // Emit regular call when code size is the priority
+               !HasMinSizeAttr)
        // "mov lr, pc; b _foo" to avoid confusing the RSP
        CallOpc = ARMISD::CALL_NOLINK;
      else
@@ -1624,8 +1708,22 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                    RegsToPass[i].second.getValueType()));
  
    // Add a register mask operand representing the call-preserved registers.
+  const uint32_t *Mask;
    const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+  const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo*>(TRI);
+  if (isThisReturn) {
+    // For 'this' returns, use the R0-preserving mask if applicable
+    Mask = ARI->getThisReturnPreservedMask(CallConv);
+    if (!Mask) {
+      // Set isThisReturn to false if the calling convention is not one that
+      // allows 'returned' to be modeled in this way, so LowerCallResult does
+      // not try to pass 'this' straight through 
+      isThisReturn = false;
+      Mask = ARI->getCallPreservedMask(CallConv);
+    }
+  } else
+    Mask = ARI->getCallPreservedMask(CallConv);
+
    assert(Mask && "Missing call preserved mask for calling convention");
    Ops.push_back(DAG.getRegisterMask(Mask));
  
@@ -1641,14 +1739,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    InFlag = Chain.getValue(1);
  
    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                             DAG.getIntPtrConstant(0, true), InFlag);
+                             DAG.getIntPtrConstant(0, true), InFlag, dl);
    if (!Ins.empty())
      InFlag = Chain.getValue(1);
  
    // Handle result values, copying them out of physregs into vregs that we
    // return.
-  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins,
-                         dl, DAG, InVals);
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
+                         InVals, isThisReturn,
+                         isThisReturn ? OutVals[0] : SDValue());
  }
  
  /// HandleByVal - Every parameter *after* a byval parameter is passed
@@ -1662,8 +1761,24 @@ ARMTargetLowering::HandleByVal(
    assert((State->getCallOrPrologue() == Prologue ||
            State->getCallOrPrologue() == Call) &&
           "unhandled ParmContext");
-  if ((!State->isFirstByValRegValid()) &&
-      (ARM::R0 <= reg) && (reg <= ARM::R3)) {
+
+  // For in-prologue parameters handling, we also introduce stack offset
+  // for byval registers: see CallingConvLower.cpp, CCState::HandleByVal.
+  // This behaviour outsides AAPCS rules (5.5 Parameters Passing) of how
+  // NSAA should be evaluted (NSAA means "next stacked argument address").
+  // So: NextStackOffset = NSAAOffset + SizeOfByValParamsStoredInRegs.
+  // Then: NSAAOffset = NextStackOffset - SizeOfByValParamsStoredInRegs.
+  unsigned NSAAOffset = State->getNextStackOffset();
+  if (State->getCallOrPrologue() != Call) {
+    for (unsigned i = 0, e = State->getInRegsParamsCount(); i != e; ++i) {
+      unsigned RB, RE;
+      State->getInRegsParamInfo(i, RB, RE);
+      assert(NSAAOffset >= (RE-RB)*4 &&
+             "Stack offset for byval regs doesn't introduced anymore?");
+      NSAAOffset -= (RE-RB)*4;
+    }
+  }
+  if ((ARM::R0 <= reg) && (reg <= ARM::R3)) {
      if (Subtarget->isAAPCS_ABI() && Align > 4) {
        unsigned AlignInRegs = Align / 4;
        unsigned Waste = (ARM::R4 - reg) % AlignInRegs;
@@ -1671,22 +1786,45 @@ ARMTargetLowering::HandleByVal(
          reg = State->AllocateReg(GPRArgRegs, 4);
      }
      if (reg != 0) {
-      State->setFirstByValReg(reg);
+      unsigned excess = 4 * (ARM::R4 - reg);
+
+      // Special case when NSAA != SP and parameter size greater than size of
+      // all remained GPR regs. In that case we can't split parameter, we must
+      // send it to stack. We also must set NCRN to R4, so waste all
+      // remained registers.
+      if (Subtarget->isAAPCS_ABI() && NSAAOffset != 0 && size > excess) {
+        while (State->AllocateReg(GPRArgRegs, 4))
+          ;
+        return;
+      }
+
+      // First register for byval parameter is the first register that wasn't
+      // allocated before this method call, so it would be "reg".
+      // If parameter is small enough to be saved in range [reg, r4), then
+      // the end (first after last) register would be reg + param-size-in-regs,
+      // else parameter would be splitted between registers and stack,
+      // end register would be r4 in this case.
+      unsigned ByValRegBegin = reg;
+      unsigned ByValRegEnd = (size < excess) ? reg + size/4 : (unsigned)ARM::R4;
+      State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
+      // Note, first register is allocated in the beginning of function already,
+      // allocate remained amount of registers we need.
+      for (unsigned i = reg+1; i != ByValRegEnd; ++i)
+        State->AllocateReg(GPRArgRegs, 4);
        // At a call site, a byval parameter that is split between
        // registers and memory needs its size truncated here.  In a
        // function prologue, such byval parameters are reassembled in
        // memory, and are not truncated.
        if (State->getCallOrPrologue() == Call) {
-        unsigned excess = 4 * (ARM::R4 - reg);
-        assert(size >= excess && "expected larger existing stack allocation");
-        size -= excess;
+        // Make remained size equal to 0 in case, when
+        // the whole structure may be stored into registers.
+        if (size < excess)
+          size = 0;
+        else
+          size -= excess;
        }
      }
    }
-  // Confiscate any remaining parameter registers to preclude their
-  // assignment to subsequent parameters.
-  while (State->AllocateReg(GPRArgRegs, 4))
-    ;
  }
  
  /// MatchingStackOffset - Return true if the given stack call argument is
@@ -1818,7 +1956,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
    // local frame.
    const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction().
                                        getInfo<ARMFunctionInfo>();
-  if (AFI_Caller->getVarArgsRegSaveSize())
+  if (AFI_Caller->getArgRegsSaveSize())
      return false;
  
    // If the callee takes no arguments then go on to check the results of the
@@ -1875,12 +2013,23 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
    return true;
  }
  
+bool
+ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                  MachineFunction &MF, bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true,
+                                                    isVarArg));
+}
+
  SDValue
  ARMTargetLowering::LowerReturn(SDValue Chain,
                                 CallingConv::ID CallConv, bool isVarArg,
                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
                                 const SmallVectorImpl<SDValue> &OutVals,
-                               DebugLoc dl, SelectionDAG &DAG) const {
+                               SDLoc dl, SelectionDAG &DAG) const {
  
    // CCValAssign - represent the assignment of the return value to a location.
    SmallVector<CCValAssign, 16> RVLocs;
@@ -1893,15 +2042,9 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
    CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
                                                 isVarArg));
  
-  // If this is the first return lowered for this function, add
-  // the regs to the liveout set for the function.
-  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
-    for (unsigned i = 0; i != RVLocs.size(); ++i)
-      if (RVLocs[i].isRegLoc())
-        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
-  }
-
    SDValue Flag;
+  SmallVector<SDValue, 4> RetOps;
+  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
  
    // Copy the result values into the output registers.
    for (unsigned i = 0, realRVLocIdx = 0;
@@ -1930,10 +2073,12 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
  
          Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag);
          Flag = Chain.getValue(1);
+        RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
          VA = RVLocs[++i]; // skip ahead to next loc
          Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
                                   HalfGPRs.getValue(1), Flag);
          Flag = Chain.getValue(1);
+        RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
          VA = RVLocs[++i]; // skip ahead to next loc
  
          // Extract the 2nd half and fall through to handle it as an f64 value.
@@ -1946,6 +2091,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
                                    DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1);
        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag);
        Flag = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
        VA = RVLocs[++i]; // skip ahead to next loc
        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(1),
                                 Flag);
@@ -1955,15 +2101,16 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
      // Guarantee that all emitted copies are
      // stuck together, avoiding something bad.
      Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
    }
  
-  SDValue result;
+  // Update chain and glue.
+  RetOps[0] = Chain;
    if (Flag.getNode())
-    result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
-  else // Return Void
-    result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain);
+    RetOps.push_back(Flag);
  
-  return result;
+  return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other,
+                     RetOps.data(), RetOps.size());
  }
  
  bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
@@ -2010,7 +2157,7 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
      Copy = *Copy->use_begin();
      if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
        return false;
-    Chain = Copy->getOperand(0);
+    TCChain = Copy->getOperand(0);
    } else {
      return false;
    }
@@ -2049,7 +2196,7 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
  static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
    EVT PtrVT = Op.getValueType();
    // FIXME there is no actual debug info here
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
    SDValue Res;
    if (CP->isMachineConstantPoolEntry())
@@ -2070,7 +2217,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
    MachineFunction &MF = DAG.getMachineFunction();
    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
    unsigned ARMPCLabelIndex = 0;
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
    EVT PtrVT = getPointerTy();
    const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
    Reloc::Model RelocM = getTargetMachine().getRelocationModel();
@@ -2099,7 +2246,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
  SDValue
  ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
                                                   SelectionDAG &DAG) const {
-  DebugLoc dl = GA->getDebugLoc();
+  SDLoc dl(GA);
    EVT PtrVT = getPointerTy();
    unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
    MachineFunction &MF = DAG.getMachineFunction();
@@ -2142,7 +2289,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
                                          SelectionDAG &DAG,
                                          TLSModel::Model model) const {
    const GlobalValue *GV = GA->getGlobal();
-  DebugLoc dl = GA->getDebugLoc();
+  SDLoc dl(GA);
    SDValue Offset;
    SDValue Chain = DAG.getEntryNode();
    EVT PtrVT = getPointerTy();
@@ -2212,10 +2359,9 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
  SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
                                                   SelectionDAG &DAG) const {
    EVT PtrVT = getPointerTy();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
-  if (RelocM == Reloc::PIC_) {
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
      bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
      ARMConstantPoolValue *CPV =
        ARMConstantPoolConstant::Create(GV,
@@ -2256,11 +2402,9 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
  SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
                                                      SelectionDAG &DAG) const {
    EVT PtrVT = getPointerTy();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
    Reloc::Model RelocM = getTargetMachine().getRelocationModel();
-  MachineFunction &MF = DAG.getMachineFunction();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  
    // FIXME: Enable this for static codegen when tool issues are fixed.  Also
    // update ARMFastISel::ARMMaterializeGV.
@@ -2288,6 +2432,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
    if (RelocM == Reloc::Static) {
      CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
    } else {
+    ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
      ARMPCLabelIndex = AFI->createPICLabelUId();
      unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8);
      ARMConstantPoolValue *CPV =
@@ -2322,7 +2467,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
    EVT PtrVT = getPointerTy();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
    ARMConstantPoolValue *CPV =
      ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_",
@@ -2338,7 +2483,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
  
  SDValue
  ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    SDValue Val = DAG.getConstant(0, MVT::i32);
    return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
                       DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
@@ -2347,7 +2492,7 @@ ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
  
  SDValue
  ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
                       Op.getOperand(1), DAG.getConstant(0, MVT::i32));
  }
@@ -2356,7 +2501,7 @@ SDValue
  ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                                            const ARMSubtarget *Subtarget) const {
    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    switch (IntNo) {
    default: return SDValue();    // Don't custom lower most intrinsics.
    case Intrinsic::arm_thread_pointer: {
@@ -2368,7 +2513,6 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
      ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
      EVT PtrVT = getPointerTy();
-    DebugLoc dl = Op.getDebugLoc();
      Reloc::Model RelocM = getTargetMachine().getRelocationModel();
      SDValue CPAddr;
      unsigned PCAdj = (RelocM != Reloc::PIC_)
@@ -2393,45 +2537,16 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
    case Intrinsic::arm_neon_vmullu: {
      unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
        ? ARMISD::VMULLs : ARMISD::VMULLu;
-    return DAG.getNode(NewOpc, Op.getDebugLoc(), Op.getValueType(),
+    return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
                         Op.getOperand(1), Op.getOperand(2));
    }
    }
  }
  
-static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG,
-                               const ARMSubtarget *Subtarget) {
-  DebugLoc dl = Op.getDebugLoc();
-  if (!Subtarget->hasDataBarrier()) {
-    // Some ARMv6 cpus can support data barriers with an mcr instruction.
-    // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
-    // here.
-    assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
-           "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
-    return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
-                       DAG.getConstant(0, MVT::i32));
-  }
-
-  SDValue Op5 = Op.getOperand(5);
-  bool isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue() != 0;
-  unsigned isLL = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-  unsigned isLS = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
-  bool isOnlyStoreBarrier = (isLL == 0 && isLS == 0);
-
-  ARM_MB::MemBOpt DMBOpt;
-  if (isDeviceBarrier)
-    DMBOpt = isOnlyStoreBarrier ? ARM_MB::ST : ARM_MB::SY;
-  else
-    DMBOpt = isOnlyStoreBarrier ? ARM_MB::ISHST : ARM_MB::ISH;
-  return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
-                     DAG.getConstant(DMBOpt, MVT::i32));
-}
-
-
  static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
                                   const ARMSubtarget *Subtarget) {
    // FIXME: handle "fence singlethread" more efficiently.
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    if (!Subtarget->hasDataBarrier()) {
      // Some ARMv6 cpus can support data barriers with an mcr instruction.
      // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
@@ -2442,8 +2557,18 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
                         DAG.getConstant(0, MVT::i32));
    }
  
+  ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
+  unsigned Domain = ARM_MB::ISH;
+  if (Subtarget->isSwift() && Ord == Release) {
+    // Swift happens to implement ISHST barriers in a way that's compatible with
+    // Release semantics but weaker than ISH so we'd be fools not to use
+    // it. Beware: other processors probably don't!
+    Domain = ARM_MB::ISHST;
+  }
+
    return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
-                     DAG.getConstant(ARM_MB::ISH, MVT::i32));
+                     DAG.getConstant(Domain, MVT::i32));
  }
  
  static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
@@ -2454,7 +2579,7 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
      // Just preserve the chain.
      return Op.getOperand(0);
  
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
    if (!isRead &&
        (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
@@ -2479,7 +2604,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
  
    // vastart just stores the address of the VarArgsFrameIndex slot into the
    // memory location argument.
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
    const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
@@ -2490,7 +2615,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
  SDValue
  ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
                                          SDValue &Root, SelectionDAG &DAG,
-                                        DebugLoc dl) const {
+                                        SDLoc dl) const {
    MachineFunction &MF = DAG.getMachineFunction();
    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
  
@@ -2524,12 +2649,17 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
  
  void
  ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
-                                  unsigned &VARegSize, unsigned &VARegSaveSize)
+                                  unsigned InRegsParamRecordIdx,
+                                  unsigned ArgSize,
+                                  unsigned &ArgRegsSize,
+                                  unsigned &ArgRegsSaveSize)
    const {
    unsigned NumGPRs;
-  if (CCInfo.isFirstByValRegValid())
-    NumGPRs = ARM::R4 - CCInfo.getFirstByValReg();
-  else {
+  if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
+    unsigned RBegin, REnd;
+    CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
+    NumGPRs = REnd - RBegin;
+  } else {
      unsigned int firstUnalloced;
      firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs,
                                                  sizeof(GPRArgRegs) /
@@ -2538,50 +2668,103 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
    }
  
    unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
-  VARegSize = NumGPRs * 4;
-  VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1);
+  ArgRegsSize = NumGPRs * 4;
+
+  // If parameter is split between stack and GPRs...
+  if (NumGPRs && Align == 8 &&
+      (ArgRegsSize < ArgSize ||
+        InRegsParamRecordIdx >= CCInfo.getInRegsParamsCount())) {
+    // Add padding for part of param recovered from GPRs, so
+    // its last byte must be at address K*8 - 1.
+    // We need to do it, since remained (stack) part of parameter has
+    // stack alignment, and we need to "attach" "GPRs head" without gaps
+    // to it:
+    // Stack:
+    // |---- 8 bytes block ----| |---- 8 bytes block ----| |---- 8 bytes...
+    // [ [padding] [GPRs head] ] [        Tail passed via stack       ....
+    //
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    unsigned Padding =
+        ((ArgRegsSize + AFI->getArgRegsSaveSize() + Align - 1) & ~(Align-1)) -
+        (ArgRegsSize + AFI->getArgRegsSaveSize());
+    ArgRegsSaveSize = ArgRegsSize + Padding;
+  } else
+    // We don't need to extend regs save size for byval parameters if they
+    // are passed via GPRs only.
+    ArgRegsSaveSize = ArgRegsSize;
  }
  
  // The remaining GPRs hold either the beginning of variable-argument
-// data, or the beginning of an aggregate passed by value (usuall
+// data, or the beginning of an aggregate passed by value (usually
  // byval).  Either way, we allocate stack slots adjacent to the data
  // provided by our caller, and store the unallocated registers there.
  // If this is a variadic function, the va_list pointer will begin with
  // these values; otherwise, this reassembles a (byval) structure that
  // was split between registers and memory.
-void
-ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
-                                        DebugLoc dl, SDValue &Chain,
-                                        const Value *OrigArg,
-                                        unsigned OffsetFromOrigArg,
-                                        unsigned ArgOffset) const {
+// Return: The frame index registers were stored into.
+int
+ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
+                                  SDLoc dl, SDValue &Chain,
+                                  const Value *OrigArg,
+                                  unsigned InRegsParamRecordIdx,
+                                  unsigned OffsetFromOrigArg,
+                                  unsigned ArgOffset,
+                                  unsigned ArgSize,
+                                  bool ForceMutable) const {
+
+  // Currently, two use-cases possible:
+  // Case #1. Non var-args function, and we meet first byval parameter.
+  //          Setup first unallocated register as first byval register;
+  //          eat all remained registers
+  //          (these two actions are performed by HandleByVal method).
+  //          Then, here, we initialize stack frame with
+  //          "store-reg" instructions.
+  // Case #2. Var-args function, that doesn't contain byval parameters.
+  //          The same: eat all remained unallocated registers,
+  //          initialize stack frame.
+
    MachineFunction &MF = DAG.getMachineFunction();
    MachineFrameInfo *MFI = MF.getFrameInfo();
    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  unsigned firstRegToSaveIndex;
-  if (CCInfo.isFirstByValRegValid())
-    firstRegToSaveIndex = CCInfo.getFirstByValReg() - ARM::R0;
-  else {
+  unsigned firstRegToSaveIndex, lastRegToSaveIndex;
+  unsigned RBegin, REnd;
+  if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
+    CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
+    firstRegToSaveIndex = RBegin - ARM::R0;
+    lastRegToSaveIndex = REnd - ARM::R0;
+  } else {
      firstRegToSaveIndex = CCInfo.getFirstUnallocated
        (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0]));
+    lastRegToSaveIndex = 4;
    }
  
-  unsigned VARegSize, VARegSaveSize;
-  computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize);
-  if (VARegSaveSize) {
-    // If this function is vararg, store any remaining integer argument regs
-    // to their spots on the stack so that they may be loaded by deferencing
-    // the result of va_next.
-    AFI->setVarArgsRegSaveSize(VARegSaveSize);
-    AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(VARegSaveSize,
-                                                     ArgOffset + VARegSaveSize
-                                                     - VARegSize,
-                                                     false));
-    SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(),
-                                    getPointerTy());
+  unsigned ArgRegsSize, ArgRegsSaveSize;
+  computeRegArea(CCInfo, MF, InRegsParamRecordIdx, ArgSize,
+                 ArgRegsSize, ArgRegsSaveSize);
+
+  // Store any by-val regs to their spots on the stack so that they may be
+  // loaded by deferencing the result of formal parameter pointer or va_next.
+  // Note: once stack area for byval/varargs registers
+  // was initialized, it can't be initialized again.
+  if (ArgRegsSaveSize) {
+
+    unsigned Padding = ArgRegsSaveSize - ArgRegsSize;
+
+    if (Padding) {
+      assert(AFI->getStoredByValParamsPadding() == 0 &&
+             "The only parameter may be padded.");
+      AFI->setStoredByValParamsPadding(Padding);
+    }
+
+    int FrameIndex = MFI->CreateFixedObject(
+                      ArgRegsSaveSize,
+                      Padding + ArgOffset,
+                      false);
+    SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy());
  
      SmallVector<SDValue, 4> MemOps;
-    for (unsigned i = 0; firstRegToSaveIndex < 4; ++firstRegToSaveIndex, ++i) {
+    for (unsigned i = 0; firstRegToSaveIndex < lastRegToSaveIndex;
+         ++firstRegToSaveIndex, ++i) {
        const TargetRegisterClass *RC;
        if (AFI->isThumb1OnlyFunction())
          RC = &ARM::tGPRRegClass;
@@ -2598,12 +2781,38 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
        FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
                          DAG.getConstant(4, getPointerTy()));
      }
+
+    AFI->setArgRegsSaveSize(ArgRegsSaveSize + AFI->getArgRegsSaveSize());
+
      if (!MemOps.empty())
        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                            &MemOps[0], MemOps.size());
+    return FrameIndex;
    } else
      // This will point to the next argument passed via stack.
-    AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true));
+    return MFI->CreateFixedObject(
+        4, AFI->getStoredByValParamsPadding() + ArgOffset, !ForceMutable);
+}
+
+// Setup stack frame, the va_list pointer will start from.
+void
+ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
+                                        SDLoc dl, SDValue &Chain,
+                                        unsigned ArgOffset,
+                                        bool ForceMutable) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  // Try to store any remaining integer argument regs
+  // to their spots on the stack so that they may be loaded by deferencing
+  // the result of va_next.
+  // If there is no regs to be stored, just point address after last
+  // argument passed via stack.
+  int FrameIndex =
+    StoreByValRegs(CCInfo, DAG, dl, Chain, 0, CCInfo.getInRegsParamsCount(),
+                   0, ArgOffset, 0, ForceMutable);
+
+  AFI->setVarArgsFrameIndex(FrameIndex);
  }
  
  SDValue
@@ -2611,7 +2820,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
                                          CallingConv::ID CallConv, bool isVarArg,
                                          const SmallVectorImpl<ISD::InputArg>
                                            &Ins,
-                                        DebugLoc dl, SelectionDAG &DAG,
+                                        SDLoc dl, SelectionDAG &DAG,
                                          SmallVectorImpl<SDValue> &InVals)
                                            const {
    MachineFunction &MF = DAG.getMachineFunction();
@@ -2626,12 +2835,18 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
    CCInfo.AnalyzeFormalArguments(Ins,
                                  CCAssignFnForNode(CallConv, /* Return*/ false,
                                                    isVarArg));
-  
+
    SmallVector<SDValue, 16> ArgValues;
    int lastInsIndex = -1;
    SDValue ArgValue;
    Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
    unsigned CurArgIdx = 0;
+
+  // Initially ArgRegsSaveSize is zero.
+  // Then we increase this value each time we meet byval parameter.
+  // We also increase this value in case of varargs function.
+  AFI->setArgRegsSaveSize(0);
+
    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
      CCValAssign &VA = ArgLocs[i];
      std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx);
@@ -2729,18 +2944,21 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
            // Since they could be overwritten by lowering of arguments in case of
            // a tail call.
            if (Flags.isByVal()) {
-            unsigned VARegSize, VARegSaveSize;
-            computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize);
-            VarArgStyleRegisters(CCInfo, DAG,
-                                 dl, Chain, CurOrigArg, Ins[VA.getValNo()].PartOffset, 0);
-            unsigned Bytes = Flags.getByValSize() - VARegSize;
-            if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
-            int FI = MFI->CreateFixedObject(Bytes,
-                                            VA.getLocMemOffset(), false);
-            InVals.push_back(DAG.getFrameIndex(FI, getPointerTy()));
+            unsigned CurByValIndex = CCInfo.getInRegsParamsProceed();
+            int FrameIndex = StoreByValRegs(
+                CCInfo, DAG, dl, Chain, CurOrigArg,
+                CurByValIndex,
+                Ins[VA.getValNo()].PartOffset,
+                VA.getLocMemOffset(),
+                Flags.getByValSize(),
+                true /*force mutable frames*/);
+            InVals.push_back(DAG.getFrameIndex(FrameIndex, getPointerTy()));
+            CCInfo.nextInRegsParam();
            } else {
+            unsigned FIOffset = VA.getLocMemOffset() +
+                                AFI->getStoredByValParamsPadding();
              int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
-                                            VA.getLocMemOffset(), true);
+                                            FIOffset, true);
  
              // Create load nodes to retrieve arguments from the stack.
              SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
@@ -2755,7 +2973,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
  
    // varargs
    if (isVarArg)
-    VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0, 0,
+    VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
                           CCInfo.getNextStackOffset());
  
    return Chain;
@@ -2782,7 +3000,7 @@ static bool isFloatingPointZero(SDValue Op) {
  SDValue
  ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                               SDValue &ARMcc, SelectionDAG &DAG,
-                             DebugLoc dl) const {
+                             SDLoc dl) const {
    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
      unsigned C = RHSC->getZExtValue();
      if (!isLegalICmpImmediate(C)) {
@@ -2840,7 +3058,7 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
  /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
  SDValue
  ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
-                             DebugLoc dl) const {
+                             SDLoc dl) const {
    SDValue Cmp;
    if (!isFloatingPointZero(RHS))
      Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
@@ -2854,7 +3072,7 @@ ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
  SDValue
  ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
    unsigned Opc = Cmp.getOpcode();
-  DebugLoc DL = Cmp.getDebugLoc();
+  SDLoc DL(Cmp);
    if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
      return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
  
@@ -2874,7 +3092,7 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
    SDValue Cond = Op.getOperand(0);
    SDValue SelectTrue = Op.getOperand(1);
    SDValue SelectFalse = Op.getOperand(2);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
  
    // Convert:
    //
@@ -2929,7 +3147,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
    SDValue TrueVal = Op.getOperand(2);
    SDValue FalseVal = Op.getOperand(3);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
  
    if (LHS.getValueType() == MVT::i32) {
      SDValue ARMcc;
@@ -2984,7 +3202,7 @@ static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
      return DAG.getConstant(0, MVT::i32);
  
    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
-    return DAG.getLoad(MVT::i32, Op.getDebugLoc(),
+    return DAG.getLoad(MVT::i32, SDLoc(Op),
                         Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
                         Ld->isVolatile(), Ld->isNonTemporal(),
                         Ld->isInvariant(), Ld->getAlignment());
@@ -3002,7 +3220,7 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
  
    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
      SDValue Ptr = Ld->getBasePtr();
-    RetVal1 = DAG.getLoad(MVT::i32, Op.getDebugLoc(),
+    RetVal1 = DAG.getLoad(MVT::i32, SDLoc(Op),
                            Ld->getChain(), Ptr,
                            Ld->getPointerInfo(),
                            Ld->isVolatile(), Ld->isNonTemporal(),
@@ -3010,9 +3228,9 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
  
      EVT PtrType = Ptr.getValueType();
      unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
-    SDValue NewPtr = DAG.getNode(ISD::ADD, Op.getDebugLoc(),
+    SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(Op),
                                   PtrType, Ptr, DAG.getConstant(4, PtrType));
-    RetVal2 = DAG.getLoad(MVT::i32, Op.getDebugLoc(),
+    RetVal2 = DAG.getLoad(MVT::i32, SDLoc(Op),
                            Ld->getChain(), NewPtr,
                            Ld->getPointerInfo().getWithOffset(4),
                            Ld->isVolatile(), Ld->isNonTemporal(),
@@ -3032,7 +3250,7 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
    SDValue LHS = Op.getOperand(2);
    SDValue RHS = Op.getOperand(3);
    SDValue Dest = Op.getOperand(4);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
  
    bool LHSSeenZero = false;
    bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
@@ -3082,7 +3300,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
    SDValue LHS = Op.getOperand(2);
    SDValue RHS = Op.getOperand(3);
    SDValue Dest = Op.getOperand(4);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
  
    if (LHS.getValueType() == MVT::i32) {
      SDValue ARMcc;
@@ -3123,7 +3341,7 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
    SDValue Chain = Op.getOperand(0);
    SDValue Table = Op.getOperand(1);
    SDValue Index = Op.getOperand(2);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
  
    EVT PTy = getPointerTy();
    JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
@@ -3159,7 +3377,7 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
  
  static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
    EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
  
    if (Op.getValueType().getVectorElementType() == MVT::i32) {
      if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
@@ -3181,7 +3399,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
    if (VT.isVector())
      return LowerVectorFP_TO_INT(Op, DAG);
  
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    unsigned Opc;
  
    switch (Op.getOpcode()) {
@@ -3199,7 +3417,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
  
  static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
    EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
  
    if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
      if (VT.getVectorElementType() == MVT::f32)
@@ -3235,7 +3453,7 @@ static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
    if (VT.isVector())
      return LowerVectorINT_TO_FP(Op, DAG);
  
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    unsigned Opc;
  
    switch (Op.getOpcode()) {
@@ -3256,7 +3474,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
    // Implement fcopysign with a fabs and a conditional fneg.
    SDValue Tmp0 = Op.getOperand(0);
    SDValue Tmp1 = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    EVT VT = Op.getValueType();
    EVT SrcVT = Tmp1.getValueType();
    bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
@@ -3340,7 +3558,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
    MFI->setReturnAddressIsTaken(true);
  
    EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    if (Depth) {
      SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
@@ -3360,7 +3578,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
    MFI->setFrameAddressIsTaken(true);
  
    EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
+  SDLoc dl(Op);  // FIXME probably not meaningful
    unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin())
      ? ARM::R7 : ARM::R11;
@@ -3372,6 +3590,47 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
    return FrameAddr;
  }
  
+/// Custom Expand long vector extensions, where size(DestVec) > 2*size(SrcVec),
+/// and size(DestVec) > 128-bits.
+/// This is achieved by doing the one extension from the SrcVec, splitting the
+/// result, extending these parts, and then concatenating these into the
+/// destination.
+static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op = N->getOperand(0);
+  EVT SrcVT = Op.getValueType();
+  EVT DestVT = N->getValueType(0);
+
+  assert(DestVT.getSizeInBits() > 128 &&
+         "Custom sext/zext expansion needs >128-bit vector.");
+  // If this is a normal length extension, use the default expansion.
+  if (SrcVT.getSizeInBits()*4 != DestVT.getSizeInBits() &&
+      SrcVT.getSizeInBits()*8 != DestVT.getSizeInBits())
+    return SDValue();
+
+  SDLoc dl(N);
+  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
+  unsigned DestEltSize = DestVT.getVectorElementType().getSizeInBits();
+  unsigned NumElts = SrcVT.getVectorNumElements();
+  LLVMContext &Ctx = *DAG.getContext();
+  SDValue Mid, SplitLo, SplitHi, ExtLo, ExtHi;
+
+  EVT MidVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
+                               NumElts);
+  EVT SplitVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
+                                 NumElts/2);
+  EVT ExtVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, DestEltSize),
+                               NumElts/2);
+
+  Mid = DAG.getNode(N->getOpcode(), dl, MidVT, Op);
+  SplitLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
+                        DAG.getIntPtrConstant(0));
+  SplitHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
+                        DAG.getIntPtrConstant(NumElts/2));
+  ExtLo = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitLo);
+  ExtHi = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitHi);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, ExtLo, ExtHi);
+}
+
  /// ExpandBITCAST - If the target supports VFP, this function is called to
  /// expand a bit convert where either the source or destination type is i64 to
  /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
@@ -3379,7 +3638,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
  /// vectors), since the legalizer won't know what to do with that.
  static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
    SDValue Op = N->getOperand(0);
  
    // This function is only supposed to be called for i64 types, either as the
@@ -3416,7 +3675,7 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
  /// not support i64 elements, so sometimes the zero vectors will need to be
  /// explicitly constructed.  Regardless, use a canonical VMOV to create the
  /// zero vector.
-static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
+static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) {
    assert(VT.isVector() && "Expected a vector type");
    // The canonical modified immediate encoding of a zero vector is....0!
    SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32);
@@ -3432,7 +3691,7 @@ SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
    assert(Op.getNumOperands() == 3 && "Not a double-shift!");
    EVT VT = Op.getValueType();
    unsigned VTBits = VT.getSizeInBits();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    SDValue ShOpLo = Op.getOperand(0);
    SDValue ShOpHi = Op.getOperand(1);
    SDValue ShAmt  = Op.getOperand(2);
@@ -3468,7 +3727,7 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
    assert(Op.getNumOperands() == 3 && "Not a double-shift!");
    EVT VT = Op.getValueType();
    unsigned VTBits = VT.getSizeInBits();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    SDValue ShOpLo = Op.getOperand(0);
    SDValue ShOpHi = Op.getOperand(1);
    SDValue ShAmt  = Op.getOperand(2);
@@ -3501,7 +3760,7 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
    // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
    // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
    // so that the shift + and get folded into a bitfield extract.
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
                                DAG.getConstant(Intrinsic::arm_get_fpscr,
                                                MVT::i32));
@@ -3516,7 +3775,7 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
  static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
                           const ARMSubtarget *ST) {
    EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
  
    if (!ST->hasV6T2Ops())
      return SDValue();
@@ -3525,10 +3784,118 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
    return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
  }
  
+/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
+/// for each 16-bit element from operand, repeated.  The basic idea is to
+/// leverage vcnt to get the 8-bit counts, gather and add the results.
+///
+/// Trace for v4i16:
+/// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
+/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
+/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
+/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
+///            [b0 b1 b2 b3 b4 b5 b6 b7]
+///           +[b1 b0 b3 b2 b5 b4 b7 b6]
+/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
+/// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
+static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+  SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
+  SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
+  SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
+  SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
+  return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
+}
+
+/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
+/// bit-count for each 16-bit element from the operand.  We need slightly
+/// different sequencing for v4i16 and v8i16 to stay within NEON's available
+/// 64/128-bit registers.
+///
+/// Trace for v4i16:
+/// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
+/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
+/// v8i16:Extended  = [k0    k1    k2    k3    k0    k1    k2    k3    ]
+/// v4i16:Extracted = [k0    k1    k2    k3    ]
+static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
+  if (VT.is64BitVector()) {
+    SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
+                       DAG.getIntPtrConstant(0));
+  } else {
+    SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
+                                    BitCounts, DAG.getIntPtrConstant(0));
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
+  }
+}
+
+/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
+/// bit-count for each 32-bit element from the operand.  The idea here is
+/// to split the vector into 16-bit elements, leverage the 16-bit count
+/// routine, and then combine the results.
+///
+/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
+/// input    = [v0    v1    ] (vi: 32-bit elements)
+/// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
+/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
+/// vrev: N0 = [k1 k0 k3 k2 ]
+///            [k0 k1 k2 k3 ]
+///       N1 =+[k1 k0 k3 k2 ]
+///            [k0 k2 k1 k3 ]
+///       N2 =+[k1 k3 k0 k2 ]
+///            [k0    k2    k1    k3    ]
+/// Extended =+[k1    k3    k0    k2    ]
+///            [k0    k2    ]
+/// Extracted=+[k1    k3    ]
+///
+static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+
+  SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
+  SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
+  SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
+  SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
+  SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
+
+  if (VT.is64BitVector()) {
+    SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
+                       DAG.getIntPtrConstant(0));
+  } else {
+    SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
+                                    DAG.getIntPtrConstant(0));
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
+  }
+}
+
+static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
+                          const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+
+  assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
+  assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
+          VT == MVT::v4i16 || VT == MVT::v8i16) &&
+         "Unexpected type for custom ctpop lowering");
+
+  if (VT.getVectorElementType() == MVT::i32)
+    return lowerCTPOP32BitElements(N, DAG);
+  else
+    return lowerCTPOP16BitElements(N, DAG);
+}
+
  static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
                            const ARMSubtarget *ST) {
    EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
  
    if (!VT.isVector())
      return SDValue();
@@ -3563,7 +3930,7 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
  static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
                                  const ARMSubtarget *ST) {
    EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
  
    // We can get here for a node like i32 = ISD::SHL i32, i64
    if (VT != MVT::i64)
@@ -3609,7 +3976,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
    SDValue CC = Op.getOperand(2);
    EVT VT = Op.getValueType();
    ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
  
    if (Op.getOperand(1).getValueType().isFloatingPoint()) {
      switch (SetCCOpcode) {
@@ -3878,7 +4245,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
    APFloat FPVal = CFP->getValueAPF();
    int ImmVal = ARM_AM::getFP32Imm(FPVal);
    if (ImmVal != -1) {
-    DebugLoc DL = Op.getDebugLoc();
+    SDLoc DL(Op);
      SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32);
      SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
                                        NewVal);
@@ -3892,7 +4259,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
    SDValue NewVal = isNEONModifiedImm(iVal, 0, 32, DAG, VMovVT, false,
                                       VMOVModImm);
    if (NewVal != SDValue()) {
-    DebugLoc DL = Op.getDebugLoc();
+    SDLoc DL(Op);
      SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
                                        NewVal);
      SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
@@ -3905,7 +4272,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
    NewVal = isNEONModifiedImm(~iVal & 0xffffffff, 0, 32, DAG, VMovVT, false,
                               VMVNModImm);
    if (NewVal != SDValue()) {
-    DebugLoc DL = Op.getDebugLoc();
+    SDLoc DL(Op);
      SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
      SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
                                         VecConstant);
@@ -3916,6 +4283,36 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
    return SDValue();
  }
  
+// check if an VEXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are the same.
+static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
+    return false;
+
+  Imm = M[0];
+
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, just follow it
+    // back to index zero and keep going.
+    ++ExpectedElt;
+    if (ExpectedElt == NumElts)
+      ExpectedElt = 0;
+
+    if (M[i] < 0) continue; // ignore UNDEF indices
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
+
+  return true;
+}
+
  
  static bool isVEXTMask(ArrayRef<int> M, EVT VT,
                         bool &ReverseVEXT, unsigned &Imm) {
@@ -4116,11 +4513,26 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
    return true;
  }
  
+/// \return true if this is a reverse operation on an vector.
+static bool isReverseMask(ArrayRef<int> M, EVT VT) {
+  unsigned NumElts = VT.getVectorNumElements();
+  // Make sure the mask has the right size.
+  if (NumElts != M.size())
+      return false;
+
+  // Look for <15, ..., 3, -1, 1, 0>.
+  for (unsigned i = 0; i != NumElts; ++i)
+    if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
+      return false;
+
+  return true;
+}
+
  // If N is an integer constant that can be moved into a register in one
  // instruction, return an SDValue of such a constant (will become a MOV
  // instruction).  Otherwise return null.
  static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
-                                     const ARMSubtarget *ST, DebugLoc dl) {
+                                     const ARMSubtarget *ST, SDLoc dl) {
    uint64_t Val;
    if (!isa<ConstantSDNode>(N))
      return SDValue();
@@ -4141,7 +4553,7 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
  SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                                               const ARMSubtarget *ST) const {
    BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    EVT VT = Op.getValueType();
  
    APInt SplatBits, SplatUndef;
@@ -4210,7 +4622,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
  
      ValueCounts.insert(std::make_pair(V, 0));
      unsigned &Count = ValueCounts[V];
-    
+
      // Is this value dominant? (takes up more than half of the lanes)
      if (++Count > (NumElts / 2)) {
        hasDominantValue = true;
@@ -4238,8 +4650,11 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
  
        // If we are VDUPing a value that comes directly from a vector, that will
        // cause an unnecessary move to and from a GPR, where instead we could
-      // just use VDUPLANE.
-      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+      // just use VDUPLANE. We can only do this if the lane being extracted
+      // is at a constant index, as the VDUP from lane instructions only have
+      // constant-index forms.
+      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          isa<ConstantSDNode>(Value->getOperand(1))) {
          // We need to create a new undef vector to use for the VDUPLANE if the
          // size of the vector from which we get the value is different than the
          // size of the vector that we need to create. We will insert the element
@@ -4254,12 +4669,10 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                   DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
                          Value, DAG.getConstant(index, MVT::i32)),
                             DAG.getConstant(index, MVT::i32));
-        } else {
+        } else
            N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
                          Value->getOperand(0), Value->getOperand(1));
-        }
-      }
-      else
+      } else
          N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
  
        if (!usesOnlyOneValue) {
@@ -4291,7 +4704,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
      if (usesOnlyOneValue) {
        SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
        if (isConstant && Val.getNode())
-        return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 
+        return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
      }
    }
  
@@ -4330,7 +4743,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
  // shuffle in combination with VEXTs.
  SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
                                                SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    EVT VT = Op.getValueType();
    unsigned NumElts = VT.getVectorNumElements();
  
@@ -4511,14 +4924,15 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
            isVZIPMask(M, VT, WhichResult) ||
            isVTRN_v_undef_Mask(M, VT, WhichResult) ||
            isVUZP_v_undef_Mask(M, VT, WhichResult) ||
-          isVZIP_v_undef_Mask(M, VT, WhichResult));
+          isVZIP_v_undef_Mask(M, VT, WhichResult) ||
+          ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
  }
  
  /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
  /// the specified operations to build the shuffle.
  static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
                                        SDValue RHS, SelectionDAG &DAG,
-                                      DebugLoc dl) {
+                                      SDLoc dl) {
    unsigned OpNum = (PFEntry >> 26) & 0x0F;
    unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
    unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
@@ -4598,7 +5012,7 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
    // Check to see if we can use the VTBL instruction.
    SDValue V1 = Op.getOperand(0);
    SDValue V2 = Op.getOperand(1);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
  
    SmallVector<SDValue, 8> VTBLMask;
    for (ArrayRef<int>::iterator
@@ -4615,10 +5029,27 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
                                   &VTBLMask[0], 8));
  }
  
+static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
+                                                      SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  SDValue OpLHS = Op.getOperand(0);
+  EVT VT = OpLHS.getValueType();
+
+  assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
+         "Expect an v8i16/v16i8 type");
+  OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
+  // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
+  // extract the first 8 bytes into the top double word and the last 8 bytes
+  // into the bottom double word. The v8i16 case is similar.
+  unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
+  return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
+                     DAG.getConstant(ExtractNum, MVT::i32));
+}
+
  static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
    SDValue V1 = Op.getOperand(0);
    SDValue V2 = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    EVT VT = Op.getValueType();
    ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
  
@@ -4675,6 +5106,12 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
      if (isVREVMask(ShuffleMask, VT, 16))
        return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
  
+    if (V2->getOpcode() == ISD::UNDEF &&
+        isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
+      return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
+                         DAG.getConstant(Imm, MVT::i32));
+    }
+
      // Check for Neon shuffles that modify both input vectors in place.
      // If both results are used, i.e., if there are two shuffles with the same
      // source operands and with masks corresponding to both results of one of
@@ -4746,6 +5183,9 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
      return DAG.getNode(ISD::BITCAST, dl, VT, Val);
    }
  
+  if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
+    return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
+
    if (VT == MVT::v8i8) {
      SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG);
      if (NewOp.getNode())
@@ -4773,7 +5213,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
    SDValue Vec = Op.getOperand(0);
    if (Op.getValueType() == MVT::i32 &&
        Vec.getValueType().getVectorElementType().getSizeInBits() < 32) {
-    DebugLoc dl = Op.getDebugLoc();
+    SDLoc dl(Op);
      return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
    }
  
@@ -4785,7 +5225,7 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
    // two 64-bit vectors are concatenated to a 128-bit vector.
    assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
           "unexpected CONCAT_VECTORS");
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    SDValue Val = DAG.getUNDEF(MVT::v2f64);
    SDValue Op0 = Op.getOperand(0);
    SDValue Op1 = Op.getOperand(1);
@@ -4874,16 +5314,83 @@ static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
    return false;
  }
  
-/// SkipExtension - For a node that is a SIGN_EXTEND, ZERO_EXTEND, extending
-/// load, or BUILD_VECTOR with extended elements, return the unextended value.
-static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) {
+static EVT getExtensionTo64Bits(const EVT &OrigVT) {
+  if (OrigVT.getSizeInBits() >= 64)
+    return OrigVT;
+
+  assert(OrigVT.isSimple() && "Expecting a simple value type");
+
+  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
+  switch (OrigSimpleTy) {
+  default: llvm_unreachable("Unexpected Vector Type");
+  case MVT::v2i8:
+  case MVT::v2i16:
+     return MVT::v2i32;
+  case MVT::v4i8:
+    return  MVT::v4i16;
+  }
+}
+
+/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
+/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
+/// We insert the required extension here to get the vector to fill a D register.
+static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
+                                            const EVT &OrigTy,
+                                            const EVT &ExtTy,
+                                            unsigned ExtOpcode) {
+  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
+  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
+  // 64-bits we need to insert a new extension so that it will be 64-bits.
+  assert(ExtTy.is128BitVector() && "Unexpected extension size");
+  if (OrigTy.getSizeInBits() >= 64)
+    return N;
+
+  // Must extend size to at least 64 bits to be used as an operand for VMULL.
+  EVT NewVT = getExtensionTo64Bits(OrigTy);
+
+  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
+}
+
+/// SkipLoadExtensionForVMULL - return a load of the original vector size that
+/// does not do any sign/zero extension. If the original vector is less
+/// than 64 bits, an appropriate extension will be added after the load to
+/// reach a total size of 64 bits. We have to add the extension separately
+/// because ARM does not have a sign/zero extending load for vectors.
+static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
+  EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
+
+  // The load already has the right type.
+  if (ExtendedTy == LD->getMemoryVT())
+    return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
+                LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
+                LD->isNonTemporal(), LD->isInvariant(),
+                LD->getAlignment());
+
+  // We need to create a zextload/sextload. We cannot just create a load
+  // followed by a zext/zext node because LowerMUL is also run during normal
+  // operation legalization where we can't create illegal types.
+  return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
+                        LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
+                        LD->getMemoryVT(), LD->isVolatile(),
+                        LD->isNonTemporal(), LD->getAlignment());
+}
+
+/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
+/// extending load, or BUILD_VECTOR with extended elements, return the
+/// unextended value. The unextended vector should be 64 bits so that it can
+/// be used as an operand to a VMULL instruction. If the original vector size
+/// before extension is less than 64 bits we add a an extension to resize
+/// the vector to 64 bits.
+static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
    if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
-    return N->getOperand(0);
+    return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
+                                        N->getOperand(0)->getValueType(0),
+                                        N->getValueType(0),
+                                        N->getOpcode());
+
    if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
-    return DAG.getLoad(LD->getMemoryVT(), N->getDebugLoc(), LD->getChain(),
-                       LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
-                       LD->isNonTemporal(), LD->isInvariant(),
-                       LD->getAlignment());
+    return SkipLoadExtensionForVMULL(LD, DAG);
+
    // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
    // have been legalized as a BITCAST from v4i32.
    if (N->getOpcode() == ISD::BITCAST) {
@@ -4891,7 +5398,7 @@ static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) {
      assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
             BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
      unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
-    return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), MVT::v2i32,
+    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32,
                         BVN->getOperand(LowElt), BVN->getOperand(LowElt+2));
    }
    // Construct a new BUILD_VECTOR with elements truncated to half the size.
@@ -4908,7 +5415,7 @@ static SDValue SkipExtension(SDNode *N, SelectionDAG &DAG) {
      // The values are implicitly truncated so sext vs. zext doesn't matter.
      Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
    }
-  return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
                       MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts);
  }
  
@@ -4938,7 +5445,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
    // Multiplications are only custom-lowered for 128-bit vectors so that
    // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
    EVT VT = Op.getValueType();
-  assert(VT.is128BitVector() && "unexpected type for custom-lowering ISD::MUL");
+  assert(VT.is128BitVector() && VT.isInteger() &&
+         "unexpected type for custom-lowering ISD::MUL");
    SDNode *N0 = Op.getOperand(0).getNode();
    SDNode *N1 = Op.getOperand(1).getNode();
    unsigned NewOpc = 0;
@@ -4979,11 +5487,11 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
    }
  
    // Legalize to a VMULL instruction.
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
    SDValue Op0;
-  SDValue Op1 = SkipExtension(N1, DAG);
+  SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
    if (!isMLA) {
-    Op0 = SkipExtension(N0, DAG);
+    Op0 = SkipExtensionForVMULL(N0, DAG);
      assert(Op0.getValueType().is64BitVector() &&
             Op1.getValueType().is64BitVector() &&
             "unexpected types for extended operands to VMULL");
@@ -4998,8 +5506,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
    //   vaddl q0, d4, d5
    //   vmovl q1, d6
    //   vmul  q0, q0, q1
-  SDValue N00 = SkipExtension(N0->getOperand(0).getNode(), DAG);
-  SDValue N01 = SkipExtension(N0->getOperand(1).getNode(), DAG);
+  SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
+  SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
    EVT Op1VT = Op1.getValueType();
    return DAG.getNode(N0->getOpcode(), DL, VT,
                       DAG.getNode(NewOpc, DL, VT,
@@ -5009,7 +5517,7 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
  }
  
  static SDValue
-LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
+LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
    // Convert to float
    // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
    // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
@@ -5038,7 +5546,7 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
  }
  
  static SDValue
-LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) {
+LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) {
    SDValue N2;
    // Convert to float.
    // float4 yf = vcvt_f32_s32(vmovl_s16(y));
@@ -5079,7 +5587,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
    assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
           "unexpected type for custom-lowering ISD::SDIV");
  
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    SDValue N0 = Op.getOperand(0);
    SDValue N1 = Op.getOperand(1);
    SDValue N2, N3;
@@ -5114,7 +5622,7 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
    assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
           "unexpected type for custom-lowering ISD::UDIV");
  
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
    SDValue N0 = Op.getOperand(0);
    SDValue N1 = Op.getOperand(1);
    SDValue N2, N3;
@@ -5198,9 +5706,9 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
    }
  
    if (!ExtraOp)
-    return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
                         Op.getOperand(1));
-  return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
                       Op.getOperand(1), Op.getOperand(2));
  }
  
@@ -5214,11 +5722,10 @@ static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
    return SDValue();
  }
  
-
  static void
  ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl<SDValue>& Results,
                      SelectionDAG &DAG, unsigned NewOp) {
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
    assert (Node->getValueType(0) == MVT::i64 &&
            "Only know how to expand i64 atomics");
  
@@ -5248,6 +5755,44 @@ ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl<SDValue>& Results,
    Results.push_back(Result.getValue(2));
  }
  
+static void ReplaceREADCYCLECOUNTER(SDNode *N,
+                                    SmallVectorImpl<SDValue> &Results,
+                                    SelectionDAG &DAG,
+                                    const ARMSubtarget *Subtarget) {
+  SDLoc DL(N);
+  SDValue Cycles32, OutChain;
+
+  if (Subtarget->hasPerfMon()) {
+    // Under Power Management extensions, the cycle-count is:
+    //    mrc p15, #0, <Rt>, c9, c13, #0
+    SDValue Ops[] = { N->getOperand(0), // Chain
+                      DAG.getConstant(Intrinsic::arm_mrc, MVT::i32),
+                      DAG.getConstant(15, MVT::i32),
+                      DAG.getConstant(0, MVT::i32),
+                      DAG.getConstant(9, MVT::i32),
+                      DAG.getConstant(13, MVT::i32),
+                      DAG.getConstant(0, MVT::i32)
+    };
+
+    Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+                           DAG.getVTList(MVT::i32, MVT::Other), &Ops[0],
+                           array_lengthof(Ops));
+    OutChain = Cycles32.getValue(1);
+  } else {
+    // Intrinsic is defined to return 0 on unsupported platforms. Technically
+    // there are older ARM CPUs that have implementation-specific ways of
+    // obtaining this information (FIXME!).
+    Cycles32 = DAG.getConstant(0, MVT::i32);
+    OutChain = DAG.getEntryNode();
+  }
+
+
+  SDValue Cycles64 = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
+                                 Cycles32, DAG.getConstant(0, MVT::i32));
+  Results.push_back(Cycles64);
+  Results.push_back(OutChain);
+}
+
  SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    switch (Op.getOpcode()) {
    default: llvm_unreachable("Don't know how to custom lower this!");
@@ -5262,7 +5807,6 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
    case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
    case ISD::VASTART:       return LowerVASTART(Op, DAG);
-  case ISD::MEMBARRIER:    return LowerMEMBARRIER(Op, DAG, Subtarget);
    case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
    case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
    case ISD::SINT_TO_FP:
@@ -5285,6 +5829,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::SRL_PARTS:
    case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
    case ISD::CTTZ:          return LowerCTTZ(Op.getNode(), DAG, Subtarget);
+  case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
    case ISD::SETCC:         return LowerVSETCC(Op, DAG);
    case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
    case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
@@ -5317,10 +5862,17 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
    case ISD::BITCAST:
      Res = ExpandBITCAST(N, DAG);
      break;
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    Res = ExpandVectorExtension(N, DAG);
+    break;
    case ISD::SRL:
    case ISD::SRA:
      Res = Expand64BitShift(N, DAG, Subtarget);
      break;
+  case ISD::READCYCLECOUNTER:
+    ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
+    return;
    case ISD::ATOMIC_LOAD_ADD:
      ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMADD64_DAG);
      return;
@@ -5345,6 +5897,18 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
    case ISD::ATOMIC_CMP_SWAP:
      ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG);
      return;
+  case ISD::ATOMIC_LOAD_MIN:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMIN64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_UMIN:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMIN64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_MAX:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMAX64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_UMAX:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMAX64_DAG);
+    return;
    }
    if (Res.getNode())
      Results.push_back(Res);
@@ -5684,7 +6248,8 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
  MachineBasicBlock *
  ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
                                        unsigned Op1, unsigned Op2,
-                                      bool NeedsCarry, bool IsCmpxchg) const {
+                                      bool NeedsCarry, bool IsCmpxchg,
+                                      bool IsMinMax, ARMCC::CondCodes CC) const {
    // This also handles ATOMIC_SWAP, indicated by Op1==0.
    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
  
@@ -5708,21 +6273,17 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
      MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
    }
  
-  unsigned ldrOpc = isThumb2 ? ARM::t2LDREXD : ARM::LDREXD;
-  unsigned strOpc = isThumb2 ? ARM::t2STREXD : ARM::STREXD;
-
    MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
    MachineBasicBlock *contBB = 0, *cont2BB = 0;
-  if (IsCmpxchg) {
+  if (IsCmpxchg || IsMinMax)
      contBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  if (IsCmpxchg)
      cont2BB = MF->CreateMachineBasicBlock(LLVM_BB);
-  }
    MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
    MF->insert(It, loopMBB);
-  if (IsCmpxchg) {
-    MF->insert(It, contBB);
-    MF->insert(It, cont2BB);
-  }
+  if (IsCmpxchg || IsMinMax) MF->insert(It, contBB);
+  if (IsCmpxchg) MF->insert(It, cont2BB);
    MF->insert(It, exitMBB);
  
    // Transfer the remainder of BB and its successor edges to exitMBB.
@@ -5749,22 +6310,26 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
    //   cmp storesuccess, #0
    //   bne- loopMBB
    //   fallthrough --> exitMBB
-  //
-  // Note that the registers are explicitly specified because there is not any
-  // way to force the register allocator to allocate a register pair.
-  //
-  // FIXME: The hardcoded registers are not necessary for Thumb2, but we
-  // need to properly enforce the restriction that the two output registers
-  // for ldrexd must be different.
    BB = loopMBB;
+
    // Load
-  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc))
-                 .addReg(ARM::R2, RegState::Define)
-                 .addReg(ARM::R3, RegState::Define).addReg(ptr));
-  // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
-  BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo).addReg(ARM::R2);
-  BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi).addReg(ARM::R3);
+  if (isThumb2) {
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2LDREXD))
+                   .addReg(destlo, RegState::Define)
+                   .addReg(desthi, RegState::Define)
+                   .addReg(ptr));
+  } else {
+    unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDREXD))
+                   .addReg(GPRPair0, RegState::Define).addReg(ptr));
+    // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
+    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo)
+      .addReg(GPRPair0, 0, ARM::gsub_0);
+    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi)
+      .addReg(GPRPair0, 0, ARM::gsub_1);
+  }
  
+  unsigned StoreLo, StoreHi;
    if (IsCmpxchg) {
      // Add early exit
      for (unsigned i = 0; i < 2; i++) {
@@ -5780,26 +6345,60 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
      }
  
      // Copy to physregs for strexd
-    unsigned setlo = MI->getOperand(5).getReg();
-    unsigned sethi = MI->getOperand(6).getReg();
-    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R0).addReg(setlo);
-    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R1).addReg(sethi);
+    StoreLo = MI->getOperand(5).getReg();
+    StoreHi = MI->getOperand(6).getReg();
    } else if (Op1) {
      // Perform binary operation
-    AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), ARM::R0)
+    unsigned tmpRegLo = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), tmpRegLo)
                     .addReg(destlo).addReg(vallo))
          .addReg(NeedsCarry ? ARM::CPSR : 0, getDefRegState(NeedsCarry));
-    AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), ARM::R1)
-                   .addReg(desthi).addReg(valhi)).addReg(0);
+    unsigned tmpRegHi = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), tmpRegHi)
+                   .addReg(desthi).addReg(valhi))
+        .addReg(IsMinMax ? ARM::CPSR : 0, getDefRegState(IsMinMax));
+
+    StoreLo = tmpRegLo;
+    StoreHi = tmpRegHi;
    } else {
      // Copy to physregs for strexd
-    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R0).addReg(vallo);
-    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R1).addReg(valhi);
+    StoreLo = vallo;
+    StoreHi = valhi;
+  }
+  if (IsMinMax) {
+    // Compare and branch to exit block.
+    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+      .addMBB(exitMBB).addImm(CC).addReg(ARM::CPSR);
+    BB->addSuccessor(exitMBB);
+    BB->addSuccessor(contBB);
+    BB = contBB;
+    StoreLo = vallo;
+    StoreHi = valhi;
    }
  
    // Store
-  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess)
-                 .addReg(ARM::R0).addReg(ARM::R1).addReg(ptr));
+  if (isThumb2) {
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2STREXD), storesuccess)
+                   .addReg(StoreLo).addReg(StoreHi).addReg(ptr));
+  } else {
+    // Marshal a pair...
+    unsigned StorePair = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    unsigned UndefPair = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), UndefPair);
+    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1)
+      .addReg(UndefPair)
+      .addReg(StoreLo)
+      .addImm(ARM::gsub_0);
+    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), StorePair)
+      .addReg(r1)
+      .addReg(StoreHi)
+      .addImm(ARM::gsub_1);
+
+    // ...and store it
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::STREXD), storesuccess)
+                   .addReg(StorePair).addReg(ptr));
+  }
    // Cmp+jump
    AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
                   .addReg(storesuccess).addImm(0));
@@ -6000,6 +6599,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
      MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
    unsigned MJTI = JTI->createJumpTableIndex(LPadList);
    unsigned UId = AFI->createJumpTableUId();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
  
    // Create the MBBs for the dispatch code.
  
@@ -6008,7 +6608,13 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
    DispatchBB->setIsLandingPad();
  
    MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
-  BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
+  unsigned trap_opcode;
+  if (Subtarget->isThumb())
+    trap_opcode = ARM::tTRAP;
+  else
+    trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
+
+  BuildMI(TrapBB, dl, TII->get(trap_opcode));
    DispatchBB->addSuccessor(TrapBB);
  
    MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
@@ -6028,12 +6634,15 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
                               MachineMemOperand::MOLoad |
                               MachineMemOperand::MOVolatile, 4, 4);
  
-  if (AFI->isThumb1OnlyFunction())
-    BuildMI(DispatchBB, dl, TII->get(ARM::tInt_eh_sjlj_dispatchsetup));
-  else if (!Subtarget->hasVFP2())
-    BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup_nofp));
-  else
-    BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
+
+  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
+  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+
+  // Add a register mask with no preserved registers.  This results in all
+  // registers being marked as clobbered.
+  MIB.addRegMask(RI.getNoPreservedMask());
  
    unsigned NumLPads = LPadList.size();
    if (Subtarget->isThumb2()) {
@@ -6151,11 +6760,14 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
                     .addImm(0)
                     .addMemOperand(JTMMOLd));
  
-    unsigned NewVReg6 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
-                   .addReg(ARM::CPSR, RegState::Define)
-                   .addReg(NewVReg5, RegState::Kill)
-                   .addReg(NewVReg3));
+    unsigned NewVReg6 = NewVReg5;
+    if (RelocM == Reloc::PIC_) {
+      NewVReg6 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
+                     .addReg(ARM::CPSR, RegState::Define)
+                     .addReg(NewVReg5, RegState::Kill)
+                     .addReg(NewVReg3));
+    }
  
      BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
        .addReg(NewVReg6, RegState::Kill)
@@ -6235,11 +6847,18 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
        .addImm(0)
        .addMemOperand(JTMMOLd));
  
-    BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
-      .addReg(NewVReg5, RegState::Kill)
-      .addReg(NewVReg4)
-      .addJumpTableIndex(MJTI)
-      .addImm(UId);
+    if (RelocM == Reloc::PIC_) {
+      BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
+        .addReg(NewVReg5, RegState::Kill)
+        .addReg(NewVReg4)
+        .addJumpTableIndex(MJTI)
+        .addImm(UId);
+    } else {
+      BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
+        .addReg(NewVReg5, RegState::Kill)
+        .addJumpTableIndex(MJTI)
+        .addImm(UId);
+    }
    }
  
    // Add the jump table entries as successors to the MBB.
@@ -6252,8 +6871,6 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
    }
  
    // N.B. the order the invoke BBs are processed in doesn't matter here.
-  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
-  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
    const uint16_t *SavedRegs = RI.getCalleeSavedRegs(MF);
    SmallVector<MachineBasicBlock*, 64> MBBLPads;
    for (SmallPtrSet<MachineBasicBlock*, 64>::iterator
@@ -6290,7 +6907,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
          DefRegs[OI->getReg()] = true;
        }
  
-      MachineInstrBuilder MIB(&*II);
+      MachineInstrBuilder MIB(*MF, &*II);
  
        for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
          unsigned Reg = SavedRegs[i];
@@ -6367,8 +6984,9 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
      UnitSize = 2;
    } else {
      // Check whether we can use NEON instructions.
-    if (!MF->getFunction()->getFnAttributes().
-          hasAttribute(Attributes::NoImplicitFloat) &&
+    if (!MF->getFunction()->getAttributes().
+          hasAttribute(AttributeSet::FunctionIndex,
+                       Attribute::NoImplicitFloat) &&
          Subtarget->hasNEON()) {
        if ((Align % 16 == 0) && SizeVal >= 16) {
          ldrOpc = ARM::VLD1q32wb_fixed;
@@ -6796,6 +7414,26 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
      return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                                isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                                /*NeedsCarry*/ false, /*IsCmpxchg*/true);
+  case ARM::ATOMMIN6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
+                              /*IsMinMax*/ true, ARMCC::LT);
+  case ARM::ATOMMAX6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
+                              /*IsMinMax*/ true, ARMCC::GE);
+  case ARM::ATOMUMIN6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
+                              /*IsMinMax*/ true, ARMCC::LO);
+  case ARM::ATOMUMAX6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
+                              /*IsMinMax*/ true, ARMCC::HS);
  
    case ARM::tMOVCCr_pseudo: {
      // To "insert" a SELECT_CC instruction, we actually have to insert the
@@ -7169,13 +7807,13 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
  
    // Slct is now know to be the desired identity constant when CC is true.
    SDValue TrueVal = OtherOp;
-  SDValue FalseVal = DAG.getNode(N->getOpcode(), N->getDebugLoc(), VT,
+  SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
                                   OtherOp, NonConstantVal);
    // Unless SwapSelectOps says CC should be false.
    if (SwapSelectOps)
      std::swap(TrueVal, FalseVal);
  
-  return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT,
+  return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
                       CCOp, TrueVal, FalseVal);
  }
  
@@ -7282,9 +7920,9 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
        llvm_unreachable("Invalid vector element type for padd optimization.");
    }
  
-  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(),
+  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
                              widenType, &Ops[0], Ops.size());
-  return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp);
+  return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, tmp);
  }
  
  static SDValue findMUL_LOHI(SDValue V) {
@@ -7327,8 +7965,11 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
  
    assert(AddcNode->getNumValues() == 2 &&
           AddcNode->getValueType(0) == MVT::i32 &&
-         AddcNode->getValueType(1) == MVT::Glue &&
-         "Expect ADDC with two result values: i32, glue");
+         "Expect ADDC with two result values. First: i32");
+
+  // Check that we have a glued ADDC node.
+  if (AddcNode->getValueType(1) != MVT::Glue)
+    return SDValue();
  
    // Check that the ADDC adds the low result of the S/UMUL_LOHI.
    if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
@@ -7409,7 +8050,7 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
    Ops.push_back(*LowAdd);
    Ops.push_back(*HiAdd);
  
-  SDValue MLALNode =  DAG.getNode(FinalOpc, AddcNode->getDebugLoc(),
+  SDValue MLALNode =  DAG.getNode(FinalOpc, SDLoc(AddcNode),
                                   DAG.getVTList(MVT::i32, MVT::i32),
                                   &Ops[0], Ops.size());
  
@@ -7517,7 +8158,7 @@ static SDValue PerformVMULCombine(SDNode *N,
    }
  
    EVT VT = N->getValueType(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
    SDValue N00 = N0->getOperand(0);
    SDValue N01 = N0->getOperand(1);
    return DAG.getNode(Opcode, DL, VT,
@@ -7547,11 +8188,11 @@ static SDValue PerformMULCombine(SDNode *N,
      return SDValue();
  
    int64_t MulAmt = C->getSExtValue();
-  unsigned ShiftAmt = CountTrailingZeros_64(MulAmt);
+  unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
  
    ShiftAmt = ShiftAmt & (32 - 1);
    SDValue V = N->getOperand(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
  
    SDValue Res;
    MulAmt >>= ShiftAmt;
@@ -7615,7 +8256,7 @@ static SDValue PerformANDCombine(SDNode *N,
  
    // Attempt to use immediate-form VBIC
    BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
    EVT VT = N->getValueType(0);
    SelectionDAG &DAG = DCI.DAG;
  
@@ -7658,7 +8299,7 @@ static SDValue PerformORCombine(SDNode *N,
                                  const ARMSubtarget *Subtarget) {
    // Attempt to use immediate-form VORR
    BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
    EVT VT = N->getValueType(0);
    SelectionDAG &DAG = DCI.DAG;
  
@@ -7733,7 +8374,7 @@ static SDValue PerformORCombine(SDNode *N,
    if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
      return SDValue();
  
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
    // 1) or (and A, mask), val => ARMbfi A, val, mask
    //      iff (val & mask) == val
    //
@@ -7768,7 +8409,7 @@ static SDValue PerformORCombine(SDNode *N,
        return SDValue();
  
      if (ARM::isBitFieldInvertedMask(Mask)) {
-      Val >>= CountTrailingZeros_32(~Mask);
+      Val >>= countTrailingZeros(~Mask);
  
        Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
                          DAG.getConstant(Val, MVT::i32),
@@ -7795,7 +8436,7 @@ static SDValue PerformORCombine(SDNode *N,
            (Mask == 0xffff || Mask == 0xffff0000))
          return SDValue();
        // 2a
-      unsigned amt = CountTrailingZeros_32(Mask2);
+      unsigned amt = countTrailingZeros(Mask2);
        Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
                          DAG.getConstant(amt, MVT::i32));
        Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
@@ -7811,7 +8452,7 @@ static SDValue PerformORCombine(SDNode *N,
            (Mask2 == 0xffff || Mask2 == 0xffff0000))
          return SDValue();
        // 2b
-      unsigned lsb = CountTrailingZeros_32(Mask);
+      unsigned lsb = countTrailingZeros(Mask);
        Res = DAG.getNode(ISD::SRL, DL, VT, N00,
                          DAG.getConstant(lsb, MVT::i32));
        Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
@@ -7829,7 +8470,7 @@ static SDValue PerformORCombine(SDNode *N,
      // where lsb(mask) == #shamt and masked bits of B are known zero.
      SDValue ShAmt = N00.getOperand(1);
      unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
-    unsigned LSB = CountTrailingZeros_32(Mask);
+    unsigned LSB = countTrailingZeros(Mask);
      if (ShAmtC != LSB)
        return SDValue();
  
@@ -7872,12 +8513,12 @@ static SDValue PerformBFICombine(SDNode *N,
      if (!N11C)
        return SDValue();
      unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
-    unsigned LSB = CountTrailingZeros_32(~InvMask);
-    unsigned Width = (32 - CountLeadingZeros_32(~InvMask)) - LSB;
+    unsigned LSB = countTrailingZeros(~InvMask);
+    unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
      unsigned Mask = (1 << Width)-1;
      unsigned Mask2 = N11C->getZExtValue();
      if ((Mask & (~Mask2)) == 0)
-      return DCI.DAG.getNode(ARMISD::BFI, N->getDebugLoc(), N->getValueType(0),
+      return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
                               N->getOperand(0), N1.getOperand(0),
                               N->getOperand(2));
    }
@@ -7903,7 +8544,7 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
      LoadSDNode *LD = cast<LoadSDNode>(InNode);
  
      SelectionDAG &DAG = DCI.DAG;
-    DebugLoc DL = LD->getDebugLoc();
+    SDLoc DL(LD);
      SDValue BasePtr = LD->getBasePtr();
      SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr,
                                   LD->getPointerInfo(), LD->isVolatile(),
@@ -7940,7 +8581,7 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
    if (Op0.getOpcode() == ARMISD::VMOVRRD &&
        Op0.getNode() == Op1.getNode() &&
        Op0.getResNo() == 0 && Op1.getResNo() == 1)
-    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
+    return DAG.getNode(ISD::BITCAST, SDLoc(N),
                         N->getValueType(0), Op0.getOperand(0));
    return SDValue();
  }
@@ -7982,7 +8623,7 @@ static SDValue PerformSTORECombine(SDNode *N,
                                       NumElems*SizeRatio);
      assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
  
-    DebugLoc DL = St->getDebugLoc();
+    SDLoc DL(St);
      SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
      SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
      for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio;
@@ -8043,7 +8684,7 @@ static SDValue PerformSTORECombine(SDNode *N,
    if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
        StVal.getNode()->hasOneUse()) {
      SelectionDAG  &DAG = DCI.DAG;
-    DebugLoc DL = St->getDebugLoc();
+    SDLoc DL(St);
      SDValue BasePtr = St->getBasePtr();
      SDValue NewST1 = DAG.getStore(St->getChain(), DL,
                                    StVal.getNode()->getOperand(0), BasePtr,
@@ -8065,14 +8706,14 @@ static SDValue PerformSTORECombine(SDNode *N,
    // Bitcast an i64 store extracted from a vector to f64.
    // Otherwise, the i64 value will be legalized to a pair of i32 values.
    SelectionDAG &DAG = DCI.DAG;
-  DebugLoc dl = StVal.getDebugLoc();
+  SDLoc dl(StVal);
    SDValue IntVec = StVal.getOperand(0);
    EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
                                   IntVec.getValueType().getVectorNumElements());
    SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
    SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                                 Vec, StVal.getOperand(1));
-  dl = N->getDebugLoc();
+  dl = SDLoc(N);
    SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
    // Make the DAGCombiner fold the bitcasts.
    DCI.AddToWorklist(Vec.getNode());
@@ -8118,7 +8759,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,
    EVT VT = N->getValueType(0);
    if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
      return SDValue();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
    SmallVector<SDValue, 8> Ops;
    unsigned NumElts = VT.getVectorNumElements();
    for (unsigned i = 0; i < NumElts; ++i) {
@@ -8132,6 +8773,98 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,
    return DAG.getNode(ISD::BITCAST, dl, VT, BV);
  }
  
+/// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
+static SDValue
+PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
+  // At that time, we may have inserted bitcasts from integer to float.
+  // If these bitcasts have survived DAGCombine, change the lowering of this
+  // BUILD_VECTOR in something more vector friendly, i.e., that does not
+  // force to use floating point types.
+
+  // Make sure we can change the type of the vector.
+  // This is possible iff:
+  // 1. The vector is only used in a bitcast to a integer type. I.e.,
+  //    1.1. Vector is used only once.
+  //    1.2. Use is a bit convert to an integer type.
+  // 2. The size of its operands are 32-bits (64-bits are not legal).
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+
+  // Check 1.1. and 2.
+  if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
+    return SDValue();
+
+  // By construction, the input type must be float.
+  assert(EltVT == MVT::f32 && "Unexpected type!");
+
+  // Check 1.2.
+  SDNode *Use = *N->use_begin();
+  if (Use->getOpcode() != ISD::BITCAST ||
+      Use->getValueType(0).isFloatingPoint())
+    return SDValue();
+
+  // Check profitability.
+  // Model is, if more than half of the relevant operands are bitcast from
+  // i32, turn the build_vector into a sequence of insert_vector_elt.
+  // Relevant operands are everything that is not statically
+  // (i.e., at compile time) bitcasted.
+  unsigned NumOfBitCastedElts = 0;
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumOfRelevantElts = NumElts;
+  for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
+    SDValue Elt = N->getOperand(Idx);
+    if (Elt->getOpcode() == ISD::BITCAST) {
+      // Assume only bit cast to i32 will go away.
+      if (Elt->getOperand(0).getValueType() == MVT::i32)
+        ++NumOfBitCastedElts;
+    } else if (Elt.getOpcode() == ISD::UNDEF || isa<ConstantSDNode>(Elt))
+      // Constants are statically casted, thus do not count them as
+      // relevant operands.
+      --NumOfRelevantElts;
+  }
+
+  // Check if more than half of the elements require a non-free bitcast.
+  if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  // Create the new vector type.
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+  // Check if the type is legal.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isTypeLegal(VecVT))
+    return SDValue();
+
+  // Combine:
+  // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
+  // => BITCAST INSERT_VECTOR_ELT
+  //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
+  //                      (BITCAST EN), N.
+  SDValue Vec = DAG.getUNDEF(VecVT);
+  SDLoc dl(N);
+  for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
+    SDValue V = N->getOperand(Idx);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (V.getOpcode() == ISD::BITCAST &&
+        V->getOperand(0).getValueType() == MVT::i32)
+      // Fold obvious case.
+      V = V.getOperand(0);
+    else {
+      V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 
+      // Make the DAGCombiner fold the bitcasts.
+      DCI.AddToWorklist(V.getNode());
+    }
+    SDValue LaneIdx = DAG.getConstant(Idx, MVT::i32);
+    Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
+  }
+  Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+  // Make the DAGCombiner fold the bitcasts.
+  DCI.AddToWorklist(Vec.getNode());
+  return Vec;
+}
+
  /// PerformInsertEltCombine - Target-specific dag combine xforms for
  /// ISD::INSERT_VECTOR_ELT.
  static SDValue PerformInsertEltCombine(SDNode *N,
@@ -8145,7 +8878,7 @@ static SDValue PerformInsertEltCombine(SDNode *N,
      return SDValue();
  
    SelectionDAG &DAG = DCI.DAG;
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
    EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
                                   VT.getVectorNumElements());
    SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
@@ -8191,7 +8924,7 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
        !TLI.isTypeLegal(Concat1Op1.getValueType()))
      return SDValue();
  
-  SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT,
+  SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
                                    Op0.getOperand(0), Op1.getOperand(0));
    // Translate the shuffle mask.
    SmallVector<int, 16> NewMask;
@@ -8207,7 +8940,7 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
        NewElt = HalfElts + MaskElt - NumElts;
      NewMask.push_back(NewElt);
    }
-  return DAG.getVectorShuffle(VT, N->getDebugLoc(), NewConcat,
+  return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
                                DAG.getUNDEF(VT), NewMask.data());
  }
  
@@ -8324,7 +9057,7 @@ static SDValue CombineBaseUpdate(SDNode *N,
        Ops.push_back(N->getOperand(i));
      }
      MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, N->getDebugLoc(), SDTys,
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
                                             Ops.data(), Ops.size(),
                                             MemInt->getMemoryVT(),
                                             MemInt->getMemOperand());
@@ -8398,7 +9131,7 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
    SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1);
    SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
    MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
-  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, VLD->getDebugLoc(), SDTys,
+  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
                                             Ops, 2, VLDMemInt->getMemoryVT(),
                                             VLDMemInt->getMemOperand());
  
@@ -8453,7 +9186,7 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
    if (EltSize > VT.getVectorElementType().getSizeInBits())
      return SDValue();
  
-  return DCI.DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
+  return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
  }
  
  // isConstVecPow2 - Return true if each vector element is a power of 2, all
@@ -8510,12 +9243,27 @@ static SDValue PerformVCVTCombine(SDNode *N,
        !isConstVecPow2(ConstVec, isSigned, C))
      return SDValue();
  
+  MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
+  MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
+  if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) {
+    // These instructions only exist converting from f32 to i32. We can handle
+    // smaller integers by generating an extra truncate, but larger ones would
+    // be lossy.
+    return SDValue();
+  }
+
    unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
      Intrinsic::arm_neon_vcvtfp2fxu;
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(),
-                     N->getValueType(0),
-                     DAG.getConstant(IntrinsicOpcode, MVT::i32), N0,
-                     DAG.getConstant(Log2_64(C), MVT::i32));
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  SDValue FixConv =  DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
+                                 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
+                                 DAG.getConstant(IntrinsicOpcode, MVT::i32), N0,
+                                 DAG.getConstant(Log2_64(C), MVT::i32));
+
+  if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
+    FixConv = DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), FixConv);
+
+  return FixConv;
  }
  
  /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
@@ -8546,12 +9294,28 @@ static SDValue PerformVDIVCombine(SDNode *N,
        !isConstVecPow2(ConstVec, isSigned, C))
      return SDValue();
  
+  MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
+  MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
+  if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) {
+    // These instructions only exist converting from i32 to f32. We can handle
+    // smaller integers by generating an extra extend, but larger ones would
+    // be lossy.
+    return SDValue();
+  }
+
+  SDValue ConvInput = Op.getOperand(0);
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
+    ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
+                            SDLoc(N), NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
+                            ConvInput);
+
    unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
      Intrinsic::arm_neon_vcvtfxu2fp;
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(),
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
                       Op.getValueType(),
                       DAG.getConstant(IntrinsicOpcode, MVT::i32),
-                     Op.getOperand(0), DAG.getConstant(Log2_64(C), MVT::i32));
+                     ConvInput, DAG.getConstant(Log2_64(C), MVT::i32));
  }
  
  /// Getvshiftimm - Check if this is a valid build_vector for the immediate
@@ -8732,7 +9496,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
        VShiftOpc = ARMISD::VQRSHRNsu; break;
      }
  
-    return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
+    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
                         N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
    }
  
@@ -8749,7 +9513,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
        llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
      }
  
-    return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
+    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
                         N->getOperand(1), N->getOperand(2),
                         DAG.getConstant(Cnt, MVT::i32));
    }
@@ -8780,7 +9544,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
        if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
            DAG.MaskedValueIsZero(N0.getOperand(0),
                                  APInt::getHighBitsSet(32, 16)))
-        return DAG.getNode(ISD::ROTR, N->getDebugLoc(), VT, N0, N1);
+        return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
      }
    }
  
@@ -8797,7 +9561,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
  
    case ISD::SHL:
      if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
-      return DAG.getNode(ARMISD::VSHL, N->getDebugLoc(), VT, N->getOperand(0),
+      return DAG.getNode(ARMISD::VSHL, SDLoc(N), VT, N->getOperand(0),
                           DAG.getConstant(Cnt, MVT::i32));
      break;
  
@@ -8806,7 +9570,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
      if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
        unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
                              ARMISD::VSHRs : ARMISD::VSHRu);
-      return DAG.getNode(VShiftOpc, N->getDebugLoc(), VT, N->getOperand(0),
+      return DAG.getNode(VShiftOpc, SDLoc(N), VT, N->getOperand(0),
                           DAG.getConstant(Cnt, MVT::i32));
      }
    }
@@ -8846,7 +9610,7 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
          Opc = ARMISD::VGETLANEu;
          break;
        }
-      return DAG.getNode(Opc, N->getDebugLoc(), VT, Vec, Lane);
+      return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
      }
    }
  
@@ -8935,7 +9699,7 @@ static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
  
    if (!Opcode)
      return SDValue();
-  return DAG.getNode(Opcode, N->getDebugLoc(), N->getValueType(0), LHS, RHS);
+  return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS);
  }
  
  /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
@@ -8947,7 +9711,7 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
      return SDValue();
  
    EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
    SDValue LHS = Cmp.getOperand(0);
    SDValue RHS = Cmp.getOperand(1);
    SDValue FalseVal = N->getOperand(0);
@@ -9037,6 +9801,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
    case ARMISD::VLD3DUP:
    case ARMISD::VLD4DUP:
      return CombineBaseUpdate(N, DCI);
+  case ARMISD::BUILD_VECTOR:
+    return PerformARMBUILD_VECTORCombine(N, DCI);
    case ISD::INTRINSIC_VOID:
    case ISD::INTRINSIC_W_CHAIN:
      switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -9067,7 +9833,7 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
    return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
  }
  
-bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
+bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
    // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
    bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
  
@@ -9076,15 +9842,27 @@ bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
      return false;
    case MVT::i8:
    case MVT::i16:
-  case MVT::i32:
+  case MVT::i32: {
      // Unaligned access can use (for example) LRDB, LRDH, LDR
-    return AllowsUnaligned;
+    if (AllowsUnaligned) {
+      if (Fast)
+        *Fast = Subtarget->hasV7Ops();
+      return true;
+    }
+    return false;
+  }
    case MVT::f64:
-  case MVT::v2f64:
+  case MVT::v2f64: {
      // For any little-endian targets with neon, we can support unaligned ld/st
      // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
      // A big-endian target may also explictly support unaligned accesses
-    return Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian());
+    if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) {
+      if (Fast)
+        *Fast = true;
+      return true;
+    }
+    return false;
+  }
    }
  }
  
@@ -9096,33 +9874,59 @@ static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
  
  EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
                                             unsigned DstAlign, unsigned SrcAlign,
-                                           bool IsZeroVal,
+                                           bool IsMemset, bool ZeroMemset,
                                             bool MemcpyStrSrc,
                                             MachineFunction &MF) const {
    const Function *F = MF.getFunction();
  
    // See if we can use NEON instructions for this...
-  if (IsZeroVal &&
-      !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat) &&
-      Subtarget->hasNEON()) {
-    if (memOpAlign(SrcAlign, DstAlign, 16) && Size >= 16) {
-      return MVT::v4i32;
-    } else if (memOpAlign(SrcAlign, DstAlign, 8) && Size >= 8) {
-      return MVT::v2i32;
+  if ((!IsMemset || ZeroMemset) &&
+      Subtarget->hasNEON() &&
+      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::NoImplicitFloat)) {
+    bool Fast;
+    if (Size >= 16 &&
+        (memOpAlign(SrcAlign, DstAlign, 16) ||
+         (allowsUnalignedMemoryAccesses(MVT::v2f64, &Fast) && Fast))) {
+      return MVT::v2f64;
+    } else if (Size >= 8 &&
+               (memOpAlign(SrcAlign, DstAlign, 8) ||
+                (allowsUnalignedMemoryAccesses(MVT::f64, &Fast) && Fast))) {
+      return MVT::f64;
      }
    }
  
    // Lowering to i32/i16 if the size permits.
-  if (Size >= 4) {
+  if (Size >= 4)
      return MVT::i32;
-  } else if (Size >= 2) {
+  else if (Size >= 2)
      return MVT::i16;
-  }
  
    // Let the target-independent logic figure it out.
    return MVT::Other;
  }
  
+bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  EVT VT1 = Val.getValueType();
+  if (!VT1.isSimple() || !VT1.isInteger() ||
+      !VT2.isSimple() || !VT2.isInteger())
+    return false;
+
+  switch (VT1.getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+    // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
+    return true;
+  }
+
+  return false;
+}
+
  static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
    if (V < 0)
      return false;
@@ -9522,9 +10326,19 @@ void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
                                                         APInt &KnownOne,
                                                         const SelectionDAG &DAG,
                                                         unsigned Depth) const {
-  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0);
+  unsigned BitWidth = KnownOne.getBitWidth();
+  KnownZero = KnownOne = APInt(BitWidth, 0);
    switch (Op.getOpcode()) {
    default: break;
+  case ARMISD::ADDC:
+  case ARMISD::ADDE:
+  case ARMISD::SUBC:
+  case ARMISD::SUBE:
+    // These nodes' second result is a boolean
+    if (Op.getResNo() == 0)
+      break;
+    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    break;
    case ARMISD::CMOV: {
      // Bits are known zero/one if known on the LHS and RHS.
      DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
@@ -9638,7 +10452,7 @@ ARMTargetLowering::getSingleConstraintMatchWeight(
  typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
  RCPair
  ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                EVT VT) const {
+                                                MVT VT) const {
    if (Constraint.size() == 1) {
      // GCC ARM Constraint Letters
      switch (Constraint[0]) {
@@ -9855,17 +10669,15 @@ ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
  
  bool ARM::isBitFieldInvertedMask(unsigned v) {
    if (v == 0xffffffff)
-    return 0;
+    return false;
+
    // there can be 1's on either or both "outsides", all the "inside"
    // bits must be 0's
-  unsigned int lsb = 0, msb = 31;
-  while (v & (1 << msb)) --msb;
-  while (v & (1 << lsb)) ++lsb;
-  for (unsigned int i = lsb; i <= msb; ++i) {
-    if (v & (1 << i))
-      return 0;
-  }
-  return 1;
+  unsigned TO = CountTrailingOnes_32(v);
+  unsigned LO = CountLeadingOnes_32(v);
+  v = (v >> TO) << TO;
+  v = (v << LO) >> LO;
+  return v == 0;
  }
  
  /// isFPImmLegal - Returns true if the target can instruction select the