ARM: permit tail calls to weak externals on COFF

[oota-llvm.git] / lib / Target / AArch64 / AArch64ISelLowering.cpp
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp

index 0cd2186eade92022a04097b724c3720b892af114..6bddf46da1915fdf6bbb09ed6d1f85a7bf224de8 100644 (file)
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12,6 +12,7 @@
  //===----------------------------------------------------------------------===//
  
  #include "AArch64ISelLowering.h"
+#include "AArch64CallingConvention.h"
  #include "AArch64MachineFunctionInfo.h"
  #include "AArch64PerfectShuffle.h"
  #include "AArch64Subtarget.h"
@@ -66,18 +67,9 @@ EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
                           cl::desc("Allow AArch64 SLI/SRI formation"),
                           cl::init(false));
  
-//===----------------------------------------------------------------------===//
-// AArch64 Lowering public interface.
-//===----------------------------------------------------------------------===//
-static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
-  if (TT.isOSBinFormatMachO())
-    return new AArch64_MachoTargetObjectFile();
-
-  return new AArch64_ELFTargetObjectFile();
-}
  
-AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
-    : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
+AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM)
+    : TargetLowering(TM) {
    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
  
    // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
@@ -108,6 +100,7 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
      addDRTypeForNEON(MVT::v2i32);
      addDRTypeForNEON(MVT::v1i64);
      addDRTypeForNEON(MVT::v1f64);
+    addDRTypeForNEON(MVT::v4f16);
  
      addQRTypeForNEON(MVT::v4f32);
      addQRTypeForNEON(MVT::v2f64);
@@ -115,6 +108,7 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
      addQRTypeForNEON(MVT::v8i16);
      addQRTypeForNEON(MVT::v4i32);
      addQRTypeForNEON(MVT::v2i64);
+    addQRTypeForNEON(MVT::v8f16);
    }
  
    // Compute derived properties from the register classes
@@ -289,6 +283,85 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
    setOperationAction(ISD::FMUL, MVT::f16, Promote);
    setOperationAction(ISD::FSUB, MVT::f16, Promote);
  
+  // v4f16 is also a storage-only type, so promote it to v4f32 when that is
+  // known to be safe.
+  setOperationAction(ISD::FADD, MVT::v4f16, Promote);
+  setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
+  setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
+  setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
+  setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
+  setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
+  AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
+  AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);
+
+  // Expand all other v4f16 operations.
+  // FIXME: We could generate better code by promoting some operations to
+  // a pair of v4f32s
+  setOperationAction(ISD::FABS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
+  setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
+  setOperationAction(ISD::FMA, MVT::v4f16, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
+  setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
+  setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
+  setOperationAction(ISD::FREM, MVT::v4f16, Expand);
+  setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
+  setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
+  setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
+  setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
+  setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
+  setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
+  setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
+  setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
+
+
+  // v8f16 is also a storage-only type, so expand it.
+  setOperationAction(ISD::FABS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FADD, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
+  setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
+  setOperationAction(ISD::FMA, MVT::v8f16, Expand);
+  setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
+  setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
+  setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
+  setOperationAction(ISD::FREM, MVT::v8f16, Expand);
+  setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
+  setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
+  setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
+  setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
+  setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
+  setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
+  setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
+  setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
+  setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);
+
    // AArch64 has implementations of a lot of rounding-like FP operations.
    static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
    for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
@@ -314,6 +387,13 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
      setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
    }
  
+  // Make floating-point constants legal for the large code model, so they don't
+  // become loads from the constant pool.
+  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
+    setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+    setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+  }
+
    // AArch64 does not have floating-point extending loads, i1 sign-extending
    // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
    setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
@@ -450,6 +530,11 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
  
      // AArch64 doesn't have MUL.2d:
      setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+    // Custom handling for some quad-vector types to detect MULL.
+    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+
      setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
      setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
      // Likewise, narrowing and extending vector loads/stores aren't handled
@@ -495,13 +580,13 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
  }
  
  void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
-  if (VT == MVT::v2f32) {
+  if (VT == MVT::v2f32 || VT == MVT::v4f16) {
      setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
      AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
  
      setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
      AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
-  } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+  } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
      setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
      AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
  
@@ -742,6 +827,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case AArch64ISD::TC_RETURN:         return "AArch64ISD::TC_RETURN";
    case AArch64ISD::SITOF:             return "AArch64ISD::SITOF";
    case AArch64ISD::UITOF:             return "AArch64ISD::UITOF";
+  case AArch64ISD::NVCAST:            return "AArch64ISD::NVCAST";
    case AArch64ISD::SQSHL_I:           return "AArch64ISD::SQSHL_I";
    case AArch64ISD::UQSHL_I:           return "AArch64ISD::UQSHL_I";
    case AArch64ISD::SRSHR_I:           return "AArch64ISD::SRSHR_I";
@@ -771,6 +857,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
    case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
    case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
+  case AArch64ISD::SMULL:             return "AArch64ISD::SMULL";
+  case AArch64ISD::UMULL:             return "AArch64ISD::UMULL";
    }
  }
  
@@ -1036,6 +1124,8 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
  
  static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                               SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
+  SDValue Cmp;
+  AArch64CC::CondCode AArch64CC;
    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
      EVT VT = RHS.getValueType();
      uint64_t C = RHSC->getZExtValue();
@@ -1067,9 +1157,9 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
          break;
        case ISD::SETLE:
        case ISD::SETGT:
-        if ((VT == MVT::i32 && C != 0x7fffffff &&
+        if ((VT == MVT::i32 && C != INT32_MAX &&
               isLegalArithImmed((uint32_t)(C + 1))) ||
-            (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
+            (VT == MVT::i64 && C != INT64_MAX &&
               isLegalArithImmed(C + 1ULL))) {
            CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
            C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
@@ -1078,9 +1168,9 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
          break;
        case ISD::SETULE:
        case ISD::SETUGT:
-        if ((VT == MVT::i32 && C != 0xffffffff &&
+        if ((VT == MVT::i32 && C != UINT32_MAX &&
               isLegalArithImmed((uint32_t)(C + 1))) ||
-            (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
+            (VT == MVT::i64 && C != UINT64_MAX &&
               isLegalArithImmed(C + 1ULL))) {
            CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
            C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
@@ -1090,9 +1180,45 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
        }
      }
    }
-
-  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
-  AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
+  // For the i8 operand, the largest immediate is 255, so this can be easily
+  // encoded in the compare instruction. For the i16 operand, however, the
+  // largest immediate cannot be encoded in the compare.
+  // Therefore, use a sign extending load and cmn to avoid materializing the -1
+  // constant. For example,
+  // movz w1, #65535
+  // ldrh w0, [x0, #0]
+  // cmp w0, w1
+  // >
+  // ldrsh w0, [x0, #0]
+  // cmn w0, #1
+  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
+  // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure
+  // both the LHS and RHS are truely zero extended and to make sure the
+  // transformation is profitable.
+  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
+    if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) &&
+        isa<LoadSDNode>(LHS)) {
+      if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
+          cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
+          LHS.getNode()->hasNUsesOfValue(1, 0)) {
+        int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
+        if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
+          SDValue SExt =
+              DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
+                          DAG.getValueType(MVT::i16));
+          Cmp = emitComparison(SExt,
+                               DAG.getConstant(ValueofRHS, RHS.getValueType()),
+                               CC, dl, DAG);
+          AArch64CC = changeIntCCToAArch64CC(CC);
+          AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
+          return Cmp;
+        }
+      }
+    }
+  }
+  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+  AArch64CC = changeIntCCToAArch64CC(CC);
    AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
    return Cmp;
  }
@@ -1416,7 +1542,10 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
  
    if (VT.getSizeInBits() > InVT.getSizeInBits()) {
      SDLoc dl(Op);
-    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
+    MVT ExtVT =
+        MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
+                         VT.getVectorNumElements());
+    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
      return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
    }
  
@@ -1521,7 +1650,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
        (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
    SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
  
-  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
    TargetLowering::CallLoweringInfo CLI(DAG);
    CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
      .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);
@@ -1545,6 +1674,197 @@ static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
        0);
  }
  
+static EVT getExtensionTo64Bits(const EVT &OrigVT) {
+  if (OrigVT.getSizeInBits() >= 64)
+    return OrigVT;
+
+  assert(OrigVT.isSimple() && "Expecting a simple value type");
+
+  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
+  switch (OrigSimpleTy) {
+  default: llvm_unreachable("Unexpected Vector Type");
+  case MVT::v2i8:
+  case MVT::v2i16:
+     return MVT::v2i32;
+  case MVT::v4i8:
+    return  MVT::v4i16;
+  }
+}
+
+static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
+                                                 const EVT &OrigTy,
+                                                 const EVT &ExtTy,
+                                                 unsigned ExtOpcode) {
+  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
+  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
+  // 64-bits we need to insert a new extension so that it will be 64-bits.
+  assert(ExtTy.is128BitVector() && "Unexpected extension size");
+  if (OrigTy.getSizeInBits() >= 64)
+    return N;
+
+  // Must extend size to at least 64 bits to be used as an operand for VMULL.
+  EVT NewVT = getExtensionTo64Bits(OrigTy);
+
+  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
+}
+
+static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
+                                   bool isSigned) {
+  EVT VT = N->getValueType(0);
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDNode *Elt = N->getOperand(i).getNode();
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
+      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+      unsigned HalfSize = EltSize / 2;
+      if (isSigned) {
+        if (!isIntN(HalfSize, C->getSExtValue()))
+          return false;
+      } else {
+        if (!isUIntN(HalfSize, C->getZExtValue()))
+          return false;
+      }
+      continue;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+    return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
+                                             N->getOperand(0)->getValueType(0),
+                                             N->getValueType(0),
+                                             N->getOpcode());
+
+  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
+  EVT VT = N->getValueType(0);
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
+  unsigned NumElts = VT.getVectorNumElements();
+  MVT TruncVT = MVT::getIntegerVT(EltSize);
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
+    const APInt &CInt = C->getAPIntValue();
+    // Element types smaller than 32 bits are not legal, so use i32 elements.
+    // The values are implicitly truncated so sext vs. zext doesn't matter.
+    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
+                     MVT::getVectorVT(TruncVT, NumElts), Ops);
+}
+
+static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND)
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, true))
+    return true;
+  return false;
+}
+
+static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::ZERO_EXTEND)
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, false))
+    return true;
+  return false;
+}
+
+static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
+  }
+  return false;
+}
+
+static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
+  }
+  return false;
+}
+
+static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
+  // Multiplications are only custom-lowered for 128-bit vectors so that
+  // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
+  EVT VT = Op.getValueType();
+  assert(VT.is128BitVector() && VT.isInteger() &&
+         "unexpected type for custom-lowering ISD::MUL");
+  SDNode *N0 = Op.getOperand(0).getNode();
+  SDNode *N1 = Op.getOperand(1).getNode();
+  unsigned NewOpc = 0;
+  bool isMLA = false;
+  bool isN0SExt = isSignExtended(N0, DAG);
+  bool isN1SExt = isSignExtended(N1, DAG);
+  if (isN0SExt && isN1SExt)
+    NewOpc = AArch64ISD::SMULL;
+  else {
+    bool isN0ZExt = isZeroExtended(N0, DAG);
+    bool isN1ZExt = isZeroExtended(N1, DAG);
+    if (isN0ZExt && isN1ZExt)
+      NewOpc = AArch64ISD::UMULL;
+    else if (isN1SExt || isN1ZExt) {
+      // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
+      // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
+      if (isN1SExt && isAddSubSExt(N0, DAG)) {
+        NewOpc = AArch64ISD::SMULL;
+        isMLA = true;
+      } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
+        NewOpc =  AArch64ISD::UMULL;
+        isMLA = true;
+      } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
+        std::swap(N0, N1);
+        NewOpc =  AArch64ISD::UMULL;
+        isMLA = true;
+      }
+    }
+
+    if (!NewOpc) {
+      if (VT == MVT::v2i64)
+        // Fall through to expand this.  It is not legal.
+        return SDValue();
+      else
+        // Other vector multiplications are legal.
+        return Op;
+    }
+  }
+
+  // Legalize to a S/UMULL instruction
+  SDLoc DL(Op);
+  SDValue Op0;
+  SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
+  if (!isMLA) {
+    Op0 = skipExtensionForVectorMULL(N0, DAG);
+    assert(Op0.getValueType().is64BitVector() &&
+           Op1.getValueType().is64BitVector() &&
+           "unexpected types for extended operands to VMULL");
+    return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
+  }
+  // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
+  // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
+  // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
+  SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
+  SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
+  EVT Op1VT = Op1.getValueType();
+  return DAG.getNode(N0->getOpcode(), DL, VT,
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
+}
  
  SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
                                                SelectionDAG &DAG) const {
@@ -1645,6 +1965,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
      return LowerFP_TO_INT(Op, DAG);
    case ISD::FSINCOS:
      return LowerFSINCOS(Op, DAG);
+  case ISD::MUL:
+    return LowerMUL(Op, DAG);
    }
  }
  
@@ -1659,8 +1981,7 @@ unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
  
  #include "AArch64GenCallingConv.inc"
  
-/// Selects the correct CCAssignFn for a the given CallingConvention
-/// value.
+/// Selects the correct CCAssignFn for a given CallingConvention value.
  CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
                                                       bool IsVarArg) const {
    switch (CC) {
@@ -1793,7 +2114,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
        unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
  
        uint32_t BEAlign = 0;
-      if (ArgSize < 8 && !Subtarget->isLittleEndian())
+      if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
+          !Ins[i].Flags.isInConsecutiveRegs())
          BEAlign = 8 - ArgSize;
  
        int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
@@ -1825,7 +2147,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
  
        ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
                                  MachinePointerInfo::getFixedStack(FI),
-                                MemVT, false, false, false, 0, nullptr);
+                                MemVT, false, false, false, 0);
  
        InVals.push_back(ArgValue);
      }
@@ -2036,7 +2358,9 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
    // cannot rely on the linker replacing the tail call with a return.
    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
      const GlobalValue *GV = G->getGlobal();
-    if (GV->hasExternalWeakLinkage())
+    const Triple TT(getTargetMachine().getTargetTriple());
+    if (GV->hasExternalWeakLinkage() &&
+        (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
        return false;
    }
  
@@ -2347,7 +2671,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
        unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
                                          : VA.getValVT().getSizeInBits();
        OpSize = (OpSize + 7) / 8;
-      if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
+      if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
+          !Flags.isInConsecutiveRegs()) {
          if (OpSize < 8)
            BEAlign = 8 - OpSize;
        }
@@ -2590,7 +2915,8 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
                                                    SelectionDAG &DAG) const {
    EVT PtrVT = getPointerTy();
    SDLoc DL(Op);
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GN->getGlobal();
    unsigned char OpFlags =
        Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
  
@@ -2605,6 +2931,25 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
      return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
    }
  
+  if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) {
+    assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+           "use of MO_CONSTPOOL only supported on small model");
+    SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE);
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
+    SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags);
+    SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+    SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr,
+                                     MachinePointerInfo::getConstantPool(),
+                                     /*isVolatile=*/ false,
+                                     /*isNonTemporal=*/ true,
+                                     /*isInvariant=*/ true, 8);
+    if (GN->getOffset() != 0)
+      return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr,
+                         DAG.getConstant(GN->getOffset(), PtrVT));
+    return GlobalAddr;
+  }
+
    if (getTargetMachine().getCodeModel() == CodeModel::Large) {
      const unsigned char MO_NC = AArch64II::MO_NC;
      return DAG.getNode(
@@ -3100,6 +3445,9 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
            AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
      return SDValue();
  
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
    // While there is no integer popcount instruction, it can
    // be more efficiently lowered to the following sequence that uses
    // AdvSIMD registers/instructions as long as the copies to/from
@@ -4216,7 +4564,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
      SDValue SourceVec = V.getOperand(0);
      auto Source = std::find(Sources.begin(), Sources.end(), SourceVec);
      if (Source == Sources.end())
-      Sources.push_back(ShuffleSourceInfo(SourceVec));
+      Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
  
      // Update the minimum and maximum lane number seen.
      unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
@@ -4255,8 +4603,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
      // This stage of the search produces a source with the same element type as
      // the original, but with a total width matching the BUILD_VECTOR output.
      EVT EltVT = SrcVT.getVectorElementType();
-    EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
-                                  VT.getSizeInBits() / EltVT.getSizeInBits());
+    unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
+    EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
  
      if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
        assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
@@ -4270,28 +4618,30 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
  
      assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
  
-    if (Src.MaxElt - Src.MinElt >= NumElts) {
+    if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
        // Span too large for a VEXT to cope
        return SDValue();
      }
  
-    if (Src.MinElt >= NumElts) {
+    if (Src.MinElt >= NumSrcElts) {
        // The extraction can just take the second half
        Src.ShuffleVec =
            DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
-                      DAG.getIntPtrConstant(NumElts));
-      Src.WindowBase = -NumElts;
-    } else if (Src.MaxElt < NumElts) {
+                      DAG.getConstant(NumSrcElts, MVT::i64));
+      Src.WindowBase = -NumSrcElts;
+    } else if (Src.MaxElt < NumSrcElts) {
        // The extraction can just take the first half
-      Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT,
-                                   Src.ShuffleVec, DAG.getIntPtrConstant(0));
+      Src.ShuffleVec =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(0, MVT::i64));
      } else {
        // An actual VEXT is needed
-      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT,
-                                     Src.ShuffleVec, DAG.getIntPtrConstant(0));
+      SDValue VEXTSrc1 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+                      DAG.getConstant(0, MVT::i64));
        SDValue VEXTSrc2 =
            DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
-                      DAG.getIntPtrConstant(NumElts));
+                      DAG.getConstant(NumSrcElts, MVT::i64));
        unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
  
        Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
@@ -4687,7 +5037,8 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
          VT.getVectorElementType() == MVT::f32)
        return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
      // vrev <4 x i16> -> REV32
-    if (VT.getVectorElementType() == MVT::i16)
+    if (VT.getVectorElementType() == MVT::i16 ||
+        VT.getVectorElementType() == MVT::f16)
        return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
      // vrev <4 x i8> -> REV16
      assert(VT.getVectorElementType() == MVT::i8);
@@ -4807,7 +5158,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
  static unsigned getDUPLANEOp(EVT EltType) {
    if (EltType == MVT::i8)
      return AArch64ISD::DUPLANE8;
-  if (EltType == MVT::i16)
+  if (EltType == MVT::i16 || EltType == MVT::f16)
      return AArch64ISD::DUPLANE16;
    if (EltType == MVT::i32 || EltType == MVT::f32)
      return AArch64ISD::DUPLANE32;
@@ -4937,7 +5288,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
      SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
  
      EVT ScalarVT = VT.getVectorElementType();
-    if (ScalarVT.getSizeInBits() < 32)
+
+    if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
        ScalarVT = MVT::i32;
  
      return DAG.getNode(
@@ -5025,7 +5377,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -5034,7 +5386,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -5043,7 +5395,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -5052,7 +5404,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -5061,7 +5413,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -5070,7 +5422,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
      }
  
@@ -5225,7 +5577,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -5234,7 +5586,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -5243,7 +5595,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -5252,7 +5604,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -5261,7 +5613,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -5270,7 +5622,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
      }
  
@@ -5343,13 +5695,13 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          if (VT.getSizeInBits() == 128) {
            SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
                                      DAG.getConstant(CnstVal, MVT::i32));
-          return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+          return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
          }
  
          // Support the V64 version via subregister insertion.
          SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
                                    DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
@@ -5358,7 +5710,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -5367,7 +5719,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -5376,7 +5728,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -5385,7 +5737,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -5394,7 +5746,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -5403,7 +5755,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
@@ -5412,7 +5764,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(264, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
@@ -5421,7 +5773,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(272, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
@@ -5429,7 +5781,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
          SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        // The few faces of FMOV...
@@ -5438,7 +5790,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
          SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
@@ -5446,7 +5798,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
          SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
                                    DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        // The many faces of MVNI...
@@ -5457,7 +5809,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -5466,7 +5818,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -5475,7 +5827,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -5484,7 +5836,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -5493,7 +5845,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -5502,7 +5854,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
@@ -5511,7 +5863,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(264, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
  
        if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
@@ -5520,7 +5872,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
          SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
                                    DAG.getConstant(CnstVal, MVT::i32),
                                    DAG.getConstant(272, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
        }
      }
  
@@ -5696,11 +6048,12 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
  
    // Insertion/extraction are legal for V128 types.
    if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
+      VT == MVT::v8f16)
      return Op;
  
    if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
+      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
      return SDValue();
  
    // For V64 types, we perform insertion by expanding the value
@@ -5729,11 +6082,12 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
  
    // Insertion/extraction are legal for V128 types.
    if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
+      VT == MVT::v8f16)
      return Op;
  
    if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
+      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
      return SDValue();
  
    // For V64 types, we perform extraction by expanding the value
@@ -6476,7 +6830,7 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
    SDValue N0 = N->getOperand(0);
    unsigned Lg2 = Divisor.countTrailingZeros();
    SDValue Zero = DAG.getConstant(0, VT);
-  SDValue Pow2MinusOne = DAG.getConstant((1 << Lg2) - 1, VT);
+  SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, VT);
  
    // Add (N0 < 0) ? Pow2 - 1 : 0;
    SDValue CCVal;
@@ -6608,7 +6962,8 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
    return SDValue();
  }
  
-static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
+                                     const AArch64Subtarget *Subtarget) {
    // First try to optimize away the conversion when it's conditionally from
    // a constant. Vectors only.
    SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
@@ -6627,7 +6982,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
    // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
    // This eliminates an "integer-to-vector-move UOP and improve throughput.
    SDValue N0 = N->getOperand(0);
-  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+  if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
        // Do not change the width of a volatile load.
        !cast<LoadSDNode>(N0)->isVolatile()) {
      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
@@ -7388,11 +7743,11 @@ static SDValue performExtendCombine(SDNode *N,
    // If the vector type isn't a simple VT, it's beyond the scope of what
    // we're  worried about here. Let legalization do its thing and hope for
    // the best.
-  if (!ResVT.isSimple())
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src->getValueType(0);
+  if (!ResVT.isSimple() || !SrcVT.isSimple())
      return SDValue();
  
-  SDValue Src = N->getOperand(0);
-  MVT SrcVT = Src->getValueType(0).getSimpleVT();
    // If the source VT is a 64-bit vector, we can play games and get the
    // better results we want.
    if (SrcVT.getSizeInBits() != 64)
@@ -7416,9 +7771,9 @@ static SDValue performExtendCombine(SDNode *N,
    EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
                                 LoVT.getVectorNumElements());
    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
-                   DAG.getIntPtrConstant(0));
+                   DAG.getConstant(0, MVT::i64));
    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
-                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+                   DAG.getConstant(InNVT.getVectorNumElements(), MVT::i64));
    Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
    Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
  
@@ -7540,9 +7895,9 @@ static SDValue performSTORECombine(SDNode *N,
    EVT HalfVT =
        EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
    SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
-                                   DAG.getIntPtrConstant(0));
+                                   DAG.getConstant(0, MVT::i64));
    SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
-                                   DAG.getIntPtrConstant(NumElts));
+                                   DAG.getConstant(NumElts, MVT::i64));
    SDValue BasePtr = S->getBasePtr();
    SDValue NewST1 =
        DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
@@ -7626,7 +7981,7 @@ static SDValue performPostLD1Combine(SDNode *N,
      Ops.push_back(Inc);
  
      EVT Tys[3] = { VT, MVT::i64, MVT::Other };
-    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, 3));
+    SDVTList SDTys = DAG.getVTList(Tys);
      unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
      SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
                                             MemVT,
@@ -7756,7 +8111,7 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
        Tys[n] = VecTy;
      Tys[n++] = MVT::i64;  // Type of write back register
      Tys[n] = MVT::Other;  // Type of the chain
-    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
+    SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
  
      MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
      SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
@@ -7777,10 +8132,272 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
    return SDValue();
  }
  
+// Checks to see if the value is the prescribed width and returns information
+// about its extension mode.
+static
+bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
+  ExtType = ISD::NON_EXTLOAD;
+  switch(V.getNode()->getOpcode()) {
+  default:
+    return false;
+  case ISD::LOAD: {
+    LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
+    if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
+       || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
+      ExtType = LoadNode->getExtensionType();
+      return true;
+    }
+    return false;
+  }
+  case ISD::AssertSext: {
+    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+    if ((TypeNode->getVT() == MVT::i8 && width == 8)
+       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+      ExtType = ISD::SEXTLOAD;
+      return true;
+    }
+    return false;
+  }
+  case ISD::AssertZext: {
+    VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+    if ((TypeNode->getVT() == MVT::i8 && width == 8)
+       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+      ExtType = ISD::ZEXTLOAD;
+      return true;
+    }
+    return false;
+  }
+  case ISD::Constant:
+  case ISD::TargetConstant: {
+    if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
+        1LL << (width - 1))
+      return true;
+    return false;
+  }
+  }
+
+  return true;
+}
+
+// This function does a whole lot of voodoo to determine if the tests are
+// equivalent without and with a mask. Essentially what happens is that given a
+// DAG resembling:
+//
+//  +-------------+ +-------------+ +-------------+ +-------------+
+//  |    Input    | | AddConstant | | CompConstant| |     CC      |
+//  +-------------+ +-------------+ +-------------+ +-------------+
+//           |           |           |               |
+//           V           V           |    +----------+
+//          +-------------+  +----+  |    |
+//          |     ADD     |  |0xff|  |    |
+//          +-------------+  +----+  |    |
+//                  |           |    |    |
+//                  V           V    |    |
+//                 +-------------+   |    |
+//                 |     AND     |   |    |
+//                 +-------------+   |    |
+//                      |            |    |
+//                      +-----+      |    |
+//                            |      |    |
+//                            V      V    V
+//                           +-------------+
+//                           |     CMP     |
+//                           +-------------+
+//
+// The AND node may be safely removed for some combinations of inputs. In
+// particular we need to take into account the extension type of the Input,
+// the exact values of AddConstant, CompConstant, and CC, along with the nominal
+// width of the input (this can work for any width inputs, the above graph is
+// specific to 8 bits.
+//
+// The specific equations were worked out by generating output tables for each
+// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
+// problem was simplified by working with 4 bit inputs, which means we only
+// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
+// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
+// patterns present in both extensions (0,7). For every distinct set of
+// AddConstant and CompConstants bit patterns we can consider the masked and
+// unmasked versions to be equivalent if the result of this function is true for
+// all 16 distinct bit patterns of for the current extension type of Input (w0).
+//
+//   sub      w8, w0, w1
+//   and      w10, w8, #0x0f
+//   cmp      w8, w2
+//   cset     w9, AArch64CC
+//   cmp      w10, w2
+//   cset     w11, AArch64CC
+//   cmp      w9, w11
+//   cset     w0, eq
+//   ret
+//
+// Since the above function shows when the outputs are equivalent it defines
+// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
+// would be expensive to run during compiles. The equations below were written
+// in a test harness that confirmed they gave equivalent outputs to the above
+// for all inputs function, so they can be used determine if the removal is
+// legal instead.
+//
+// isEquivalentMaskless() is the code for testing if the AND can be removed
+// factored out of the DAG recognition as the DAG can take several forms.
+
+static
+bool isEquivalentMaskless(unsigned CC, unsigned width,
+                          ISD::LoadExtType ExtType, signed AddConstant,
+                          signed CompConstant) {
+  // By being careful about our equations and only writing the in term
+  // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
+  // make them generally applicable to all bit widths.
+  signed MaxUInt = (1 << width);
+
+  // For the purposes of these comparisons sign extending the type is
+  // equivalent to zero extending the add and displacing it by half the integer
+  // width. Provided we are careful and make sure our equations are valid over
+  // the whole range we can just adjust the input and avoid writing equations
+  // for sign extended inputs.
+  if (ExtType == ISD::SEXTLOAD)
+    AddConstant -= (1 << (width-1));
+
+  switch(CC) {
+  case AArch64CC::LE:
+  case AArch64CC::GT: {
+    if ((AddConstant == 0) ||
+        (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
+        (AddConstant >= 0 && CompConstant < 0) ||
+        (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
+      return true;
+  } break;
+  case AArch64CC::LT:
+  case AArch64CC::GE: {
+    if ((AddConstant == 0) ||
+        (AddConstant >= 0 && CompConstant <= 0) ||
+        (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
+      return true;
+  } break;
+  case AArch64CC::HI:
+  case AArch64CC::LS: {
+    if ((AddConstant >= 0 && CompConstant < 0) ||
+       (AddConstant <= 0 && CompConstant >= -1 &&
+        CompConstant < AddConstant + MaxUInt))
+      return true;
+  } break;
+  case AArch64CC::PL:
+  case AArch64CC::MI: {
+    if ((AddConstant == 0) ||
+        (AddConstant > 0 && CompConstant <= 0) ||
+        (AddConstant < 0 && CompConstant <= AddConstant))
+      return true;
+  } break;
+  case AArch64CC::LO:
+  case AArch64CC::HS: {
+    if ((AddConstant >= 0 && CompConstant <= 0) ||
+        (AddConstant <= 0 && CompConstant >= 0 &&
+         CompConstant <= AddConstant + MaxUInt))
+      return true;
+  } break;
+  case AArch64CC::EQ:
+  case AArch64CC::NE: {
+    if ((AddConstant > 0 && CompConstant < 0) ||
+        (AddConstant < 0 && CompConstant >= 0 &&
+         CompConstant < AddConstant + MaxUInt) ||
+        (AddConstant >= 0 && CompConstant >= 0 &&
+         CompConstant >= AddConstant) ||
+        (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
+
+      return true;
+  } break;
+  case AArch64CC::VS:
+  case AArch64CC::VC:
+  case AArch64CC::AL:
+  case AArch64CC::NV:
+    return true;
+  case AArch64CC::Invalid:
+    break;
+  }
+
+  return false;
+}
+
+static
+SDValue performCONDCombine(SDNode *N,
+                           TargetLowering::DAGCombinerInfo &DCI,
+                           SelectionDAG &DAG, unsigned CCIndex,
+                           unsigned CmpIndex) {
+  unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
+  SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
+  unsigned CondOpcode = SubsNode->getOpcode();
+
+  if (CondOpcode != AArch64ISD::SUBS)
+    return SDValue();
+
+  // There is a SUBS feeding this condition. Is it fed by a mask we can
+  // use?
+
+  SDNode *AndNode = SubsNode->getOperand(0).getNode();
+  unsigned MaskBits = 0;
+
+  if (AndNode->getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
+    uint32_t CNV = CN->getZExtValue();
+    if (CNV == 255)
+      MaskBits = 8;
+    else if (CNV == 65535)
+      MaskBits = 16;
+  }
+
+  if (!MaskBits)
+    return SDValue();
+
+  SDValue AddValue = AndNode->getOperand(0);
+
+  if (AddValue.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // The basic dag structure is correct, grab the inputs and validate them.
+
+  SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
+  SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
+  SDValue SubsInputValue = SubsNode->getOperand(1);
+
+  // The mask is present and the provenance of all the values is a smaller type,
+  // lets see if the mask is superfluous.
+
+  if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
+      !isa<ConstantSDNode>(SubsInputValue.getNode()))
+    return SDValue();
+
+  ISD::LoadExtType ExtType;
+
+  if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
+      !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
+      !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
+    return SDValue();
+
+  if(!isEquivalentMaskless(CC, MaskBits, ExtType,
+                cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
+                cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
+    return SDValue();
+
+  // The AND is not necessary, remove it.
+
+  SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
+                               SubsNode->getValueType(1));
+  SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
+
+  SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
+  DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
+
+  return SDValue(N, 0);
+}
+
  // Optimize compare with zero and branch.
  static SDValue performBRCONDCombine(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      SelectionDAG &DAG) {
+  SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3);
+  if (NV.getNode())
+    N = NV.getNode();
    SDValue Chain = N->getOperand(0);
    SDValue Dest = N->getOperand(1);
    SDValue CCVal = N->getOperand(2);
@@ -7869,21 +8486,29 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
    SDValue N0 = N->getOperand(0);
    EVT ResVT = N->getValueType(0);
  
-  if (!N->getOperand(1).getValueType().isVector())
+  if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
      return SDValue();
  
-  if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
+  // If NumMaskElts == 0, the comparison is larger than select result. The
+  // largest real NEON comparison is 64-bits per lane, which means the result is
+  // at most 32-bits and an illegal vector. Just bail out for now.
+  EVT SrcVT = N0.getOperand(0).getValueType();
+
+  // Don't try to do this optimization when the setcc itself has i1 operands.
+  // There are no legal vectors of i1, so this would be pointless.
+  if (SrcVT == MVT::i1)
      return SDValue();
  
-  SDLoc DL(N0);
+  int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
+  if (!ResVT.isVector() || NumMaskElts == 0)
+    return SDValue();
  
-  EVT SrcVT = N0.getOperand(0).getValueType();
-  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
-                           ResVT.getSizeInBits() / SrcVT.getSizeInBits());
+  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
    EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
  
    // First perform a vector comparison, where lane 0 is the one we're interested
    // in.
+  SDLoc DL(N0);
    SDValue LHS =
        DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
    SDValue RHS =
@@ -7893,8 +8518,8 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
    // Now duplicate the comparison mask we want across all other lanes.
    SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
    SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
-  Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
-                     Mask);
+  Mask = DAG.getNode(ISD::BITCAST, DL,
+                     ResVT.changeVectorElementTypeToInteger(), Mask);
  
    return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
  }
@@ -7914,7 +8539,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
      return performMulCombine(N, DAG, DCI, Subtarget);
    case ISD::SINT_TO_FP:
    case ISD::UINT_TO_FP:
-    return performIntToFpCombine(N, DAG);
+    return performIntToFpCombine(N, DAG, Subtarget);
    case ISD::OR:
      return performORCombine(N, DCI, Subtarget);
    case ISD::INTRINSIC_WO_CHAIN:
@@ -7935,6 +8560,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
      return performSTORECombine(N, DCI, DAG, Subtarget);
    case AArch64ISD::BRCOND:
      return performBRCONDCombine(N, DCI, DAG);
+  case AArch64ISD::CSEL:
+    return performCONDCombine(N, DCI, DAG, 2, 3);
    case AArch64ISD::DUP:
      return performPostLD1Combine(N, DCI, false);
    case ISD::INSERT_VECTOR_ELT:
@@ -8090,13 +8717,12 @@ bool AArch64TargetLowering::getPostIndexedAddressParts(
  
  static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                    SelectionDAG &DAG) {
-  if (N->getValueType(0) != MVT::i16)
-    return;
-
    SDLoc DL(N);
    SDValue Op = N->getOperand(0);
-  assert(Op.getValueType() == MVT::f16 &&
-         "Inconsistent bitcast? Only 16-bit types should be i16 or f16");
+
+  if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16)
+    return;
+
    Op = SDValue(
        DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
                           DAG.getUNDEF(MVT::i32), Op,
@@ -8122,23 +8748,16 @@ void AArch64TargetLowering::ReplaceNodeResults(
    }
  }
  
-bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
-  // Loads and stores less than 128-bits are already atomic; ones above that
-  // are doomed anyway, so defer to the default libcall and blame the OS when
-  // things go wrong:
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128;
-  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-    return LI->getType()->getPrimitiveSizeInBits() == 128;
-
-  // For the real atomic operations, we have ldxr/stxr up to 128 bits.
-  return Inst->getType()->getPrimitiveSizeInBits() <= 128;
-}
-
  bool AArch64TargetLowering::useLoadStackGuardNode() const {
    return true;
  }
  
+bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
+  // Combine multiple FDIVs with the same divisor into multiple FMULs by the
+  // reciprocal if there are three or more FDIVs.
+  return NumUsers > 2;
+}
+
  TargetLoweringBase::LegalizeTypeAction
  AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
    MVT SVT = VT.getSimpleVT();
@@ -8151,6 +8770,32 @@ AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
    return TargetLoweringBase::getPreferredVectorAction(VT);
  }
  
+// Loads and stores less than 128-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong.
+bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+  unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
+  return Size == 128;
+}
+
+// Loads and stores less than 128-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong.
+bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+  unsigned Size = LI->getType()->getPrimitiveSizeInBits();
+  return Size == 128;
+}
+
+// For the real atomic operations, we have ldxr/stxr up to 128 bits,
+bool AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+  unsigned Size = AI->getType()->getPrimitiveSizeInBits();
+  return Size <= 128;
+}
+
+bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const {
+  return true;
+}
+
  Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                               AtomicOrdering Ord) const {
    Module *M = Builder.GetInsertBlock()->getParent()->getParent();
@@ -8217,3 +8862,8 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
                  Val, Stxr->getFunctionType()->getParamType(0)),
        Addr);
  }
+
+bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
+    Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
+  return Ty->isArrayTy();
+}