Fix comment typo (test commit). NFC

[oota-llvm.git] / lib / Target / AArch64 / AArch64ISelLowering.cpp
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp

index 6d2a95129c3c1209f810711a15972caa390604c9..981946df3e0963ca0784f3888013995373d56b45 100644 (file)
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -25,6 +25,7 @@
  #include "llvm/CodeGen/MachineInstrBuilder.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
  #include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
  #include "llvm/IR/Intrinsics.h"
  #include "llvm/IR/Type.h"
  #include "llvm/Support/CommandLine.h"
@@ -75,6 +76,9 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
      cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
      cl::init(false));
  
+/// Value type used for condition codes.
+static const MVT MVT_CC = MVT::i32;
+
  AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
                                               const AArch64Subtarget &STI)
      : TargetLowering(TM), Subtarget(&STI) {
@@ -491,6 +495,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
  
    setTargetDAGCombine(ISD::SELECT);
    setTargetDAGCombine(ISD::VSELECT);
+  setTargetDAGCombine(ISD::SELECT_CC);
  
    setTargetDAGCombine(ISD::INTRINSIC_VOID);
    setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
@@ -678,6 +683,14 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
    setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
    setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
  
+  // [SU][MIN|MAX] and [SU]ABSDIFF are available for all NEON types apart from
+  // i64.
+  if (!VT.isFloatingPoint() &&
+      VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64)
+    for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
+                            ISD::SABSDIFF, ISD::UABSDIFF})
+      setOperationAction(Opcode, VT.getSimpleVT(), Legal);
+
    if (Subtarget->isLittleEndian()) {
      for (unsigned im = (unsigned)ISD::PRE_INC;
           im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
@@ -697,7 +710,8 @@ void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
    addTypeForNEON(VT, MVT::v4i32);
  }
  
-EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
+                                              EVT VT) const {
    if (!VT.isVector())
      return MVT::i32;
    return VT.changeVectorElementTypeToInteger();
@@ -766,7 +780,8 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
    }
  }
  
-MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
+MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
+                                                  EVT) const {
    return MVT::i64;
  }
  
@@ -799,6 +814,9 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
    case AArch64ISD::ADCS:              return "AArch64ISD::ADCS";
    case AArch64ISD::SBCS:              return "AArch64ISD::SBCS";
    case AArch64ISD::ANDS:              return "AArch64ISD::ANDS";
+  case AArch64ISD::CCMP:              return "AArch64ISD::CCMP";
+  case AArch64ISD::CCMN:              return "AArch64ISD::CCMN";
+  case AArch64ISD::FCCMP:             return "AArch64ISD::FCCMP";
    case AArch64ISD::FCMP:              return "AArch64ISD::FCMP";
    case AArch64ISD::FMIN:              return "AArch64ISD::FMIN";
    case AArch64ISD::FMAX:              return "AArch64ISD::FMAX";
@@ -1157,14 +1175,224 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
      LHS = LHS.getOperand(0);
    }
  
-  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
+  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
        .getValue(1);
  }
  
+/// \defgroup AArch64CCMP CMP;CCMP matching
+///
+/// These functions deal with the formation of CMP;CCMP;... sequences.
+/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
+/// a comparison. They set the NZCV flags to a predefined value if their
+/// predicate is false. This allows to express arbitrary conjunctions, for
+/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))"
+/// expressed as:
+///   cmp A
+///   ccmp B, inv(CB), CA
+///   check for CB flags
+///
+/// In general we can create code for arbitrary "... (and (and A B) C)"
+/// sequences. We can also implement some "or" expressions, because "(or A B)"
+/// is equivalent to "not (and (not A) (not B))" and we can implement some
+/// negation operations:
+/// We can negate the results of a single comparison by inverting the flags
+/// used when the predicate fails and inverting the flags tested in the next
+/// instruction; We can also negate the results of the whole previous
+/// conditional compare sequence by inverting the flags tested in the next
+/// instruction. However there is no way to negate the result of a partial
+/// sequence.
+///
+/// Therefore on encountering an "or" expression we can negate the subtree on
+/// one side and have to be able to push the negate to the leafs of the subtree
+/// on the other side (see also the comments in code). As complete example:
+/// "or (or (setCA (cmp A)) (setCB (cmp B)))
+///     (and (setCC (cmp C)) (setCD (cmp D)))"
+/// is transformed to
+/// "not (and (not (and (setCC (cmp C)) (setCC (cmp D))))
+///           (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
+/// and implemented as:
+///   cmp C
+///   ccmp D, inv(CD), CC
+///   ccmp A, CA, inv(CD)
+///   ccmp B, CB, inv(CA)
+///   check for CB flags
+/// A counterexample is "or (and A B) (and C D)" which cannot be implemented
+/// by conditional compare sequences.
+/// @{
+
+/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
+static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
+                                         ISD::CondCode CC, SDValue CCOp,
+                                         SDValue Condition, unsigned NZCV,
+                                         SDLoc DL, SelectionDAG &DAG) {
+  unsigned Opcode = 0;
+  if (LHS.getValueType().isFloatingPoint())
+    Opcode = AArch64ISD::FCCMP;
+  else if (RHS.getOpcode() == ISD::SUB) {
+    SDValue SubOp0 = RHS.getOperand(0);
+    if (const ConstantSDNode *SubOp0C = dyn_cast<ConstantSDNode>(SubOp0))
+      if (SubOp0C->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+        // See emitComparison() on why we can only do this for SETEQ and SETNE.
+        Opcode = AArch64ISD::CCMN;
+        RHS = RHS.getOperand(1);
+      }
+  }
+  if (Opcode == 0)
+    Opcode = AArch64ISD::CCMP;
+
+  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
+  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
+}
+
+/// Returns true if @p Val is a tree of AND/OR/SETCC operations.
+/// CanPushNegate is set to true if we can push a negate operation through
+/// the tree in a was that we are left with AND operations and negate operations
+/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
+/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
+/// brought into such a form.
+static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate,
+                                         unsigned Depth = 0) {
+  if (!Val.hasOneUse())
+    return false;
+  unsigned Opcode = Val->getOpcode();
+  if (Opcode == ISD::SETCC) {
+    CanPushNegate = true;
+    return true;
+  }
+  // Protect against stack overflow.
+  if (Depth > 15)
+    return false;
+  if (Opcode == ISD::AND || Opcode == ISD::OR) {
+    SDValue O0 = Val->getOperand(0);
+    SDValue O1 = Val->getOperand(1);
+    bool CanPushNegateL;
+    if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1))
+      return false;
+    bool CanPushNegateR;
+    if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1))
+      return false;
+    // We cannot push a negate through an AND operation (it would become an OR),
+    // we can however change a (not (or x y)) to (and (not x) (not y)) if we can
+    // push the negate through the x/y subtrees.
+    CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR;
+    return true;
+  }
+  return false;
+}
+
+/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
+/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
+/// Tries to transform the given i1 producing node @p Val to a series compare
+/// and conditional compare operations. @returns an NZCV flags producing node
+/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
+/// transformation was not possible.
+/// On recursive invocations @p PushNegate may be set to true to have negation
+/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
+/// for the comparisons in the current subtree; @p Depth limits the search
+/// depth to avoid stack overflow.
+static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
+    AArch64CC::CondCode &OutCC, bool PushNegate = false,
+    SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL,
+    unsigned Depth = 0) {
+  // We're at a tree leaf, produce a conditional comparison operation.
+  unsigned Opcode = Val->getOpcode();
+  if (Opcode == ISD::SETCC) {
+    SDValue LHS = Val->getOperand(0);
+    SDValue RHS = Val->getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
+    bool isInteger = LHS.getValueType().isInteger();
+    if (PushNegate)
+      CC = getSetCCInverse(CC, isInteger);
+    SDLoc DL(Val);
+    // Determine OutCC and handle FP special case.
+    if (isInteger) {
+      OutCC = changeIntCCToAArch64CC(CC);
+    } else {
+      assert(LHS.getValueType().isFloatingPoint());
+      AArch64CC::CondCode ExtraCC;
+      changeFPCCToAArch64CC(CC, OutCC, ExtraCC);
+      // Surpisingly some floating point conditions can't be tested with a
+      // single condition code. Construct an additional comparison in this case.
+      // See comment below on how we deal with OR conditions.
+      if (ExtraCC != AArch64CC::AL) {
+        SDValue ExtraCmp;
+        if (!CCOp.getNode())
+          ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
+        else {
+          SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);
+          // Note that we want the inverse of ExtraCC, so NZCV is not inversed.
+          unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC);
+          ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp,
+                                               NZCV, DL, DAG);
+        }
+        CCOp = ExtraCmp;
+        Predicate = AArch64CC::getInvertedCondCode(ExtraCC);
+        OutCC = AArch64CC::getInvertedCondCode(OutCC);
+      }
+    }
+
+    // Produce a normal comparison if we are first in the chain
+    if (!CCOp.getNode())
+      return emitComparison(LHS, RHS, CC, DL, DAG);
+    // Otherwise produce a ccmp.
+    SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);
+    AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
+    unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
+    return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL,
+                                     DAG);
+  } else if (Opcode != ISD::AND && Opcode != ISD::OR)
+    return SDValue();
+
+  assert((Opcode == ISD::OR || !PushNegate)
+         && "Can only push negate through OR operation");
+
+  // Check if both sides can be transformed.
+  SDValue LHS = Val->getOperand(0);
+  SDValue RHS = Val->getOperand(1);
+  bool CanPushNegateL;
+  if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1))
+    return SDValue();
+  bool CanPushNegateR;
+  if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1))
+    return SDValue();
+
+  // Do we need to negate our operands?
+  bool NegateOperands = Opcode == ISD::OR;
+  // We can negate the results of all previous operations by inverting the
+  // predicate flags giving us a free negation for one side. For the other side
+  // we need to be able to push the negation to the leafs of the tree.
+  if (NegateOperands) {
+    if (!CanPushNegateL && !CanPushNegateR)
+      return SDValue();
+    // Order the side where we can push the negate through to LHS.
+    if (!CanPushNegateL && CanPushNegateR) {
+      std::swap(LHS, RHS);
+      CanPushNegateL = true;
+    }
+  }
+
+  // Emit RHS. If we want to negate the tree we only need to push a negate
+  // through if we are already in a PushNegate case, otherwise we can negate
+  // the "flags to test" afterwards.
+  AArch64CC::CondCode RHSCC;
+  SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate,
+                                                CCOp, Predicate, Depth+1);
+  if (NegateOperands && !PushNegate)
+    RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
+  // Emit LHS. We must push the negate through if we need to negate it.
+  SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands,
+                                                CmpR, RHSCC, Depth+1);
+  // If we transformed an OR to and AND then we have to negate the result
+  // (or absorb a PushNegate resulting in a double negation).
+  if (Opcode == ISD::OR && !PushNegate)
+    OutCC = AArch64CC::getInvertedCondCode(OutCC);
+  return CmpL;
+}
+
+/// @}
+
  static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                               SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
-  SDValue Cmp;
-  AArch64CC::CondCode AArch64CC;
    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
      EVT VT = RHS.getValueType();
      uint64_t C = RHSC->getZExtValue();
@@ -1219,47 +1447,56 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
        }
      }
    }
-  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
-  // For the i8 operand, the largest immediate is 255, so this can be easily
-  // encoded in the compare instruction. For the i16 operand, however, the
-  // largest immediate cannot be encoded in the compare.
-  // Therefore, use a sign extending load and cmn to avoid materializing the -1
-  // constant. For example,
-  // movz w1, #65535
-  // ldrh w0, [x0, #0]
-  // cmp w0, w1
-  // >
-  // ldrsh w0, [x0, #0]
-  // cmn w0, #1
-  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
-  // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure
-  // both the LHS and RHS are truely zero extended and to make sure the
-  // transformation is profitable.
+  SDValue Cmp;
+  AArch64CC::CondCode AArch64CC;
    if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
-    if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) &&
-        isa<LoadSDNode>(LHS)) {
-      if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
-          cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
-          LHS.getNode()->hasNUsesOfValue(1, 0)) {
-        int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
-        if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
-          SDValue SExt =
-              DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
-                          DAG.getValueType(MVT::i16));
-          Cmp = emitComparison(SExt,
-                               DAG.getConstant(ValueofRHS, dl,
-                                               RHS.getValueType()),
-                               CC, dl, DAG);
-          AArch64CC = changeIntCCToAArch64CC(CC);
-          AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32);
-          return Cmp;
-        }
+    const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
+
+    // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
+    // For the i8 operand, the largest immediate is 255, so this can be easily
+    // encoded in the compare instruction. For the i16 operand, however, the
+    // largest immediate cannot be encoded in the compare.
+    // Therefore, use a sign extending load and cmn to avoid materializing the
+    // -1 constant. For example,
+    // movz w1, #65535
+    // ldrh w0, [x0, #0]
+    // cmp w0, w1
+    // >
+    // ldrsh w0, [x0, #0]
+    // cmn w0, #1
+    // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
+    // if and only if (sext LHS) == (sext RHS). The checks are in place to
+    // ensure both the LHS and RHS are truely zero extended and to make sure the
+    // transformation is profitable.
+    if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
+        cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
+        cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
+        LHS.getNode()->hasNUsesOfValue(1, 0)) {
+      int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
+      if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
+        SDValue SExt =
+            DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
+                        DAG.getValueType(MVT::i16));
+        Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
+                                                   RHS.getValueType()),
+                             CC, dl, DAG);
+        AArch64CC = changeIntCCToAArch64CC(CC);
+      }
+    }
+
+    if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
+      if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) {
+        if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
+          AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
        }
      }
    }
-  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
-  AArch64CC = changeIntCCToAArch64CC(CC);
-  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32);
+
+  if (!Cmp) {
+    Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+    AArch64CC = changeIntCCToAArch64CC(CC);
+  }
+  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
    return Cmp;
  }
  
@@ -1416,7 +1653,7 @@ static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
    ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
    ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
  
-  // The the values aren't constants, this isn't the pattern we're looking for.
+  // The values aren't constants, this isn't the pattern we're looking for.
    if (!CFVal || !CTVal)
      return Op;
  
@@ -1702,7 +1939,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
  
    const char *LibcallName =
        (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+  SDValue Callee =
+      DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
  
    StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
    TargetLowering::CallLoweringInfo CLI(DAG);
@@ -1769,8 +2007,7 @@ static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
    if (N->getOpcode() != ISD::BUILD_VECTOR)
      return false;
  
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-    SDNode *Elt = N->getOperand(i).getNode();
+  for (const SDValue &Elt : N->op_values()) {
      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
        unsigned EltSize = VT.getVectorElementType().getSizeInBits();
        unsigned HalfSize = EltSize / 2;
@@ -2082,7 +2319,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
        CurArgIdx = Ins[i].getOrigArgIndex();
  
        // Get type of the original argument.
-      EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
+      EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
+                                  /*AllowUnknown*/ true);
        MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
        // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
        if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
@@ -2104,7 +2342,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
      if (Ins[i].Flags.isByVal()) {
        // Byval is used for HFAs in the PCS, but the system should work in a
        // non-compliant manner for larger structs.
-      EVT PtrTy = getPointerTy();
+      EVT PtrVT = getPointerTy(DAG.getDataLayout());
        int Size = Ins[i].Flags.getByValSize();
        unsigned NumRegs = (Size + 7) / 8;
  
@@ -2112,7 +2350,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
        // case. It should also work for fundamental types too.
        unsigned FrameIdx =
          MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
-      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
+      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
        InVals.push_back(FrameIdxN);
  
        continue;
@@ -2179,7 +2417,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
        int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
  
        // Create load nodes to retrieve arguments from the stack.
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
        SDValue ArgValue;
  
        // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
@@ -2258,6 +2496,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
    MachineFunction &MF = DAG.getMachineFunction();
    MachineFrameInfo *MFI = MF.getFrameInfo();
    AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
  
    SmallVector<SDValue, 8> MemOps;
  
@@ -2272,7 +2511,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
    if (GPRSaveSize != 0) {
      GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
  
-    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
+    SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
  
      for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
        unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
@@ -2281,8 +2520,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
            DAG.getStore(Val.getValue(1), DL, Val, FIN,
                         MachinePointerInfo::getStack(i * 8), false, false, 0);
        MemOps.push_back(Store);
-      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                        DAG.getConstant(8, DL, getPointerTy()));
+      FIN =
+          DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
      }
    }
    FuncInfo->setVarArgsGPRIndex(GPRIdx);
@@ -2300,7 +2539,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
      if (FPRSaveSize != 0) {
        FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
  
-      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
+      SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
  
        for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
          unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
@@ -2310,8 +2549,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
              DAG.getStore(Val.getValue(1), DL, Val, FIN,
                           MachinePointerInfo::getStack(i * 16), false, false, 0);
          MemOps.push_back(Store);
-        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                          DAG.getConstant(16, DL, getPointerTy()));
+        FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
+                          DAG.getConstant(16, DL, PtrVT));
        }
      }
      FuncInfo->setVarArgsFPRIndex(FPRIdx);
@@ -2414,7 +2653,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
    // cannot rely on the linker replacing the tail call with a return.
    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
      const GlobalValue *GV = G->getGlobal();
-    const Triple TT(getTargetMachine().getTargetTriple());
+    const Triple &TT = getTargetMachine().getTargetTriple();
      if (GV->hasExternalWeakLinkage() &&
          (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
        return false;
@@ -2607,7 +2846,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
      for (unsigned i = 0; i != NumArgs; ++i) {
        MVT ValVT = Outs[i].VT;
        // Get type of the original argument.
-      EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
+      EVT ActualVT = getValueType(DAG.getDataLayout(),
+                                  CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
                                    /*AllowUnknown*/ true);
        MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
        ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
@@ -2667,10 +2907,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
                                                                true),
                                   DL);
  
-  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy());
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
+                                        getPointerTy(DAG.getDataLayout()));
  
    SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
    SmallVector<SDValue, 8> MemOpChains;
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
  
    // Walk the register/memloc assignments, inserting copies/loads.
    for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
@@ -2736,13 +2978,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
        unsigned LocMemOffset = VA.getLocMemOffset();
        int32_t Offset = LocMemOffset + BEAlign;
        SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
-      PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+      PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
  
        if (IsTailCall) {
          Offset = Offset + FPDiff;
          int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
  
-        DstAddr = DAG.getFrameIndex(FI, getPointerTy());
+        DstAddr = DAG.getFrameIndex(FI, PtrVT);
          DstInfo = MachinePointerInfo::getFixedStack(FI);
  
          // Make sure any stack arguments overlapping with where we're storing
@@ -2752,7 +2994,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
        } else {
          SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
  
-        DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+        DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
          DstInfo = MachinePointerInfo::getStack(LocMemOffset);
        }
  
@@ -2802,25 +3044,24 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
        const GlobalValue *GV = G->getGlobal();
        bool InternalLinkage = GV->hasInternalLinkage();
        if (InternalLinkage)
-        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+        Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
        else {
-        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
-                                            AArch64II::MO_GOT);
-        Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+        Callee =
+            DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
+        Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
        }
      } else if (ExternalSymbolSDNode *S =
                     dyn_cast<ExternalSymbolSDNode>(Callee)) {
        const char *Sym = S->getSymbol();
-      Callee =
-          DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT);
-      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+      Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
+      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
      }
    } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
      const GlobalValue *GV = G->getGlobal();
-    Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+    Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
      const char *Sym = S->getSymbol();
-    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
+    Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
    }
  
    // We don't usually want to end the call-sequence here because we would tidy
@@ -2970,7 +3211,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
  
  SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
                                                    SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
    SDLoc DL(Op);
    const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
    const GlobalValue *GV = GN->getGlobal();
@@ -3062,7 +3303,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
    assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
  
    SDLoc DL(Op);
-  MVT PtrVT = getPointerTy();
+  MVT PtrVT = getPointerTy(DAG.getDataLayout());
    const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
  
    SDValue TLVPAddr =
@@ -3117,7 +3358,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
  ///  the sequence is produced as per above.
  SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,
                                                        SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
  
    SDValue Chain = DAG.getEntryNode();
    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
@@ -3152,7 +3393,7 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
    }
  
    SDValue TPOff;
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
    SDLoc DL(Op);
    const GlobalValue *GV = GA->getGlobal();
  
@@ -3412,7 +3653,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
      EltVT = MVT::i64;
      VecVT = MVT::v2i64;
  
-    // We want to materialize a mask with the the high bit set, but the AdvSIMD
+    // We want to materialize a mask with the high bit set, but the AdvSIMD
      // immediate moves cannot materialize that in a single instruction for
      // 64-bit elements. Instead, materialize zero and then negate it.
      EltMask = 0;
@@ -3567,7 +3808,8 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
  /// operations would *not* be semantically equivalent.
  static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
    if (Cmp == Result)
-    return true;
+    return (Cmp.getValueType() == MVT::f32 ||
+            Cmp.getValueType() == MVT::f64);
  
    ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
    ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
@@ -3701,46 +3943,6 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
    assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
    assert(LHS.getValueType() == RHS.getValueType());
    EVT VT = TVal.getValueType();
-
-  // Try to match this select into a max/min operation, which have dedicated
-  // opcode in the instruction set.
-  // FIXME: This is not correct in the presence of NaNs, so we only enable this
-  // in no-NaNs mode.
-  if (getTargetMachine().Options.NoNaNsFPMath) {
-    SDValue MinMaxLHS = TVal, MinMaxRHS = FVal;
-    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) &&
-        selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) {
-      CC = ISD::getSetCCSwappedOperands(CC);
-      std::swap(MinMaxLHS, MinMaxRHS);
-    }
-
-    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) &&
-        selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) {
-      switch (CC) {
-      default:
-        break;
-      case ISD::SETGT:
-      case ISD::SETGE:
-      case ISD::SETUGT:
-      case ISD::SETUGE:
-      case ISD::SETOGT:
-      case ISD::SETOGE:
-        return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS);
-        break;
-      case ISD::SETLT:
-      case ISD::SETLE:
-      case ISD::SETULT:
-      case ISD::SETULE:
-      case ISD::SETOLT:
-      case ISD::SETOLE:
-        return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS);
-        break;
-      }
-    }
-  }
-
-  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
-  // and do the comparison.
    SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
  
    // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
@@ -3818,7 +4020,7 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
    // Jump table entries as PC relative offsets. No additional tweaking
    // is necessary here. Just get the address of the jump table.
    JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
    SDLoc DL(Op);
  
    if (getTargetMachine().getCodeModel() == CodeModel::Large &&
@@ -3844,7 +4046,7 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
  SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
                                                   SelectionDAG &DAG) const {
    ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
    SDLoc DL(Op);
  
    if (getTargetMachine().getCodeModel() == CodeModel::Large) {
@@ -3885,7 +4087,7 @@ SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
  SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
    const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  EVT PtrVT = getPointerTy();
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
    SDLoc DL(Op);
    if (getTargetMachine().getCodeModel() == CodeModel::Large &&
        !Subtarget->isTargetMachO()) {
@@ -3911,8 +4113,8 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
        DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
  
    SDLoc DL(Op);
-  SDValue FR =
-      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
+  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
+                                 getPointerTy(DAG.getDataLayout()));
    const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
    return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
                        MachinePointerInfo(SV), false, false, 0);
@@ -3924,6 +4126,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
    // Standard, section B.3.
    MachineFunction &MF = DAG.getMachineFunction();
    AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
    SDLoc DL(Op);
  
    SDValue Chain = Op.getOperand(0);
@@ -3932,8 +4135,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
    SmallVector<SDValue, 4> MemOps;
  
    // void *__stack at offset 0
-  SDValue Stack =
-      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
+  SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
    MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
                                  MachinePointerInfo(SV), false, false, 8));
  
@@ -3942,12 +4144,12 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
    if (GPRSize > 0) {
      SDValue GRTop, GRTopAddr;
  
-    GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                            DAG.getConstant(8, DL, getPointerTy()));
+    GRTopAddr =
+        DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));
  
-    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
-    GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
-                        DAG.getConstant(GPRSize, DL, getPointerTy()));
+    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
+    GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
+                        DAG.getConstant(GPRSize, DL, PtrVT));
  
      MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
                                    MachinePointerInfo(SV, 8), false, false, 8));
@@ -3957,28 +4159,28 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
    int FPRSize = FuncInfo->getVarArgsFPRSize();
    if (FPRSize > 0) {
      SDValue VRTop, VRTopAddr;
-    VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                            DAG.getConstant(16, DL, getPointerTy()));
+    VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                            DAG.getConstant(16, DL, PtrVT));
  
-    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
-    VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
-                        DAG.getConstant(FPRSize, DL, getPointerTy()));
+    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
+    VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
+                        DAG.getConstant(FPRSize, DL, PtrVT));
  
      MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
                                    MachinePointerInfo(SV, 16), false, false, 8));
    }
  
    // int __gr_offs at offset 24
-  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                                   DAG.getConstant(24, DL, getPointerTy()));
+  SDValue GROffsAddr =
+      DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
    MemOps.push_back(DAG.getStore(Chain, DL,
                                  DAG.getConstant(-GPRSize, DL, MVT::i32),
                                  GROffsAddr, MachinePointerInfo(SV, 24), false,
                                  false, 4));
  
    // int __vr_offs at offset 28
-  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                                   DAG.getConstant(28, DL, getPointerTy()));
+  SDValue VROffsAddr =
+      DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
    MemOps.push_back(DAG.getStore(Chain, DL,
                                  DAG.getConstant(-FPRSize, DL, MVT::i32),
                                  VROffsAddr, MachinePointerInfo(SV, 28), false,
@@ -4019,21 +4221,22 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
    SDValue Chain = Op.getOperand(0);
    SDValue Addr = Op.getOperand(1);
    unsigned Align = Op.getConstantOperandVal(3);
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
  
-  SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
-                               MachinePointerInfo(V), false, false, false, 0);
+  SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V),
+                               false, false, false, 0);
    Chain = VAList.getValue(1);
  
    if (Align > 8) {
      assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
-    VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                         DAG.getConstant(Align - 1, DL, getPointerTy()));
-    VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
-                         DAG.getConstant(-(int64_t)Align, DL, getPointerTy()));
+    VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                         DAG.getConstant(Align - 1, DL, PtrVT));
+    VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
+                         DAG.getConstant(-(int64_t)Align, DL, PtrVT));
    }
  
    Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
-  uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
+  uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
  
    // Scalar integer and FP values smaller than 64 bits are implicitly extended
    // up to 64 bits.  At the very least, we have to increase the striding of the
@@ -4048,8 +4251,8 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
    }
  
    // Increment the pointer, VAList, to the next vaarg
-  SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                               DAG.getConstant(ArgSize, DL, getPointerTy()));
+  SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                               DAG.getConstant(ArgSize, DL, PtrVT));
    // Store the incremented VAList to the legalized pointer
    SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
                                   false, false, 0);
@@ -4089,14 +4292,15 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
  
  // FIXME? Maybe this could be a TableGen attribute on some registers and
  // this table could be generated automatically from RegInfo.
-unsigned AArch64TargetLowering::getRegisterByName(const char* RegName,
-                                                  EVT VT) const {
+unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
+                                                  SelectionDAG &DAG) const {
    unsigned Reg = StringSwitch<unsigned>(RegName)
                         .Case("sp", AArch64::SP)
                         .Default(0);
    if (Reg)
      return Reg;
-  report_fatal_error("Invalid register name global variable");
+  report_fatal_error(Twine("Invalid register name \""
+                              + StringRef(RegName)  + "\"."));
  }
  
  SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
@@ -4110,7 +4314,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
    unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
    if (Depth) {
      SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    SDValue Offset = DAG.getConstant(8, DL, getPointerTy());
+    SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
      return DAG.getLoad(VT, DL, DAG.getEntryNode(),
                         DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
                         MachinePointerInfo(), false, false, false, 0);
@@ -4263,7 +4467,7 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
  /// getConstraintType - Given a constraint letter, return the type of
  /// constraint it is for this target.
  AArch64TargetLowering::ConstraintType
-AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
+AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
    if (Constraint.size() == 1) {
      switch (Constraint[0]) {
      default:
@@ -4314,8 +4518,7 @@ AArch64TargetLowering::getSingleConstraintMatchWeight(
  
  std::pair<unsigned, const TargetRegisterClass *>
  AArch64TargetLowering::getRegForInlineAsmConstraint(
-    const TargetRegisterInfo *TRI, const std::string &Constraint,
-    MVT VT) const {
+    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
    if (Constraint.size() == 1) {
      switch (Constraint[0]) {
      case 'r':
@@ -4351,10 +4554,9 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
      unsigned Size = Constraint.size();
      if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
          tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
-      const std::string Reg =
-          std::string(&Constraint[2], &Constraint[Size - 1]);
-      int RegNo = atoi(Reg.c_str());
-      if (RegNo >= 0 && RegNo <= 31) {
+      int RegNo;
+      bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
+      if (!Failed && RegNo >= 0 && RegNo <= 31) {
          // v0 - v31 are aliases of q0 - q31.
          // By default we'll emit v0-v31 for this unless there's a modifier where
          // we'll emit the correct register as well.
@@ -6460,6 +6662,7 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
  bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                                 const CallInst &I,
                                                 unsigned Intrinsic) const {
+  auto &DL = I.getModule()->getDataLayout();
    switch (Intrinsic) {
    case Intrinsic::aarch64_neon_ld2:
    case Intrinsic::aarch64_neon_ld3:
@@ -6475,7 +6678,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
    case Intrinsic::aarch64_neon_ld4r: {
      Info.opc = ISD::INTRINSIC_W_CHAIN;
      // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
+    uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8;
      Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
      Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
      Info.offset = 0;
@@ -6501,7 +6704,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
        Type *ArgTy = I.getArgOperand(ArgI)->getType();
        if (!ArgTy->isVectorTy())
          break;
-      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
+      NumElts += DL.getTypeAllocSize(ArgTy) / 8;
      }
      Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
      Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
@@ -6519,7 +6722,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
      Info.memVT = MVT::getVT(PtrTy->getElementType());
      Info.ptrVal = I.getArgOperand(0);
      Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
      Info.vol = true;
      Info.readMem = true;
      Info.writeMem = false;
@@ -6532,7 +6735,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
      Info.memVT = MVT::getVT(PtrTy->getElementType());
      Info.ptrVal = I.getArgOperand(1);
      Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
      Info.vol = true;
      Info.readMem = false;
      Info.writeMem = true;
@@ -6603,7 +6806,8 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
      return true;
  
    const TargetOptions &Options = getTargetMachine().Options;
-  EVT VT = getValueType(User->getOperand(0)->getType());
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  EVT VT = getValueType(DL, User->getOperand(0)->getType());
  
    if (isFMAFasterThanFMulAndFAdd(VT) &&
        isOperationLegalOrCustom(ISD::FMA, VT) &&
@@ -6668,6 +6872,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
        break;
      case Instruction::GetElementPtr: {
        gep_type_iterator GTI = gep_type_begin(Instr);
+      auto &DL = Ext->getModule()->getDataLayout();
        std::advance(GTI, U.getOperandNo());
        Type *IdxTy = *GTI;
        // This extension will end up with a shift because of the scaling factor.
@@ -6675,7 +6880,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
        // Get the shift amount based on the scaling factor:
        // log2(sizeof(IdxTy)) - log2(8).
        uint64_t ShiftAmt =
-        countTrailingZeros(getDataLayout()->getTypeStoreSizeInBits(IdxTy)) - 3;
+          countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3;
        // Is the constant foldable in the shift of the addressing mode?
        // I.e., shift amount is between 1 and 4 inclusive.
        if (ShiftAmt == 0 || ShiftAmt > 4)
@@ -6719,6 +6924,160 @@ bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
    return NumBits == 32 || NumBits == 64;
  }
  
+/// \brief Lower an interleaved load into a ldN intrinsic.
+///
+/// E.g. Lower an interleaved load (Factor = 2):
+///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
+///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
+///
+///      Into:
+///        %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
+///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
+///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
+bool AArch64TargetLowering::lowerInterleavedLoad(
+    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    ArrayRef<unsigned> Indices, unsigned Factor) const {
+  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+         "Invalid interleave factor");
+  assert(!Shuffles.empty() && "Empty shufflevector input");
+  assert(Shuffles.size() == Indices.size() &&
+         "Unmatched number of shufflevectors and indices");
+
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+
+  VectorType *VecTy = Shuffles[0]->getType();
+  unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy);
+
+  // Skip illegal vector types.
+  if (VecSize != 64 && VecSize != 128)
+    return false;
+
+  // A pointer vector can not be the return type of the ldN intrinsics. Need to
+  // load integer vectors first and then convert to pointer vectors.
+  Type *EltTy = VecTy->getVectorElementType();
+  if (EltTy->isPointerTy())
+    VecTy =
+        VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
+
+  Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
+  Type *Tys[2] = {VecTy, PtrTy};
+  static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
+                                            Intrinsic::aarch64_neon_ld3,
+                                            Intrinsic::aarch64_neon_ld4};
+  Function *LdNFunc =
+      Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
+
+  IRBuilder<> Builder(LI);
+  Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy);
+
+  CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN");
+
+  // Replace uses of each shufflevector with the corresponding vector loaded
+  // by ldN.
+  for (unsigned i = 0; i < Shuffles.size(); i++) {
+    ShuffleVectorInst *SVI = Shuffles[i];
+    unsigned Index = Indices[i];
+
+    Value *SubVec = Builder.CreateExtractValue(LdN, Index);
+
+    // Convert the integer vector to pointer vector if the element is pointer.
+    if (EltTy->isPointerTy())
+      SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType());
+
+    SVI->replaceAllUsesWith(SubVec);
+  }
+
+  return true;
+}
+
+/// \brief Get a mask consisting of sequential integers starting from \p Start.
+///
+/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
+static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
+                                   unsigned NumElts) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < NumElts; i++)
+    Mask.push_back(Builder.getInt32(Start + i));
+
+  return ConstantVector::get(Mask);
+}
+
+/// \brief Lower an interleaved store into a stN intrinsic.
+///
+/// E.g. Lower an interleaved store (Factor = 3):
+///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
+///                                  <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+///        store <12 x i32> %i.vec, <12 x i32>* %ptr
+///
+///      Into:
+///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
+///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
+///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
+///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
+///
+/// Note that the new shufflevectors will be removed and we'll only generate one
+/// st3 instruction in CodeGen.
+bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
+                                                  ShuffleVectorInst *SVI,
+                                                  unsigned Factor) const {
+  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+         "Invalid interleave factor");
+
+  VectorType *VecTy = SVI->getType();
+  assert(VecTy->getVectorNumElements() % Factor == 0 &&
+         "Invalid interleaved store");
+
+  unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
+  Type *EltTy = VecTy->getVectorElementType();
+  VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
+
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
+
+  // Skip illegal vector types.
+  if (SubVecSize != 64 && SubVecSize != 128)
+    return false;
+
+  Value *Op0 = SVI->getOperand(0);
+  Value *Op1 = SVI->getOperand(1);
+  IRBuilder<> Builder(SI);
+
+  // StN intrinsics don't support pointer vectors as arguments. Convert pointer
+  // vectors to integer vectors.
+  if (EltTy->isPointerTy()) {
+    Type *IntTy = DL.getIntPtrType(EltTy);
+    unsigned NumOpElts =
+        dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();
+
+    // Convert to the corresponding integer vector.
+    Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
+    Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
+    Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
+
+    SubVecTy = VectorType::get(IntTy, NumSubElts);
+  }
+
+  Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
+  Type *Tys[2] = {SubVecTy, PtrTy};
+  static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
+                                             Intrinsic::aarch64_neon_st3,
+                                             Intrinsic::aarch64_neon_st4};
+  Function *StNFunc =
+      Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
+
+  SmallVector<Value *, 5> Ops;
+
+  // Split the shufflevector operands into sub vectors for the new stN call.
+  for (unsigned i = 0; i < Factor; i++)
+    Ops.push_back(Builder.CreateShuffleVector(
+        Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
+
+  Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
+  Builder.CreateCall(StNFunc, Ops);
+  return true;
+}
+
  static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
                         unsigned AlignCheck) {
    return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
@@ -6771,8 +7130,9 @@ bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
  
  /// isLegalAddressingMode - Return true if the addressing mode represented
  /// by AM is legal for this target, for a load/store of the specified type.
-bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                  Type *Ty) const {
+bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                  const AddrMode &AM, Type *Ty,
+                                                  unsigned AS) const {
    // AArch64 has five basic addressing modes:
    //  reg
    //  reg + 9-bit signed offset
@@ -6792,7 +7152,7 @@ bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
    // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
    uint64_t NumBytes = 0;
    if (Ty->isSized()) {
-    uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty);
+    uint64_t NumBits = DL.getTypeSizeInBits(Ty);
      NumBytes = NumBits / 8;
      if (!isPowerOf2_64(NumBits))
        NumBytes = 0;
@@ -6822,8 +7182,9 @@ bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
    return false;
  }
  
-int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
-                                                Type *Ty) const {
+int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
+                                                const AddrMode &AM, Type *Ty,
+                                                unsigned AS) const {
    // Scaling factors are not free at all.
    // Operands                     | Rt Latency
    // -------------------------------------------
@@ -6831,7 +7192,7 @@ int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
    // -------------------------------------------
    // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
    // Rt, [Xn, Wm, <extend> #imm]  |
-  if (isLegalAddressingMode(AM, Ty))
+  if (isLegalAddressingMode(DL, AM, Ty, AS))
      // Scale represents reg2 * scale, thus account for 1 if
      // it is not equal to 0 or 1.
      return AM.Scale != 0 && AM.Scale != 1;
@@ -7471,21 +7832,26 @@ static SDValue tryCombineFixedPointConvert(SDNode *N,
  //
  // This routine does the actual conversion of such DUPs, once outer routines
  // have determined that everything else is in order.
+// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
+// similarly here.
  static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
-  // We can handle most types of duplicate, but the lane ones have an extra
-  // operand saying *which* lane, so we need to know.
-  bool IsDUPLANE;
    switch (N.getOpcode()) {
    case AArch64ISD::DUP:
-    IsDUPLANE = false;
-    break;
    case AArch64ISD::DUPLANE8:
    case AArch64ISD::DUPLANE16:
    case AArch64ISD::DUPLANE32:
    case AArch64ISD::DUPLANE64:
-    IsDUPLANE = true;
+  case AArch64ISD::MOVI:
+  case AArch64ISD::MOVIshift:
+  case AArch64ISD::MOVIedit:
+  case AArch64ISD::MOVImsl:
+  case AArch64ISD::MVNIshift:
+  case AArch64ISD::MVNImsl:
      break;
    default:
+    // FMOV could be supported, but isn't very useful, as it would only occur
+    // if you passed a bitcast' floating point immediate to an eligible long
+    // integer op (addl, smull, ...).
      return SDValue();
    }
  
@@ -7495,17 +7861,11 @@ static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
  
    MVT ElementTy = NarrowTy.getVectorElementType();
    unsigned NumElems = NarrowTy.getVectorNumElements();
-  MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2);
+  MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
  
    SDLoc dl(N);
-  SDValue NewDUP;
-  if (IsDUPLANE)
-    NewDUP = DAG.getNode(N.getOpcode(), dl, NewDUPVT, N.getOperand(0),
-                         N.getOperand(1));
-  else
-    NewDUP = DAG.getNode(AArch64ISD::DUP, dl, NewDUPVT, N.getOperand(0));
-
-  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, NewDUP,
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
+                     DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
                       DAG.getConstant(NumElems, dl, MVT::i64));
  }
  
@@ -7705,14 +8065,15 @@ static SDValue performAddSubLongCombine(SDNode *N,
  //   (aarch64_neon_umull (extract_high (v2i64 vec)))
  //                     (extract_high (v2i64 (dup128 scalar)))))
  //
-static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
+static SDValue tryCombineLongOpWithDup(SDNode *N,
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         SelectionDAG &DAG) {
    if (DCI.isBeforeLegalizeOps())
      return SDValue();
  
-  SDValue LHS = N->getOperand(1);
-  SDValue RHS = N->getOperand(2);
+  bool IsIntrinsic = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
+  SDValue LHS = N->getOperand(IsIntrinsic ? 1 : 0);
+  SDValue RHS = N->getOperand(IsIntrinsic ? 2 : 1);
    assert(LHS.getValueType().is64BitVector() &&
           RHS.getValueType().is64BitVector() &&
           "unexpected shape for long operation");
@@ -7730,8 +8091,13 @@ static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
        return SDValue();
    }
  
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
-                     N->getOperand(0), LHS, RHS);
+  // N could either be an intrinsic or a sabsdiff/uabsdiff node.
+  if (IsIntrinsic)
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
+                       N->getOperand(0), LHS, RHS);
+  else
+    return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+                       LHS, RHS);
  }
  
  static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
@@ -7850,11 +8216,17 @@ static SDValue performIntrinsicCombine(SDNode *N,
    case Intrinsic::aarch64_neon_fmin:
      return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0),
                         N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_sabd:
+    return DAG.getNode(ISD::SABSDIFF, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_uabd:
+    return DAG.getNode(ISD::UABSDIFF, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
    case Intrinsic::aarch64_neon_smull:
    case Intrinsic::aarch64_neon_umull:
    case Intrinsic::aarch64_neon_pmull:
    case Intrinsic::aarch64_neon_sqdmull:
-    return tryCombineLongOpWithDup(IID, N, DCI, DAG);
+    return tryCombineLongOpWithDup(N, DCI, DAG);
    case Intrinsic::aarch64_neon_sqshl:
    case Intrinsic::aarch64_neon_uqshl:
    case Intrinsic::aarch64_neon_sqshlu:
@@ -7879,18 +8251,15 @@ static SDValue performExtendCombine(SDNode *N,
    // helps the backend to decide that an sabdl2 would be useful, saving a real
    // extract_high operation.
    if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
-      N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+      (N->getOperand(0).getOpcode() == ISD::SABSDIFF ||
+       N->getOperand(0).getOpcode() == ISD::UABSDIFF)) {
      SDNode *ABDNode = N->getOperand(0).getNode();
-    unsigned IID = getIntrinsicID(ABDNode);
-    if (IID == Intrinsic::aarch64_neon_sabd ||
-        IID == Intrinsic::aarch64_neon_uabd) {
-      SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
-      if (!NewABD.getNode())
-        return SDValue();
+    SDValue NewABD = tryCombineLongOpWithDup(ABDNode, DCI, DAG);
+    if (!NewABD.getNode())
+      return SDValue();
  
-      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
-                         NewABD);
-    }
+    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
+                       NewABD);
    }
  
    // This is effectively a custom type legalization for AArch64.
@@ -8735,6 +9104,83 @@ static SDValue performSelectCombine(SDNode *N,
    return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
  }
  
+/// performSelectCCCombine - Target-specific DAG combining for ISD::SELECT_CC
+/// to match FMIN/FMAX patterns.
+static SDValue performSelectCCCombine(SDNode *N, SelectionDAG &DAG) {
+  // Try to use FMIN/FMAX instructions for FP selects like "x < y ? x : y".
+  // Unless the NoNaNsFPMath option is set, be careful about NaNs:
+  // vmax/vmin return NaN if either operand is a NaN;
+  // only do the transformation when it matches that behavior.
+
+  SDValue CondLHS = N->getOperand(0);
+  SDValue CondRHS = N->getOperand(1);
+  SDValue LHS = N->getOperand(2);
+  SDValue RHS = N->getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+
+  unsigned Opcode;
+  bool IsReversed;
+  if (selectCCOpsAreFMaxCompatible(CondLHS, LHS) &&
+      selectCCOpsAreFMaxCompatible(CondRHS, RHS)) {
+    IsReversed = false; // x CC y ? x : y
+  } else if (selectCCOpsAreFMaxCompatible(CondRHS, LHS) &&
+             selectCCOpsAreFMaxCompatible(CondLHS, RHS)) {
+    IsReversed = true ; // x CC y ? y : x
+  } else {
+    return SDValue();
+  }
+
+  bool IsUnordered = false, IsOrEqual;
+  switch (CC) {
+  default:
+    return SDValue();
+  case ISD::SETULT:
+  case ISD::SETULE:
+    IsUnordered = true;
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETLT:
+  case ISD::SETLE:
+    IsOrEqual = (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE);
+    Opcode = IsReversed ? AArch64ISD::FMAX : AArch64ISD::FMIN;
+    break;
+
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    IsUnordered = true;
+  case ISD::SETOGT:
+  case ISD::SETOGE:
+  case ISD::SETGT:
+  case ISD::SETGE:
+    IsOrEqual = (CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE);
+    Opcode = IsReversed ? AArch64ISD::FMIN : AArch64ISD::FMAX;
+    break;
+  }
+
+  // If LHS is NaN, an ordered comparison will be false and the result will be
+  // the RHS, but FMIN(NaN, RHS) = FMAX(NaN, RHS) = NaN. Avoid this by checking
+  // that LHS != NaN. Likewise, for unordered comparisons, check for RHS != NaN.
+  if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
+    return SDValue();
+
+  // For xxx-or-equal comparisons, "+0 <= -0" and "-0 >= +0" will both be true,
+  // but FMIN will return -0, and FMAX will return +0. So FMIN/FMAX can only be
+  // used for unsafe math or if one of the operands is known to be nonzero.
+  if (IsOrEqual && !DAG.getTarget().Options.UnsafeFPMath &&
+      !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+    return SDValue();
+
+  return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS);
+}
+
+/// Get rid of unnecessary NVCASTs (that don't change the type).
+static SDValue performNVCASTCombine(SDNode *N) {
+  if (N->getValueType(0) == N->getOperand(0).getValueType())
+    return N->getOperand(0);
+
+  return SDValue();
+}
+
  SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                   DAGCombinerInfo &DCI) const {
    SelectionDAG &DAG = DCI.DAG;
@@ -8767,6 +9213,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
      return performSelectCombine(N, DCI);
    case ISD::VSELECT:
      return performVSelectCombine(N, DCI.DAG);
+  case ISD::SELECT_CC:
+    return performSelectCCCombine(N, DCI.DAG);
    case ISD::STORE:
      return performSTORECombine(N, DCI, DAG, Subtarget);
    case AArch64ISD::BRCOND:
@@ -8775,6 +9223,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
      return performCONDCombine(N, DCI, DAG, 2, 3);
    case AArch64ISD::DUP:
      return performPostLD1Combine(N, DCI, false);
+  case AArch64ISD::NVCAST:
+    return performNVCASTCombine(N);
    case ISD::INSERT_VECTOR_ELT:
      return performPostLD1Combine(N, DCI, true);
    case ISD::INTRINSIC_VOID:
@@ -9062,7 +9512,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
      Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
      Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
      Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
-    return Builder.CreateCall3(Stxr, Lo, Hi, Addr);
+    return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
    }
  
    Intrinsic::ID Int =
@@ -9070,13 +9520,18 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
    Type *Tys[] = { Addr->getType() };
    Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
  
-  return Builder.CreateCall2(
-      Stxr, Builder.CreateZExtOrBitCast(
-                Val, Stxr->getFunctionType()->getParamType(0)),
-      Addr);
+  return Builder.CreateCall(Stxr,
+                            {Builder.CreateZExtOrBitCast(
+                                 Val, Stxr->getFunctionType()->getParamType(0)),
+                             Addr});
  }
  
  bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
      Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
    return Ty->isArrayTy();
  }
+
+bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
+                                                            EVT) const {
+  return false;
+}