Revert "[AArch64] Add DAG combine for extract extend pattern"

[oota-llvm.git] / lib / Target / AArch64 / AArch64ISelLowering.cpp
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp

index 3c1251e26959f49708d9c603ac1da9ad88b35610..9f5beff121002a5e9103f31b57af3f3762c0406f 100644 (file)
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -196,11 +196,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
    setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
  
    setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
  
-  // Exception handling.
-  // FIXME: These are guesses. Has this been defined yet?
-  setExceptionPointerRegister(AArch64::X0);
-  setExceptionSelectorRegister(AArch64::X1);
-
    // Constant pool entries
    setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
  
    // Constant pool entries
    setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
  
@@ -220,6 +215,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
    // AArch64 lacks both left-rotate and popcount instructions.
    setOperationAction(ISD::ROTL, MVT::i32, Expand);
    setOperationAction(ISD::ROTL, MVT::i64, Expand);
    // AArch64 lacks both left-rotate and popcount instructions.
    setOperationAction(ISD::ROTL, MVT::i32, Expand);
    setOperationAction(ISD::ROTL, MVT::i64, Expand);
+  for (MVT VT : MVT::vector_valuetypes()) {
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
+  }
  
    // AArch64 doesn't have {U|S}MUL_LOHI.
    setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
  
    // AArch64 doesn't have {U|S}MUL_LOHI.
    setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
@@ -238,6 +237,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
  
    setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
    setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
  
    setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
    setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+  for (MVT VT : MVT::vector_valuetypes()) {
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
+  }
    setOperationAction(ISD::SREM, MVT::i32, Expand);
    setOperationAction(ISD::SREM, MVT::i64, Expand);
    setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
    setOperationAction(ISD::SREM, MVT::i32, Expand);
    setOperationAction(ISD::SREM, MVT::i64, Expand);
    setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
@@ -478,6 +481,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
    setTargetDAGCombine(ISD::SINT_TO_FP);
    setTargetDAGCombine(ISD::UINT_TO_FP);
  
    setTargetDAGCombine(ISD::SINT_TO_FP);
    setTargetDAGCombine(ISD::UINT_TO_FP);
  
+  setTargetDAGCombine(ISD::FP_TO_SINT);
+  setTargetDAGCombine(ISD::FP_TO_UINT);
+  setTargetDAGCombine(ISD::FDIV);
+
    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
  
    setTargetDAGCombine(ISD::ANY_EXTEND);
    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
  
    setTargetDAGCombine(ISD::ANY_EXTEND);
@@ -486,6 +493,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
    setTargetDAGCombine(ISD::BITCAST);
    setTargetDAGCombine(ISD::CONCAT_VECTORS);
    setTargetDAGCombine(ISD::STORE);
    setTargetDAGCombine(ISD::BITCAST);
    setTargetDAGCombine(ISD::CONCAT_VECTORS);
    setTargetDAGCombine(ISD::STORE);
+  if (Subtarget->supportsAddressTopByteIgnored())
+    setTargetDAGCombine(ISD::LOAD);
  
    setTargetDAGCombine(ISD::MUL);
  
  
    setTargetDAGCombine(ISD::MUL);
  
@@ -682,12 +691,10 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
    setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
    setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
  
    setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
    setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
  
-  // [SU][MIN|MAX] and [SU]ABSDIFF are available for all NEON types apart from
-  // i64.
+  // [SU][MIN|MAX] are available for all NEON types apart from i64.
    if (!VT.isFloatingPoint() &&
        VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64)
    if (!VT.isFloatingPoint() &&
        VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64)
-    for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
-                            ISD::SABSDIFF, ISD::UABSDIFF})
+    for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
        setOperationAction(Opcode, VT.getSimpleVT(), Legal);
  
    // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!).
        setOperationAction(Opcode, VT.getSimpleVT(), Legal);
  
    // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!).
@@ -796,9 +803,25 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                             bool *Fast) const {
    if (Subtarget->requiresStrictAlign())
      return false;
                                                             bool *Fast) const {
    if (Subtarget->requiresStrictAlign())
      return false;
-  // FIXME: True for Cyclone, but not necessary others.
-  if (Fast)
-    *Fast = true;
+
+  // FIXME: This is mostly true for Cyclone, but not necessarily others.
+  if (Fast) {
+    // FIXME: Define an attribute for slow unaligned accesses instead of
+    // relying on the CPU type as a proxy.
+    // On Cyclone, unaligned 128-bit stores are slow.
+    *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 ||
+            // See comments in performSTORECombine() for more details about
+            // these conditions.
+
+            // Code that uses clang vector extensions can mark that it
+            // wants unaligned accesses to be treated as fast by
+            // underspecifying alignment to be 1 or 2.
+            Align <= 2 ||
+
+            // Disregard v2i64. Memcpy lowering produces those and splitting
+            // them regresses performance on micro-benchmarks and olden/bh.
+            VT == MVT::v2i64;
+  }
    return true;
  }
  
    return true;
  }
  
@@ -954,8 +977,7 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
    const TargetInstrInfo *TII = Subtarget->getInstrInfo();
    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
    DebugLoc DL = MI->getDebugLoc();
    const TargetInstrInfo *TII = Subtarget->getInstrInfo();
    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
    DebugLoc DL = MI->getDebugLoc();
-  MachineFunction::iterator It = MBB;
-  ++It;
+  MachineFunction::iterator It = ++MBB->getIterator();
  
    unsigned DestReg = MI->getOperand(0).getReg();
    unsigned IfTrueReg = MI->getOperand(1).getReg();
  
    unsigned DestReg = MI->getOperand(0).getReg();
    unsigned IfTrueReg = MI->getOperand(1).getReg();
@@ -1164,8 +1186,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
    // register to WZR/XZR if it ends up being unused.
    unsigned Opcode = AArch64ISD::SUBS;
  
    // register to WZR/XZR if it ends up being unused.
    unsigned Opcode = AArch64ISD::SUBS;
  
-  if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
-      cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
+  if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
        (CC == ISD::SETEQ || CC == ISD::SETNE)) {
      // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
      // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
        (CC == ISD::SETEQ || CC == ISD::SETNE)) {
      // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
      // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
@@ -1179,8 +1200,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
      // the absence of information about op2.
      Opcode = AArch64ISD::ADDS;
      RHS = RHS.getOperand(1);
      // the absence of information about op2.
      Opcode = AArch64ISD::ADDS;
      RHS = RHS.getOperand(1);
-  } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
-             cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
+  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
               !isUnsignedIntSetCC(CC)) {
      // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
      // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
               !isUnsignedIntSetCC(CC)) {
      // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
      // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
@@ -1245,8 +1265,7 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
      Opcode = AArch64ISD::FCCMP;
    else if (RHS.getOpcode() == ISD::SUB) {
      SDValue SubOp0 = RHS.getOperand(0);
      Opcode = AArch64ISD::FCCMP;
    else if (RHS.getOpcode() == ISD::SUB) {
      SDValue SubOp0 = RHS.getOperand(0);
-    if (const ConstantSDNode *SubOp0C = dyn_cast<ConstantSDNode>(SubOp0))
-      if (SubOp0C->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
          // See emitComparison() on why we can only do this for SETEQ and SETNE.
          Opcode = AArch64ISD::CCMN;
          RHS = RHS.getOperand(1);
          // See emitComparison() on why we can only do this for SETEQ and SETNE.
          Opcode = AArch64ISD::CCMN;
          RHS = RHS.getOperand(1);
@@ -1640,8 +1659,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
  SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
                                               RTLIB::Libcall Call) const {
    SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
  SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
                                               RTLIB::Libcall Call) const {
    SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
-  return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
-                     SDLoc(Op)).first;
+  return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
  }
  
  static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
  }
  
  static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
@@ -1820,8 +1838,8 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
    // precise. That doesn't take part in the LibCall so we can't directly use
    // LowerF128Call.
    SDValue SrcVal = Op.getOperand(0);
    // precise. That doesn't take part in the LibCall so we can't directly use
    // LowerF128Call.
    SDValue SrcVal = Op.getOperand(0);
-  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
-                     /*isSigned*/ false, SDLoc(Op)).first;
+  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
+                     SDLoc(Op)).first;
  }
  
  static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
  }
  
  static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
@@ -1830,6 +1848,16 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
    // in the cost tables.
    EVT InVT = Op.getOperand(0).getValueType();
    EVT VT = Op.getValueType();
    // in the cost tables.
    EVT InVT = Op.getOperand(0).getValueType();
    EVT VT = Op.getValueType();
+  unsigned NumElts = InVT.getVectorNumElements();
+
+  // f16 vectors are promoted to f32 before a conversion.
+  if (InVT.getVectorElementType() == MVT::f16) {
+    MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
+    SDLoc dl(Op);
+    return DAG.getNode(
+        Op.getOpcode(), dl, Op.getValueType(),
+        DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
+  }
  
    if (VT.getSizeInBits() < InVT.getSizeInBits()) {
      SDLoc dl(Op);
  
    if (VT.getSizeInBits() < InVT.getSizeInBits()) {
      SDLoc dl(Op);
@@ -1877,8 +1905,7 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
      LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
  
    SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
      LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
  
    SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
-  return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
-                     SDLoc(Op)).first;
+  return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
  }
  
  static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
  }
  
  static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -2311,11 +2338,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
    }
  }
  
    }
  }
  
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
-  return 2;
-}
-
  //===----------------------------------------------------------------------===//
  //                      Calling Convention Implementation
  //===----------------------------------------------------------------------===//
  //===----------------------------------------------------------------------===//
  //                      Calling Convention Implementation
  //===----------------------------------------------------------------------===//
@@ -3249,6 +3271,19 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
      Flag = Chain.getValue(1);
      RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
    }
      Flag = Chain.getValue(1);
      RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
    }
+  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const MCPhysReg *I =
+      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+  if (I) {
+    for (; *I; ++I) {
+      if (AArch64::GPR64RegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+      else if (AArch64::FPR64RegClass.contains(*I))
+        RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
+      else
+        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+    }
+  }
  
    RetOps[0] = Chain; // Update chain.
  
  
    RetOps[0] = Chain; // Update chain.
  
@@ -3564,8 +3599,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
    // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
    // instruction.
    unsigned Opc = LHS.getOpcode();
    // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
    // instruction.
    unsigned Opc = LHS.getOpcode();
-  if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
-      cast<ConstantSDNode>(RHS)->isOne() &&
+  if (LHS.getResNo() == 1 && isOneConstant(RHS) &&
        (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
         Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
      assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
        (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
         Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
      assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
@@ -3869,7 +3903,13 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
      }
    }
  
      }
    }
  
-  // Handle integers first.
+  // Also handle f16, for which we need to do a f32 comparison.
+  if (LHS.getValueType() == MVT::f16) {
+    LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
+    RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
+  }
+
+  // Next, handle integers.
    if (LHS.getValueType().isInteger()) {
      assert((LHS.getValueType() == RHS.getValueType()) &&
             (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
    if (LHS.getValueType().isInteger()) {
      assert((LHS.getValueType() == RHS.getValueType()) &&
             (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
@@ -3892,9 +3932,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
      } else if (TVal.getOpcode() == ISD::XOR) {
        // If TVal is a NOT we want to swap TVal and FVal so that we can match
        // with a CSINV rather than a CSEL.
      } else if (TVal.getOpcode() == ISD::XOR) {
        // If TVal is a NOT we want to swap TVal and FVal so that we can match
        // with a CSINV rather than a CSEL.
-      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
-
-      if (CVal && CVal->isAllOnesValue()) {
+      if (isAllOnesConstant(TVal.getOperand(1))) {
          std::swap(TVal, FVal);
          std::swap(CTVal, CFVal);
          CC = ISD::getSetCCInverse(CC, true);
          std::swap(TVal, FVal);
          std::swap(CTVal, CFVal);
          CC = ISD::getSetCCInverse(CC, true);
@@ -3902,9 +3940,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
      } else if (TVal.getOpcode() == ISD::SUB) {
        // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
        // that we can match with a CSNEG rather than a CSEL.
      } else if (TVal.getOpcode() == ISD::SUB) {
        // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
        // that we can match with a CSNEG rather than a CSEL.
-      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
-
-      if (CVal && CVal->isNullValue()) {
+      if (isNullConstant(TVal.getOperand(0))) {
          std::swap(TVal, FVal);
          std::swap(CTVal, CFVal);
          CC = ISD::getSetCCInverse(CC, true);
          std::swap(TVal, FVal);
          std::swap(CTVal, CFVal);
          CC = ISD::getSetCCInverse(CC, true);
@@ -4364,46 +4400,57 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
    SDValue ShOpLo = Op.getOperand(0);
    SDValue ShOpHi = Op.getOperand(1);
    SDValue ShAmt = Op.getOperand(2);
    SDValue ShOpLo = Op.getOperand(0);
    SDValue ShOpHi = Op.getOperand(1);
    SDValue ShAmt = Op.getOperand(2);
-  SDValue ARMcc;
    unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
  
    assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
  
    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
                                   DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
    unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
  
    assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
  
    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
                                   DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+
+  // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
+  // is "undef". We wanted 0, so CSEL it directly.
+  SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
+                               ISD::SETEQ, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
+  HiBitsForLo =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
+                  HiBitsForLo, CCVal, Cmp);
+
    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
                                     DAG.getConstant(VTBits, dl, MVT::i64));
    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
                                     DAG.getConstant(VTBits, dl, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
  
  
-  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64),
-                               ISD::SETGE, dl, DAG);
-  SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+  SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue LoForNormalShift =
+      DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);
  
  
-  SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-  SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
-  SDValue Lo =
-      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+  Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
+                       dl, DAG);
+  CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+  SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+  SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
+                           LoForNormalShift, CCVal, Cmp);
  
    // AArch64 shifts larger than the register width are wrapped rather than
    // clamped, so we can't just emit "hi >> x".
  
    // AArch64 shifts larger than the register width are wrapped rather than
    // clamped, so we can't just emit "hi >> x".
-  SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
-  SDValue TrueValHi = Opc == ISD::SRA
-                          ? DAG.getNode(Opc, dl, VT, ShOpHi,
-                                        DAG.getConstant(VTBits - 1, dl,
-                                                        MVT::i64))
-                          : DAG.getConstant(0, dl, VT);
-  SDValue Hi =
-      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
+  SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+  SDValue HiForBigShift =
+      Opc == ISD::SRA
+          ? DAG.getNode(Opc, dl, VT, ShOpHi,
+                        DAG.getConstant(VTBits - 1, dl, MVT::i64))
+          : DAG.getConstant(0, dl, VT);
+  SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
+                           HiForNormalShift, CCVal, Cmp);
  
    SDValue Ops[2] = { Lo, Hi };
    return DAG.getMergeValues(Ops, dl);
  }
  
  
    SDValue Ops[2] = { Lo, Hi };
    return DAG.getMergeValues(Ops, dl);
  }
  
+
  /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
  /// i64 values and take a 2 x i64 value to shift plus a shift amount.
  SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
  /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
  /// i64 values and take a 2 x i64 value to shift plus a shift amount.
  SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
-                                                 SelectionDAG &DAG) const {
+                                                   SelectionDAG &DAG) const {
    assert(Op.getNumOperands() == 3 && "Not a double-shift!");
    EVT VT = Op.getValueType();
    unsigned VTBits = VT.getSizeInBits();
    assert(Op.getNumOperands() == 3 && "Not a double-shift!");
    EVT VT = Op.getValueType();
    unsigned VTBits = VT.getSizeInBits();
@@ -4411,31 +4458,41 @@ SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
    SDValue ShOpLo = Op.getOperand(0);
    SDValue ShOpHi = Op.getOperand(1);
    SDValue ShAmt = Op.getOperand(2);
    SDValue ShOpLo = Op.getOperand(0);
    SDValue ShOpHi = Op.getOperand(1);
    SDValue ShAmt = Op.getOperand(2);
-  SDValue ARMcc;
  
    assert(Op.getOpcode() == ISD::SHL_PARTS);
    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
                                   DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
  
    assert(Op.getOpcode() == ISD::SHL_PARTS);
    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
                                   DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+  SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+
+  // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
+  // is "undef". We wanted 0, so CSEL it directly.
+  SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
+                               ISD::SETEQ, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
+  LoBitsForHi =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
+                  LoBitsForHi, CCVal, Cmp);
+
    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
                                     DAG.getConstant(VTBits, dl, MVT::i64));
    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
                                     DAG.getConstant(VTBits, dl, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
-  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+  SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+  SDValue HiForNormalShift =
+      DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
  
  
-  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
  
  
-  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64),
-                               ISD::SETGE, dl, DAG);
-  SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
-  SDValue Hi =
-      DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
+  Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
+                       dl, DAG);
+  CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
+  SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
+                           HiForNormalShift, CCVal, Cmp);
  
    // AArch64 shifts of larger than register sizes are wrapped rather than
    // clamped, so we can't just emit "lo << a" if a is too big.
  
    // AArch64 shifts of larger than register sizes are wrapped rather than
    // clamped, so we can't just emit "lo << a" if a is too big.
-  SDValue TrueValLo = DAG.getConstant(0, dl, VT);
-  SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
-  SDValue Lo =
-      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+  SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
+  SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
+                           LoForNormalShift, CCVal, Cmp);
  
    SDValue Ops[2] = { Lo, Hi };
    return DAG.getMergeValues(Ops, dl);
  
    SDValue Ops[2] = { Lo, Hi };
    return DAG.getMergeValues(Ops, dl);
@@ -4617,8 +4674,7 @@ void AArch64TargetLowering::LowerAsmOperandForConstraint(
    // Validate and return a target constant for them if we can.
    case 'z': {
      // 'z' maps to xzr or wzr so it needs an input of 0.
    // Validate and return a target constant for them if we can.
    case 'z': {
      // 'z' maps to xzr or wzr so it needs an input of 0.
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
-    if (!C || C->getZExtValue() != 0)
+    if (!isNullConstant(Op))
        return;
  
      if (Op.getValueType() == MVT::i64)
        return;
  
      if (Op.getValueType() == MVT::i64)
@@ -6379,24 +6435,11 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
    unsigned Val = Cst->getZExtValue();
  
    unsigned Size = Op.getValueType().getSizeInBits();
    unsigned Val = Cst->getZExtValue();
  
    unsigned Size = Op.getValueType().getSizeInBits();
-  if (Val == 0) {
-    switch (Size) {
-    case 8:
-      return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 16:
-      return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 32:
-      return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 64:
-      return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    default:
-      llvm_unreachable("Unexpected vector type in extract_subvector!");
-    }
-  }
+
+  // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
+  if (Val == 0)
+    return Op;
+
    // If this is extracting the upper 64-bits of a 128-bit vector, we match
    // that directly.
    if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
    // If this is extracting the upper 64-bits of a 128-bit vector, we match
    // that directly.
    if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
@@ -6700,7 +6743,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
    case Intrinsic::aarch64_neon_ld4r: {
      Info.opc = ISD::INTRINSIC_W_CHAIN;
      // Conservatively set memVT to the entire set of vectors loaded.
    case Intrinsic::aarch64_neon_ld4r: {
      Info.opc = ISD::INTRINSIC_W_CHAIN;
      // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8;
+    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
      Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
      Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
      Info.offset = 0;
      Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
      Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
      Info.offset = 0;
@@ -6726,7 +6769,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
        Type *ArgTy = I.getArgOperand(ArgI)->getType();
        if (!ArgTy->isVectorTy())
          break;
        Type *ArgTy = I.getArgOperand(ArgI)->getType();
        if (!ArgTy->isVectorTy())
          break;
-      NumElts += DL.getTypeAllocSize(ArgTy) / 8;
+      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
      }
      Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
      Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
      }
      Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
      Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
@@ -6969,10 +7012,10 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
    const DataLayout &DL = LI->getModule()->getDataLayout();
  
    VectorType *VecTy = Shuffles[0]->getType();
    const DataLayout &DL = LI->getModule()->getDataLayout();
  
    VectorType *VecTy = Shuffles[0]->getType();
-  unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy);
+  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
  
  
-  // Skip illegal vector types.
-  if (VecSize != 64 && VecSize != 128)
+  // Skip if we do not have NEON and skip illegal vector types.
+  if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128))
      return false;
  
    // A pointer vector can not be the return type of the ldN intrinsics. Need to
      return false;
  
    // A pointer vector can not be the return type of the ldN intrinsics. Need to
@@ -7055,10 +7098,10 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
    VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
  
    const DataLayout &DL = SI->getModule()->getDataLayout();
    VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
  
    const DataLayout &DL = SI->getModule()->getDataLayout();
-  unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
+  unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
  
  
-  // Skip illegal vector types.
-  if (SubVecSize != 64 && SubVecSize != 128)
+  // Skip if we do not have NEON and skip illegal vector types.
+  if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128))
      return false;
  
    Value *Op0 = SVI->getOperand(0);
      return false;
  
    Value *Op0 = SVI->getOperand(0);
@@ -7490,7 +7533,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
  
    // If the result of an integer load is only used by an integer-to-float
    // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
  
    // If the result of an integer load is only used by an integer-to-float
    // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
-  // This eliminates an "integer-to-vector-move UOP and improve throughput.
+  // This eliminates an "integer-to-vector-move" UOP and improves throughput.
    SDValue N0 = N->getOperand(0);
    if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
        // Do not change the width of a volatile load.
    SDValue N0 = N->getOperand(0);
    if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
        // Do not change the width of a volatile load.
@@ -7513,6 +7556,134 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
    return SDValue();
  }
  
    return SDValue();
  }
  
+/// Fold a floating-point multiply by power of two into floating-point to
+/// fixed-point conversion.
+static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
+                                     const AArch64Subtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
+  SDValue Op = N->getOperand(0);
+  if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL)
+    return SDValue();
+
+  SDValue ConstVec = Op->getOperand(1);
+  if (!isa<BuildVectorSDNode>(ConstVec))
+    return SDValue();
+
+  MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
+  uint32_t FloatBits = FloatTy.getSizeInBits();
+  if (FloatBits != 32 && FloatBits != 64)
+    return SDValue();
+
+  MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
+  uint32_t IntBits = IntTy.getSizeInBits();
+  if (IntBits != 16 && IntBits != 32 && IntBits != 64)
+    return SDValue();
+
+  // Avoid conversions where iN is larger than the float (e.g., float -> i64).
+  if (IntBits > FloatBits)
+    return SDValue();
+
+  BitVector UndefElements;
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+  int32_t Bits = IntBits == 64 ? 64 : 32;
+  int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
+  if (C == -1 || C == 0 || C > Bits)
+    return SDValue();
+
+  MVT ResTy;
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  switch (NumLanes) {
+  default:
+    return SDValue();
+  case 2:
+    ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
+    break;
+  case 4:
+    ResTy = MVT::v4i32;
+    break;
+  }
+
+  SDLoc DL(N);
+  bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+  unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
+                                      : Intrinsic::aarch64_neon_vcvtfp2fxu;
+  SDValue FixConv =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
+                  DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
+                  Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
+  // We can handle smaller integers by generating an extra trunc.
+  if (IntBits < FloatBits)
+    FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
+
+  return FixConv;
+}
+
+/// Fold a floating-point divide by power of two into fixed-point to
+/// floating-point conversion.
+static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
+                                  const AArch64Subtarget *Subtarget) {
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
+  SDValue Op = N->getOperand(0);
+  unsigned Opc = Op->getOpcode();
+  if (!Op.getValueType().isVector() ||
+      (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
+    return SDValue();
+
+  SDValue ConstVec = N->getOperand(1);
+  if (!isa<BuildVectorSDNode>(ConstVec))
+    return SDValue();
+
+  MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
+  int32_t IntBits = IntTy.getSizeInBits();
+  if (IntBits != 16 && IntBits != 32 && IntBits != 64)
+    return SDValue();
+
+  MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
+  int32_t FloatBits = FloatTy.getSizeInBits();
+  if (FloatBits != 32 && FloatBits != 64)
+    return SDValue();
+
+  // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
+  if (IntBits > FloatBits)
+    return SDValue();
+
+  BitVector UndefElements;
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
+  int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
+  if (C == -1 || C == 0 || C > FloatBits)
+    return SDValue();
+
+  MVT ResTy;
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  switch (NumLanes) {
+  default:
+    return SDValue();
+  case 2:
+    ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
+    break;
+  case 4:
+    ResTy = MVT::v4i32;
+    break;
+  }
+
+  SDLoc DL(N);
+  SDValue ConvInput = Op.getOperand(0);
+  bool IsSigned = Opc == ISD::SINT_TO_FP;
+  if (IntBits < FloatBits)
+    ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
+                            ResTy, ConvInput);
+
+  unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
+                                      : Intrinsic::aarch64_neon_vcvtfxu2fp;
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
+                     DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
+                     DAG.getConstant(C, DL, MVT::i32));
+}
+
  /// An EXTR instruction is made up of two shifts, ORed together. This helper
  /// searches for and classifies those shifts.
  static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
  /// An EXTR instruction is made up of two shifts, ORed together. This helper
  /// searches for and classifies those shifts.
  static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
@@ -8086,15 +8257,14 @@ static SDValue performAddSubLongCombine(SDNode *N,
  //   (aarch64_neon_umull (extract_high (v2i64 vec)))
  //                     (extract_high (v2i64 (dup128 scalar)))))
  //
  //   (aarch64_neon_umull (extract_high (v2i64 vec)))
  //                     (extract_high (v2i64 (dup128 scalar)))))
  //
-static SDValue tryCombineLongOpWithDup(SDNode *N,
+static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         SelectionDAG &DAG) {
    if (DCI.isBeforeLegalizeOps())
      return SDValue();
  
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         SelectionDAG &DAG) {
    if (DCI.isBeforeLegalizeOps())
      return SDValue();
  
-  bool IsIntrinsic = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
-  SDValue LHS = N->getOperand(IsIntrinsic ? 1 : 0);
-  SDValue RHS = N->getOperand(IsIntrinsic ? 2 : 1);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
    assert(LHS.getValueType().is64BitVector() &&
           RHS.getValueType().is64BitVector() &&
           "unexpected shape for long operation");
    assert(LHS.getValueType().is64BitVector() &&
           RHS.getValueType().is64BitVector() &&
           "unexpected shape for long operation");
@@ -8112,13 +8282,8 @@ static SDValue tryCombineLongOpWithDup(SDNode *N,
        return SDValue();
    }
  
        return SDValue();
    }
  
-  // N could either be an intrinsic or a sabsdiff/uabsdiff node.
-  if (IsIntrinsic)
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
-                       N->getOperand(0), LHS, RHS);
-  else
-    return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
-                       LHS, RHS);
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
+                     N->getOperand(0), LHS, RHS);
  }
  
  static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
  }
  
  static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
@@ -8236,12 +8401,6 @@ static SDValue performIntrinsicCombine(SDNode *N,
    case Intrinsic::aarch64_neon_fmin:
      return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0),
                         N->getOperand(1), N->getOperand(2));
    case Intrinsic::aarch64_neon_fmin:
      return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0),
                         N->getOperand(1), N->getOperand(2));
-  case Intrinsic::aarch64_neon_sabd:
-    return DAG.getNode(ISD::SABSDIFF, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2));
-  case Intrinsic::aarch64_neon_uabd:
-    return DAG.getNode(ISD::UABSDIFF, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2));
    case Intrinsic::aarch64_neon_fmaxnm:
      return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
                         N->getOperand(1), N->getOperand(2));
    case Intrinsic::aarch64_neon_fmaxnm:
      return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
                         N->getOperand(1), N->getOperand(2));
@@ -8252,7 +8411,7 @@ static SDValue performIntrinsicCombine(SDNode *N,
    case Intrinsic::aarch64_neon_umull:
    case Intrinsic::aarch64_neon_pmull:
    case Intrinsic::aarch64_neon_sqdmull:
    case Intrinsic::aarch64_neon_umull:
    case Intrinsic::aarch64_neon_pmull:
    case Intrinsic::aarch64_neon_sqdmull:
-    return tryCombineLongOpWithDup(N, DCI, DAG);
+    return tryCombineLongOpWithDup(IID, N, DCI, DAG);
    case Intrinsic::aarch64_neon_sqshl:
    case Intrinsic::aarch64_neon_uqshl:
    case Intrinsic::aarch64_neon_sqshlu:
    case Intrinsic::aarch64_neon_sqshl:
    case Intrinsic::aarch64_neon_uqshl:
    case Intrinsic::aarch64_neon_sqshlu:
@@ -8277,15 +8436,18 @@ static SDValue performExtendCombine(SDNode *N,
    // helps the backend to decide that an sabdl2 would be useful, saving a real
    // extract_high operation.
    if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
    // helps the backend to decide that an sabdl2 would be useful, saving a real
    // extract_high operation.
    if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
-      (N->getOperand(0).getOpcode() == ISD::SABSDIFF ||
-       N->getOperand(0).getOpcode() == ISD::UABSDIFF)) {
+      N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
      SDNode *ABDNode = N->getOperand(0).getNode();
      SDNode *ABDNode = N->getOperand(0).getNode();
-    SDValue NewABD = tryCombineLongOpWithDup(ABDNode, DCI, DAG);
-    if (!NewABD.getNode())
-      return SDValue();
+    unsigned IID = getIntrinsicID(ABDNode);
+    if (IID == Intrinsic::aarch64_neon_sabd ||
+        IID == Intrinsic::aarch64_neon_uabd) {
+      SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
+      if (!NewABD.getNode())
+        return SDValue();
  
  
-    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
-                       NewABD);
+      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
+                         NewABD);
+    }
    }
  
    // This is effectively a custom type legalization for AArch64.
    }
  
    // This is effectively a custom type legalization for AArch64.
@@ -8424,10 +8586,9 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
    return NewST1;
  }
  
    return NewST1;
  }
  
-static SDValue performSTORECombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI,
-                                   SelectionDAG &DAG,
-                                   const AArch64Subtarget *Subtarget) {
+static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                              SelectionDAG &DAG,
+                              const AArch64Subtarget *Subtarget) {
    if (!DCI.isBeforeLegalize())
      return SDValue();
  
    if (!DCI.isBeforeLegalize())
      return SDValue();
  
@@ -8435,6 +8596,10 @@ static SDValue performSTORECombine(SDNode *N,
    if (S->isVolatile())
      return SDValue();
  
    if (S->isVolatile())
      return SDValue();
  
+  // FIXME: The logic for deciding if an unaligned store should be split should
+  // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
+  // a call to that function here.
+
    // Cyclone has bad performance on unaligned 16B stores when crossing line and
    // page boundaries. We want to split such stores.
    if (!Subtarget->isCyclone())
    // Cyclone has bad performance on unaligned 16B stores when crossing line and
    // page boundaries. We want to split such stores.
    if (!Subtarget->isCyclone())
@@ -8585,7 +8750,39 @@ static SDValue performPostLD1Combine(SDNode *N,
    return SDValue();
  }
  
    return SDValue();
  }
  
-/// This function handles the log2-shuffle pattern produced by the
+/// Simplify \Addr given that the top byte of it is ignored by HW during
+/// address translation.
+static bool performTBISimplification(SDValue Addr,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     SelectionDAG &DAG) {
+  APInt DemandedMask = APInt::getLowBitsSet(64, 56);
+  APInt KnownZero, KnownOne;
+  TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
+                                        DCI.isBeforeLegalizeOps());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) {
+    DCI.CommitTargetLoweringOpt(TLO);
+    return true;
+  }
+  return false;
+}
+
+static SDValue performSTORECombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   SelectionDAG &DAG,
+                                   const AArch64Subtarget *Subtarget) {
+  SDValue Split = split16BStores(N, DCI, DAG, Subtarget);
+  if (Split.getNode())
+    return Split;
+
+  if (Subtarget->supportsAddressTopByteIgnored() &&
+      performTBISimplification(N->getOperand(2), DCI, DAG))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+  /// This function handles the log2-shuffle pattern produced by the
  /// LoopVectorizer for the across vector reduction. It consists of
  /// log2(NumVectorElements) steps and, in each step, 2^(s) elements
  /// are reduced, where s is an induction variable from 0 to
  /// LoopVectorizer for the across vector reduction. It consists of
  /// log2(NumVectorElements) steps and, in each step, 2^(s) elements
  /// are reduced, where s is an induction variable from 0 to
@@ -8598,8 +8795,13 @@ static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
      return SDValue();
  
    int NumVecElts = VTy.getVectorNumElements();
      return SDValue();
  
    int NumVecElts = VTy.getVectorNumElements();
-  if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
-    return SDValue();
+  if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
+    if (NumVecElts != 4)
+      return SDValue();
+  } else {
+    if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
+      return SDValue();
+  }
  
    int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
    SDValue PreOp = OpV;
  
    int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
    SDValue PreOp = OpV;
@@ -8650,6 +8852,8 @@ static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
      PreOp = CurOp;
    }
    unsigned Opcode;
      PreOp = CurOp;
    }
    unsigned Opcode;
+  bool IsIntrinsic = false;
+
    switch (Op) {
    default:
      llvm_unreachable("Unexpected operator for across vector reduction");
    switch (Op) {
    default:
      llvm_unreachable("Unexpected operator for across vector reduction");
@@ -8668,11 +8872,24 @@ static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
    case ISD::UMIN:
      Opcode = AArch64ISD::UMINV;
      break;
    case ISD::UMIN:
      Opcode = AArch64ISD::UMINV;
      break;
+  case ISD::FMAXNUM:
+    Opcode = Intrinsic::aarch64_neon_fmaxnmv;
+    IsIntrinsic = true;
+    break;
+  case ISD::FMINNUM:
+    Opcode = Intrinsic::aarch64_neon_fminnmv;
+    IsIntrinsic = true;
+    break;
    }
    SDLoc DL(N);
    }
    SDLoc DL(N);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
-                     DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
-                     DAG.getConstant(0, DL, MVT::i64));
+
+  return IsIntrinsic
+             ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0),
+                           DAG.getConstant(Opcode, DL, MVT::i32), PreOp)
+             : DAG.getNode(
+                   ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
+                   DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
+                   DAG.getConstant(0, DL, MVT::i64));
  }
  
  /// Target-specific DAG combine for the across vector min/max reductions.
  }
  
  /// Target-specific DAG combine for the across vector min/max reductions.
@@ -8696,9 +8913,6 @@ static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
  ///     becomes :
  ///   %1 = smaxv %0
  ///   %result = extract_vector_elt %1, 0
  ///     becomes :
  ///   %1 = smaxv %0
  ///   %result = extract_vector_elt %1, 0
-/// FIXME: Currently this function matches only SMAXV, UMAXV, SMINV, and UMINV.
-/// We could also support other types of across lane reduction available
-/// in AArch64, including FMAXNMV, FMAXV, FMINNMV, and FMINV.
  static SDValue
  performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
                                          const AArch64Subtarget *Subtarget) {
  static SDValue
  performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
                                          const AArch64Subtarget *Subtarget) {
@@ -8726,17 +8940,26 @@ performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
    SDValue VectorOp = SetCC.getOperand(0);
    unsigned Op = VectorOp->getOpcode();
    // Check if the input vector is fed by the operator we want to handle.
    SDValue VectorOp = SetCC.getOperand(0);
    unsigned Op = VectorOp->getOpcode();
    // Check if the input vector is fed by the operator we want to handle.
-  if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && Op != ISD::UMIN)
+  if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN &&
+      Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM)
      return SDValue();
  
    EVT VTy = VectorOp.getValueType();
    if (!VTy.isVector())
      return SDValue();
  
      return SDValue();
  
    EVT VTy = VectorOp.getValueType();
    if (!VTy.isVector())
      return SDValue();
  
-  EVT EltTy = VTy.getVectorElementType();
-  if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+  if (VTy.getSizeInBits() < 64)
      return SDValue();
  
      return SDValue();
  
+  EVT EltTy = VTy.getVectorElementType();
+  if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
+    if (EltTy != MVT::f32)
+      return SDValue();
+  } else {
+    if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
+      return SDValue();
+  }
+
    // Check if extracting from the same vector.
    // For example,
    //   %sc = setcc %vector, %svn1, gt
    // Check if extracting from the same vector.
    // For example,
    //   %sc = setcc %vector, %svn1, gt
@@ -8752,22 +8975,25 @@ performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
    if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) ||
        (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) ||
        (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) ||
    if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) ||
        (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) ||
        (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) ||
-      (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE))
+      (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) ||
+      (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE &&
+       CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT &&
+       CC != ISD::SETGE) ||
+      (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE &&
+       CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT &&
+       CC != ISD::SETLE))
      return SDValue();
  
    // Expect to check only lane 0 from the vector SETCC.
      return SDValue();
  
    // Expect to check only lane 0 from the vector SETCC.
-  if (!isa<ConstantSDNode>(N0.getOperand(1)) ||
-      cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue() != 0)
+  if (!isNullConstant(N0.getOperand(1)))
      return SDValue();
  
    // Expect to extract the true value from lane 0.
      return SDValue();
  
    // Expect to extract the true value from lane 0.
-  if (!isa<ConstantSDNode>(IfTrue.getOperand(1)) ||
-      cast<ConstantSDNode>(IfTrue.getOperand(1))->getZExtValue() != 0)
+  if (!isNullConstant(IfTrue.getOperand(1)))
      return SDValue();
  
    // Expect to extract the false value from lane 1.
      return SDValue();
  
    // Expect to extract the false value from lane 1.
-  if (!isa<ConstantSDNode>(IfFalse.getOperand(1)) ||
-      cast<ConstantSDNode>(IfFalse.getOperand(1))->getZExtValue() != 1)
+  if (!isOneConstant(IfFalse.getOperand(1)))
      return SDValue();
  
    return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG);
      return SDValue();
  
    return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG);
@@ -8800,7 +9026,7 @@ performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
  
    // The vector extract idx must constant zero because we only expect the final
    // result of the reduction is placed in lane 0.
  
    // The vector extract idx must constant zero because we only expect the final
    // result of the reduction is placed in lane 0.
-  if (!isa<ConstantSDNode>(N1) || cast<ConstantSDNode>(N1)->getZExtValue() != 0)
+  if (!isNullConstant(N1))
      return SDValue();
  
    EVT VTy = N0.getValueType();
      return SDValue();
  
    EVT VTy = N0.getValueType();
@@ -8811,6 +9037,9 @@ performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
    if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
      return SDValue();
  
    if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
      return SDValue();
  
+  if (VTy.getSizeInBits() < 64)
+    return SDValue();
+
    return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
  }
  
    return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
  }
  
@@ -9239,10 +9468,10 @@ static SDValue performBRCONDCombine(SDNode *N,
    if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
      return SDValue();
  
    if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
      return SDValue();
  
-  if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
+  if (isNullConstant(LHS))
      std::swap(LHS, RHS);
  
      std::swap(LHS, RHS);
  
-  if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
+  if (!isNullConstant(RHS))
      return SDValue();
  
    if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
      return SDValue();
  
    if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
@@ -9380,6 +9609,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
    case ISD::SINT_TO_FP:
    case ISD::UINT_TO_FP:
      return performIntToFpCombine(N, DAG, Subtarget);
    case ISD::SINT_TO_FP:
    case ISD::UINT_TO_FP:
      return performIntToFpCombine(N, DAG, Subtarget);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return performFpToIntCombine(N, DAG, Subtarget);
+  case ISD::FDIV:
+    return performFDivCombine(N, DAG, Subtarget);
    case ISD::OR:
      return performORCombine(N, DCI, Subtarget);
    case ISD::INTRINSIC_WO_CHAIN:
    case ISD::OR:
      return performORCombine(N, DCI, Subtarget);
    case ISD::INTRINSIC_WO_CHAIN:
@@ -9400,6 +9634,10 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
    }
    case ISD::VSELECT:
      return performVSelectCombine(N, DCI.DAG);
    }
    case ISD::VSELECT:
      return performVSelectCombine(N, DCI.DAG);
+  case ISD::LOAD:
+    if (performTBISimplification(N->getOperand(1), DCI, DAG))
+      return SDValue(N, 0);
+    break;
    case ISD::STORE:
      return performSTORECombine(N, DCI, DAG, Subtarget);
    case AArch64ISD::BRCOND:
    case ISD::STORE:
      return performSTORECombine(N, DCI, DAG, Subtarget);
    case AArch64ISD::BRCOND:
@@ -9580,6 +9818,20 @@ static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
    Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
  }
  
    Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
  }
  
+static void ReplaceReductionResults(SDNode *N,
+                                    SmallVectorImpl<SDValue> &Results,
+                                    SelectionDAG &DAG, unsigned InterOp,
+                                    unsigned AcrossOp) {
+  EVT LoVT, HiVT;
+  SDValue Lo, Hi;
+  SDLoc dl(N);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+  SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
+  SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
+  Results.push_back(SplitVal);
+}
+
  void AArch64TargetLowering::ReplaceNodeResults(
      SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
    switch (N->getOpcode()) {
  void AArch64TargetLowering::ReplaceNodeResults(
      SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
    switch (N->getOpcode()) {
@@ -9588,6 +9840,24 @@ void AArch64TargetLowering::ReplaceNodeResults(
    case ISD::BITCAST:
      ReplaceBITCASTResults(N, Results, DAG);
      return;
    case ISD::BITCAST:
      ReplaceBITCASTResults(N, Results, DAG);
      return;
+  case AArch64ISD::SADDV:
+    ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
+    return;
+  case AArch64ISD::UADDV:
+    ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
+    return;
+  case AArch64ISD::SMINV:
+    ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
+    return;
+  case AArch64ISD::UMINV:
+    ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
+    return;
+  case AArch64ISD::SMAXV:
+    ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
+    return;
+  case AArch64ISD::UMAXV:
+    ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
+    return;
    case ISD::FP_TO_UINT:
    case ISD::FP_TO_SINT:
      assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
    case ISD::FP_TO_UINT:
    case ISD::FP_TO_SINT:
      assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
@@ -9682,6 +9952,13 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
        cast<PointerType>(Addr->getType())->getElementType());
  }
  
        cast<PointerType>(Addr->getType())->getElementType());
  }
  
+void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
+    IRBuilder<> &Builder) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Builder.CreateCall(
+      llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
+}
+
  Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
                                                     Value *Val, Value *Addr,
                                                     AtomicOrdering Ord) const {
  Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
                                                     Value *Val, Value *Addr,
                                                     AtomicOrdering Ord) const {
@@ -9723,3 +10000,65 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
                                                              EVT) const {
    return false;
  }
                                                              EVT) const {
    return false;
  }
+
+Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+  if (!Subtarget->isTargetAndroid())
+    return TargetLowering::getSafeStackPointerLocation(IRB);
+
+  // Android provides a fixed TLS slot for the SafeStack pointer. See the
+  // definition of TLS_SLOT_SAFESTACK in
+  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+  const unsigned TlsOffset = 0x48;
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  Function *ThreadPointerFunc =
+      Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer);
+  return IRB.CreatePointerCast(
+      IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
+      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
+}
+
+void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+  // Update IsSplitCSR in AArch64unctionInfo.
+  AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
+  AFI->setIsSplitCSR(true);
+}
+
+void AArch64TargetLowering::insertCopiesSplitCSR(
+    MachineBasicBlock *Entry,
+    const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+  if (!IStart)
+    return;
+
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+  for (const MCPhysReg *I = IStart; *I; ++I) {
+    const TargetRegisterClass *RC = nullptr;
+    if (AArch64::GPR64RegClass.contains(*I))
+      RC = &AArch64::GPR64RegClass;
+    else if (AArch64::FPR64RegClass.contains(*I))
+      RC = &AArch64::FPR64RegClass;
+    else
+      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+    unsigned NewVR = MRI->createVirtualRegister(RC);
+    // Create copy from CSR to a virtual register.
+    // FIXME: this currently does not emit CFI pseudo-instructions, it works
+    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+    // nounwind. If we want to generalize this later, we may need to emit
+    // CFI pseudo-instructions.
+    assert(Entry->getParent()->getFunction()->hasFnAttribute(
+               Attribute::NoUnwind) &&
+           "Function should be nounwind in insertCopiesSplitCSR!");
+    Entry->addLiveIn(*I);
+    BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+            NewVR)
+        .addReg(*I);
+
+    for (auto *Exit : Exits)
+      BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+              *I)
+          .addReg(NewVR);
+  }
+}