Add some testing for thumb1 and thumb2 inline asm immediate constraints

[oota-llvm.git] / lib / Target / ARM / ARMISelLowering.cpp
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp

index da8ecfa1102b4b4d9cfc2823b93b72e9a46c1f6e..fc57d83ce1eaec0708a82778b62d617357b03eda 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -143,15 +143,10 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
    setOperationAction(ISD::UREM, VT, Expand);
    setOperationAction(ISD::FREM, VT, Expand);
  
-  if (VT.isInteger()) {
-    setOperationAction(ISD::SABSDIFF, VT, Legal);
-    setOperationAction(ISD::UABSDIFF, VT, Legal);
-  }
    if (!VT.isFloatingPoint() &&
        VT != MVT::v2i64 && VT != MVT::v1i64)
      for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
        setOperationAction(Opcode, VT, Legal);
-
  }
  
  void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
@@ -256,8 +251,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
    setLibcallName(RTLIB::SRL_I128, nullptr);
    setLibcallName(RTLIB::SRA_I128, nullptr);
  
-  if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetMachO() &&
-      !Subtarget->isTargetWindows()) {
+  // RTLIB
+  if (Subtarget->isAAPCS_ABI() &&
+      (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
+       Subtarget->isTargetAndroid())) {
      static const struct {
        const RTLIB::Libcall Op;
        const char * const Name;
@@ -345,12 +342,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
        { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
        { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
        { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-
-      // Memory operations
-      // RTABI chapter 4.3.4
-      { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
      };
  
      for (const auto &LC : LibraryCalls) {
@@ -359,6 +350,30 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
        if (LC.Cond != ISD::SETCC_INVALID)
          setCmpLibcallCC(LC.Op, LC.Cond);
      }
+
+    // EABI dependent RTLIB
+    if (TM.Options.EABIVersion == EABI::EABI4 ||
+        TM.Options.EABIVersion == EABI::EABI5) {
+      static const struct {
+        const RTLIB::Libcall Op;
+        const char *const Name;
+        const CallingConv::ID CC;
+        const ISD::CondCode Cond;
+      } MemOpsLibraryCalls[] = {
+        // Memory operations
+        // RTABI chapter 4.3.4
+        { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+        { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+        { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      };
+
+      for (const auto &LC : MemOpsLibraryCalls) {
+        setLibcallName(LC.Op, LC.Name);
+        setLibcallCallingConv(LC.Op, LC.CC);
+        if (LC.Cond != ISD::SETCC_INVALID)
+          setCmpLibcallCC(LC.Op, LC.Cond);
+      }
+    }
    }
  
    if (Subtarget->isTargetWindows()) {
@@ -375,6 +390,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
        { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
        { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
        { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::SDIV_I32, "__rt_sdiv",   CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::UDIV_I32, "__rt_udiv",   CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::SDIV_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::UDIV_I64, "__rt_udiv64", CallingConv::ARM_AAPCS_VFP },
      };
  
      for (const auto &LC : LibraryCalls) {
@@ -725,6 +744,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
      setOperationAction(ISD::SUBE,    MVT::i32, Custom);
    }
  
+  if (!Subtarget->isThumb1Only())
+    setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+
    // ARM does not have ROTL.
    setOperationAction(ISD::ROTL, MVT::i32, Expand);
    for (MVT VT : MVT::vector_valuetypes()) {
@@ -758,14 +780,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
      setOperationAction(ISD::UDIV,  MVT::i32, LibCall);
    }
  
-  if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) {
-    setOperationAction(ISD::SDIV, MVT::i32, Custom);
-    setOperationAction(ISD::UDIV, MVT::i32, Custom);
-
-    setOperationAction(ISD::SDIV, MVT::i64, Custom);
-    setOperationAction(ISD::UDIV, MVT::i64, Custom);
-  }
-
    setOperationAction(ISD::SREM,  MVT::i32, Expand);
    setOperationAction(ISD::UREM,  MVT::i32, Expand);
    // Register based DivRem for AEABI (RTABI 4.2)
@@ -1109,8 +1123,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
  
    case ARMISD::CMOV:          return "ARMISD::CMOV";
  
-  case ARMISD::RBIT:          return "ARMISD::RBIT";
-
    case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
    case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
    case ARMISD::RRX:           return "ARMISD::RRX";
@@ -2518,6 +2530,72 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
    return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
  }
  
+/// \brief Convert a TLS address reference into the correct sequence of loads
+/// and calls to compute the variable's address for Darwin, and return an
+/// SDValue containing the final node.
+
+/// Darwin only has one TLS scheme which must be capable of dealing with the
+/// fully general situation, in the worst case. This means:
+///     + "extern __thread" declaration.
+///     + Defined in a possibly unknown dynamic library.
+///
+/// The general system is that each __thread variable has a [3 x i32] descriptor
+/// which contains information used by the runtime to calculate the address. The
+/// only part of this the compiler needs to know about is the first word, which
+/// contains a function pointer that must be called with the address of the
+/// entire descriptor in "r0".
+///
+/// Since this descriptor may be in a different unit, in general access must
+/// proceed along the usual ARM rules. A common sequence to produce is:
+///
+///     movw rT1, :lower16:_var$non_lazy_ptr
+///     movt rT1, :upper16:_var$non_lazy_ptr
+///     ldr r0, [rT1]
+///     ldr rT2, [r0]
+///     blx rT2
+///     [...address now in r0...]
+SDValue
+ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
+  SDLoc DL(Op);
+
+  // First step is to get the address of the actua global symbol. This is where
+  // the TLS descriptor lives.
+  SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
+
+  // The first entry in the descriptor is a function pointer that we must call
+  // to obtain the address of the variable.
+  SDValue Chain = DAG.getEntryNode();
+  SDValue FuncTLVGet =
+      DAG.getLoad(MVT::i32, DL, Chain, DescAddr,
+                  MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                  false, true, true, 4);
+  Chain = FuncTLVGet.getValue(1);
+
+  MachineFunction &F = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = F.getFrameInfo();
+  MFI->setAdjustsStack(true);
+
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
+  // silly).
+  auto TRI =
+      getTargetMachine().getSubtargetImpl(*F.getFunction())->getRegisterInfo();
+  auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
+  const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
+
+  // Finally, we can make the call. This is just a degenerate version of a
+  // normal AArch64 call node: r0 takes the address of the descriptor, and
+  // returns the address of the variable in this thread.
+  Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
+  Chain =
+      DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+                  Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
+                  DAG.getRegisterMask(Mask), Chain.getValue(1));
+  return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
+}
+
  // Lower ISD::GlobalTLSAddress using the "general dynamic" model
  SDValue
  ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
@@ -2619,9 +2697,11 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
  
  SDValue
  ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+  if (Subtarget->isTargetDarwin())
+    return LowerGlobalTLSAddressDarwin(Op, DAG);
+
    // TODO: implement the "local dynamic" model
-  assert(Subtarget->isTargetELF() &&
-         "TLS not implemented for non-ELF targets");
+  assert(Subtarget->isTargetELF() && "Only ELF implemented here");
    GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
    if (DAG.getTarget().Options.EmulatedTLS)
      return LowerToTLSEmulatedModel(GA, DAG);
@@ -2777,7 +2857,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
    case Intrinsic::arm_rbit: {
      assert(Op.getOperand(1).getValueType() == MVT::i32 &&
             "RBIT intrinsic must have i32 type!");
-    return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(1));
+    return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1));
    }
    case Intrinsic::arm_thread_pointer: {
      EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -3266,13 +3346,9 @@ static bool isFloatingPointZero(SDValue Op) {
      // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
      // created by LowerConstantFP().
      SDValue BitcastOp = Op->getOperand(0);
-    if (BitcastOp->getOpcode() == ARMISD::VMOVIMM) {
-      SDValue MoveOp = BitcastOp->getOperand(0);
-      if (MoveOp->getOpcode() == ISD::TargetConstant &&
-          cast<ConstantSDNode>(MoveOp)->getZExtValue() == 0) {
-        return true;
-      }
-    }
+    if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
+        isNullConstant(BitcastOp->getOperand(0)))
+      return true;
    }
    return false;
  }
@@ -4126,6 +4202,56 @@ static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
    Results.push_back(Read.getOperand(0));
  }
  
+/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
+/// When \p DstVT, the destination type of \p BC, is on the vector
+/// register bank and the source of bitcast, \p Op, operates on the same bank,
+/// it might be possible to combine them, such that everything stays on the
+/// vector register bank.
+/// \p return The node that would replace \p BT, if the combine
+/// is possible.
+static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
+                                                SelectionDAG &DAG) {
+  SDValue Op = BC->getOperand(0);
+  EVT DstVT = BC->getValueType(0);
+
+  // The only vector instruction that can produce a scalar (remember,
+  // since the bitcast was about to be turned into VMOVDRR, the source
+  // type is i64) from a vector is EXTRACT_VECTOR_ELT.
+  // Moreover, we can do this combine only if there is one use.
+  // Finally, if the destination type is not a vector, there is not
+  // much point on forcing everything on the vector bank.
+  if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      !Op.hasOneUse())
+    return SDValue();
+
+  // If the index is not constant, we will introduce an additional
+  // multiply that will stick.
+  // Give up in that case.
+  ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!Index)
+    return SDValue();
+  unsigned DstNumElt = DstVT.getVectorNumElements();
+
+  // Compute the new index.
+  const APInt &APIntIndex = Index->getAPIntValue();
+  APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
+  NewIndex *= APIntIndex;
+  // Check if the new constant index fits into i32.
+  if (NewIndex.getBitWidth() > 32)
+    return SDValue();
+
+  // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
+  // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
+  SDLoc dl(Op);
+  SDValue ExtractSrc = Op.getOperand(0);
+  EVT VecVT = EVT::getVectorVT(
+      *DAG.getContext(), DstVT.getScalarType(),
+      ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
+  SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
+                     DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
+}
+
  /// ExpandBITCAST - If the target supports VFP, this function is called to
  /// expand a bit convert where either the source or destination type is i64 to
  /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
@@ -4145,6 +4271,11 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
  
    // Turn i64->f64 into VMOVDRR.
    if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
+    // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
+    // if we can combine the bitcast with its source.
+    if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
+      return Val;
+
      SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
                               DAG.getConstant(0, dl, MVT::i32));
      SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
@@ -4356,7 +4487,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
    if (!ST->hasV6T2Ops())
      return SDValue();
  
-  SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0));
+  SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
    return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
  }
  
@@ -4517,8 +4648,7 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
           "Unknown shift to lower!");
  
    // We only lower SRA, SRL of 1 here, all others use generic lowering.
-  if (!isa<ConstantSDNode>(N->getOperand(1)) ||
-      cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1)
+  if (!isOneConstant(N->getOperand(1)))
      return SDValue();
  
    // If we are in thumb mode, we don't have RRX.
@@ -5400,16 +5530,14 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
        // just use VDUPLANE. We can only do this if the lane being extracted
        // is at a constant index, as the VDUP from lane instructions only have
        // constant-index forms.
+      ConstantSDNode *constIndex;
        if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-          isa<ConstantSDNode>(Value->getOperand(1))) {
+          (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
          // We need to create a new undef vector to use for the VDUPLANE if the
          // size of the vector from which we get the value is different than the
          // size of the vector that we need to create. We will insert the element
          // such that the register coalescer will remove unnecessary copies.
          if (VT != Value->getOperand(0).getValueType()) {
-          ConstantSDNode *constIndex;
-          constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1));
-          assert(constIndex && "The index is not a constant!");
            unsigned index = constIndex->getAPIntValue().getLimitedValue() %
                               VT.getVectorNumElements();
            N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
@@ -6814,14 +6942,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
    case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
    case ISD::MUL:           return LowerMUL(Op, DAG);
-  case ISD::SDIV:
-    if (Subtarget->isTargetWindows())
-      return LowerDIV_Windows(Op, DAG, /* Signed */ true);
-    return LowerSDIV(Op, DAG);
-  case ISD::UDIV:
-    if (Subtarget->isTargetWindows())
-      return LowerDIV_Windows(Op, DAG, /* Signed */ false);
-    return LowerUDIV(Op, DAG);
+  case ISD::SDIV:          return LowerSDIV(Op, DAG);
+  case ISD::UDIV:          return LowerUDIV(Op, DAG);
    case ISD::ADDC:
    case ISD::ADDE:
    case ISD::SUBC:
@@ -7342,7 +7464,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI,
        }
      }
  
-    BB->addSuccessor(DispatchBB);
+    BB->addSuccessor(DispatchBB, BranchProbability::getZero());
+    BB->normalizeSuccProbs();
  
      // Find the invoke call and mark all of the callee-saved registers as
      // 'implicit defined' so that they're spilled. This prevents code from
@@ -8186,10 +8309,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
  
  // Helper function that checks if N is a null or all ones constant.
  static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
-  if (!C)
-    return false;
-  return AllOnes ? C->isAllOnesValue() : C->isNullValue();
+  return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
  }
  
  // Return true if N is conditionally 0 or all ones.
@@ -9011,12 +9131,88 @@ static SDValue PerformXORCombine(SDNode *N,
    return SDValue();
  }
  
-/// PerformBFICombine - (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
-/// the bits being cleared by the AND are not demanded by the BFI.
+// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
+// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
+// their position in "to" (Rd).
+static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
+  assert(N->getOpcode() == ARMISD::BFI);
+
+  SDValue From = N->getOperand(1);
+  ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
+  FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
+
+  // If the Base came from a SHR #C, we can deduce that it is really testing bit
+  // #C in the base of the SHR.
+  if (From->getOpcode() == ISD::SRL &&
+      isa<ConstantSDNode>(From->getOperand(1))) {
+    APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
+    assert(Shift.getLimitedValue() < 32 && "Shift too large!");
+    FromMask <<= Shift.getLimitedValue(31);
+    From = From->getOperand(0);
+  }
+
+  return From;
+}
+
+// If A and B contain one contiguous set of bits, does A | B == A . B?
+//
+// Neither A nor B must be zero.
+static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
+  unsigned LastActiveBitInA =  A.countTrailingZeros();
+  unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
+  return LastActiveBitInA - 1 == FirstActiveBitInB;
+}
+
+static SDValue FindBFIToCombineWith(SDNode *N) {
+  // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
+  // if one exists.
+  APInt ToMask, FromMask;
+  SDValue From = ParseBFI(N, ToMask, FromMask);
+  SDValue To = N->getOperand(0);
+
+  // Now check for a compatible BFI to merge with. We can pass through BFIs that
+  // aren't compatible, but not if they set the same bit in their destination as
+  // we do (or that of any BFI we're going to combine with).
+  SDValue V = To;
+  APInt CombinedToMask = ToMask;
+  while (V.getOpcode() == ARMISD::BFI) {
+    APInt NewToMask, NewFromMask;
+    SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
+    if (NewFrom != From) {
+      // This BFI has a different base. Keep going.
+      CombinedToMask |= NewToMask;
+      V = V.getOperand(0);
+      continue;
+    }
+
+    // Do the written bits conflict with any we've seen so far?
+    if ((NewToMask & CombinedToMask).getBoolValue())
+      // Conflicting bits - bail out because going further is unsafe.
+      return SDValue();
+
+    // Are the new bits contiguous when combined with the old bits?
+    if (BitsProperlyConcatenate(ToMask, NewToMask) &&
+        BitsProperlyConcatenate(FromMask, NewFromMask))
+      return V;
+    if (BitsProperlyConcatenate(NewToMask, ToMask) &&
+        BitsProperlyConcatenate(NewFromMask, FromMask))
+      return V;
+
+    // We've seen a write to some bits, so track it.
+    CombinedToMask |= NewToMask;
+    // Keep going...
+    V = V.getOperand(0);
+  }
+
+  return SDValue();
+}
+
  static SDValue PerformBFICombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI) {
    SDValue N1 = N->getOperand(1);
    if (N1.getOpcode() == ISD::AND) {
+    // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
+    // the bits being cleared by the AND are not demanded by the BFI.
      ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
      if (!N11C)
        return SDValue();
@@ -9032,6 +9228,38 @@ static SDValue PerformBFICombine(SDNode *N,
        return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
                               N->getOperand(0), N1.getOperand(0),
                               N->getOperand(2));
+  } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
+    // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
+    // Keep track of any consecutive bits set that all come from the same base
+    // value. We can combine these together into a single BFI.
+    SDValue CombineBFI = FindBFIToCombineWith(N);
+    if (CombineBFI == SDValue())
+      return SDValue();
+
+    // We've found a BFI.
+    APInt ToMask1, FromMask1;
+    SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
+
+    APInt ToMask2, FromMask2;
+    SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
+    assert(From1 == From2);
+    (void)From2;
+
+    // First, unlink CombineBFI.
+    DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
+    // Then create a new BFI, combining the two together.
+    APInt NewFromMask = FromMask1 | FromMask2;
+    APInt NewToMask = ToMask1 | ToMask2;
+
+    EVT VT = N->getValueType(0);
+    SDLoc dl(N);
+
+    if (NewFromMask[0] == 0)
+      From1 = DCI.DAG.getNode(
+        ISD::SRL, dl, VT, From1,
+        DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
+    return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
+                           DCI.DAG.getConstant(~NewToMask, dl, VT));
    }
    return SDValue();
  }
@@ -9984,15 +10212,6 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
      // Don't do anything for most intrinsics.
      break;
  
-  case Intrinsic::arm_neon_vabds:
-    if (!N->getValueType(0).isInteger())
-      return SDValue();
-    return DAG.getNode(ISD::SABSDIFF, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2));
-  case Intrinsic::arm_neon_vabdu:
-    return DAG.getNode(ISD::UABSDIFF, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2));
-
    // Vector shifts: check for immediate versions and lower them.
    // Note: This is done during DAG combining instead of DAG legalizing because
    // the build_vectors for 64-bit vector element shift counts are generally
@@ -10265,8 +10484,14 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D
  
    SDValue Op0 = CMOV->getOperand(0);
    SDValue Op1 = CMOV->getOperand(1);
+  auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
+  auto CC = CCNode->getAPIntValue().getLimitedValue();
    SDValue CmpZ = CMOV->getOperand(4);
  
+  // The compare must be against zero.
+  if (!isNullConstant(CmpZ->getOperand(1)))
+    return SDValue();
+
    assert(CmpZ->getOpcode() == ARMISD::CMPZ);
    SDValue And = CmpZ->getOperand(0);
    if (And->getOpcode() != ISD::AND)
@@ -10276,18 +10501,23 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D
      return SDValue();
    SDValue X = And->getOperand(0);
  
-  // Canonicalize so that the OR is on the left.
-  if (Op1->getOpcode() == ISD::OR)
+  if (CC == ARMCC::EQ) {
+    // We're performing an "equal to zero" compare. Swap the operands so we
+    // canonicalize on a "not equal to zero" compare.
      std::swap(Op0, Op1);
-  if (Op0->getOpcode() != ISD::OR)
+  } else {
+    assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
+  }
+  
+  if (Op1->getOpcode() != ISD::OR)
      return SDValue();
  
-  ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op0->getOperand(1));
+  ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
    if (!OrC)
      return SDValue();
-  SDValue Y = Op0->getOperand(0);
+  SDValue Y = Op1->getOperand(0);
  
-  if (Op1 != Y)
+  if (Op0 != Y)
      return SDValue();
  
    // Now, is it profitable to continue?
@@ -11245,7 +11475,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
          return;
  
        case 'J':
-        if (Subtarget->isThumb()) {  // FIXME thumb2
+        if (Subtarget->isThumb1Only()) {
            // This must be a constant between -255 and -1, for negated ADD
            // immediates. This can be used in GCC with an "n" modifier that
            // prints the negated value, for use with SUB instructions. It is
@@ -11314,7 +11544,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
          return;
  
        case 'M':
-        if (Subtarget->isThumb()) { // FIXME thumb2
+        if (Subtarget->isThumb1Only()) {
            // This must be a multiple of 4 between 0 and 1020, for
            // ADD sp + immediate.
            if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
@@ -11564,7 +11794,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
      Info.opc = ISD::INTRINSIC_W_CHAIN;
      // Conservatively set memVT to the entire set of vectors loaded.
      auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
-    uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8;
+    uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
      Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
      Info.ptrVal = I.getArgOperand(0);
      Info.offset = 0;
@@ -11590,7 +11820,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
        Type *ArgTy = I.getArgOperand(ArgI)->getType();
        if (!ArgTy->isVectorTy())
          break;
-      NumElts += DL.getTypeAllocSize(ArgTy) / 8;
+      NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
      }
      Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
      Info.ptrVal = I.getArgOperand(0);
@@ -11771,7 +12001,7 @@ bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
  TargetLowering::AtomicExpansionKind
  ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
    unsigned Size = LI->getType()->getPrimitiveSizeInBits();
-  return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLSC
+  return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
                                                    : AtomicExpansionKind::None;
  }
  
@@ -11824,6 +12054,14 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
    return false;
  }
  
+bool ARMTargetLowering::isCheapToSpeculateCttz() const {
+  return Subtarget->hasV6T2Ops();
+}
+
+bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
+  return Subtarget->hasV6T2Ops();
+}
+
  Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                           AtomicOrdering Ord) const {
    Module *M = Builder.GetInsertBlock()->getParent()->getParent();
@@ -11925,8 +12163,8 @@ bool ARMTargetLowering::lowerInterleavedLoad(
    Type *EltTy = VecTy->getVectorElementType();
  
    const DataLayout &DL = LI->getModule()->getDataLayout();
-  unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy);
-  bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64;
+  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
+  bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
  
    // Skip if we do not have NEON and skip illegal vector types and vector types
    // with i64/f64 elements (vldN doesn't support i64/f64 elements).
@@ -12015,8 +12253,8 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
    VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
  
    const DataLayout &DL = SI->getModule()->getDataLayout();
-  unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy);
-  bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64;
+  unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+  bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
  
    // Skip if we do not have NEON and skip illegal vector types and vector types
    // with i64/f64 elements (vstN doesn't support i64/f64 elements).