AMDGPU/SI: Test commit

[oota-llvm.git] / lib / Target / AArch64 / AArch64TargetTransformInfo.cpp
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

index f1e9c6adb5312691843cf1652ac4ec870f24b05a..9af0e6444789a0ee031a995ac0ae8fdeefe2a055 100644 (file)
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===//
+//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
  //
  //                     The LLVM Compiler Infrastructure
  //
@@ -6,18 +6,11 @@
  // License. See LICENSE.TXT for details.
  //
  //===----------------------------------------------------------------------===//
-/// \file
-/// This file implements a TargetTransformInfo analysis pass specific to the
-/// AArch64 target machine. It uses the target's detailed information to provide
-/// more precise answers to certain TTI queries, while letting the target
-/// independent and default TTI implementations handle the rest.
-///
-//===----------------------------------------------------------------------===//
  
-#include "AArch64.h"
-#include "AArch64TargetMachine.h"
+#include "AArch64TargetTransformInfo.h"
  #include "MCTargetDesc/AArch64AddressingModes.h"
  #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
  #include "llvm/CodeGen/BasicTTIImpl.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Target/CostTable.h"
@@ -27,128 +20,10 @@ using namespace llvm;
  
  #define DEBUG_TYPE "aarch64tti"
  
-namespace {
-
-class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
-  typedef BasicTTIImplBase<AArch64TTIImpl> BaseT;
-  typedef TargetTransformInfo TTI;
-
-  const AArch64Subtarget *ST;
-  const AArch64TargetLowering *TLI;
-
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
-
-  enum MemIntrinsicType {
-    VECTOR_LDST_TWO_ELEMENTS,
-    VECTOR_LDST_THREE_ELEMENTS,
-    VECTOR_LDST_FOUR_ELEMENTS
-  };
-
-public:
-  explicit AArch64TTIImpl(const AArch64TargetMachine *TM = nullptr)
-      : BaseT(TM), ST(TM ? TM->getSubtargetImpl() : nullptr),
-        TLI(ST ? ST->getTargetLowering() : nullptr) {}
-
-  // Provide value semantics. MSVC requires that we spell all of these out.
-  AArch64TTIImpl(const AArch64TTIImpl &Arg)
-      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
-  AArch64TTIImpl(AArch64TTIImpl &&Arg)
-      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
-        TLI(std::move(Arg.TLI)) {}
-  AArch64TTIImpl &operator=(const AArch64TTIImpl &RHS) {
-    BaseT::operator=(static_cast<const BaseT &>(RHS));
-    ST = RHS.ST;
-    TLI = RHS.TLI;
-    return *this;
-  }
-  AArch64TTIImpl &operator=(AArch64TTIImpl &&RHS) {
-    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
-    ST = std::move(RHS.ST);
-    TLI = std::move(RHS.TLI);
-    return *this;
-  }
-
-  /// \name Scalar TTI Implementations
-  /// @{
-
-  using BaseT::getIntImmCost;
-  unsigned getIntImmCost(int64_t Val);
-  unsigned getIntImmCost(const APInt &Imm, Type *Ty);
-  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                         Type *Ty);
-  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                         Type *Ty);
-  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
-
-  /// @}
-
-  /// \name Vector TTI Implementations
-  /// @{
-
-  unsigned getNumberOfRegisters(bool Vector) {
-    if (Vector) {
-      if (ST->hasNEON())
-        return 32;
-      return 0;
-    }
-    return 31;
-  }
-
-  unsigned getRegisterBitWidth(bool Vector) {
-    if (Vector) {
-      if (ST->hasNEON())
-        return 128;
-      return 0;
-    }
-    return 64;
-  }
-
-  unsigned getMaxInterleaveFactor();
-
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
-
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
-
-  unsigned getArithmeticInstrCost(
-      unsigned Opcode, Type *Ty,
-      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
-      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
-      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
-      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
-
-  unsigned getAddressComputationCost(Type *Ty, bool IsComplex);
-
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
-
-  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace);
-
-  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
-
-  void getUnrollingPreferences(const Function *F, Loop *L,
-                               TTI::UnrollingPreferences &UP);
-
-  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
-                                           Type *ExpectedType);
-
-  bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
-
-  /// @}
-};
-
-} // end anonymous namespace
-
-ImmutablePass *
-llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) {
-  return new TargetTransformInfoWrapperPass(AArch64TTIImpl(TM));
-}
-
  /// \brief Calculate the cost of materializing a 64-bit value. This helper
  /// method might only calculate a fraction of a larger immediate. Therefore it
  /// is valid to return a cost of ZERO.
-unsigned AArch64TTIImpl::getIntImmCost(int64_t Val) {
+int AArch64TTIImpl::getIntImmCost(int64_t Val) {
    // Check if the immediate can be encoded within an instruction.
    if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
      return 0;
@@ -162,7 +37,7 @@ unsigned AArch64TTIImpl::getIntImmCost(int64_t Val) {
  }
  
  /// \brief Calculate the cost of materializing the given constant.
-unsigned AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
    assert(Ty->isIntegerTy());
  
    unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -176,18 +51,18 @@ unsigned AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
  
    // Split the constant into 64-bit chunks and calculate the cost for each
    // chunk.
-  unsigned Cost = 0;
+  int Cost = 0;
    for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
      APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
      int64_t Val = Tmp.getSExtValue();
      Cost += getIntImmCost(Val);
    }
    // We need at least one instruction to materialze the constant.
-  return std::max(1U, Cost);
+  return std::max(1, Cost);
  }
  
-unsigned AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
-                                       const APInt &Imm, Type *Ty) {
+int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
+                                  const APInt &Imm, Type *Ty) {
    assert(Ty->isIntegerTy());
  
    unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -243,17 +118,17 @@ unsigned AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
    }
  
    if (Idx == ImmIdx) {
-    unsigned NumConstants = (BitSize + 63) / 64;
-    unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+    int NumConstants = (BitSize + 63) / 64;
+    int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
      return (Cost <= NumConstants * TTI::TCC_Basic)
-               ? static_cast<unsigned>(TTI::TCC_Free)
+               ? static_cast<int>(TTI::TCC_Free)
                 : Cost;
    }
    return AArch64TTIImpl::getIntImmCost(Imm, Ty);
  }
  
-unsigned AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
-                                       const APInt &Imm, Type *Ty) {
+int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                                  const APInt &Imm, Type *Ty) {
    assert(Ty->isIntegerTy());
  
    unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -272,10 +147,10 @@ unsigned AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
    case Intrinsic::smul_with_overflow:
    case Intrinsic::umul_with_overflow:
      if (Idx == 1) {
-      unsigned NumConstants = (BitSize + 63) / 64;
-      unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+      int NumConstants = (BitSize + 63) / 64;
+      int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
        return (Cost <= NumConstants * TTI::TCC_Basic)
-                 ? static_cast<unsigned>(TTI::TCC_Free)
+                 ? static_cast<int>(TTI::TCC_Free)
                   : Cost;
      }
      break;
@@ -301,18 +176,41 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
    return TTI::PSK_Software;
  }
  
-unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                          Type *Src) {
+int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
    int ISD = TLI->InstructionOpcodeToISD(Opcode);
    assert(ISD && "Invalid opcode");
  
-  EVT SrcTy = TLI->getValueType(Src);
-  EVT DstTy = TLI->getValueType(Dst);
+  EVT SrcTy = TLI->getValueType(DL, Src);
+  EVT DstTy = TLI->getValueType(DL, Dst);
  
    if (!SrcTy.isSimple() || !DstTy.isSimple())
      return BaseT::getCastInstrCost(Opcode, Dst, Src);
  
-  static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
+  static const TypeConversionCostTblEntry
+  ConversionTbl[] = {
+    { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32,  1 },
+    { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64,  0 },
+    { ISD::TRUNCATE, MVT::v8i8,  MVT::v8i32,  3 },
+    { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
+
+    // The number of shll instructions for the extension.
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+
      // LowerVectorINT_TO_FP:
      { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
      { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
@@ -335,6 +233,16 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
      { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
      { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
  
+    // Complex: to v8f32
+    { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
+    { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+    { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
+    { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+
+    // Complex: to v16f32
+    { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
+    { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
+
      // Complex: to v2f64
      { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
      { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
@@ -375,22 +283,21 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
      { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },
    };
  
-  int Idx = ConvertCostTableLookup<MVT>(
-      ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
-      SrcTy.getSimpleVT());
-  if (Idx != -1)
-    return ConversionTbl[Idx].Cost;
+  if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
+                                                 DstTy.getSimpleVT(),
+                                                 SrcTy.getSimpleVT()))
+    return Entry->Cost;
  
    return BaseT::getCastInstrCost(Opcode, Dst, Src);
  }
  
-unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                            unsigned Index) {
+int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                       unsigned Index) {
    assert(Val->isVectorTy() && "This must be a vector type");
  
    if (Index != -1U) {
      // Legalize the type.
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
  
      // This type is legalized to a scalar type.
      if (!LT.second.isVector())
@@ -406,15 +313,15 @@ unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
    }
  
    // All other insert/extracts cost this much.
-  return 2;
+  return 3;
  }
  
-unsigned AArch64TTIImpl::getArithmeticInstrCost(
+int AArch64TTIImpl::getArithmeticInstrCost(
      unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
      TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
      TTI::OperandValueProperties Opd2PropInfo) {
    // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
  
    int ISD = TLI->InstructionOpcodeToISD(Opcode);
  
@@ -425,10 +332,9 @@ unsigned AArch64TTIImpl::getArithmeticInstrCost(
      // normally expanded to the sequence ADD + CMP + SELECT + SRA.
      // The OperandValue properties many not be same as that of previous
      // operation; conservatively assume OP_None.
-    unsigned Cost =
-      getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
-                             TargetTransformInfo::OP_None,
-                             TargetTransformInfo::OP_None);
+    int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+                                      TargetTransformInfo::OP_None,
+                                      TargetTransformInfo::OP_None);
      Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
                                     TargetTransformInfo::OP_None,
                                     TargetTransformInfo::OP_None);
@@ -456,7 +362,7 @@ unsigned AArch64TTIImpl::getArithmeticInstrCost(
    }
  }
  
-unsigned AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
+int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
    // Address computations in vectorized code with non-consecutive addresses will
    // likely result in more instructions compared to scalar code where the
    // computation can more often be merged into the index mode. The resulting
@@ -471,41 +377,40 @@ unsigned AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
    return 1;
  }
  
-unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                            Type *CondTy) {
+int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                       Type *CondTy) {
  
    int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  // We don't lower vector selects well that are wider than the register width.
+  // We don't lower some vector selects well that are wider than the register
+  // width.
    if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
      // We would need this many instructions to hide the scalarization happening.
-    unsigned AmortizationCost = 20;
-    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+    const int AmortizationCost = 20;
+    static const TypeConversionCostTblEntry
      VectorSelectTbl[] = {
-      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
-      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
-      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
        { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
        { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
        { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
      };
  
-    EVT SelCondTy = TLI->getValueType(CondTy);
-    EVT SelValTy = TLI->getValueType(ValTy);
+    EVT SelCondTy = TLI->getValueType(DL, CondTy);
+    EVT SelValTy = TLI->getValueType(DL, ValTy);
      if (SelCondTy.isSimple() && SelValTy.isSimple()) {
-      int Idx =
-          ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
-                                 SelValTy.getSimpleVT());
-      if (Idx != -1)
-        return VectorSelectTbl[Idx].Cost;
+      if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
+                                                     SelCondTy.getSimpleVT(),
+                                                     SelValTy.getSimpleVT()))
+        return Entry->Cost;
      }
    }
    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
  }
  
-unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
-                                         unsigned Alignment,
-                                         unsigned AddressSpace) {
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                    unsigned Alignment, unsigned AddressSpace) {
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
  
    if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
        Src->getVectorElementType()->isIntegerTy(64)) {
@@ -514,7 +419,7 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
      // practice on inlined memcpy code.
      // We make v2i64 stores expensive so that we will only vectorize if there
      // are 6 other instructions getting vectorized.
-    unsigned AmortizationCost = 6;
+    int AmortizationCost = 6;
  
      return LT.first * 2 * AmortizationCost;
    }
@@ -532,8 +437,30 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
    return LT.first;
  }
  
-unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
-  unsigned Cost = 0;
+int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                               unsigned Factor,
+                                               ArrayRef<unsigned> Indices,
+                                               unsigned Alignment,
+                                               unsigned AddressSpace) {
+  assert(Factor >= 2 && "Invalid interleave factor");
+  assert(isa<VectorType>(VecTy) && "Expect a vector type");
+
+  if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+    unsigned NumElts = VecTy->getVectorNumElements();
+    Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
+    unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+
+    // ldN/stN only support legal vector types of size 64 or 128 in bits.
+    if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
+      return Factor;
+  }
+
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
+}
+
+int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
+  int Cost = 0;
    for (auto *I : Tys) {
      if (!I->isVectorTy())
        continue;
@@ -544,14 +471,23 @@ unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
    return Cost;
  }
  
-unsigned AArch64TTIImpl::getMaxInterleaveFactor() {
+unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
    if (ST->isCortexA57())
      return 4;
    return 2;
  }
  
-void AArch64TTIImpl::getUnrollingPreferences(const Function *F, Loop *L,
+void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
                                               TTI::UnrollingPreferences &UP) {
+  // Enable partial unrolling and runtime unrolling.
+  BaseT::getUnrollingPreferences(L, UP);
+
+  // For inner loop, it is more likely to be a hot one, and the runtime check
+  // can be promoted out from LICM pass, so the overhead is less, let's try
+  // a larger threshold to unroll more loops.
+  if (L->getLoopDepth() > 1)
+    UP.PartialThreshold *= 2;
+
    // Disable partial & runtime unrolling on -Os.
    UP.PartialOptSizeThreshold = 0;
  }
@@ -602,7 +538,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
    case Intrinsic::aarch64_neon_ld4:
      Info.ReadMem = true;
      Info.WriteMem = false;
-    Info.Vol = false;
+    Info.IsSimple = true;
      Info.NumMemRefs = 1;
      Info.PtrVal = Inst->getArgOperand(0);
      break;
@@ -611,7 +547,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
    case Intrinsic::aarch64_neon_st4:
      Info.ReadMem = false;
      Info.WriteMem = true;
-    Info.Vol = false;
+    Info.IsSimple = true;
      Info.NumMemRefs = 1;
      Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
      break;