AMDGPU/SI: Test commit

[oota-llvm.git] / lib / Target / AArch64 / AArch64TargetTransformInfo.cpp
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

index 653ba83e4cfb811baf297e04eb8467d7b5c06049..9af0e6444789a0ee031a995ac0ae8fdeefe2a055 100644 (file)
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===//
+//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
  //
  //                     The LLVM Compiler Infrastructure
  //
@@ -6,18 +6,12 @@
  // License. See LICENSE.TXT for details.
  //
  //===----------------------------------------------------------------------===//
-/// \file
-/// This file implements a TargetTransformInfo analysis pass specific to the
-/// AArch64 target machine. It uses the target's detailed information to provide
-/// more precise answers to certain TTI queries, while letting the target
-/// independent and default TTI implementations handle the rest.
-///
-//===----------------------------------------------------------------------===//
  
-#include "AArch64.h"
-#include "AArch64TargetMachine.h"
+#include "AArch64TargetTransformInfo.h"
  #include "MCTargetDesc/AArch64AddressingModes.h"
  #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
  #include "llvm/Support/Debug.h"
  #include "llvm/Target/CostTable.h"
  #include "llvm/Target/TargetLowering.h"
@@ -26,141 +20,10 @@ using namespace llvm;
  
  #define DEBUG_TYPE "aarch64tti"
  
-// Declare the pass initialization routine locally as target-specific passes
-// don't have a target-wide initialization entry point, and so we rely on the
-// pass constructor initialization.
-namespace llvm {
-void initializeAArch64TTIPass(PassRegistry &);
-}
-
-namespace {
-
-class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
-  const AArch64TargetMachine *TM;
-  const AArch64Subtarget *ST;
-  const AArch64TargetLowering *TLI;
-
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
-
-  enum MemIntrinsicType {
-    VECTOR_LDST_TWO_ELEMENTS,
-    VECTOR_LDST_THREE_ELEMENTS,
-    VECTOR_LDST_FOUR_ELEMENTS
-  };
-
-public:
-  AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
-    llvm_unreachable("This pass cannot be directly constructed");
-  }
-
-  AArch64TTI(const AArch64TargetMachine *TM)
-      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
-        TLI(TM->getSubtargetImpl()->getTargetLowering()) {
-    initializeAArch64TTIPass(*PassRegistry::getPassRegistry());
-  }
-
-  void initializePass() override { pushTTIStack(this); }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    TargetTransformInfo::getAnalysisUsage(AU);
-  }
-
-  /// Pass identification.
-  static char ID;
-
-  /// Provide necessary pointer adjustments for the two base classes.
-  void *getAdjustedAnalysisPointer(const void *ID) override {
-    if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo *)this;
-    return this;
-  }
-
-  /// \name Scalar TTI Implementations
-  /// @{
-  unsigned getIntImmCost(int64_t Val) const;
-  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
-  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                         Type *Ty) const override;
-  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                         Type *Ty) const override;
-  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
-
-  /// @}
-
-  /// \name Vector TTI Implementations
-  /// @{
-
-  unsigned getNumberOfRegisters(bool Vector) const override {
-    if (Vector) {
-      if (ST->hasNEON())
-        return 32;
-      return 0;
-    }
-    return 31;
-  }
-
-  unsigned getRegisterBitWidth(bool Vector) const override {
-    if (Vector) {
-      if (ST->hasNEON())
-        return 128;
-      return 0;
-    }
-    return 64;
-  }
-
-  unsigned getMaxInterleaveFactor() const override;
-
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
-      override;
-
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
-      override;
-
-  unsigned getArithmeticInstrCost(
-      unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue,
-      OperandValueKind Opd2Info = OK_AnyValue,
-      OperandValueProperties Opd1PropInfo = OP_None,
-      OperandValueProperties Opd2PropInfo = OP_None) const override;
-
-  unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
-
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const
-      override;
-
-  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace) const override;
-
-  unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const override;
-
-  void getUnrollingPreferences(const Function *F, Loop *L,
-                               UnrollingPreferences &UP) const override;
-
-  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
-                                           Type *ExpectedType) const override;
-
-  bool getTgtMemIntrinsic(IntrinsicInst *Inst,
-                          MemIntrinsicInfo &Info) const override;
-
-  /// @}
-};
-
-} // end anonymous namespace
-
-INITIALIZE_AG_PASS(AArch64TTI, TargetTransformInfo, "aarch64tti",
-                   "AArch64 Target Transform Info", true, true, false)
-char AArch64TTI::ID = 0;
-
-ImmutablePass *
-llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) {
-  return new AArch64TTI(TM);
-}
-
  /// \brief Calculate the cost of materializing a 64-bit value. This helper
  /// method might only calculate a fraction of a larger immediate. Therefore it
  /// is valid to return a cost of ZERO.
-unsigned AArch64TTI::getIntImmCost(int64_t Val) const {
+int AArch64TTIImpl::getIntImmCost(int64_t Val) {
    // Check if the immediate can be encoded within an instruction.
    if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
      return 0;
@@ -174,7 +37,7 @@ unsigned AArch64TTI::getIntImmCost(int64_t Val) const {
  }
  
  /// \brief Calculate the cost of materializing the given constant.
-unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
    assert(Ty->isIntegerTy());
  
    unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -188,35 +51,35 @@ unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
  
    // Split the constant into 64-bit chunks and calculate the cost for each
    // chunk.
-  unsigned Cost = 0;
+  int Cost = 0;
    for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
      APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
      int64_t Val = Tmp.getSExtValue();
      Cost += getIntImmCost(Val);
    }
    // We need at least one instruction to materialze the constant.
-  return std::max(1U, Cost);
+  return std::max(1, Cost);
  }
  
-unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
-                                 const APInt &Imm, Type *Ty) const {
+int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
+                                  const APInt &Imm, Type *Ty) {
    assert(Ty->isIntegerTy());
  
    unsigned BitSize = Ty->getPrimitiveSizeInBits();
    // There is no cost model for constants with a bit size of 0. Return TCC_Free
    // here, so that constant hoisting will ignore this constant.
    if (BitSize == 0)
-    return TCC_Free;
+    return TTI::TCC_Free;
  
    unsigned ImmIdx = ~0U;
    switch (Opcode) {
    default:
-    return TCC_Free;
+    return TTI::TCC_Free;
    case Instruction::GetElementPtr:
      // Always hoist the base address of a GetElementPtr.
      if (Idx == 0)
-      return 2 * TCC_Basic;
-    return TCC_Free;
+      return 2 * TTI::TCC_Basic;
+    return TTI::TCC_Free;
    case Instruction::Store:
      ImmIdx = 0;
      break;
@@ -238,7 +101,7 @@ unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
    case Instruction::LShr:
    case Instruction::AShr:
      if (Idx == 1)
-      return TCC_Free;
+      return TTI::TCC_Free;
      break;
    case Instruction::Trunc:
    case Instruction::ZExt:
@@ -255,27 +118,28 @@ unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
    }
  
    if (Idx == ImmIdx) {
-    unsigned NumConstants = (BitSize + 63) / 64;
-    unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
-    return (Cost <= NumConstants * TCC_Basic)
-      ? static_cast<unsigned>(TCC_Free) : Cost;
+    int NumConstants = (BitSize + 63) / 64;
+    int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+    return (Cost <= NumConstants * TTI::TCC_Basic)
+               ? static_cast<int>(TTI::TCC_Free)
+               : Cost;
    }
-  return AArch64TTI::getIntImmCost(Imm, Ty);
+  return AArch64TTIImpl::getIntImmCost(Imm, Ty);
  }
  
-unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
-                                 const APInt &Imm, Type *Ty) const {
+int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                                  const APInt &Imm, Type *Ty) {
    assert(Ty->isIntegerTy());
  
    unsigned BitSize = Ty->getPrimitiveSizeInBits();
    // There is no cost model for constants with a bit size of 0. Return TCC_Free
    // here, so that constant hoisting will ignore this constant.
    if (BitSize == 0)
-    return TCC_Free;
+    return TTI::TCC_Free;
  
    switch (IID) {
    default:
-    return TCC_Free;
+    return TTI::TCC_Free;
    case Intrinsic::sadd_with_overflow:
    case Intrinsic::uadd_with_overflow:
    case Intrinsic::ssub_with_overflow:
@@ -283,46 +147,70 @@ unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
    case Intrinsic::smul_with_overflow:
    case Intrinsic::umul_with_overflow:
      if (Idx == 1) {
-      unsigned NumConstants = (BitSize + 63) / 64;
-      unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
-      return (Cost <= NumConstants * TCC_Basic)
-        ? static_cast<unsigned>(TCC_Free) : Cost;
+      int NumConstants = (BitSize + 63) / 64;
+      int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+      return (Cost <= NumConstants * TTI::TCC_Basic)
+                 ? static_cast<int>(TTI::TCC_Free)
+                 : Cost;
      }
      break;
    case Intrinsic::experimental_stackmap:
      if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
-      return TCC_Free;
+      return TTI::TCC_Free;
      break;
    case Intrinsic::experimental_patchpoint_void:
    case Intrinsic::experimental_patchpoint_i64:
      if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
-      return TCC_Free;
+      return TTI::TCC_Free;
      break;
    }
-  return AArch64TTI::getIntImmCost(Imm, Ty);
+  return AArch64TTIImpl::getIntImmCost(Imm, Ty);
  }
  
-AArch64TTI::PopcntSupportKind
-AArch64TTI::getPopcntSupport(unsigned TyWidth) const {
+TargetTransformInfo::PopcntSupportKind
+AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
    assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
    if (TyWidth == 32 || TyWidth == 64)
-    return PSK_FastHardware;
+    return TTI::PSK_FastHardware;
    // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
-  return PSK_Software;
+  return TTI::PSK_Software;
  }
  
-unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                    Type *Src) const {
+int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
    int ISD = TLI->InstructionOpcodeToISD(Opcode);
    assert(ISD && "Invalid opcode");
  
-  EVT SrcTy = TLI->getValueType(Src);
-  EVT DstTy = TLI->getValueType(Dst);
+  EVT SrcTy = TLI->getValueType(DL, Src);
+  EVT DstTy = TLI->getValueType(DL, Dst);
  
    if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+    return BaseT::getCastInstrCost(Opcode, Dst, Src);
+
+  static const TypeConversionCostTblEntry
+  ConversionTbl[] = {
+    { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32,  1 },
+    { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64,  0 },
+    { ISD::TRUNCATE, MVT::v8i8,  MVT::v8i32,  3 },
+    { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
+
+    // The number of shll instructions for the extension.
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
  
-  static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
      // LowerVectorINT_TO_FP:
      { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
      { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
@@ -345,6 +233,16 @@ unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
      { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
      { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
  
+    // Complex: to v8f32
+    { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
+    { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+    { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
+    { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+
+    // Complex: to v16f32
+    { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
+    { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
+
      // Complex: to v2f64
      { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
      { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
@@ -385,22 +283,21 @@ unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
      { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },
    };
  
-  int Idx = ConvertCostTableLookup<MVT>(
-      ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
-      SrcTy.getSimpleVT());
-  if (Idx != -1)
-    return ConversionTbl[Idx].Cost;
+  if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
+                                                 DstTy.getSimpleVT(),
+                                                 SrcTy.getSimpleVT()))
+    return Entry->Cost;
  
-  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+  return BaseT::getCastInstrCost(Opcode, Dst, Src);
  }
  
-unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                      unsigned Index) const {
+int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                       unsigned Index) {
    assert(Val->isVectorTy() && "This must be a vector type");
  
    if (Index != -1U) {
      // Legalize the type.
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
  
      // This type is legalized to a scalar type.
      if (!LT.second.isVector())
@@ -416,15 +313,15 @@ unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
    }
  
    // All other insert/extracts cost this much.
-  return 2;
+  return 3;
  }
  
-unsigned AArch64TTI::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
-    OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
-    OperandValueProperties Opd2PropInfo) const {
+int AArch64TTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+    TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo) {
    // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
  
    int ISD = TLI->InstructionOpcodeToISD(Opcode);
  
@@ -435,10 +332,9 @@ unsigned AArch64TTI::getArithmeticInstrCost(
      // normally expanded to the sequence ADD + CMP + SELECT + SRA.
      // The OperandValue properties many not be same as that of previous
      // operation; conservatively assume OP_None.
-    unsigned Cost =
-      getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
-                             TargetTransformInfo::OP_None,
-                             TargetTransformInfo::OP_None);
+    int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+                                      TargetTransformInfo::OP_None,
+                                      TargetTransformInfo::OP_None);
      Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
                                     TargetTransformInfo::OP_None,
                                     TargetTransformInfo::OP_None);
@@ -453,8 +349,8 @@ unsigned AArch64TTI::getArithmeticInstrCost(
  
    switch (ISD) {
    default:
-    return TargetTransformInfo::getArithmeticInstrCost(
-        Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                         Opd1PropInfo, Opd2PropInfo);
    case ISD::ADD:
    case ISD::MUL:
    case ISD::XOR:
@@ -466,7 +362,7 @@ unsigned AArch64TTI::getArithmeticInstrCost(
    }
  }
  
-unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) {
    // Address computations in vectorized code with non-consecutive addresses will
    // likely result in more instructions compared to scalar code where the
    // computation can more often be merged into the index mode. The resulting
@@ -481,41 +377,40 @@ unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
    return 1;
  }
  
-unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                      Type *CondTy) const {
+int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                       Type *CondTy) {
  
    int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  // We don't lower vector selects well that are wider than the register width.
+  // We don't lower some vector selects well that are wider than the register
+  // width.
    if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
      // We would need this many instructions to hide the scalarization happening.
-    unsigned AmortizationCost = 20;
-    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+    const int AmortizationCost = 20;
+    static const TypeConversionCostTblEntry
      VectorSelectTbl[] = {
-      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
-      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
-      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
        { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
        { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
        { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
      };
  
-    EVT SelCondTy = TLI->getValueType(CondTy);
-    EVT SelValTy = TLI->getValueType(ValTy);
+    EVT SelCondTy = TLI->getValueType(DL, CondTy);
+    EVT SelValTy = TLI->getValueType(DL, ValTy);
      if (SelCondTy.isSimple() && SelValTy.isSimple()) {
-      int Idx =
-          ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
-                                 SelValTy.getSimpleVT());
-      if (Idx != -1)
-        return VectorSelectTbl[Idx].Cost;
+      if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
+                                                     SelCondTy.getSimpleVT(),
+                                                     SelValTy.getSimpleVT()))
+        return Entry->Cost;
      }
    }
-  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
  }
  
-unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
-                                   unsigned Alignment,
-                                   unsigned AddressSpace) const {
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                    unsigned Alignment, unsigned AddressSpace) {
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
  
    if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
        Src->getVectorElementType()->isIntegerTy(64)) {
@@ -524,7 +419,7 @@ unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
      // practice on inlined memcpy code.
      // We make v2i64 stores expensive so that we will only vectorize if there
      // are 6 other instructions getting vectorized.
-    unsigned AmortizationCost = 6;
+    int AmortizationCost = 6;
  
      return LT.first * 2 * AmortizationCost;
    }
@@ -542,8 +437,30 @@ unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
    return LT.first;
  }
  
-unsigned AArch64TTI::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const {
-  unsigned Cost = 0;
+int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                               unsigned Factor,
+                                               ArrayRef<unsigned> Indices,
+                                               unsigned Alignment,
+                                               unsigned AddressSpace) {
+  assert(Factor >= 2 && "Invalid interleave factor");
+  assert(isa<VectorType>(VecTy) && "Expect a vector type");
+
+  if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+    unsigned NumElts = VecTy->getVectorNumElements();
+    Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
+    unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+
+    // ldN/stN only support legal vector types of size 64 or 128 in bits.
+    if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
+      return Factor;
+  }
+
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
+}
+
+int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
+  int Cost = 0;
    for (auto *I : Tys) {
      if (!I->isVectorTy())
        continue;
@@ -554,20 +471,29 @@ unsigned AArch64TTI::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const {
    return Cost;
  }
  
-unsigned AArch64TTI::getMaxInterleaveFactor() const {
+unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
    if (ST->isCortexA57())
      return 4;
    return 2;
  }
  
-void AArch64TTI::getUnrollingPreferences(const Function *F, Loop *L,
-                                         UnrollingPreferences &UP) const {
+void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
+                                             TTI::UnrollingPreferences &UP) {
+  // Enable partial unrolling and runtime unrolling.
+  BaseT::getUnrollingPreferences(L, UP);
+
+  // For inner loop, it is more likely to be a hot one, and the runtime check
+  // can be promoted out from LICM pass, so the overhead is less, let's try
+  // a larger threshold to unroll more loops.
+  if (L->getLoopDepth() > 1)
+    UP.PartialThreshold *= 2;
+
    // Disable partial & runtime unrolling on -Os.
    UP.PartialOptSizeThreshold = 0;
  }
  
-Value *AArch64TTI::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
-                                                     Type *ExpectedType) const {
+Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+                                                         Type *ExpectedType) {
    switch (Inst->getIntrinsicID()) {
    default:
      return nullptr;
@@ -602,8 +528,8 @@ Value *AArch64TTI::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
    }
  }
  
-bool AArch64TTI::getTgtMemIntrinsic(IntrinsicInst *Inst,
-                                    MemIntrinsicInfo &Info) const {
+bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
+                                        MemIntrinsicInfo &Info) {
    switch (Inst->getIntrinsicID()) {
    default:
      break;
@@ -612,7 +538,7 @@ bool AArch64TTI::getTgtMemIntrinsic(IntrinsicInst *Inst,
    case Intrinsic::aarch64_neon_ld4:
      Info.ReadMem = true;
      Info.WriteMem = false;
-    Info.Vol = false;
+    Info.IsSimple = true;
      Info.NumMemRefs = 1;
      Info.PtrVal = Inst->getArgOperand(0);
      break;
@@ -621,7 +547,7 @@ bool AArch64TTI::getTgtMemIntrinsic(IntrinsicInst *Inst,
    case Intrinsic::aarch64_neon_st4:
      Info.ReadMem = false;
      Info.WriteMem = true;
-    Info.Vol = false;
+    Info.IsSimple = true;
      Info.NumMemRefs = 1;
      Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
      break;