lib/Target/ARM64/ARM64TargetTransformInfo.cpp

   1 //===-- ARM64TargetTransformInfo.cpp - ARM64 specific TTI pass ------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 /// \file
  10 /// This file implements a TargetTransformInfo analysis pass specific to the
  11 /// ARM64 target machine. It uses the target's detailed information to provide
  12 /// more precise answers to certain TTI queries, while letting the target
  13 /// independent and default TTI implementations handle the rest.
  14 ///
  15 //===----------------------------------------------------------------------===//
  16
  17 #define DEBUG_TYPE "arm64tti"
  18 #include "ARM64.h"
  19 #include "ARM64TargetMachine.h"
  20 #include "MCTargetDesc/ARM64AddressingModes.h"
  21 #include "llvm/Analysis/TargetTransformInfo.h"
  22 #include "llvm/Support/Debug.h"
  23 #include "llvm/Target/CostTable.h"
  24 #include "llvm/Target/TargetLowering.h"
  25 using namespace llvm;
  26
  27 // Declare the pass initialization routine locally as target-specific passes
  28 // don't havve a target-wide initialization entry point, and so we rely on the
  29 // pass constructor initialization.
  30 namespace llvm {
  31 void initializeARM64TTIPass(PassRegistry &);
  32 }
  33
  34 namespace {
  35
  36 class ARM64TTI final : public ImmutablePass, public TargetTransformInfo {
  37   const ARM64TargetMachine *TM;
  38   const ARM64Subtarget *ST;
  39   const ARM64TargetLowering *TLI;
  40
  41   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
  42   /// are set if the result needs to be inserted and/or extracted from vectors.
  43   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
  44
  45 public:
  46   ARM64TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
  47     llvm_unreachable("This pass cannot be directly constructed");
  48   }
  49
  50   ARM64TTI(const ARM64TargetMachine *TM)
  51       : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
  52         TLI(TM->getTargetLowering()) {
  53     initializeARM64TTIPass(*PassRegistry::getPassRegistry());
  54   }
  55
  56   void initializePass() override { pushTTIStack(this); }
  57
  58   void getAnalysisUsage(AnalysisUsage &AU) const override {
  59     TargetTransformInfo::getAnalysisUsage(AU);
  60   }
  61
  62   /// Pass identification.
  63   static char ID;
  64
  65   /// Provide necessary pointer adjustments for the two base classes.
  66   void *getAdjustedAnalysisPointer(const void *ID) override {
  67     if (ID == &TargetTransformInfo::ID)
  68       return (TargetTransformInfo *)this;
  69     return this;
  70   }
  71
  72   /// \name Scalar TTI Implementations
  73   /// @{
  74
  75   unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
  76   PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
  77
  78   /// @}
  79
  80   /// \name Vector TTI Implementations
  81   /// @{
  82
  83   unsigned getNumberOfRegisters(bool Vector) const override {
  84     if (Vector)
  85       return 32;
  86
  87     return 31;
  88   }
  89
  90   unsigned getRegisterBitWidth(bool Vector) const override {
  91     if (Vector)
  92       return 128;
  93
  94     return 64;
  95   }
  96
  97   unsigned getMaximumUnrollFactor() const override { return 2; }
  98
  99   unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
 100       override;
 101
 102   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
 103       override;
 104
 105   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 106                                   OperandValueKind Opd1Info = OK_AnyValue,
 107                                   OperandValueKind Opd2Info = OK_AnyValue) const
 108       override;
 109
 110   unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
 111
 112   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const
 113       override;
 114
 115   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
 116                            unsigned AddressSpace) const override;
 117   /// @}
 118 };
 119
 120 } // end anonymous namespace
 121
 122 INITIALIZE_AG_PASS(ARM64TTI, TargetTransformInfo, "arm64tti",
 123                    "ARM64 Target Transform Info", true, true, false)
 124 char ARM64TTI::ID = 0;
 125
 126 ImmutablePass *
 127 llvm::createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM) {
 128   return new ARM64TTI(TM);
 129 }
 130
 131 unsigned ARM64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
 132   assert(Ty->isIntegerTy());
 133
 134   unsigned BitSize = Ty->getPrimitiveSizeInBits();
 135   if (BitSize == 0)
 136     return ~0U;
 137
 138   int64_t Val = Imm.getSExtValue();
 139   if (Val == 0 || ARM64_AM::isLogicalImmediate(Val, BitSize))
 140     return 1;
 141
 142   if ((int64_t)Val < 0)
 143     Val = ~Val;
 144   if (BitSize == 32)
 145     Val &= (1LL << 32) - 1;
 146
 147   unsigned LZ = countLeadingZeros((uint64_t)Val);
 148   unsigned Shift = (63 - LZ) / 16;
 149   // MOVZ is free so return true for one or fewer MOVK.
 150   return (Shift == 0) ? 1 : Shift;
 151 }
 152
 153 ARM64TTI::PopcntSupportKind ARM64TTI::getPopcntSupport(unsigned TyWidth) const {
 154   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
 155   if (TyWidth == 32 || TyWidth == 64)
 156     return PSK_FastHardware;
 157   // TODO: ARM64TargetLowering::LowerCTPOP() supports 128bit popcount.
 158   return PSK_Software;
 159 }
 160
 161 unsigned ARM64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
 162                                     Type *Src) const {
 163   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 164   assert(ISD && "Invalid opcode");
 165
 166   EVT SrcTy = TLI->getValueType(Src);
 167   EVT DstTy = TLI->getValueType(Dst);
 168
 169   if (!SrcTy.isSimple() || !DstTy.isSimple())
 170     return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
 171
 172   static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
 173     // LowerVectorINT_TO_FP:
 174     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
 175     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
 176     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
 177     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
 178     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
 179     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
 180     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
 181     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
 182     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
 183     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
 184     // LowerVectorFP_TO_INT
 185     { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
 186     { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
 187     { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
 188     { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
 189     { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
 190     { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
 191     { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 },
 192     { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 },
 193   };
 194
 195   int Idx = ConvertCostTableLookup<MVT>(
 196       ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
 197       SrcTy.getSimpleVT());
 198   if (Idx != -1)
 199     return ConversionTbl[Idx].Cost;
 200
 201   return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
 202 }
 203
 204 unsigned ARM64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
 205                                       unsigned Index) const {
 206   assert(Val->isVectorTy() && "This must be a vector type");
 207
 208   if (Index != -1U) {
 209     // Legalize the type.
 210     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
 211
 212     // This type is legalized to a scalar type.
 213     if (!LT.second.isVector())
 214       return 0;
 215
 216     // The type may be split. Normalize the index to the new type.
 217     unsigned Width = LT.second.getVectorNumElements();
 218     Index = Index % Width;
 219
 220     // The element at index zero is already inside the vector.
 221     if (Index == 0)
 222       return 0;
 223   }
 224
 225   // All other insert/extracts cost this much.
 226   return 2;
 227 }
 228
 229 unsigned ARM64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 230                                           OperandValueKind Opd1Info,
 231                                           OperandValueKind Opd2Info) const {
 232   // Legalize the type.
 233   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
 234
 235   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 236
 237   switch (ISD) {
 238   default:
 239     return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
 240                                                        Opd2Info);
 241   case ISD::ADD:
 242   case ISD::MUL:
 243   case ISD::XOR:
 244   case ISD::OR:
 245   case ISD::AND:
 246     // These nodes are marked as 'custom' for combining purposes only.
 247     // We know that they are legal. See LowerAdd in ISelLowering.
 248     return 1 * LT.first;
 249   }
 250 }
 251
 252 unsigned ARM64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
 253   // Address computations in vectorized code with non-consecutive addresses will
 254   // likely result in more instructions compared to scalar code where the
 255   // computation can more often be merged into the index mode. The resulting
 256   // extra micro-ops can significantly decrease throughput.
 257   unsigned NumVectorInstToHideOverhead = 10;
 258
 259   if (Ty->isVectorTy() && IsComplex)
 260     return NumVectorInstToHideOverhead;
 261
 262   // In many cases the address computation is not merged into the instruction
 263   // addressing mode.
 264   return 1;
 265 }
 266
 267 unsigned ARM64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
 268                                       Type *CondTy) const {
 269
 270   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 271   // We don't lower vector selects well that are wider than the register width.
 272   if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
 273     // We would need this many instructions to hide the scalarization happening.
 274     unsigned AmortizationCost = 20;
 275     static const TypeConversionCostTblEntry<MVT::SimpleValueType>
 276     VectorSelectTbl[] = {
 277       { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
 278       { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
 279       { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
 280       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
 281       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
 282       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
 283     };
 284
 285     EVT SelCondTy = TLI->getValueType(CondTy);
 286     EVT SelValTy = TLI->getValueType(ValTy);
 287     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
 288       int Idx =
 289           ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
 290                                  SelValTy.getSimpleVT());
 291       if (Idx != -1)
 292         return VectorSelectTbl[Idx].Cost;
 293     }
 294   }
 295   return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 296 }
 297
 298 unsigned ARM64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
 299                                    unsigned Alignment,
 300                                    unsigned AddressSpace) const {
 301   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
 302
 303   if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
 304       Src->getVectorElementType()->isIntegerTy(64)) {
 305     // Unaligned stores are extremely inefficient. We don't split
 306     // unaligned v2i64 stores because the negative impact that has shown in
 307     // practice on inlined memcpy code.
 308     // We make v2i64 stores expensive so that we will only vectorize if there
 309     // are 6 other instructions getting vectorized.
 310     unsigned AmortizationCost = 6;
 311
 312     return LT.first * 2 * AmortizationCost;
 313   }
 314
 315   if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
 316       Src->getVectorNumElements() < 8) {
 317     // We scalarize the loads/stores because there is not v.4b register and we
 318     // have to promote the elements to v.4h.
 319     unsigned NumVecElts = Src->getVectorNumElements();
 320     unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
 321     // We generate 2 instructions per vector element.
 322     return NumVectorizableInstsToAmortize * NumVecElts * 2;
 323   }
 324
 325   return LT.first;
 326 }