lib/Target/PowerPC/PPCTargetTransformInfo.cpp

   1 //===-- PPCTargetTransformInfo.cpp - PPC specific TTI pass ----------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 /// \file
  10 /// This file implements a TargetTransformInfo analysis pass specific to the
  11 /// PPC target machine. It uses the target's detailed information to provide
  12 /// more precise answers to certain TTI queries, while letting the target
  13 /// independent and default TTI implementations handle the rest.
  14 ///
  15 //===----------------------------------------------------------------------===//
  16
  17 #include "PPC.h"
  18 #include "PPCTargetMachine.h"
  19 #include "llvm/Analysis/TargetTransformInfo.h"
  20 #include "llvm/CodeGen/BasicTTIImpl.h"
  21 #include "llvm/Support/CommandLine.h"
  22 #include "llvm/Support/Debug.h"
  23 #include "llvm/Target/CostTable.h"
  24 #include "llvm/Target/TargetLowering.h"
  25 using namespace llvm;
  26
  27 #define DEBUG_TYPE "ppctti"
  28
  29 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
  30 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
  31
  32 namespace {
  33
  34 class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
  35   typedef BasicTTIImplBase<PPCTTIImpl> BaseT;
  36   typedef TargetTransformInfo TTI;
  37
  38   const PPCSubtarget *ST;
  39   const PPCTargetLowering *TLI;
  40
  41 public:
  42   explicit PPCTTIImpl(const PPCTargetMachine *TM = nullptr)
  43       : BaseT(TM), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {}
  44
  45   // Provide value semantics. MSVC requires that we spell all of these out.
  46   PPCTTIImpl(const PPCTTIImpl &Arg)
  47       : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
  48   PPCTTIImpl(PPCTTIImpl &&Arg)
  49       : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
  50         TLI(std::move(Arg.TLI)) {}
  51   PPCTTIImpl &operator=(const PPCTTIImpl &RHS) {
  52     BaseT::operator=(static_cast<const BaseT &>(RHS));
  53     ST = RHS.ST;
  54     TLI = RHS.TLI;
  55     return *this;
  56   }
  57   PPCTTIImpl &operator=(PPCTTIImpl &&RHS) {
  58     BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
  59     ST = std::move(RHS.ST);
  60     TLI = std::move(RHS.TLI);
  61     return *this;
  62   }
  63
  64   /// \name Scalar TTI Implementations
  65   /// @{
  66
  67   using BaseT::getIntImmCost;
  68   unsigned getIntImmCost(const APInt &Imm, Type *Ty);
  69
  70   unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
  71                          Type *Ty);
  72   unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
  73                          Type *Ty);
  74
  75   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
  76   void getUnrollingPreferences(const Function *F, Loop *L,
  77                                TTI::UnrollingPreferences &UP);
  78
  79   /// @}
  80
  81   /// \name Vector TTI Implementations
  82   /// @{
  83
  84   unsigned getNumberOfRegisters(bool Vector);
  85   unsigned getRegisterBitWidth(bool Vector);
  86   unsigned getMaxInterleaveFactor();
  87   unsigned getArithmeticInstrCost(
  88       unsigned Opcode, Type *Ty,
  89       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
  90       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
  91       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
  92       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
  93   unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
  94                           Type *SubTp);
  95   unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
  96   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
  97   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
  98   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
  99                            unsigned AddressSpace);
 100
 101   /// @}
 102 };
 103
 104 } // end anonymous namespace
 105
 106 ImmutablePass *
 107 llvm::createPPCTargetTransformInfoPass(const PPCTargetMachine *TM) {
 108   return new TargetTransformInfoWrapperPass(PPCTTIImpl(TM));
 109 }
 110
 111
 112 //===----------------------------------------------------------------------===//
 113 //
 114 // PPC cost model.
 115 //
 116 //===----------------------------------------------------------------------===//
 117
 118 TargetTransformInfo::PopcntSupportKind
 119 PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
 120   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
 121   if (ST->hasPOPCNTD() && TyWidth <= 64)
 122     return TTI::PSK_FastHardware;
 123   return TTI::PSK_Software;
 124 }
 125
 126 unsigned PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
 127   if (DisablePPCConstHoist)
 128     return BaseT::getIntImmCost(Imm, Ty);
 129
 130   assert(Ty->isIntegerTy());
 131
 132   unsigned BitSize = Ty->getPrimitiveSizeInBits();
 133   if (BitSize == 0)
 134     return ~0U;
 135
 136   if (Imm == 0)
 137     return TTI::TCC_Free;
 138
 139   if (Imm.getBitWidth() <= 64) {
 140     if (isInt<16>(Imm.getSExtValue()))
 141       return TTI::TCC_Basic;
 142
 143     if (isInt<32>(Imm.getSExtValue())) {
 144       // A constant that can be materialized using lis.
 145       if ((Imm.getZExtValue() & 0xFFFF) == 0)
 146         return TTI::TCC_Basic;
 147
 148       return 2 * TTI::TCC_Basic;
 149     }
 150   }
 151
 152   return 4 * TTI::TCC_Basic;
 153 }
 154
 155 unsigned PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
 156                                    const APInt &Imm, Type *Ty) {
 157   if (DisablePPCConstHoist)
 158     return BaseT::getIntImmCost(IID, Idx, Imm, Ty);
 159
 160   assert(Ty->isIntegerTy());
 161
 162   unsigned BitSize = Ty->getPrimitiveSizeInBits();
 163   if (BitSize == 0)
 164     return ~0U;
 165
 166   switch (IID) {
 167   default:
 168     return TTI::TCC_Free;
 169   case Intrinsic::sadd_with_overflow:
 170   case Intrinsic::uadd_with_overflow:
 171   case Intrinsic::ssub_with_overflow:
 172   case Intrinsic::usub_with_overflow:
 173     if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
 174       return TTI::TCC_Free;
 175     break;
 176   case Intrinsic::experimental_stackmap:
 177     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
 178       return TTI::TCC_Free;
 179     break;
 180   case Intrinsic::experimental_patchpoint_void:
 181   case Intrinsic::experimental_patchpoint_i64:
 182     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
 183       return TTI::TCC_Free;
 184     break;
 185   }
 186   return PPCTTIImpl::getIntImmCost(Imm, Ty);
 187 }
 188
 189 unsigned PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
 190                                    const APInt &Imm, Type *Ty) {
 191   if (DisablePPCConstHoist)
 192     return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty);
 193
 194   assert(Ty->isIntegerTy());
 195
 196   unsigned BitSize = Ty->getPrimitiveSizeInBits();
 197   if (BitSize == 0)
 198     return ~0U;
 199
 200   unsigned ImmIdx = ~0U;
 201   bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
 202        ZeroFree = false;
 203   switch (Opcode) {
 204   default:
 205     return TTI::TCC_Free;
 206   case Instruction::GetElementPtr:
 207     // Always hoist the base address of a GetElementPtr. This prevents the
 208     // creation of new constants for every base constant that gets constant
 209     // folded with the offset.
 210     if (Idx == 0)
 211       return 2 * TTI::TCC_Basic;
 212     return TTI::TCC_Free;
 213   case Instruction::And:
 214     RunFree = true; // (for the rotate-and-mask instructions)
 215     // Fallthrough...
 216   case Instruction::Add:
 217   case Instruction::Or:
 218   case Instruction::Xor:
 219     ShiftedFree = true;
 220     // Fallthrough...
 221   case Instruction::Sub:
 222   case Instruction::Mul:
 223   case Instruction::Shl:
 224   case Instruction::LShr:
 225   case Instruction::AShr:
 226     ImmIdx = 1;
 227     break;
 228   case Instruction::ICmp:
 229     UnsignedFree = true;
 230     ImmIdx = 1;
 231     // Fallthrough... (zero comparisons can use record-form instructions)
 232   case Instruction::Select:
 233     ZeroFree = true;
 234     break;
 235   case Instruction::PHI:
 236   case Instruction::Call:
 237   case Instruction::Ret:
 238   case Instruction::Load:
 239   case Instruction::Store:
 240     break;
 241   }
 242
 243   if (ZeroFree && Imm == 0)
 244     return TTI::TCC_Free;
 245
 246   if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
 247     if (isInt<16>(Imm.getSExtValue()))
 248       return TTI::TCC_Free;
 249
 250     if (RunFree) {
 251       if (Imm.getBitWidth() <= 32 &&
 252           (isShiftedMask_32(Imm.getZExtValue()) ||
 253            isShiftedMask_32(~Imm.getZExtValue())))
 254         return TTI::TCC_Free;
 255
 256       if (ST->isPPC64() &&
 257           (isShiftedMask_64(Imm.getZExtValue()) ||
 258            isShiftedMask_64(~Imm.getZExtValue())))
 259         return TTI::TCC_Free;
 260     }
 261
 262     if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
 263       return TTI::TCC_Free;
 264
 265     if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
 266       return TTI::TCC_Free;
 267   }
 268
 269   return PPCTTIImpl::getIntImmCost(Imm, Ty);
 270 }
 271
 272 void PPCTTIImpl::getUnrollingPreferences(const Function *F, Loop *L,
 273                                          TTI::UnrollingPreferences &UP) {
 274   if (TM->getSubtarget<PPCSubtarget>(F).getDarwinDirective() == PPC::DIR_A2) {
 275     // The A2 is in-order with a deep pipeline, and concatenation unrolling
 276     // helps expose latency-hiding opportunities to the instruction scheduler.
 277     UP.Partial = UP.Runtime = true;
 278   }
 279
 280   BaseT::getUnrollingPreferences(F, L, UP);
 281 }
 282
 283 unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
 284   if (Vector && !ST->hasAltivec())
 285     return 0;
 286   return ST->hasVSX() ? 64 : 32;
 287 }
 288
 289 unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) {
 290   if (Vector) {
 291     if (ST->hasAltivec()) return 128;
 292     return 0;
 293   }
 294
 295   if (ST->isPPC64())
 296     return 64;
 297   return 32;
 298
 299 }
 300
 301 unsigned PPCTTIImpl::getMaxInterleaveFactor() {
 302   unsigned Directive = ST->getDarwinDirective();
 303   // The 440 has no SIMD support, but floating-point instructions
 304   // have a 5-cycle latency, so unroll by 5x for latency hiding.
 305   if (Directive == PPC::DIR_440)
 306     return 5;
 307
 308   // The A2 has no SIMD support, but floating-point instructions
 309   // have a 6-cycle latency, so unroll by 6x for latency hiding.
 310   if (Directive == PPC::DIR_A2)
 311     return 6;
 312
 313   // FIXME: For lack of any better information, do no harm...
 314   if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
 315     return 1;
 316
 317   // For most things, modern systems have two execution units (and
 318   // out-of-order execution).
 319   return 2;
 320 }
 321
 322 unsigned PPCTTIImpl::getArithmeticInstrCost(
 323     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
 324     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
 325     TTI::OperandValueProperties Opd2PropInfo) {
 326   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 327
 328   // Fallback to the default implementation.
 329   return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
 330                                        Opd1PropInfo, Opd2PropInfo);
 331 }
 332
 333 unsigned PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
 334                                     Type *SubTp) {
 335   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 336 }
 337
 338 unsigned PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
 339   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 340
 341   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 342 }
 343
 344 unsigned PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
 345                                         Type *CondTy) {
 346   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 347 }
 348
 349 unsigned PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
 350                                         unsigned Index) {
 351   assert(Val->isVectorTy() && "This must be a vector type");
 352
 353   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 354   assert(ISD && "Invalid opcode");
 355
 356   if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
 357     // Double-precision scalars are already located in index #0.
 358     if (Index == 0)
 359       return 0;
 360
 361     return BaseT::getVectorInstrCost(Opcode, Val, Index);
 362   }
 363
 364   // Estimated cost of a load-hit-store delay.  This was obtained
 365   // experimentally as a minimum needed to prevent unprofitable
 366   // vectorization for the paq8p benchmark.  It may need to be
 367   // raised further if other unprofitable cases remain.
 368   unsigned LHSPenalty = 2;
 369   if (ISD == ISD::INSERT_VECTOR_ELT)
 370     LHSPenalty += 7;
 371
 372   // Vector element insert/extract with Altivec is very expensive,
 373   // because they require store and reload with the attendant
 374   // processor stall for load-hit-store.  Until VSX is available,
 375   // these need to be estimated as very costly.
 376   if (ISD == ISD::EXTRACT_VECTOR_ELT ||
 377       ISD == ISD::INSERT_VECTOR_ELT)
 378     return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index);
 379
 380   return BaseT::getVectorInstrCost(Opcode, Val, Index);
 381 }
 382
 383 unsigned PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
 384                                      unsigned Alignment,
 385                                      unsigned AddressSpace) {
 386   // Legalize the type.
 387   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
 388   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
 389          "Invalid Opcode");
 390
 391   unsigned Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
 392
 393   // VSX loads/stores support unaligned access.
 394   if (ST->hasVSX()) {
 395     if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64)
 396       return Cost;
 397   }
 398
 399   bool UnalignedAltivec =
 400     Src->isVectorTy() &&
 401     Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() &&
 402     LT.second.getSizeInBits() == 128 &&
 403     Opcode == Instruction::Load;
 404
 405   // PPC in general does not support unaligned loads and stores. They'll need
 406   // to be decomposed based on the alignment factor.
 407   unsigned SrcBytes = LT.second.getStoreSize();
 408   if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) {
 409     Cost += LT.first*(SrcBytes/Alignment-1);
 410
 411     // For a vector type, there is also scalarization overhead (only for
 412     // stores, loads are expanded using the vector-load + permutation sequence,
 413     // which is much less expensive).
 414     if (Src->isVectorTy() && Opcode == Instruction::Store)
 415       for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
 416         Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
 417   }
 418
 419   return Cost;
 420 }
 421