///
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "x86tti"
#include "X86.h"
#include "X86TargetMachine.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/CostTable.h"
#include "llvm/Target/TargetLowering.h"
using namespace llvm;
+#define DEBUG_TYPE "x86tti"
+
// Declare the pass initialization routine locally as target-specific passes
// don't havve a target-wide initialization entry point, and so we rely on the
// pass constructor initialization.
void initializeX86TTIPass(PassRegistry &);
}
+static cl::opt<bool>
+UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true),
+ cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden);
+static cl::opt<unsigned>
+PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0),
+ cl::desc("Threshold for X86 partial unrolling"), cl::Hidden);
+static cl::opt<unsigned>
+PartialUnrollingMaxBranches("x86-partial-max-branches", cl::init(2),
+ cl::desc("Threshold for taken branches in X86 partial unrolling"),
+ cl::Hidden);
+
namespace {
class X86TTI final : public ImmutablePass, public TargetTransformInfo {
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
public:
- X86TTI() : ImmutablePass(ID), ST(0), TLI(0) {
+ X86TTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) {
llvm_unreachable("This pass cannot be directly constructed");
}
initializeX86TTIPass(*PassRegistry::getPassRegistry());
}
- virtual void initializePass() override {
+ void initializePass() override {
pushTTIStack(this);
}
- virtual void finalizePass() {
- popTTIStack();
- }
-
- virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
TargetTransformInfo::getAnalysisUsage(AU);
}
static char ID;
/// Provide necessary pointer adjustments for the two base classes.
- virtual void *getAdjustedAnalysisPointer(const void *ID) override {
+ void *getAdjustedAnalysisPointer(const void *ID) override {
if (ID == &TargetTransformInfo::ID)
return (TargetTransformInfo*)this;
return this;
/// \name Scalar TTI Implementations
/// @{
- virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
+ PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
+ void getUnrollingPreferences(Loop *L,
+ UnrollingPreferences &UP) const override;
/// @}
/// \name Vector TTI Implementations
/// @{
- virtual unsigned getNumberOfRegisters(bool Vector) const override;
- virtual unsigned getRegisterBitWidth(bool Vector) const override;
- virtual unsigned getMaximumUnrollFactor() const override;
- virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
- OperandValueKind,
- OperandValueKind) const override;
- virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
- int Index, Type *SubTp) const override;
- virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
- Type *Src) const override;
- virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy) const override;
- virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
- unsigned Index) const override;
- virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
- unsigned Alignment,
- unsigned AddressSpace) const override;
-
- virtual unsigned
- getAddressComputationCost(Type *PtrTy, bool IsComplex) const override;
-
- virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
- bool IsPairwiseForm) const override;
-
- virtual unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
-
- virtual unsigned getIntImmCost(unsigned Opcode, const APInt &Imm,
- Type *Ty) const override;
- virtual unsigned getIntImmCost(Intrinsic::ID IID, const APInt &Imm,
- Type *Ty) const override;
+ unsigned getNumberOfRegisters(bool Vector) const override;
+ unsigned getRegisterBitWidth(bool Vector) const override;
+ unsigned getMaximumUnrollFactor() const override;
+ unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
+ OperandValueKind) const override;
+ unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
+ int Index, Type *SubTp) const override;
+ unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+ Type *Src) const override;
+ unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy) const override;
+ unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) const override;
+ unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace) const override;
+
+ unsigned getAddressComputationCost(Type *PtrTy,
+ bool IsComplex) const override;
+
+ unsigned getReductionCost(unsigned Opcode, Type *Ty,
+ bool IsPairwiseForm) const override;
+
+ unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
+
+ unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+ Type *Ty) const override;
+ unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty) const override;
/// @}
};
return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software;
}
+void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
+ if (!UsePartialUnrolling)
+ return;
+ // According to the Intel 64 and IA-32 Architectures Optimization Reference
+ // Manual, Intel Core models and later have a loop stream detector
+ // (and associated uop queue) that can benefit from partial unrolling.
+ // The relevant requirements are:
+ // - The loop must have no more than 4 (8 for Nehalem and later) branches
+ // taken, and none of them may be calls.
+ // - The loop can have no more than 18 (28 for Nehalem and later) uops.
+
+ // According to the Software Optimization Guide for AMD Family 15h Processors,
+ // models 30h-4fh (Steamroller and later) have a loop predictor and loop
+ // buffer which can benefit from partial unrolling.
+ // The relevant requirements are:
+ // - The loop must have fewer than 16 branches
+ // - The loop must have less than 40 uops in all executed loop branches
+
+ unsigned MaxBranches, MaxOps;
+ if (PartialUnrollingThreshold.getNumOccurrences() > 0) {
+ MaxBranches = PartialUnrollingMaxBranches;
+ MaxOps = PartialUnrollingThreshold;
+ } else if (ST->isAtom()) {
+ // On the Atom, the throughput for taken branches is 2 cycles. For small
+ // simple loops, expand by a small factor to hide the backedge cost.
+ MaxBranches = 2;
+ MaxOps = 10;
+ } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) {
+ MaxBranches = 16;
+ MaxOps = 40;
+ } else if (ST->hasFMA4() /* Any other recent AMD */) {
+ return;
+ } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) {
+ MaxBranches = 8;
+ MaxOps = 28;
+ } else if (ST->hasSSSE3() /* Intel Core */) {
+ MaxBranches = 4;
+ MaxOps = 18;
+ } else {
+ return;
+ }
+
+ // Scan the loop: don't unroll loops with calls, and count the potential
+ // number of taken branches (this is somewhat conservative because we're
+ // counting all block transitions as potential branches while in reality some
+ // of these will become implicit via block placement).
+ unsigned MaxDepth = 0;
+ for (df_iterator<BasicBlock*> DI = df_begin(L->getHeader()),
+ DE = df_end(L->getHeader()); DI != DE;) {
+ if (!L->contains(*DI)) {
+ DI.skipChildren();
+ continue;
+ }
+
+ MaxDepth = std::max(MaxDepth, DI.getPathLength());
+ if (MaxDepth > MaxBranches)
+ return;
+
+ for (BasicBlock::iterator I = DI->begin(), IE = DI->end(); I != IE; ++I)
+ if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+ ImmutableCallSite CS(I);
+ if (const Function *F = CS.getCalledFunction()) {
+ if (!isLoweredToCall(F))
+ continue;
+ }
+
+ return;
+ }
+
+ ++DI;
+ }
+
+ // Enable runtime and partial unrolling up to the specified size.
+ UP.Partial = UP.Runtime = true;
+ UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
+
+ // Set the maximum count based on the loop depth. The maximum number of
+ // branches taken in a loop (including the backedge) is equal to the maximum
+ // loop depth (the DFS path length from the loop header to any block in the
+ // loop). When the loop is unrolled, this depth (except for the backedge
+ // itself) is multiplied by the unrolling factor. This new unrolled depth
+ // must be less than the target-specific maximum branch count (which limits
+ // the number of taken branches in the uop buffer).
+ if (MaxDepth > 1)
+ UP.MaxCount = (MaxBranches-1)/(MaxDepth-1);
+}
+
unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
if (Vector && !ST->hasSSE1())
return 0;
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ static const CostTblEntry<MVT::SimpleValueType>
+ AVX2UniformConstCostTable[] = {
+ { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
+ { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
+ { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
+ { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
+ };
+
+ if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ ST->hasAVX2()) {
+ int Idx = CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second);
+ if (Idx != -1)
+ return LT.first * AVX2UniformConstCostTable[Idx].Cost;
+ }
+
static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = {
// Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
// customize them to detect the cases where shift amount is a scalar one.
{ ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
{ ISD::SRA, MVT::v8i16, 1 }, // psraw.
{ ISD::SRA, MVT::v4i32, 1 }, // psrad.
+
+ { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
+ { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
+ { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+ { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
ST->hasSSE2()) {
+ // pmuldq sequence.
+ if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
+ return LT.first * 15;
+
int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second);
if (Idx != -1)
return LT.first * SSE2UniformConstCostTable[Idx].Cost;
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
-
- { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 },
+ // The generic code to compute the scalar overhead is currently broken.
+ // Workaround this limitation by estimating the scalarization overhead
+ // here. We have roughly 10 instructions per scalar element.
+ // Multiply that by the vector width.
+ // FIXME: remove that when PR19268 is fixed.
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 4*10 },
+
+ { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 },
{ ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
+ // This node is expanded into scalarized operations but BasicTTI is overly
+ // optimistic estimating its cost. It computes 3 per element (one
+ // vector-extract, one scalar conversion and one vector-insert). The
+ // problem is that the inserts form a read-modify-write chain so latency
+ // should be factored in too. Inflating the cost per element by 1.
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 },
};
if (ST->hasAVX2()) {
if (BitSize == 0)
return ~0U;
+ if (Imm == 0)
+ return TCC_Free;
+
if (Imm.getBitWidth() <= 64 &&
(isInt<32>(Imm.getSExtValue()) || isUInt<32>(Imm.getZExtValue())))
return TCC_Basic;
return 2 * TCC_Basic;
}
-unsigned X86TTI::getIntImmCost(unsigned Opcode, const APInt &Imm,
+unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
Type *Ty) const {
assert(Ty->isIntegerTy());
if (BitSize == 0)
return ~0U;
+ unsigned ImmIdx = ~0U;
switch (Opcode) {
+ default: return TCC_Free;
+ case Instruction::GetElementPtr:
+ // Always hoist the base address of a GetElementPtr. This prevents the
+ // creation of new constants for every base constant that gets constant
+ // folded with the offset.
+ if (Idx == 0)
+ return 2 * TCC_Basic;
+ return TCC_Free;
+ case Instruction::Store:
+ ImmIdx = 0;
+ break;
case Instruction::Add:
case Instruction::Sub:
case Instruction::Mul:
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
case Instruction::ICmp:
- if (Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
+ ImmIdx = 1;
+ break;
+ // Always return TCC_Free for the shift value of a shift instruction.
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ if (Idx == 1)
return TCC_Free;
- else
- return X86TTI::getIntImmCost(Imm, Ty);
+ break;
case Instruction::Trunc:
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::IntToPtr:
case Instruction::PtrToInt:
case Instruction::BitCast:
+ case Instruction::PHI:
case Instruction::Call:
case Instruction::Select:
case Instruction::Ret:
case Instruction::Load:
- case Instruction::Store:
- return X86TTI::getIntImmCost(Imm, Ty);
+ break;
}
- return TargetTransformInfo::getIntImmCost(Opcode, Imm, Ty);
+
+ if ((Idx == ImmIdx) &&
+ Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
+ return TCC_Free;
+
+ return X86TTI::getIntImmCost(Imm, Ty);
}
-unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, const APInt &Imm,
- Type *Ty) const {
+unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty) const {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
return ~0U;
switch (IID) {
- default: return TargetTransformInfo::getIntImmCost(IID, Imm, Ty);
+ default: return TCC_Free;
case Intrinsic::sadd_with_overflow:
case Intrinsic::uadd_with_overflow:
case Intrinsic::ssub_with_overflow:
case Intrinsic::usub_with_overflow:
case Intrinsic::smul_with_overflow:
case Intrinsic::umul_with_overflow:
- if (Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
+ if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue()))
return TCC_Free;
- else
- return X86TTI::getIntImmCost(Imm, Ty);
+ break;
case Intrinsic::experimental_stackmap:
+ if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+ return TCC_Free;
+ break;
case Intrinsic::experimental_patchpoint_void:
case Intrinsic::experimental_patchpoint_i64:
- if (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))
+ if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
return TCC_Free;
- else
- return X86TTI::getIntImmCost(Imm, Ty);
+ break;
}
+ return X86TTI::getIntImmCost(Imm, Ty);
}