}
-unsigned X86TTIImpl::getMaxInterleaveFactor() {
+unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
+ // If the loop will not be vectorized, don't interleave the loop.
+ // Let regular unroll to unroll the loop, which saves the overflow
+ // check and memory check cost.
+ if (VF == 1)
+ return 1;
+
if (ST->isAtom())
return 1;
TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo) {
// Legalize the type.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
static const CostTblEntry<MVT::SimpleValueType>
AVX2UniformConstCostTable[] = {
+ { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
+
{ ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
{ ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
{ ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
{ ISD::SHL, MVT::v4i64, 1 },
{ ISD::SRL, MVT::v4i64, 1 },
- { ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence.
- { ISD::SHL, MVT::v16i16, 16*10 }, // Scalarized.
+ { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
- { ISD::SRL, MVT::v32i8, 32*10 }, // Scalarized.
- { ISD::SRL, MVT::v16i16, 8*10 }, // Scalarized.
+ { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
- { ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized.
- { ISD::SRA, MVT::v16i16, 16*10 }, // Scalarized.
- { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized.
+ { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
+ { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized.
// Vectorizing division is a bad idea. See the SSE2 table for more comments.
{ ISD::SDIV, MVT::v32i8, 32*20 },
{ ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
{ ISD::SRA, MVT::v8i16, 1 }, // psraw.
{ ISD::SRA, MVT::v4i32, 1 }, // psrad.
+ { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle.
{ ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
{ ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
// to ISel. The cost model must return worst case assumptions because it is
// used for vectorization and we don't want to make vectorized code worse
// than scalar code.
- { ISD::SHL, MVT::v16i8, 30 }, // cmpeqb sequence.
- { ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized.
- { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
+ { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
{ ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized.
{ ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized.
- { ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized.
- { ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized.
- { ISD::SRL, MVT::v4i32, 4*10 }, // Scalarized.
+ { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
{ ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized.
- { ISD::SRA, MVT::v16i8, 16*10 }, // Scalarized.
- { ISD::SRA, MVT::v8i16, 8*10 }, // Scalarized.
- { ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized.
+ { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
+ { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
+ { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
{ ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized.
// It is not a good idea to vectorize division. We have to scalarize it and
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
if (Kind == TTI::SK_Reverse) {
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
unsigned Cost = 1;
if (LT.second.getSizeInBits() > 128)
Cost = 3; // Extract + insert + copy.
if (Kind == TTI::SK_Alternate) {
// 64-bit packed float vectors (v2f32) are widened to type v4f32.
// 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
// The backend knows how to generate a single VEX.256 version of
// instruction VPBLENDW if the target supports AVX2.
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
- std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src);
- std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst);
+ std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
+ std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
static const TypeConversionCostTblEntry<MVT::SimpleValueType>
SSE2ConvTbl[] = {
if (Idx != -1)
return AVX512ConversionTbl[Idx].Cost;
}
- EVT SrcTy = TLI->getValueType(Src);
- EVT DstTy = TLI->getValueType(Dst);
+ EVT SrcTy = TLI->getValueType(DL, Src);
+ EVT DstTy = TLI->getValueType(DL, Dst);
// The function getSimpleVT only handles simple value types.
if (!SrcTy.isSimple() || !DstTy.isSimple())
unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
Type *CondTy) {
// Legalize the type.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second;
if (Index != -1U) {
// Legalize the type.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
// This type is legalized to a scalar type.
if (!LT.second.isVector())
}
// Legalize the type.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
"Invalid Opcode");
}
// Legalize the type.
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(SrcVTy);
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
unsigned Cost = 0;
- if (LT.second != TLI->getValueType(SrcVTy).getSimpleVT() &&
+ if (LT.second != TLI->getValueType(DL, SrcVTy).getSimpleVT() &&
LT.second.getVectorNumElements() == NumElem)
// Promotion requires expand/truncate for data and a shuffle for mask.
Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) +
unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
bool IsPairwise) {
- std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second;
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, int Consecutive) {
int DataWidth = DataTy->getPrimitiveSizeInBits();
-
+
// Todo: AVX512 allows gather/scatter, works with strided and random as well
if ((DataWidth < 32) || (Consecutive == 0))
return false;
- if (ST->hasAVX512() || ST->hasAVX2())
+ if (ST->hasAVX512() || ST->hasAVX2())
return true;
return false;
}
return isLegalMaskedLoad(DataType, Consecutive);
}
+bool X86TTIImpl::hasCompatibleFunctionAttributes(const Function *Caller,
+ const Function *Callee) const {
+ const TargetMachine &TM = getTLI()->getTargetMachine();
+
+ // Work this as a subsetting of subtarget features.
+ const FeatureBitset &CallerBits =
+ TM.getSubtargetImpl(*Caller)->getFeatureBits();
+ const FeatureBitset &CalleeBits =
+ TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+ // FIXME: This is likely too limiting as it will include subtarget features
+ // that we might not care about for inlining, but it is conservatively
+ // correct.
+ return (CallerBits & CalleeBits) == CalleeBits;
+}