1 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 /// This file implements a TargetTransformInfo analysis pass specific to the
11 /// X86 target machine. It uses the target's detailed information to provide
12 /// more precise answers to certain TTI queries, while letting the target
13 /// independent and default TTI implementations handle the rest.
15 //===----------------------------------------------------------------------===//
17 #define DEBUG_TYPE "x86tti"
19 #include "X86TargetMachine.h"
20 #include "llvm/Analysis/TargetTransformInfo.h"
21 #include "llvm/Support/Debug.h"
22 #include "llvm/Target/TargetLowering.h"
25 // Declare the pass initialization routine locally as target-specific passes
26 // don't havve a target-wide initialization entry point, and so we rely on the
27 // pass constructor initialization.
29 void initializeX86TTIPass(PassRegistry &);
34 class X86TTI : public ImmutablePass, public TargetTransformInfo {
35 const X86TargetMachine *TM;
36 const X86Subtarget *ST;
37 const X86TargetLowering *TLI;
39 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
40 /// are set if the result needs to be inserted and/or extracted from vectors.
41 unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
44 X86TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
45 llvm_unreachable("This pass cannot be directly constructed");
48 X86TTI(const X86TargetMachine *TM)
49 : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
50 TLI(TM->getTargetLowering()) {
51 initializeX86TTIPass(*PassRegistry::getPassRegistry());
54 virtual void initializePass() {
58 virtual void finalizePass() {
62 virtual void getAnalysisUsage(AnalysisUsage &AU) const {
63 TargetTransformInfo::getAnalysisUsage(AU);
66 /// Pass identification.
69 /// Provide necessary pointer adjustments for the two base classes.
70 virtual void *getAdjustedAnalysisPointer(const void *ID) {
71 if (ID == &TargetTransformInfo::ID)
72 return (TargetTransformInfo*)this;
76 /// \name Scalar TTI Implementations
78 virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
82 /// \name Vector TTI Implementations
85 virtual unsigned getNumberOfRegisters(bool Vector) const;
86 virtual unsigned getRegisterBitWidth(bool Vector) const;
87 virtual unsigned getMaximumUnrollFactor() const;
88 virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
89 virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
90 int Index, Type *SubTp) const;
91 virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
93 virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
95 virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
96 unsigned Index) const;
97 virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
99 unsigned AddressSpace) const;
104 } // end anonymous namespace
106 INITIALIZE_AG_PASS(X86TTI, TargetTransformInfo, "x86tti",
107 "X86 Target Transform Info", true, true, false)
111 llvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) {
112 return new X86TTI(TM);
116 //===----------------------------------------------------------------------===//
120 //===----------------------------------------------------------------------===//
122 X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
123 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
124 // TODO: Currently the __builtin_popcount() implementation using SSE3
125 // instructions is inefficient. Once the problem is fixed, we should
126 // call ST->hasSSE3() instead of ST->hasSSE4().
127 return ST->hasSSE41() ? PSK_FastHardware : PSK_Software;
130 unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
131 if (Vector && !ST->hasSSE1())
139 unsigned X86TTI::getRegisterBitWidth(bool Vector) const {
141 if (ST->hasAVX()) return 256;
142 if (ST->hasSSE1()) return 128;
152 unsigned X86TTI::getMaximumUnrollFactor() const {
156 // Sandybridge and Haswell have multiple execution ports and pipelined
164 unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
165 // Legalize the type.
166 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
168 int ISD = TLI->InstructionOpcodeToISD(Opcode);
169 assert(ISD && "Invalid opcode");
171 // We don't have to scalarize unsupported ops. We can issue two half-sized
172 // operations and we only need to extract the upper YMM half.
173 // Two ops + 1 extract + 1 insert = 4.
174 static const CostTableEntry AVX1CostTable[] = {
175 { ISD::MUL, { MVT::v8i32 }, 4 },
176 { ISD::SUB, { MVT::v8i32 }, 4 },
177 { ISD::ADD, { MVT::v8i32 }, 4 },
178 { ISD::MUL, { MVT::v4i64 }, 4 },
179 { ISD::SUB, { MVT::v4i64 }, 4 },
180 { ISD::ADD, { MVT::v4i64 }, 4 },
182 UnaryCostTable costTable (AVX1CostTable, array_lengthof(AVX1CostTable));
184 // Look for AVX1 lowering tricks.
186 unsigned cost = costTable.findCost(ISD, LT.second);
187 if (cost != BinaryCostTable::COST_NOT_FOUND)
188 return LT.first * cost;
190 // Fallback to the default implementation.
191 return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty);
194 unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
196 // We only estimate the cost of reverse shuffles.
197 if (Kind != SK_Reverse)
198 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
200 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
202 if (LT.second.getSizeInBits() > 128)
203 Cost = 3; // Extract + insert + copy.
205 // Multiple by the number of parts.
206 return Cost * LT.first;
209 unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
210 int ISD = TLI->InstructionOpcodeToISD(Opcode);
211 assert(ISD && "Invalid opcode");
213 EVT SrcTy = TLI->getValueType(Src);
214 EVT DstTy = TLI->getValueType(Dst);
216 if (!SrcTy.isSimple() || !DstTy.isSimple())
217 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
219 static const CostTableEntry AVXConversionTbl[] = {
220 { ISD::SIGN_EXTEND, { MVT::v8i32, MVT::v8i16 }, 1 },
221 { ISD::ZERO_EXTEND, { MVT::v8i32, MVT::v8i16 }, 1 },
222 { ISD::SIGN_EXTEND, { MVT::v4i64, MVT::v4i32 }, 1 },
223 { ISD::ZERO_EXTEND, { MVT::v4i64, MVT::v4i32 }, 1 },
224 { ISD::TRUNCATE, { MVT::v4i32, MVT::v4i64 }, 1 },
225 { ISD::TRUNCATE, { MVT::v8i16, MVT::v8i32 }, 1 },
226 { ISD::SINT_TO_FP, { MVT::v8f32, MVT::v8i8 }, 1 },
227 { ISD::SINT_TO_FP, { MVT::v4f32, MVT::v4i8 }, 1 },
228 { ISD::UINT_TO_FP, { MVT::v8f32, MVT::v8i8 }, 1 },
229 { ISD::UINT_TO_FP, { MVT::v4f32, MVT::v4i8 }, 1 },
230 { ISD::FP_TO_SINT, { MVT::v8i8, MVT::v8f32 }, 1 },
231 { ISD::FP_TO_SINT, { MVT::v4i8, MVT::v4f32 }, 1 },
232 { ISD::ZERO_EXTEND, { MVT::v8i32, MVT::v8i1 }, 6 },
233 { ISD::SIGN_EXTEND, { MVT::v8i32, MVT::v8i1 }, 9 },
234 { ISD::TRUNCATE, { MVT::v8i32, MVT::v8i64 }, 3 }
236 BinaryCostTable costTable (AVXConversionTbl, array_lengthof(AVXConversionTbl));
239 unsigned cost = costTable.findCost(ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
240 if (cost != BinaryCostTable::COST_NOT_FOUND)
244 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
247 unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
248 Type *CondTy) const {
249 // Legalize the type.
250 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
254 int ISD = TLI->InstructionOpcodeToISD(Opcode);
255 assert(ISD && "Invalid opcode");
257 static const CostTableEntry SSE42CostTbl[] = {
258 { ISD::SETCC, { MVT::v2f64 }, 1 },
259 { ISD::SETCC, { MVT::v4f32 }, 1 },
260 { ISD::SETCC, { MVT::v2i64 }, 1 },
261 { ISD::SETCC, { MVT::v4i32 }, 1 },
262 { ISD::SETCC, { MVT::v8i16 }, 1 },
263 { ISD::SETCC, { MVT::v16i8 }, 1 },
265 UnaryCostTable costTableSSE4 (SSE42CostTbl, array_lengthof(SSE42CostTbl));
267 static const CostTableEntry AVX1CostTbl[] = {
268 { ISD::SETCC, { MVT::v4f64 }, 1 },
269 { ISD::SETCC, { MVT::v8f32 }, 1 },
270 // AVX1 does not support 8-wide integer compare.
271 { ISD::SETCC, { MVT::v4i64 }, 4 },
272 { ISD::SETCC, { MVT::v8i32 }, 4 },
273 { ISD::SETCC, { MVT::v16i16 }, 4 },
274 { ISD::SETCC, { MVT::v32i8 }, 4 },
276 UnaryCostTable costTableAVX1 (AVX1CostTbl, array_lengthof(AVX1CostTbl));
278 static const CostTableEntry AVX2CostTbl[] = {
279 { ISD::SETCC, { MVT::v4i64 }, 1 },
280 { ISD::SETCC, { MVT::v8i32 }, 1 },
281 { ISD::SETCC, { MVT::v16i16 }, 1 },
282 { ISD::SETCC, { MVT::v32i8 }, 1 },
284 UnaryCostTable costTableAVX2 (AVX2CostTbl, array_lengthof(AVX2CostTbl));
287 unsigned cost = costTableAVX2.findCost(ISD, MTy);
288 if (cost != BinaryCostTable::COST_NOT_FOUND)
289 return LT.first * cost;
293 unsigned cost = costTableAVX1.findCost(ISD, MTy);
294 if (cost != BinaryCostTable::COST_NOT_FOUND)
295 return LT.first * cost;
298 if (ST->hasSSE42()) {
299 unsigned cost = costTableSSE4.findCost(ISD, MTy);
300 if (cost != BinaryCostTable::COST_NOT_FOUND)
301 return LT.first * cost;
304 return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
307 unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
308 unsigned Index) const {
309 assert(Val->isVectorTy() && "This must be a vector type");
312 // Legalize the type.
313 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
315 // This type is legalized to a scalar type.
316 if (!LT.second.isVector())
319 // The type may be split. Normalize the index to the new type.
320 unsigned Width = LT.second.getVectorNumElements();
321 Index = Index % Width;
323 // Floating point scalars are already located in index #0.
324 if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
328 return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
331 unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
332 unsigned AddressSpace) const {
333 // Legalize the type.
334 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
335 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
338 // Each load/store unit costs 1.
339 unsigned Cost = LT.first * 1;
341 // On Sandybridge 256bit load/stores are double pumped
342 // (but not on Haswell).
343 if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2())