From: Nadav Rotem Date: Sat, 3 Nov 2012 00:39:56 +0000 (+0000) Subject: X86 CostModel: Add support for a some of the common arithmetic instructions for SSE4... X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=b4b04c3fa0a5da15424de7818e9f72811495c65b;p=oota-llvm.git X86 CostModel: Add support for a some of the common arithmetic instructions for SSE4, AVX and AVX2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167347 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/Target/TargetTransformImpl.h b/include/llvm/Target/TargetTransformImpl.h index fa1acbea087..625be7208ad 100644 --- a/include/llvm/Target/TargetTransformImpl.h +++ b/include/llvm/Target/TargetTransformImpl.h @@ -51,7 +51,7 @@ public: }; class VectorTargetTransformImpl : public VectorTargetTransformInfo { -private: +protected: const TargetLowering *TLI; /// Estimate the cost of type-legalization and the legalized type. diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9eea44349a8..0d38ba236e6 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -17504,3 +17504,73 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, return Res; } + +unsigned +X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode, + Type *Ty) const { + const X86Subtarget &ST = + TLI->getTargetMachine().getSubtarget(); + + // Fix some of the inaccuracies of the target independent estimation. + if (Ty->isVectorTy() && ST.hasSSE41()) { + unsigned NumElem = Ty->getVectorNumElements(); + unsigned SizeInBits = Ty->getScalarType()->getScalarSizeInBits(); + + bool Is2 = (NumElem == 2); + bool Is4 = (NumElem == 4); + bool Is8 = (NumElem == 8); + bool Is32bits = (SizeInBits == 32); + bool Is64bits = (SizeInBits == 64); + bool HasAvx = ST.hasAVX(); + bool HasAvx2 = ST.hasAVX2(); + + switch (Opcode) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: { + // Only AVX2 has support for 8-wide integer operations. + if (Is32bits && (Is4 || (Is8 && HasAvx2))) return 1; + if (Is64bits && (Is2 || (Is4 && HasAvx2))) return 1; + + // We don't have to completly scalarize unsupported ops. We can + // issue two half-sized operations (with some overhead). + // We don't need to extract the lower part of the YMM to the XMM. + // Extract the upper, two ops, insert the upper = 4. + if (Is32bits && Is8 && HasAvx) return 4; + if (Is64bits && Is4 && HasAvx) return 4; + break; + } + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: { + // AVX has support for 8-wide float operations. + if (Is32bits && (Is4 || (Is8 && HasAvx))) return 1; + if (Is64bits && (Is2 || (Is4 && HasAvx))) return 1; + break; + } + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // AVX has support for 8-wide integer bitwise operations. + if (Is32bits && (Is4 || (Is8 && HasAvx))) return 1; + if (Is64bits && (Is2 || (Is4 && HasAvx))) return 1; + break; + } + } + } + + return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty); +} + +unsigned +X86VectorTargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) const { + // Floating point scalars are already located in index #0. + if (Val->getScalarType()->isFloatingPointTy() && Index == 0) + return 0; + return VectorTargetTransformImpl::getVectorInstrCost(Opcode, Val, Index); +} + diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index d4c30369b74..3ecef983bd3 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -953,13 +953,10 @@ namespace llvm { explicit X86VectorTargetTransformInfo(const TargetLowering *TL) : VectorTargetTransformImpl(TL) {} + virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const; + virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) const { - // Floating point scalars are already located in index #0. - if (Val->getScalarType()->isFloatingPointTy() && Index == 0) - return 0; - return VectorTargetTransformImpl::getVectorInstrCost(Opcode, Val, Index); - } + unsigned Index) const; }; } diff --git a/test/Analysis/CostModel/X86/arith.ll b/test/Analysis/CostModel/X86/arith.ll new file mode 100644 index 00000000000..58b4a7c4265 --- /dev/null +++ b/test/Analysis/CostModel/X86/arith.ll @@ -0,0 +1,40 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +define i32 @add(i32 %arg) { + ;CHECK: cost of 1 {{.*}} add + %A = add <4 x i32> undef, undef + ;CHECK: cost of 4 {{.*}} add + %B = add <8 x i32> undef, undef + ;CHECK: cost of 1 {{.*}} add + %C = add <2 x i64> undef, undef + ;CHECK: cost of 4 {{.*}} add + %D = add <4 x i64> undef, undef + ;CHECK: cost of 1 {{.*}} ret + ret i32 undef +} + + +define i32 @xor(i32 %arg) { + ;CHECK: cost of 1 {{.*}} xor + %A = xor <4 x i32> undef, undef + ;CHECK: cost of 1 {{.*}} xor + %B = xor <8 x i32> undef, undef + ;CHECK: cost of 1 {{.*}} xor + %C = xor <2 x i64> undef, undef + ;CHECK: cost of 1 {{.*}} xor + %D = xor <4 x i64> undef, undef + ;CHECK: cost of 1 {{.*}} ret + ret i32 undef +} + + +define i32 @fmul(i32 %arg) { + ;CHECK: cost of 1 {{.*}} fmul + %A = fmul <4 x float> undef, undef + ;CHECK: cost of 1 {{.*}} fmul + %B = fmul <8 x float> undef, undef + ret i32 undef +} diff --git a/test/Analysis/CostModel/X86/vectorized-loop.ll b/test/Analysis/CostModel/X86/vectorized-loop.ll index fbf20de5153..7919a9ca9a6 100644 --- a/test/Analysis/CostModel/X86/vectorized-loop.ll +++ b/test/Analysis/CostModel/X86/vectorized-loop.ll @@ -30,10 +30,12 @@ vector.body: ; preds = %for.body.lr.ph, %ve %5 = bitcast i32* %4 to <8 x i32>* ;CHECK: cost of 1 {{.*}} load %6 = load <8 x i32>* %5, align 4 + ;CHECK: cost of 4 {{.*}} mul %7 = mul nsw <8 x i32> %6, %8 = getelementptr inbounds i32* %A, i64 %index %9 = bitcast i32* %8 to <8 x i32>* %10 = load <8 x i32>* %9, align 4 + ;CHECK: cost of 4 {{.*}} add %11 = add nsw <8 x i32> %10, %7 ;CHECK: cost of 1 {{.*}} store store <8 x i32> %11, <8 x i32>* %9, align 4