From f5637c399711e37287e01f9d9ca9ce7cd2f3d14f Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Fri, 21 Dec 2012 01:33:59 +0000 Subject: [PATCH] Improve the X86 cost model for loads and stores. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170830 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 24 +++++++ lib/Target/X86/X86ISelLowering.h | 4 ++ test/Analysis/CostModel/X86/load_store.ll | 64 +++++++++++++++++++ .../Analysis/CostModel/X86/vectorized-loop.ll | 5 +- 4 files changed, 95 insertions(+), 2 deletions(-) create mode 100644 test/Analysis/CostModel/X86/load_store.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1e64741c257..6b650726b62 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -17818,6 +17818,30 @@ X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode, return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty); } + +unsigned +X86VectorTargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) const { + // Legalize the type. + std::pair LT = getTypeLegalizationCost(Src); + assert(Opcode == Instruction::Load || Opcode == Instruction::Store && + "Invalid Opcode"); + + const X86Subtarget &ST = + TLI->getTargetMachine().getSubtarget(); + + // Each load/store unit costs 1. + unsigned Cost = LT.first * 1; + + // On Sandybridge 256bit load/stores are double pumped + // (but not on Haswell). + if (LT.second.getSizeInBits() > 128 && !ST.hasAVX2()) + Cost*=2; + + return Cost; +} + unsigned X86VectorTargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const { diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 5be7f095a47..72cd3b3f5bc 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -953,6 +953,10 @@ namespace llvm { virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const; + virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) const; + virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const; diff --git a/test/Analysis/CostModel/X86/load_store.ll b/test/Analysis/CostModel/X86/load_store.ll new file mode 100644 index 00000000000..4195b1d879a --- /dev/null +++ b/test/Analysis/CostModel/X86/load_store.ll @@ -0,0 +1,64 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +define i32 @stores(i32 %arg) { + + ;CHECK: cost of 1 {{.*}} store + store i8 undef, i8* undef, align 4 + ;CHECK: cost of 1 {{.*}} store + store i16 undef, i16* undef, align 4 + ;CHECK: cost of 1 {{.*}} store + store i32 undef, i32* undef, align 4 + ;CHECK: cost of 1 {{.*}} store + store i64 undef, i64* undef, align 4 + ;CHECK: cost of 2 {{.*}} store + store i128 undef, i128* undef, align 4 + + ;CHECK: cost of 1 {{.*}} store + store <4 x i16> undef, <4 x i16>* undef, align 4 + ;CHECK: cost of 1 {{.*}} store + store <4 x i32> undef, <4 x i32>* undef, align 4 + ;CHECK: cost of 2 {{.*}} store + store <4 x i64> undef, <4 x i64>* undef, align 4 + + ;CHECK: cost of 1 {{.*}} store + store <8 x i16> undef, <8 x i16>* undef, align 4 + ;CHECK: cost of 2 {{.*}} store + store <8 x i32> undef, <8 x i32>* undef, align 4 + ;CHECK: cost of 4 {{.*}} store + store <8 x i64> undef, <8 x i64>* undef, align 4 + + ret i32 undef +} +define i32 @loads(i32 %arg) { + ;CHECK: cost of 1 {{.*}} load + load i8* undef, align 4 + ;CHECK: cost of 1 {{.*}} load + load i16* undef, align 4 + ;CHECK: cost of 1 {{.*}} load + load i32* undef, align 4 + ;CHECK: cost of 1 {{.*}} load + load i64* undef, align 4 + ;CHECK: cost of 2 {{.*}} load + load i128* undef, align 4 + + ;CHECK: cost of 1 {{.*}} load + load <2 x i32>* undef, align 4 + ;CHECK: cost of 1 {{.*}} load + load <4 x i32>* undef, align 4 + ;CHECK: cost of 2 {{.*}} load + load <8 x i32>* undef, align 4 + + + ;CHECK: cost of 1 {{.*}} load + load <2 x i64>* undef, align 4 + ;CHECK: cost of 2 {{.*}} load + load <4 x i64>* undef, align 4 + ;CHECK: cost of 4 {{.*}} load + load <8 x i64>* undef, align 4 + + ret i32 undef +} + diff --git a/test/Analysis/CostModel/X86/vectorized-loop.ll b/test/Analysis/CostModel/X86/vectorized-loop.ll index 6c9e111bb10..25b11145c66 100644 --- a/test/Analysis/CostModel/X86/vectorized-loop.ll +++ b/test/Analysis/CostModel/X86/vectorized-loop.ll @@ -28,16 +28,17 @@ vector.body: ; preds = %for.body.lr.ph, %ve %4 = getelementptr inbounds i32* %B, i64 %3 ;CHECK: cost of 0 {{.*}} bitcast %5 = bitcast i32* %4 to <8 x i32>* - ;CHECK: cost of 1 {{.*}} load + ;CHECK: cost of 2 {{.*}} load %6 = load <8 x i32>* %5, align 4 ;CHECK: cost of 4 {{.*}} mul %7 = mul nsw <8 x i32> %6, %8 = getelementptr inbounds i32* %A, i64 %index %9 = bitcast i32* %8 to <8 x i32>* + ;CHECK: cost of 2 {{.*}} load %10 = load <8 x i32>* %9, align 4 ;CHECK: cost of 4 {{.*}} add %11 = add nsw <8 x i32> %10, %7 - ;CHECK: cost of 1 {{.*}} store + ;CHECK: cost of 2 {{.*}} store store <8 x i32> %11, <8 x i32>* %9, align 4 %index.next = add i64 %index, 8 %12 = icmp eq i64 %index.next, %end.idx.rnd.down -- 2.34.1