ARM cost model: Address computation in vector mem ops not free

author Arnold Schwaighofer <aschwaighofer@apple.com>

Fri, 8 Feb 2013 14:50:48 +0000 (14:50 +0000)

committer Arnold Schwaighofer <aschwaighofer@apple.com>

Fri, 8 Feb 2013 14:50:48 +0000 (14:50 +0000)
author Arnold Schwaighofer <aschwaighofer@apple.com>
Fri, 8 Feb 2013 14:50:48 +0000 (14:50 +0000)
committer Arnold Schwaighofer <aschwaighofer@apple.com>
Fri, 8 Feb 2013 14:50:48 +0000 (14:50 +0000)
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h

index 681b838bc734d0dcb87dc803d0af4ea8627f63fe..e1331a16b3bc15ff4bc905cb37cb94b9d3550385 100644 (file)
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -314,6 +314,12 @@ public:
    /// split during legalization. Zero is returned when the answer is unknown.
    virtual unsigned getNumberOfParts(Type *Tp) const;
  
+  /// \returns The cost of the address computation. For most targets this can be
+  /// merged into the instruction indexing mode. Some targets might want to
+  /// distinguish between address computation for memory operations on vector
+  /// types and scalar types. Such targets should override this function.
+  virtual unsigned getAddressComputationCost(Type *Ty) const;
+
    /// @}
  
    /// Analysis group identification.
diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp

index 1784512bce77a8f652b72fe0ce4a0cdf91c7719d..8435e397077f48f2857f2b24034f66c2ba1d7cbc 100644 (file)
--- a/lib/Analysis/CostModel.cpp
+++ b/lib/Analysis/CostModel.cpp
@@ -85,6 +85,11 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
      return -1;
  
    switch (I->getOpcode()) {
+  case Instruction::GetElementPtr:{
+    Type *ValTy = I->getOperand(0)->getType()->getPointerElementType();
+    return TTI->getAddressComputationCost(ValTy);
+  }
+
    case Instruction::Ret:
    case Instruction::PHI:
    case Instruction::Br: {
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp

index 9fc21fdb9230d2d13023c56c1f506e56f14a6b6c..72421a00c767061f19e47029cf8cabffe0895160 100644 (file)
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -196,6 +196,9 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
    return PrevTTI->getNumberOfParts(Tp);
  }
  
+unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp) const {
+  return PrevTTI->getAddressComputationCost(Tp);
+}
  
  namespace {
  
@@ -535,6 +538,10 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
    unsigned getNumberOfParts(Type *Tp) const {
      return 0;
    }
+
+  unsigned getAddressComputationCost(Type *Tp) const {
+    return 0;
+  }
  };
  
  } // end anonymous namespace
diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp

index ea5e93747dd02554c37f54c740875c4b1ed24b48..e8b5b4fe8d1b9e709befe8835870a8001b87befa 100644 (file)
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -101,6 +101,7 @@ public:
    virtual unsigned getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy,
                                           ArrayRef<Type*> Tys) const;
    virtual unsigned getNumberOfParts(Type *Tp) const;
+  virtual unsigned getAddressComputationCost(Type *Ty) const;
  
    /// @}
  };
@@ -400,3 +401,7 @@ unsigned BasicTTI::getNumberOfParts(Type *Tp) const {
    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
    return LT.first;
  }
+
+unsigned BasicTTI::getAddressComputationCost(Type *Ty) const {
+  return 0;
+}
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp

index 1f91e0ee362761dafd08e7f0d7a5427982a26c9d..f6fa319970953d2312899b8957e43af5f02b0b7b 100644 (file)
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -120,6 +120,8 @@ public:
    unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const;
  
    unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const;
+
+  unsigned getAddressComputationCost(Type *Val) const;
    /// @}
  };
  
@@ -304,12 +306,13 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
  
  unsigned ARMTTI::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                      unsigned Index) const {
-  // Penalize inserting into an D-subregister.
+  // Penalize inserting into an D-subregister. We end up with a three times
+  // lower estimated throughput on swift.
    if (ST->isSwift() &&
        Opcode == Instruction::InsertElement &&
        ValTy->isVectorTy() &&
        ValTy->getScalarSizeInBits() <= 32)
-    return 2;
+    return 3;
  
    return TargetTransformInfo::getVectorInstrCost(Opcode, ValTy, Index);
  }
@@ -326,3 +329,9 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
  
    return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
  }
+
+unsigned ARMTTI::getAddressComputationCost(Type *Ty) const {
+  // In many cases the address computation is not merged into the instruction
+  // addressing mode.
+  return 1;
+}
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 91d565976ad258cc6fff83768de10009cdc807e5..f12b0bf0f39e7593a8e07435799ffdd8f1ca43ad 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3056,9 +3056,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
    // TODO: We need to estimate the cost of intrinsic calls.
    switch (I->getOpcode()) {
    case Instruction::GetElementPtr:
-    // We mark this instruction as zero-cost because scalar GEPs are usually
-    // lowered to the intruction addressing mode. At the moment we don't
-    // generate vector geps.
+    // We mark this instruction as zero-cost because the cost of GEPs in
+    // vectorized code depends on whether the corresponding memory instruction
+    // is scalarized or not. Therefore, we handle GEPs with the memory
+    // instruction cost.
      return 0;
    case Instruction::Br: {
      return TTI.getCFInstrCost(I->getOpcode());
@@ -3113,9 +3114,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
      unsigned AS = SI ? SI->getPointerAddressSpace() :
        LI->getPointerAddressSpace();
      Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
-
+    // We add the cost of address computation here instead of with the gep
+    // instruction because only here we know whether the operation is
+    // scalarized.
      if (VF == 1)
-      return TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+      return TTI.getAddressComputationCost(VectorTy) +
+        TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
  
      // Scalarized loads/stores.
      int Stride = Legal->isConsecutivePtr(Ptr);
@@ -3135,15 +3139,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
                                              VectorTy, i);
        }
  
-      // The cost of the scalar stores.
+      // The cost of the scalar loads/stores.
+      Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType());
        Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
                                         Alignment, AS);
        return Cost;
      }
  
      // Wide load/stores.
-    unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
-                                        Alignment, AS);
+    unsigned Cost = TTI.getAddressComputationCost(VectorTy);
+    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+
      if (Reverse)
        Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
                                    VectorTy, 0);
diff --git a/test/Analysis/CostModel/ARM/gep.ll b/test/Analysis/CostModel/ARM/gep.ll

new file mode 100644 (file)

index 0000000..a63b87d
--- /dev/null
+++ b/test/Analysis/CostModel/ARM/gep.ll
@@ -0,0 +1,43 @@
+; RUN: opt -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios6.0.0"
+
+define void @test_geps() {
+  ; Cost of scalar integer geps should be one. We can't always expect it to be
+  ; folded into the instruction addressing mode.
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8*
+  %a0 = getelementptr inbounds i8* undef, i32 0
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16*
+  %a1 = getelementptr inbounds i16* undef, i32 0
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32*
+  %a2 = getelementptr inbounds i32* undef, i32 0
+
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64*
+  %a3 = getelementptr inbounds i64* undef, i32 0
+
+  ; Cost of scalar floating point geps should be one. We cannot fold the address
+  ; computation.
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds float*
+  %a4 = getelementptr inbounds float* undef, i32 0
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds double*
+  %a5 = getelementptr inbounds double* undef, i32 0
+
+
+  ; Cost of vector geps should be one. We cannot fold the address computation.
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i8>*
+  %a7 = getelementptr inbounds <4 x i8>* undef, i32 0
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i16>*
+  %a8 = getelementptr inbounds <4 x i16>* undef, i32 0
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i32>*
+  %a9 = getelementptr inbounds <4 x i32>* undef, i32 0
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i64>*
+  %a10 = getelementptr inbounds <4 x i64>* undef, i32 0
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x float>*
+  %a11 = getelementptr inbounds <4 x float>* undef, i32 0
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x double>*
+  %a12 = getelementptr inbounds <4 x double>* undef, i32 0
+
+
+  ret void
+}
diff --git a/test/Analysis/CostModel/ARM/insertelement.ll b/test/Analysis/CostModel/ARM/insertelement.ll

index 2d43e4ddd9be2d8bdd62929e367f4198033e67bd..f951b08f9baa8aad01845133d5fdeb3143d8b644 100644 (file)
--- a/test/Analysis/CostModel/ARM/insertelement.ll
+++ b/test/Analysis/CostModel/ARM/insertelement.ll
@@ -12,7 +12,7 @@ define void @insertelement_i8(%T_i8* %saddr,
                             %T_i8v* %vaddr) {
    %v0 = load %T_i8v* %vaddr
    %v1 = load %T_i8* %saddr
-;CHECK: estimated cost of 2 for {{.*}} insertelement <8 x i8>
+;CHECK: estimated cost of 3 for {{.*}} insertelement <8 x i8>
    %v2 = insertelement %T_i8v %v0, %T_i8 %v1, i32 1
    store %T_i8v %v2, %T_i8v* %vaddr
    ret void
@@ -26,7 +26,7 @@ define void @insertelement_i16(%T_i16* %saddr,
                             %T_i16v* %vaddr) {
    %v0 = load %T_i16v* %vaddr
    %v1 = load %T_i16* %saddr
-;CHECK: estimated cost of 2 for {{.*}} insertelement <4 x i16>
+;CHECK: estimated cost of 3 for {{.*}} insertelement <4 x i16>
    %v2 = insertelement %T_i16v %v0, %T_i16 %v1, i32 1
    store %T_i16v %v2, %T_i16v* %vaddr
    ret void
@@ -39,7 +39,7 @@ define void @insertelement_i32(%T_i32* %saddr,
                             %T_i32v* %vaddr) {
    %v0 = load %T_i32v* %vaddr
    %v1 = load %T_i32* %saddr
-;CHECK: estimated cost of 2 for {{.*}} insertelement <2 x i32>
+;CHECK: estimated cost of 3 for {{.*}} insertelement <2 x i32>
    %v2 = insertelement %T_i32v %v0, %T_i32 %v1, i32 1
    store %T_i32v %v2, %T_i32v* %vaddr
    ret void
diff --git a/test/Analysis/CostModel/X86/gep.ll b/test/Analysis/CostModel/X86/gep.ll

new file mode 100644 (file)

index 0000000..877184a
--- /dev/null
+++ b/test/Analysis/CostModel/X86/gep.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+
+define void @test_geps() {
+  ; Cost of should be zero. We expect it to be folded into
+  ; the instruction addressing mode.
+;CHECK:  cost of 0 for instruction: {{.*}} getelementptr inbounds i8*
+  %a0 = getelementptr inbounds i8* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16*
+  %a1 = getelementptr inbounds i16* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32*
+  %a2 = getelementptr inbounds i32* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64*
+  %a3 = getelementptr inbounds i64* undef, i32 0
+
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds float*
+  %a4 = getelementptr inbounds float* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds double*
+  %a5 = getelementptr inbounds double* undef, i32 0
+
+ ; Vector geps should also have zero cost.
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i8>*
+  %a7 = getelementptr inbounds <4 x i8>* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i16>*
+  %a8 = getelementptr inbounds <4 x i16>* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i32>*
+  %a9 = getelementptr inbounds <4 x i32>* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i64>*
+  %a10 = getelementptr inbounds <4 x i64>* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x float>*
+  %a11 = getelementptr inbounds <4 x float>* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x double>*
+  %a12 = getelementptr inbounds <4 x double>* undef, i32 0
+
+
+  ret void
+}
author	Arnold Schwaighofer <aschwaighofer@apple.com>
	Fri, 8 Feb 2013 14:50:48 +0000 (14:50 +0000)
committer	Arnold Schwaighofer <aschwaighofer@apple.com>
	Fri, 8 Feb 2013 14:50:48 +0000 (14:50 +0000)
include/llvm/Analysis/TargetTransformInfo.h		patch \| blob \| history
lib/Analysis/CostModel.cpp		patch \| blob \| history
lib/Analysis/TargetTransformInfo.cpp		patch \| blob \| history
lib/CodeGen/BasicTargetTransformInfo.cpp		patch \| blob \| history
lib/Target/ARM/ARMTargetTransformInfo.cpp		patch \| blob \| history
lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
test/Analysis/CostModel/ARM/gep.ll	[new file with mode: 0644]	patch \| blob
test/Analysis/CostModel/ARM/insertelement.ll		patch \| blob \| history
test/Analysis/CostModel/X86/gep.ll	[new file with mode: 0644]	patch \| blob