[PowerPC] Adjust load/store costs in PPCTTI

[oota-llvm.git] / lib / Target / PowerPC / PPCTargetTransformInfo.cpp
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp

index 22cdd66dbb5632fd09eeb21c5ebeefd71dfe6d03..ed849b5bc85948c826b3c2b528d7fa84fc393341 100644 (file)
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -37,10 +37,6 @@ class PPCTTI final : public ImmutablePass, public TargetTransformInfo {
    const PPCSubtarget *ST;
    const PPCTargetLowering *TLI;
  
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
-
  public:
    PPCTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
      llvm_unreachable("This pass cannot be directly constructed");
@@ -56,10 +52,6 @@ public:
      pushTTIStack(this);
    }
  
-  virtual void finalizePass() {
-    popTTIStack();
-  }
-
    virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
      TargetTransformInfo::getAnalysisUsage(AU);
    }
@@ -143,7 +135,7 @@ void PPCTTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
  unsigned PPCTTI::getNumberOfRegisters(bool Vector) const {
    if (Vector && !ST->hasAltivec())
      return 0;
-  return 32;
+  return ST->hasVSX() ? 64 : 32;
  }
  
  unsigned PPCTTI::getRegisterBitWidth(bool Vector) const {
@@ -212,11 +204,21 @@ unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val,
    int ISD = TLI->InstructionOpcodeToISD(Opcode);
    assert(ISD && "Invalid opcode");
  
+  if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
+    // Double-precision scalars are already located in index #0.
+    if (Index == 0)
+      return 0;
+
+    return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
+  }
+
    // Estimated cost of a load-hit-store delay.  This was obtained
    // experimentally as a minimum needed to prevent unprofitable
    // vectorization for the paq8p benchmark.  It may need to be
    // raised further if other unprofitable cases remain.
-  unsigned LHSPenalty = 12;
+  unsigned LHSPenalty = 2;
+  if (ISD == ISD::INSERT_VECTOR_ELT)
+    LHSPenalty += 7;
  
    // Vector element insert/extract with Altivec is very expensive,
    // because they require store and reload with the attendant
@@ -237,14 +239,34 @@ unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
    assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
           "Invalid Opcode");
  
-  // Each load/store unit costs 1.
-  unsigned Cost = LT.first * 1;
+  unsigned Cost =
+    TargetTransformInfo::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+
+  // VSX loads/stores support unaligned access.
+  if (ST->hasVSX()) {
+    if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64)
+      return Cost;
+  }
+
+  bool UnalignedAltivec =
+    Src->isVectorTy() &&
+    Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() &&
+    LT.second.getSizeInBits() == 128 &&
+    Opcode == Instruction::Load;
  
    // PPC in general does not support unaligned loads and stores. They'll need
    // to be decomposed based on the alignment factor.
    unsigned SrcBytes = LT.second.getStoreSize();
-  if (SrcBytes && Alignment && Alignment < SrcBytes)
-    Cost *= (SrcBytes/Alignment);
+  if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) {
+    Cost += LT.first*(SrcBytes/Alignment-1);
+
+    // For a vector type, there is also scalarization overhead (only for
+    // stores, loads are expanded using the vector-load + permutation sequence,
+    // which is much less expensive).
+    if (Src->isVectorTy() && Opcode == Instruction::Store)
+      for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
+        Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
+  }
  
    return Cost;
  }