[PowerPC] Enable interleaved-access vectorization

author Hal Finkel <hfinkel@anl.gov>

Fri, 4 Sep 2015 00:10:41 +0000 (00:10 +0000)

committer Hal Finkel <hfinkel@anl.gov>

Fri, 4 Sep 2015 00:10:41 +0000 (00:10 +0000)
author Hal Finkel <hfinkel@anl.gov>
Fri, 4 Sep 2015 00:10:41 +0000 (00:10 +0000)
committer Hal Finkel <hfinkel@anl.gov>
Fri, 4 Sep 2015 00:10:41 +0000 (00:10 +0000)
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp

index 124556e2d17182bc6bc1e31ec1bb8b605e5d8812..cd86dabd5abea7dbdb63a606196c79b74b35df25 100644 (file)
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -207,6 +207,10 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
    return LoopHasReductions;
  }
  
+bool PPCTTIImpl::enableInterleavedAccessVectorization() {
+  return true;
+}
+
  unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
    if (Vector && !ST->hasAltivec() && !ST->hasQPX())
      return 0;
@@ -266,7 +270,15 @@ int PPCTTIImpl::getArithmeticInstrCost(
  
  int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                 Type *SubTp) {
-  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+  // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
+  // (at least in the sense that there need only be one non-loop-invariant
+  // instruction). We need one such shuffle instruction for each actual
+  // register (this is not true for arbitrary shuffles, but is true for the
+  // structured types of shuffles covered by TTI::ShuffleKind).
+  return LT.first;
  }
  
  int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
@@ -375,3 +387,27 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
    return Cost;
  }
  
+int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                           unsigned Factor,
+                                           ArrayRef<unsigned> Indices,
+                                           unsigned Alignment,
+                                           unsigned AddressSpace) {
+  assert(isa<VectorType>(VecTy) &&
+         "Expect a vector type for interleaved memory op");
+
+  // Legalize the type.
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
+
+  // Firstly, the cost of load/store operation.
+  int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
+
+  // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
+  // (at least in the sense that there need only be one non-loop-invariant
+  // instruction). For each result vector, we need one shuffle per incoming
+  // vector (except that the first shuffle can take two incoming vectors
+  // because it does not need to take itself).
+  Cost += Factor*(LT.first-1);
+
+  return Cost;
+}
+
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h

index 70f24f023d416b4870acc00da65a0e0f5aaf1c64..7f03223c8eaec57e41729a104c32efc64d9cd0db 100644 (file)
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -67,6 +67,7 @@ public:
    /// @{
  
    bool enableAggressiveInterleaving(bool LoopHasReductions);
+  bool enableInterleavedAccessVectorization();
    unsigned getNumberOfRegisters(bool Vector);
    unsigned getRegisterBitWidth(bool Vector);
    unsigned getMaxInterleaveFactor(unsigned VF);
@@ -82,6 +83,11 @@ public:
    int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
    int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                        unsigned AddressSpace);
+  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                 unsigned Factor,
+                                 ArrayRef<unsigned> Indices,
+                                 unsigned Alignment,
+                                 unsigned AddressSpace);
  
    /// @}
  };
diff --git a/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll b/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll

new file mode 100644 (file)

index 0000000..0cb8455
--- /dev/null
+++ b/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll
@@ -0,0 +1,30 @@
+; RUN: opt -S -basicaa -loop-vectorize < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: @foo
+; CHECK: <2 x double>
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %0
+  %1 = load double, double* %arrayidx, align 8
+  %add = fadd double %1, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  store double %add, double* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { nounwind "target-cpu"="pwr8" }
+
author	Hal Finkel <hfinkel@anl.gov>
	Fri, 4 Sep 2015 00:10:41 +0000 (00:10 +0000)
committer	Hal Finkel <hfinkel@anl.gov>
	Fri, 4 Sep 2015 00:10:41 +0000 (00:10 +0000)
lib/Target/PowerPC/PPCTargetTransformInfo.cpp		patch \| blob \| history
lib/Target/PowerPC/PPCTargetTransformInfo.h		patch \| blob \| history
test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll	[new file with mode: 0644]	patch \| blob