From f606a6ed992d5e4e2877419b51e2a9b540b5e3f0 Mon Sep 17 00:00:00 2001 From: Hal Finkel Date: Fri, 4 Sep 2015 00:10:41 +0000 Subject: [PATCH 1/1] [PowerPC] Enable interleaved-access vectorization This adds a basic cost model for interleaved-access vectorization (and a better default for shuffles), and enables interleaved-access vectorization by default. The relevant difference from the default cost model for interleaved-access vectorization, is that on PPC, the shuffles that end up being used are *much* cheaper than modeling the process with insert/extract pairs (which are quite expensive, especially on older cores). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@246824 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 38 ++++++++++++++++++- lib/Target/PowerPC/PPCTargetTransformInfo.h | 6 +++ .../PowerPC/stride-vectorization.ll | 30 +++++++++++++++ 3 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 124556e2d17..cd86dabd5ab 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -207,6 +207,10 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { return LoopHasReductions; } +bool PPCTTIImpl::enableInterleavedAccessVectorization() { + return true; +} + unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) { if (Vector && !ST->hasAltivec() && !ST->hasQPX()) return 0; @@ -266,7 +270,15 @@ int PPCTTIImpl::getArithmeticInstrCost( int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + // Legalize the type. + std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + + // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // (at least in the sense that there need only be one non-loop-invariant + // instruction). We need one such shuffle instruction for each actual + // register (this is not true for arbitrary shuffles, but is true for the + // structured types of shuffles covered by TTI::ShuffleKind). + return LT.first; } int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { @@ -375,3 +387,27 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, return Cost; } +int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) { + assert(isa(VecTy) && + "Expect a vector type for interleaved memory op"); + + // Legalize the type. + std::pair LT = TLI->getTypeLegalizationCost(DL, VecTy); + + // Firstly, the cost of load/store operation. + int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace); + + // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // (at least in the sense that there need only be one non-loop-invariant + // instruction). For each result vector, we need one shuffle per incoming + // vector (except that the first shuffle can take two incoming vectors + // because it does not need to take itself). + Cost += Factor*(LT.first-1); + + return Cost; +} + diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h index 70f24f023d4..7f03223c8ea 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -67,6 +67,7 @@ public: /// @{ bool enableAggressiveInterleaving(bool LoopHasReductions); + bool enableInterleavedAccessVectorization(); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getMaxInterleaveFactor(unsigned VF); @@ -82,6 +83,11 @@ public: int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace); + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace); /// @} }; diff --git a/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll b/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll new file mode 100644 index 00000000000..0cb84552024 --- /dev/null +++ b/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll @@ -0,0 +1,30 @@ +; RUN: opt -S -basicaa -loop-vectorize < %s | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +; Function Attrs: nounwind +define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 { +entry: + br label %for.body + +; CHECK-LABEL: @foo +; CHECK: <2 x double> + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds double, double* %b, i64 %0 + %1 = load double, double* %arrayidx, align 8 + %add = fadd double %1, 1.000000e+00 + %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv + store double %add, double* %arrayidx2, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1600 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { nounwind "target-cpu"="pwr8" } + -- 2.34.1