From e637d65af355ef24ef2bd8d6072861d7354d5e91 Mon Sep 17 00:00:00 2001 From: Karthik Bhat Date: Mon, 25 Aug 2014 04:56:54 +0000 Subject: [PATCH] Allow vectorization of division by uniform power of 2. This patch adds support to recognize division by uniform power of 2 and modifies the cost table to vectorize division by uniform power of 2 whenever possible. Updates Cost model for Loop and SLP Vectorizer.The cost table is currently only updated for X86 backend. Thanks to Hal, Andrea, Sanjay for the review. (http://reviews.llvm.org/D4971) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@216371 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/TargetTransformInfo.h | 12 ++++-- lib/Analysis/TargetTransformInfo.cpp | 14 +++--- lib/CodeGen/BasicTargetTransformInfo.cpp | 8 ++-- .../AArch64/AArch64TargetTransformInfo.cpp | 20 +++++---- lib/Target/ARM/ARMTargetTransformInfo.cpp | 20 +++++---- lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 18 ++++---- lib/Target/X86/X86TargetTransformInfo.cpp | 31 +++++++++++-- lib/Transforms/Vectorize/LoopVectorize.cpp | 21 +++++++-- lib/Transforms/Vectorize/SLPVectorizer.cpp | 18 ++++++-- .../Transforms/LoopVectorize/X86/powof2div.ll | 32 ++++++++++++++ .../Transforms/SLPVectorizer/X86/powof2div.ll | 43 +++++++++++++++++++ 11 files changed, 187 insertions(+), 50 deletions(-) create mode 100644 test/Transforms/LoopVectorize/X86/powof2div.ll create mode 100644 test/Transforms/SLPVectorizer/X86/powof2div.ll diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 0ec49c2b18a..087ab302bae 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -335,6 +335,9 @@ public: OK_NonUniformConstantValue // Operand is a non uniform constant value. }; + /// \brief Additional properties of an operand's values. + enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 }; + /// \return The number of scalar or vector registers that the target has. /// If 'Vectors' is true, it returns the number of vector registers. If it is /// set to false, it returns the number of scalar registers. @@ -349,9 +352,12 @@ public: virtual unsigned getMaximumUnrollFactor() const; /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc. - virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Opd1Info = OK_AnyValue, - OperandValueKind Opd2Info = OK_AnyValue) const; + virtual unsigned + getArithmeticInstrCost(unsigned Opcode, Type *Ty, + OperandValueKind Opd1Info = OK_AnyValue, + OperandValueKind Opd2Info = OK_AnyValue, + OperandValueProperties Opd1PropInfo = OP_None, + OperandValueProperties Opd2PropInfo = OP_None) const; /// \return The cost of a shuffle instruction of kind Kind and of type Tp. /// The index and subtype parameters are used by the subvector insertion and diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 7ac22303deb..b8f3630df8d 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -171,11 +171,12 @@ unsigned TargetTransformInfo::getMaximumUnrollFactor() const { return PrevTTI->getMaximumUnrollFactor(); } -unsigned TargetTransformInfo::getArithmeticInstrCost(unsigned Opcode, - Type *Ty, - OperandValueKind Op1Info, - OperandValueKind Op2Info) const { - return PrevTTI->getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); +unsigned TargetTransformInfo::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Op1Info, + OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo, + OperandValueProperties Opd2PropInfo) const { + return PrevTTI->getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, + Opd1PropInfo, Opd2PropInfo); } unsigned TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Tp, @@ -569,7 +570,8 @@ struct NoTTI final : ImmutablePass, TargetTransformInfo { } unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, - OperandValueKind) const override { + OperandValueKind, OperandValueProperties, + OperandValueProperties) const override { return 1; } diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp index 24bc570f44a..85045eaa6df 100644 --- a/lib/CodeGen/BasicTargetTransformInfo.cpp +++ b/lib/CodeGen/BasicTargetTransformInfo.cpp @@ -104,7 +104,8 @@ public: unsigned getMaximumUnrollFactor() const override; unsigned getRegisterBitWidth(bool Vector) const override; unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, - OperandValueKind) const override; + OperandValueKind, OperandValueProperties, + OperandValueProperties) const override; unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const override; unsigned getCastInstrCost(unsigned Opcode, Type *Dst, @@ -289,8 +290,9 @@ unsigned BasicTTI::getMaximumUnrollFactor() const { } unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind, - OperandValueKind) const { + OperandValueKind, OperandValueKind, + OperandValueProperties, + OperandValueProperties) const { // Check if any of the operands are vector operands. const TargetLoweringBase *TLI = getTLI(); int ISD = TLI->InstructionOpcodeToISD(Opcode); diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 24cb129b0b7..7ef1fdbfff8 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -112,10 +112,11 @@ public: unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const override; - unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Opd1Info = OK_AnyValue, - OperandValueKind Opd2Info = OK_AnyValue) const - override; + unsigned getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue, + OperandValueKind Opd2Info = OK_AnyValue, + OperandValueProperties Opd1PropInfo = OP_None, + OperandValueProperties Opd2PropInfo = OP_None) const override; unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override; @@ -403,9 +404,10 @@ unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val, return 2; } -unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Opd1Info, - OperandValueKind Opd2Info) const { +unsigned AArch64TTI::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, + OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo, + OperandValueProperties Opd2PropInfo) const { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(Ty); @@ -413,8 +415,8 @@ unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, switch (ISD) { default: - return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info, - Opd2Info); + return TargetTransformInfo::getArithmeticInstrCost( + Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); case ISD::ADD: case ISD::MUL: case ISD::XOR: diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index f7a8e4aa10a..4635e4308f7 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -126,10 +126,11 @@ public: unsigned getAddressComputationCost(Type *Val, bool IsComplex) const override; - unsigned - getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Op1Info = OK_AnyValue, - OperandValueKind Op2Info = OK_AnyValue) const override; + unsigned getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Op1Info = OK_AnyValue, + OperandValueKind Op2Info = OK_AnyValue, + OperandValueProperties Opd1PropInfo = OP_None, + OperandValueProperties Opd2PropInfo = OP_None) const override; unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const override; @@ -497,9 +498,10 @@ unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); } -unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Op1Info, - OperandValueKind Op2Info) const { +unsigned ARMTTI::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Op1Info, + OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo, + OperandValueProperties Opd2PropInfo) const { int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); std::pair LT = TLI->getTypeLegalizationCost(Ty); @@ -555,8 +557,8 @@ unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, if (Idx != -1) return LT.first * CostTbl[Idx].Cost; - unsigned Cost = - TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); + unsigned Cost = TargetTransformInfo::getArithmeticInstrCost( + Opcode, Ty, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo); // This is somewhat of a hack. The problem that we are facing is that SROA // creates a sequence of shift, and, or instructions to construct values. diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 27ca7b2afed..f5ce735bf94 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -92,9 +92,10 @@ public: virtual unsigned getNumberOfRegisters(bool Vector) const override; virtual unsigned getRegisterBitWidth(bool Vector) const override; virtual unsigned getMaximumUnrollFactor() const override; - virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind, - OperandValueKind) const override; + virtual unsigned + getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, + OperandValueKind, OperandValueProperties, + OperandValueProperties) const override; virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const override; virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, @@ -318,14 +319,15 @@ unsigned PPCTTI::getMaximumUnrollFactor() const { return 2; } -unsigned PPCTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Op1Info, - OperandValueKind Op2Info) const { +unsigned PPCTTI::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Op1Info, + OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo, + OperandValueProperties Opd2PropInfo) const { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); // Fallback to the default implementation. - return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, - Op2Info); + return TargetTransformInfo::getArithmeticInstrCost( + Opcode, Ty, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo); } unsigned PPCTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 173bb4e6ec8..cd0336fa92f 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -84,7 +84,8 @@ public: unsigned getRegisterBitWidth(bool Vector) const override; unsigned getMaximumUnrollFactor() const override; unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, - OperandValueKind) const override; + OperandValueKind, OperandValueProperties, + OperandValueProperties) const override; unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const override; unsigned getCastInstrCost(unsigned Opcode, Type *Dst, @@ -178,15 +179,37 @@ unsigned X86TTI::getMaximumUnrollFactor() const { return 2; } -unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Op1Info, - OperandValueKind Op2Info) const { +unsigned X86TTI::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Op1Info, + OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo, + OperandValueProperties Opd2PropInfo) const { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + if (ISD == ISD::SDIV && + Op2Info == TargetTransformInfo::OK_UniformConstantValue && + Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { + // On X86, vector signed division by constants power-of-two are + // normally expanded to the sequence SRA + SRL + ADD + SRA. + // The OperandValue properties many not be same as that of previous + // operation;conservatively assume OP_None. + unsigned Cost = + 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + + return Cost; + } + static const CostTblEntry AVX2UniformConstCostTable[] = { { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 0c2b3d200cc..42fb1242f67 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5837,18 +5837,31 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueProperties Op1VP = + TargetTransformInfo::OP_None; + TargetTransformInfo::OperandValueProperties Op2VP = + TargetTransformInfo::OP_None; Value *Op2 = I->getOperand(1); // Check for a splat of a constant or for a non uniform vector of constants. - if (isa(Op2)) + if (isa(Op2)) { + ConstantInt *CInt = cast(Op2); + if (CInt && CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; Op2VK = TargetTransformInfo::OK_UniformConstantValue; - else if (isa(Op2) || isa(Op2)) { + } else if (isa(Op2) || isa(Op2)) { Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; - if (cast(Op2)->getSplatValue() != nullptr) + Constant *SplatValue = cast(Op2)->getSplatValue(); + if (SplatValue) { + ConstantInt *CInt = dyn_cast(SplatValue); + if (CInt && CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; Op2VK = TargetTransformInfo::OK_UniformConstantValue; + } } - return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK); + return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, + Op1VP, Op2VP); } case Instruction::Select: { SelectInst *SI = cast(I); diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 3aced0b4c40..2602feb2f2c 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1425,6 +1425,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_UniformConstantValue; + TargetTransformInfo::OperandValueProperties Op1VP = + TargetTransformInfo::OP_None; + TargetTransformInfo::OperandValueProperties Op2VP = + TargetTransformInfo::OP_None; // If all operands are exactly the same ConstantInt then set the // operand kind to OK_UniformConstantValue. @@ -1446,11 +1450,17 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { CInt != cast(I->getOperand(1))) Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } + // FIXME: Currently cost of model modification for division by + // power of 2 is handled only for X86. Add support for other targets. + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && + CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; - ScalarCost = - VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK); - VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK); + ScalarCost = VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, + Op1VP, Op2VP); + VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, + Op1VP, Op2VP); } return VecCost - ScalarCost; } diff --git a/test/Transforms/LoopVectorize/X86/powof2div.ll b/test/Transforms/LoopVectorize/X86/powof2div.ll new file mode 100644 index 00000000000..054da8ed20a --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/powof2div.ll @@ -0,0 +1,32 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.anon = type { [100 x i32], i32, [100 x i32] } + +@Foo = common global %struct.anon zeroinitializer, align 4 + +;CHECK-LABEL: @foo( +;CHECK: load <4 x i32>* +;CHECK: sdiv <4 x i32> +;CHECK: store <4 x i32> + +define void @foo(){ +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds %struct.anon* @Foo, i64 0, i32 2, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %div = sdiv i32 %0, 2 + %arrayidx2 = getelementptr inbounds %struct.anon* @Foo, i64 0, i32 0, i64 %indvars.iv + store i32 %div, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + diff --git a/test/Transforms/SLPVectorizer/X86/powof2div.ll b/test/Transforms/SLPVectorizer/X86/powof2div.ll new file mode 100644 index 00000000000..b82cd4dd236 --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/powof2div.ll @@ -0,0 +1,43 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +;CHECK-LABEL: @powof2div( +;CHECK: load <4 x i32>* +;CHECK: add <4 x i32> +;CHECK: sdiv <4 x i32> +define void @powof2div(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){ +entry: + %0 = load i32* %b, align 4 + %1 = load i32* %c, align 4 + %add = add nsw i32 %1, %0 + %div = sdiv i32 %add, 2 + store i32 %div, i32* %a, align 4 + %arrayidx3 = getelementptr inbounds i32* %b, i64 1 + %2 = load i32* %arrayidx3, align 4 + %arrayidx4 = getelementptr inbounds i32* %c, i64 1 + %3 = load i32* %arrayidx4, align 4 + %add5 = add nsw i32 %3, %2 + %div6 = sdiv i32 %add5, 2 + %arrayidx7 = getelementptr inbounds i32* %a, i64 1 + store i32 %div6, i32* %arrayidx7, align 4 + %arrayidx8 = getelementptr inbounds i32* %b, i64 2 + %4 = load i32* %arrayidx8, align 4 + %arrayidx9 = getelementptr inbounds i32* %c, i64 2 + %5 = load i32* %arrayidx9, align 4 + %add10 = add nsw i32 %5, %4 + %div11 = sdiv i32 %add10, 2 + %arrayidx12 = getelementptr inbounds i32* %a, i64 2 + store i32 %div11, i32* %arrayidx12, align 4 + %arrayidx13 = getelementptr inbounds i32* %b, i64 3 + %6 = load i32* %arrayidx13, align 4 + %arrayidx14 = getelementptr inbounds i32* %c, i64 3 + %7 = load i32* %arrayidx14, align 4 + %add15 = add nsw i32 %7, %6 + %div16 = sdiv i32 %add15, 2 + %arrayidx17 = getelementptr inbounds i32* %a, i64 3 + store i32 %div16, i32* %arrayidx17, align 4 + ret void +} + -- 2.34.1