From: Hal Finkel Date: Thu, 25 Oct 2012 21:12:23 +0000 (+0000) Subject: Begin incorporating target information into BBVectorize. X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=65309660fa61a837cc05323f69c618a7d8134d56;p=oota-llvm.git Begin incorporating target information into BBVectorize. This is the first of several steps to incorporate information from the new TargetTransformInfo infrastructure into BBVectorize. Two things are done here: 1. Target information is used to determine if it is profitable to fuse two instructions. This means that the cost of the vector operation must not be more expensive than the cost of the two original operations. Pairs that are not profitable are no longer considered (because current cost information is incomplete, for intrinsics for example, equal-cost pairs are still considered). 2. The 'cost savings' computed for the profitability check are also used to rank the DAGs that represent the potential vectorization plans. Specifically, for nodes of non-trivial depth, the cost savings is used as the node weight. The next step will be to incorporate the shuffle costs into the DAG weighting; this will give the edges of the DAG weights as well. Once that is done, when target information is available, we should be able to dispense with the depth heuristic. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166716 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp index 81125f22a69..61e8d735e41 100644 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -43,12 +43,17 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Support/ValueHandle.h" #include "llvm/DataLayout.h" +#include "llvm/TargetTransformInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" #include #include using namespace llvm; +static cl::opt +IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), + cl::Hidden, cl::desc("Ignore target information")); + static cl::opt ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, cl::desc("The required chain depth for vectorization")); @@ -181,9 +186,13 @@ namespace { DT = &P->getAnalysis(); SE = &P->getAnalysis(); TD = P->getAnalysisIfAvailable(); + TTI = IgnoreTargetInfo ? 0 : + P->getAnalysisIfAvailable(); + VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0; } typedef std::pair ValuePair; + typedef std::pair ValuePairWithCost; typedef std::pair ValuePairWithDepth; typedef std::pair VPPair; // A ValuePair pair typedef std::pair::iterator, @@ -196,6 +205,8 @@ namespace { DominatorTree *DT; ScalarEvolution *SE; DataLayout *TD; + TargetTransformInfo *TTI; + const VectorTargetTransformInfo *VTTI; // FIXME: const correct? @@ -204,6 +215,7 @@ namespace { bool getCandidatePairs(BasicBlock &BB, BasicBlock::iterator &Start, std::multimap &CandidatePairs, + DenseMap &CandidatePairCostSavings, std::vector &PairableInsts, bool NonPow2Len); void computeConnectedPairs(std::multimap &CandidatePairs, @@ -216,6 +228,7 @@ namespace { DenseSet &PairableInstUsers); void choosePairs(std::multimap &CandidatePairs, + DenseMap &CandidatePairCostSavings, std::vector &PairableInsts, std::multimap &ConnectedPairs, DenseSet &PairableInstUsers, @@ -228,7 +241,8 @@ namespace { bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); bool areInstsCompatible(Instruction *I, Instruction *J, - bool IsSimpleLoadStore, bool NonPow2Len); + bool IsSimpleLoadStore, bool NonPow2Len, + int &CostSavings); bool trackUsesOfI(DenseSet &Users, AliasSetTracker &WriteSet, Instruction *I, @@ -270,13 +284,14 @@ namespace { void findBestTreeFor( std::multimap &CandidatePairs, + DenseMap &CandidatePairCostSavings, std::vector &PairableInsts, std::multimap &ConnectedPairs, DenseSet &PairableInstUsers, std::multimap &PairableInstUserMap, DenseMap &ChosenPairs, DenseSet &BestTree, size_t &BestMaxDepth, - size_t &BestEffSize, VPIteratorPair ChoiceRange, + int &BestEffSize, VPIteratorPair ChoiceRange, bool UseCycleCheck); Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, @@ -339,13 +354,16 @@ namespace { return false; } + DEBUG(if (VTTI) dbgs() << "BBV: using target information\n"); + bool changed = false; // Iterate a sufficient number of times to merge types of size 1 bit, // then 2 bits, then 4, etc. up to half of the target vector width of the // target vector register. unsigned n = 1; for (unsigned v = 2; - v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter); + (VTTI || v <= Config.VectorBits) && + (!Config.MaxIter || n <= Config.MaxIter); v *= 2, ++n) { DEBUG(dbgs() << "BBV: fusing loop #" << n << " for " << BB.getName() << " in " << @@ -375,6 +393,9 @@ namespace { DT = &getAnalysis(); SE = &getAnalysis(); TD = getAnalysisIfAvailable(); + TTI = IgnoreTargetInfo ? 0 : + getAnalysisIfAvailable(); + VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0; return vectorizeBB(BB); } @@ -427,6 +448,10 @@ namespace { T2 = cast(I)->getSrcTy(); else T2 = T1; + + if (SelectInst *SI = dyn_cast(I)) { + T2 = SI->getCondition()->getType(); + } } // Returns the weight associated with the provided value. A chain of @@ -465,18 +490,25 @@ namespace { // directly after J. bool getPairPtrInfo(Instruction *I, Instruction *J, Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, + unsigned &IAddressSpace, unsigned &JAddressSpace, int64_t &OffsetInElmts) { OffsetInElmts = 0; - if (isa(I)) { - IPtr = cast(I)->getPointerOperand(); - JPtr = cast(J)->getPointerOperand(); - IAlignment = cast(I)->getAlignment(); - JAlignment = cast(J)->getAlignment(); + if (LoadInst *LI = dyn_cast(I)) { + LoadInst *LJ = cast(J); + IPtr = LI->getPointerOperand(); + JPtr = LJ->getPointerOperand(); + IAlignment = LI->getAlignment(); + JAlignment = LJ->getAlignment(); + IAddressSpace = LI->getPointerAddressSpace(); + JAddressSpace = LJ->getPointerAddressSpace(); } else { - IPtr = cast(I)->getPointerOperand(); - JPtr = cast(J)->getPointerOperand(); - IAlignment = cast(I)->getAlignment(); - JAlignment = cast(J)->getAlignment(); + StoreInst *SI = cast(I), *SJ = cast(J); + IPtr = SI->getPointerOperand(); + JPtr = SJ->getPointerOperand(); + IAlignment = SI->getAlignment(); + JAlignment = SJ->getAlignment(); + IAddressSpace = SI->getPointerAddressSpace(); + JAddressSpace = SJ->getPointerAddressSpace(); } const SCEV *IPtrSCEV = SE->getSCEV(IPtr); @@ -562,7 +594,9 @@ namespace { do { std::vector PairableInsts; std::multimap CandidatePairs; + DenseMap CandidatePairCostSavings; ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, + CandidatePairCostSavings, PairableInsts, NonPow2Len); if (PairableInsts.empty()) continue; @@ -590,7 +624,8 @@ namespace { // variables. DenseMap ChosenPairs; - choosePairs(CandidatePairs, PairableInsts, ConnectedPairs, + choosePairs(CandidatePairs, CandidatePairCostSavings, + PairableInsts, ConnectedPairs, PairableInstUsers, ChosenPairs); if (ChosenPairs.empty()) continue; @@ -679,15 +714,22 @@ namespace { !(VectorType::isValidElementType(T2) || T2->isVectorTy())) return false; - if (T1->getScalarSizeInBits() == 1 && T2->getScalarSizeInBits() == 1) { + if (T1->getScalarSizeInBits() == 1) { if (!Config.VectorizeBools) return false; } else { - if (!Config.VectorizeInts - && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) + if (!Config.VectorizeInts && T1->isIntOrIntVectorTy()) return false; } - + + if (T2->getScalarSizeInBits() == 1) { + if (!Config.VectorizeBools) + return false; + } else { + if (!Config.VectorizeInts && T2->isIntOrIntVectorTy()) + return false; + } + if (!Config.VectorizeFloats && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) return false; @@ -703,8 +745,8 @@ namespace { T2->getScalarType()->isPointerTy())) return false; - if (T1->getPrimitiveSizeInBits() >= Config.VectorBits || - T2->getPrimitiveSizeInBits() >= Config.VectorBits) + if (!VTTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits || + T2->getPrimitiveSizeInBits() >= Config.VectorBits)) return false; return true; @@ -715,10 +757,13 @@ namespace { // that I has already been determined to be vectorizable and that J is not // in the use tree of I. bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, - bool IsSimpleLoadStore, bool NonPow2Len) { + bool IsSimpleLoadStore, bool NonPow2Len, + int &CostSavings) { DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << " <-> " << *J << "\n"); + CostSavings = 0; + // Loads and stores can be merged if they have different alignments, // but are otherwise the same. if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment | @@ -731,38 +776,62 @@ namespace { unsigned MaxTypeBits = std::max( IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(), IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits()); - if (MaxTypeBits > Config.VectorBits) + if (!VTTI && MaxTypeBits > Config.VectorBits) return false; // FIXME: handle addsub-type operations! if (IsSimpleLoadStore) { Value *IPtr, *JPtr; - unsigned IAlignment, JAlignment; + unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; int64_t OffsetInElmts = 0; if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, + IAddressSpace, JAddressSpace, OffsetInElmts) && abs64(OffsetInElmts) == 1) { - if (Config.AlignedOnly) { - Type *aTypeI = isa(I) ? - cast(I)->getValueOperand()->getType() : I->getType(); - Type *aTypeJ = isa(J) ? - cast(J)->getValueOperand()->getType() : J->getType(); + unsigned BottomAlignment = IAlignment; + if (OffsetInElmts < 0) BottomAlignment = JAlignment; + + Type *aTypeI = isa(I) ? + cast(I)->getValueOperand()->getType() : I->getType(); + Type *aTypeJ = isa(J) ? + cast(J)->getValueOperand()->getType() : J->getType(); + Type *VType = getVecTypeForPair(aTypeI, aTypeJ); + if (Config.AlignedOnly) { // An aligned load or store is possible only if the instruction // with the lower offset has an alignment suitable for the // vector type. - unsigned BottomAlignment = IAlignment; - if (OffsetInElmts < 0) BottomAlignment = JAlignment; - - Type *VType = getVecTypeForPair(aTypeI, aTypeJ); unsigned VecAlignment = TD->getPrefTypeAlignment(VType); if (BottomAlignment < VecAlignment) return false; } + + if (VTTI) { + unsigned ICost = VTTI->getMemoryOpCost(I->getOpcode(), I->getType(), + IAlignment, IAddressSpace); + unsigned JCost = VTTI->getMemoryOpCost(J->getOpcode(), J->getType(), + JAlignment, JAddressSpace); + unsigned VCost = VTTI->getMemoryOpCost(I->getOpcode(), VType, + BottomAlignment, + IAddressSpace); + if (VCost > ICost + JCost) + return false; + CostSavings = ICost + JCost - VCost; + } } else { return false; } + } else if (VTTI) { + unsigned ICost = VTTI->getInstrCost(I->getOpcode(), IT1, IT2); + unsigned JCost = VTTI->getInstrCost(J->getOpcode(), JT1, JT2); + Type *VT1 = getVecTypeForPair(IT1, JT1), + *VT2 = getVecTypeForPair(IT2, JT2); + unsigned VCost = VTTI->getInstrCost(I->getOpcode(), VT1, VT2); + + if (VCost > ICost + JCost) + return false; + CostSavings = ICost + JCost - VCost; } // The powi intrinsic is special because only the first argument is @@ -845,6 +914,7 @@ namespace { bool BBVectorize::getCandidatePairs(BasicBlock &BB, BasicBlock::iterator &Start, std::multimap &CandidatePairs, + DenseMap &CandidatePairCostSavings, std::vector &PairableInsts, bool NonPow2Len) { BasicBlock::iterator E = BB.end(); if (Start == E) return false; @@ -881,7 +951,9 @@ namespace { // J does not use I, and comes before the first use of I, so it can be // merged with I if the instructions are compatible. - if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len)) continue; + int CostSavings; + if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len, + CostSavings)) continue; // J is a candidate for merging with I. if (!PairableInsts.size() || @@ -890,6 +962,9 @@ namespace { } CandidatePairs.insert(ValuePair(I, J)); + if (VTTI) + CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J), + CostSavings)); // The next call to this function must start after the last instruction // selected during this invocation. @@ -899,7 +974,8 @@ namespace { } DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " - << *I << " <-> " << *J << "\n"); + << *I << " <-> " << *J << " (cost savings: " << + CostSavings << ")\n"); // If we have already found too many pairs, break here and this function // will be called again starting after the last instruction selected @@ -1353,13 +1429,14 @@ namespace { // pairs, given the choice of root pairs as an iterator range. void BBVectorize::findBestTreeFor( std::multimap &CandidatePairs, + DenseMap &CandidatePairCostSavings, std::vector &PairableInsts, std::multimap &ConnectedPairs, DenseSet &PairableInstUsers, std::multimap &PairableInstUserMap, DenseMap &ChosenPairs, DenseSet &BestTree, size_t &BestMaxDepth, - size_t &BestEffSize, VPIteratorPair ChoiceRange, + int &BestEffSize, VPIteratorPair ChoiceRange, bool UseCycleCheck) { for (std::multimap::iterator J = ChoiceRange.first; J != ChoiceRange.second; ++J) { @@ -1409,17 +1486,26 @@ namespace { PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree, PrunedTree, *J, UseCycleCheck); - size_t EffSize = 0; - for (DenseSet::iterator S = PrunedTree.begin(), - E = PrunedTree.end(); S != E; ++S) - EffSize += getDepthFactor(S->first); + int EffSize = 0; + if (VTTI) { + for (DenseSet::iterator S = PrunedTree.begin(), + E = PrunedTree.end(); S != E; ++S) { + if (getDepthFactor(S->first)) + EffSize += CandidatePairCostSavings.find(*S)->second; + } + } else { + for (DenseSet::iterator S = PrunedTree.begin(), + E = PrunedTree.end(); S != E; ++S) + EffSize += (int) getDepthFactor(S->first); + } DEBUG(if (DebugPairSelection) dbgs() << "BBV: found pruned Tree for pair {" << *J->first << " <-> " << *J->second << "} of depth " << MaxDepth << " and size " << PrunedTree.size() << " (effective size: " << EffSize << ")\n"); - if (MaxDepth >= Config.ReqChainDepth && EffSize > BestEffSize) { + if (MaxDepth >= Config.ReqChainDepth && + EffSize > 0 && EffSize > BestEffSize) { BestMaxDepth = MaxDepth; BestEffSize = EffSize; BestTree = PrunedTree; @@ -1431,6 +1517,7 @@ namespace { // that will be fused into vector instructions. void BBVectorize::choosePairs( std::multimap &CandidatePairs, + DenseMap &CandidatePairCostSavings, std::vector &PairableInsts, std::multimap &ConnectedPairs, DenseSet &PairableInstUsers, @@ -1447,9 +1534,11 @@ namespace { VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I); // The best pair to choose and its tree: - size_t BestMaxDepth = 0, BestEffSize = 0; + size_t BestMaxDepth = 0; + int BestEffSize = 0; DenseSet BestTree; - findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, + findBestTreeFor(CandidatePairs, CandidatePairCostSavings, + PairableInsts, ConnectedPairs, PairableInstUsers, PairableInstUserMap, ChosenPairs, BestTree, BestMaxDepth, BestEffSize, ChoiceRange, UseCycleCheck); @@ -1505,12 +1594,13 @@ namespace { Instruction *I, Instruction *J, unsigned o, bool FlipMemInputs) { Value *IPtr, *JPtr; - unsigned IAlignment, JAlignment; + unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; int64_t OffsetInElmts; // Note: the analysis might fail here, that is why FlipMemInputs has // been precomputed (OffsetInElmts must be unused here). (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, + IAddressSpace, JAddressSpace, OffsetInElmts); // The pointer value is taken to be the one with the lowest offset. @@ -2212,9 +2302,10 @@ namespace { continue; Value *IPtr, *JPtr; - unsigned IAlignment, JAlignment; + unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; int64_t OffsetInElmts; if (!getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, + IAddressSpace, JAddressSpace, OffsetInElmts) || abs64(OffsetInElmts) != 1) llvm_unreachable("Pre-fusion pointer analysis failed"); diff --git a/test/Transforms/BBVectorize/loop1.ll b/test/Transforms/BBVectorize/loop1.ll index bebc91ad91a..a30af5091fe 100644 --- a/test/Transforms/BBVectorize/loop1.ll +++ b/test/Transforms/BBVectorize/loop1.ll @@ -1,8 +1,11 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s ; RUN: opt < %s -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL ; The second check covers the use of alias analysis (with loop unrolling). +; Both checks are run with and without target information. define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable { entry: diff --git a/test/Transforms/BBVectorize/simple.ll b/test/Transforms/BBVectorize/simple.ll index 88eb9c90f7e..702796b57d0 100644 --- a/test/Transforms/BBVectorize/simple.ll +++ b/test/Transforms/BBVectorize/simple.ll @@ -1,5 +1,6 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-TI ; Basic depth-3 chain define double @test1(double %A1, double %A2, double %B1, double %B2) { @@ -23,6 +24,9 @@ define double @test1(double %A1, double %A2, double %B1, double %B2) { ; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2 ret double %R ; CHECK: ret double %R +; CHECK-TI: @test1 +; CHECK-TI: fsub <2 x double> +; CHECK-TI: ret double } ; Basic depth-3 chain (last pair permuted) @@ -146,6 +150,9 @@ define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) { ; CHECK: %R = mul <8 x i8> %Q1.v.r1, %Q1.v.r2 ret <8 x i8> %R ; CHECK: ret <8 x i8> %R +; CHECK-TI: @test6 +; CHECK-TI-NOT: sub <16 x i8> +; CHECK-TI: ret <8 x i8> }