#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
return Opcode;
}
+/// \returns \p I after propagating metadata from \p VL.
+static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
+ Instruction *I0 = cast<Instruction>(VL[0]);
+ SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
+ I0->getAllMetadataOtherThanDebugLoc(Metadata);
+
+ for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
+ unsigned Kind = Metadata[i].first;
+ MDNode *MD = Metadata[i].second;
+
+ for (int i = 1, e = VL.size(); MD && i != e; i++) {
+ Instruction *I = cast<Instruction>(VL[i]);
+ MDNode *IMD = I->getMetadata(Kind);
+
+ switch (Kind) {
+ default:
+ MD = 0; // Remove unknown metadata
+ break;
+ case LLVMContext::MD_tbaa:
+ MD = MDNode::getMostGenericTBAA(MD, IMD);
+ break;
+ case LLVMContext::MD_fpmath:
+ MD = MDNode::getMostGenericFPMath(MD, IMD);
+ break;
+ }
+ }
+ I->setMetadata(Kind, MD);
+ }
+ return I;
+}
+
/// \returns The type that all of the values in \p VL have or null if there
/// are different types.
static Type* getSameType(ArrayRef<Value *> VL) {
return true;
}
-static bool all_equal(SmallVectorImpl<Value *> &V) {
- Value *First = V[0];
- for (int i = 1, e = V.size(); i != e; ++i)
- if (V[i] != First)
- return false;
- return true;
-}
-
static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right) {
Right.push_back(V1);
}
- bool LeftBroadcast = all_equal(Left);
- bool RightBroadcast = all_equal(Right);
+ bool LeftBroadcast = isSplat(Left);
+ bool RightBroadcast = isSplat(Right);
// Don't reorder if the operands where good to begin with.
if (!(LeftBroadcast || RightBroadcast) &&
/// \returns whether the VectorizableTree is fully vectoriable and will
/// be beneficial even the tree height is tiny.
- bool isFullyVectorizableTinyTree();
+ bool isFullyVectorizableTinyTree();
struct TreeEntry {
TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0),
/// Holds all of the instructions that we gathered.
SetVector<Instruction *> GatherSeq;
+ /// A list of blocks that we are going to CSE.
+ SetVector<BasicBlock *> CSEBlocks;
/// Numbers instructions in different blocks.
DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers;
UE = Scalar->use_end(); User != UE; ++User) {
DEBUG(dbgs() << "SLP: Checking user:" << **User << ".\n");
- bool Gathered = MustGather.count(*User);
-
// Skip in-tree scalars that become vectors.
- if (ScalarToTreeEntry.count(*User) && !Gathered) {
+ if (ScalarToTreeEntry.count(*User)) {
DEBUG(dbgs() << "SLP: \tInternal user will be removed:" <<
**User << ".\n");
int Idx = ScalarToTreeEntry[*User]; (void) Idx;
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
newTreeEntry(VL, false);
- DEBUG(dbgs() << "SLP: Non consecutive store.\n");
+ DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
}
TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
} else {
- ScalarCost = VecTy->getNumElements() *
- TTI->getArithmeticInstrCost(Opcode, ScalarTy);
- VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
+ // Certain instructions can be cheaper to vectorize if they have a
+ // constant second vector operand.
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_UniformConstantValue;
+
+ // If all operands are exactly the same ConstantInt then set the
+ // operand kind to OK_UniformConstantValue.
+ // If instead not all operands are constants, then set the operand kind
+ // to OK_AnyValue. If all operands are constants but not the same,
+ // then set the operand kind to OK_NonUniformConstantValue.
+ ConstantInt *CInt = NULL;
+ for (unsigned i = 0; i < VL.size(); ++i) {
+ const Instruction *I = cast<Instruction>(VL[i]);
+ if (!isa<ConstantInt>(I->getOperand(1))) {
+ Op2VK = TargetTransformInfo::OK_AnyValue;
+ break;
+ }
+ if (i == 0) {
+ CInt = cast<ConstantInt>(I->getOperand(1));
+ continue;
+ }
+ if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
+ CInt != cast<ConstantInt>(I->getOperand(1)))
+ Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+ }
+
+ ScalarCost =
+ VecTy->getNumElements() *
+ TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK);
+ VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK);
}
return VecCost - ScalarCost;
}
// Cost of wide load - cost of scalar loads.
int ScalarLdCost = VecTy->getNumElements() *
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
- int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+ int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
return VecLdCost - ScalarLdCost;
}
case Instruction::Store: {
// We know that we can merge the stores. Calculate the cost.
int ScalarStCost = VecTy->getNumElements() *
TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
- int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
+ int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
return VecStCost - ScalarStCost;
}
default:
Cost += C;
}
+ SmallSet<Value *, 16> ExtractCostCalculated;
int ExtractCost = 0;
for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
I != E; ++I) {
+ // We only add extract cost once for the same scalar.
+ if (!ExtractCostCalculated.insert(I->Scalar))
+ continue;
VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
I->Lane);
}
-
DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
return Cost + ExtractCost;
}
Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
GatherSeq.insert(Insrt);
+ CSEBlocks.insert(Insrt->getParent());
// Add to our 'need-to-extract' list.
if (ScalarToTreeEntry.count(VL[i])) {
BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
E->VectorizedValue = V;
+
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ return propagateMetadata(I, E->Scalars);
+
return V;
}
case Instruction::Load: {
LI = Builder.CreateLoad(VecPtr);
LI->setAlignment(Alignment);
E->VectorizedValue = LI;
- return LI;
+ return propagateMetadata(LI, E->Scalars);
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(VL0);
StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
S->setAlignment(Alignment);
E->VectorizedValue = S;
- return S;
+ return propagateMetadata(S, E->Scalars);
}
default:
llvm_unreachable("unknown inst");
if (PHINode *PN = dyn_cast<PHINode>(Vec)) {
Builder.SetInsertPoint(PN->getParent()->getFirstInsertionPt());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ CSEBlocks.insert(PN->getParent());
User->replaceUsesOfWith(Scalar, Ex);
} else if (isa<Instruction>(Vec)){
if (PHINode *PH = dyn_cast<PHINode>(User)) {
if (PH->getIncomingValue(i) == Scalar) {
Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ CSEBlocks.insert(PH->getIncomingBlock(i));
PH->setOperand(i, Ex);
}
}
} else {
Builder.SetInsertPoint(cast<Instruction>(User));
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ CSEBlocks.insert(cast<Instruction>(User)->getParent());
User->replaceUsesOfWith(Scalar, Ex);
}
} else {
Builder.SetInsertPoint(F->getEntryBlock().begin());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ CSEBlocks.insert(&F->getEntryBlock());
User->replaceUsesOfWith(Scalar, Ex);
}
for (Value::use_iterator User = Scalar->use_begin(),
UE = Scalar->use_end(); User != UE; ++User) {
DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n");
- assert(!MustGather.count(*User) &&
- "Replacing gathered value with undef");
assert((ScalarToTreeEntry.count(*User) ||
// It is legal to replace the reduction users by undef.
return VectorizableTree[0].VectorizedValue;
}
+class DTCmp {
+ const DominatorTree *DT;
+
+public:
+ DTCmp(const DominatorTree *DT) : DT(DT) {}
+ bool operator()(const BasicBlock *A, const BasicBlock *B) const {
+ return DT->properlyDominates(A, B);
+ }
+};
+
void BoUpSLP::optimizeGatherSequence() {
DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
<< " gather sequences instructions.\n");
Insert->moveBefore(PreHeader->getTerminator());
}
+ // Sort blocks by domination. This ensures we visit a block after all blocks
+ // dominating it are visited.
+ SmallVector<BasicBlock *, 8> CSEWorkList(CSEBlocks.begin(), CSEBlocks.end());
+ std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), DTCmp(DT));
+
// Perform O(N^2) search over the gather sequences and merge identical
// instructions. TODO: We can further optimize this scan if we split the
// instructions into different buckets based on the insert lane.
- SmallPtrSet<Instruction*, 16> Visited;
- SmallVector<Instruction*, 16> ToRemove;
- ReversePostOrderTraversal<Function*> RPOT(F);
- for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(),
- E = RPOT.end(); I != E; ++I) {
+ SmallVector<Instruction *, 16> Visited;
+ for (SmallVectorImpl<BasicBlock *>::iterator I = CSEWorkList.begin(),
+ E = CSEWorkList.end();
+ I != E; ++I) {
+ assert((I == CSEWorkList.begin() || !DT->dominates(*I, *llvm::prior(I))) &&
+ "Worklist not sorted properly!");
BasicBlock *BB = *I;
- // For all instructions in the function:
- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
- Instruction *In = it;
- if ((!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) ||
- !GatherSeq.count(In))
+ // For all instructions in blocks containing gather sequences:
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
+ Instruction *In = it++;
+ if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
continue;
// Check if we can replace this instruction with any of the
// visited instructions.
- for (SmallPtrSet<Instruction*, 16>::iterator v = Visited.begin(),
- ve = Visited.end(); v != ve; ++v) {
+ for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
+ ve = Visited.end();
+ v != ve; ++v) {
if (In->isIdenticalTo(*v) &&
DT->dominates((*v)->getParent(), In->getParent())) {
In->replaceAllUsesWith(*v);
- ToRemove.push_back(In);
+ In->eraseFromParent();
In = 0;
break;
}
}
- if (In)
- Visited.insert(In);
+ if (In) {
+ assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
+ Visited.push_back(In);
+ }
}
}
-
- // Erase all of the instructions that we RAUWed.
- for (SmallVectorImpl<Instruction *>::iterator v = ToRemove.begin(),
- ve = ToRemove.end(); v != ve; ++v) {
- assert((*v)->getNumUses() == 0 && "Can't remove instructions with uses");
- (*v)->eraseFromParent();
- }
+ CSEBlocks.clear();
+ GatherSeq.clear();
}
/// The SLPVectorizer Pass.
DominatorTree *DT;
virtual bool runOnFunction(Function &F) {
+ if (skipOptnoneFunction(F))
+ return false;
+
SE = &getAnalysis<ScalarEvolution>();
DL = getAnalysisIfAvailable<DataLayout>();
TTI = &getAnalysis<TargetTransformInfo>();
AA = &getAnalysis<AliasAnalysis>();
LI = &getAnalysis<LoopInfo>();
- DT = &getAnalysis<DominatorTree>();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
StoreRefs.clear();
bool Changed = false;
DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
- // Use the bollom up slp vectorizer to construct chains that start with
+ // Use the bottom up slp vectorizer to construct chains that start with
// he store instructions.
BoUpSLP R(&F, SE, DL, TTI, AA, LI, DT);
AU.addRequired<AliasAnalysis>();
AU.addRequired<TargetTransformInfo>();
AU.addRequired<LoopInfo>();
- AU.addRequired<DominatorTree>();
+ AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<LoopInfo>();
- AU.addPreserved<DominatorTree>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
AU.setPreservesCFG();
}
StoreListMap StoreRefs;
};
+/// \brief Check that the Values in the slice in VL array are still existent in
+/// the WeakVH array.
+/// Vectorization of part of the VL array may cause later values in the VL array
+/// to become invalid. We track when this has happened in the WeakVH array.
+static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL,
+ SmallVectorImpl<WeakVH> &VH,
+ unsigned SliceBegin,
+ unsigned SliceSize) {
+ for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i)
+ if (VH[i] != VL[i])
+ return true;
+
+ return false;
+}
+
bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
int CostThreshold, BoUpSLP &R) {
unsigned ChainLen = Chain.size();
if (!isPowerOf2_32(Sz) || VF < 2)
return false;
+ // Keep track of values that were delete by vectorizing in the loop below.
+ SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
+
bool Changed = false;
// Look for profitable vectorizable trees at all offsets, starting at zero.
for (unsigned i = 0, e = ChainLen; i < e; ++i) {
if (i + VF > e)
break;
+
+ // Check that a previous iteration of this loop did not delete the Value.
+ if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
+ continue;
+
DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
<< "\n");
ArrayRef<Value *> Operands = Chain.slice(i, VF);
}
}
- return Changed;
+ return Changed;
}
bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
return false;
unsigned Opcode0 = I0->getOpcode();
-
+
Type *Ty0 = I0->getType();
unsigned Sz = DL->getTypeSizeInBits(Ty0);
unsigned VF = MinVecRegSize / Sz;
}
bool Changed = false;
-
+
+ // Keep track of values that were delete by vectorizing in the loop below.
+ SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
+
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
unsigned OpsWidth = 0;
-
- if (i + VF > e)
+
+ if (i + VF > e)
OpsWidth = e - i;
else
OpsWidth = VF;
if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
break;
- DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " << "\n");
+ // Check that a previous iteration of this loop did not delete the Value.
+ if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
+ continue;
+
+ DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+ << "\n");
ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
-
+
R.buildTree(Ops);
int Cost = R.getTreeCost();
-
+
if (Cost < -SLPCostThreshold) {
DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n");
R.vectorizeTree();
-
+
// Move to the next bundle.
i += VF - 1;
Changed = true;
}
}
-
- return Changed;
+
+ return Changed;
}
bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
break;
}
- // Start over at the next instruction of a differnt type (or the end).
+ // Start over at the next instruction of a different type (or the end).
IncIt = SameTypeIt;
}
}