// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
//
//===----------------------------------------------------------------------===//
-#define SV_NAME "slp-vectorizer"
-#define DEBUG_TYPE "SLP"
-
#include "llvm/Transforms/Vectorize.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/VectorUtils.h"
#include <algorithm>
#include <map>
using namespace llvm;
+#define SV_NAME "slp-vectorizer"
+#define DEBUG_TYPE "SLP"
+
static cl::opt<int>
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
cl::desc("Only vectorize if you gain more than this "
BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {}
- BlockNumbering() : BB(0), Valid(false) {}
+ BlockNumbering() : BB(nullptr), Valid(false) {}
void numberInstructions() {
unsigned Loc = 0;
static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
if (!I0)
- return 0;
+ return nullptr;
BasicBlock *BB = I0->getParent();
for (int i = 1, e = VL.size(); i < e; i++) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
if (!I)
- return 0;
+ return nullptr;
if (BB != I->getParent())
- return 0;
+ return nullptr;
}
return BB;
}
return Opcode;
}
+/// \returns \p I after propagating metadata from \p VL.
+static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
+ Instruction *I0 = cast<Instruction>(VL[0]);
+ SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
+ I0->getAllMetadataOtherThanDebugLoc(Metadata);
+
+ for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
+ unsigned Kind = Metadata[i].first;
+ MDNode *MD = Metadata[i].second;
+
+ for (int i = 1, e = VL.size(); MD && i != e; i++) {
+ Instruction *I = cast<Instruction>(VL[i]);
+ MDNode *IMD = I->getMetadata(Kind);
+
+ switch (Kind) {
+ default:
+ MD = nullptr; // Remove unknown metadata
+ break;
+ case LLVMContext::MD_tbaa:
+ MD = MDNode::getMostGenericTBAA(MD, IMD);
+ break;
+ case LLVMContext::MD_fpmath:
+ MD = MDNode::getMostGenericFPMath(MD, IMD);
+ break;
+ }
+ }
+ I->setMetadata(Kind, MD);
+ }
+ return I;
+}
+
/// \returns The type that all of the values in \p VL have or null if there
/// are different types.
static Type* getSameType(ArrayRef<Value *> VL) {
Type *Ty = VL[0]->getType();
for (int i = 1, e = VL.size(); i < e; i++)
if (VL[i]->getType() != Ty)
- return 0;
+ return nullptr;
return Ty;
}
return true;
}
+static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+ SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right) {
+
+ SmallVector<Value *, 16> OrigLeft, OrigRight;
+
+ bool AllSameOpcodeLeft = true;
+ bool AllSameOpcodeRight = true;
+ for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+ Instruction *I = cast<Instruction>(VL[i]);
+ Value *V0 = I->getOperand(0);
+ Value *V1 = I->getOperand(1);
+
+ OrigLeft.push_back(V0);
+ OrigRight.push_back(V1);
+
+ Instruction *I0 = dyn_cast<Instruction>(V0);
+ Instruction *I1 = dyn_cast<Instruction>(V1);
+
+ // Check whether all operands on one side have the same opcode. In this case
+ // we want to preserve the original order and not make things worse by
+ // reordering.
+ AllSameOpcodeLeft = I0;
+ AllSameOpcodeRight = I1;
+
+ if (i && AllSameOpcodeLeft) {
+ if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) {
+ if(P0->getOpcode() != I0->getOpcode())
+ AllSameOpcodeLeft = false;
+ } else
+ AllSameOpcodeLeft = false;
+ }
+ if (i && AllSameOpcodeRight) {
+ if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) {
+ if(P1->getOpcode() != I1->getOpcode())
+ AllSameOpcodeRight = false;
+ } else
+ AllSameOpcodeRight = false;
+ }
+
+ // Sort two opcodes. In the code below we try to preserve the ability to use
+ // broadcast of values instead of individual inserts.
+ // vl1 = load
+ // vl2 = phi
+ // vr1 = load
+ // vr2 = vr2
+ // = vl1 x vr1
+ // = vl2 x vr2
+ // If we just sorted according to opcode we would leave the first line in
+ // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
+ // = vl1 x vr1
+ // = vr2 x vl2
+ // Because vr2 and vr1 are from the same load we loose the opportunity of a
+ // broadcast for the packed right side in the backend: we have [vr1, vl2]
+ // instead of [vr1, vr2=vr1].
+ if (I0 && I1) {
+ if(!i && I0->getOpcode() > I1->getOpcode()) {
+ Left.push_back(I1);
+ Right.push_back(I0);
+ } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) {
+ // Try not to destroy a broad cast for no apparent benefit.
+ Left.push_back(I1);
+ Right.push_back(I0);
+ } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] == I0) {
+ // Try preserve broadcasts.
+ Left.push_back(I1);
+ Right.push_back(I0);
+ } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) {
+ // Try preserve broadcasts.
+ Left.push_back(I1);
+ Right.push_back(I0);
+ } else {
+ Left.push_back(I0);
+ Right.push_back(I1);
+ }
+ continue;
+ }
+ // One opcode, put the instruction on the right.
+ if (I0) {
+ Left.push_back(V1);
+ Right.push_back(I0);
+ continue;
+ }
+ Left.push_back(V0);
+ Right.push_back(V1);
+ }
+
+ bool LeftBroadcast = isSplat(Left);
+ bool RightBroadcast = isSplat(Right);
+
+ // Don't reorder if the operands where good to begin with.
+ if (!(LeftBroadcast || RightBroadcast) &&
+ (AllSameOpcodeRight || AllSameOpcodeLeft)) {
+ Left = OrigLeft;
+ Right = OrigRight;
+ }
+}
+
/// Bottom Up SLP Vectorizer.
class BoUpSLP {
public:
typedef SmallPtrSet<Value *, 16> ValueSet;
typedef SmallVector<StoreInst *, 8> StoreList;
- BoUpSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl,
+ BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl,
TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li,
DominatorTree *Dt) :
F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), DT(Dt),
/// A negative number means that this is profitable.
int getTreeCost();
- /// Construct a vectorizable tree that starts at \p Roots and is possibly
- /// used by a reduction of \p RdxOps.
- void buildTree(ArrayRef<Value *> Roots, ValueSet *RdxOps = 0);
+ /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
+ /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
+ void buildTree(ArrayRef<Value *> Roots,
+ ArrayRef<Value *> UserIgnoreLst = None);
/// Clear the internal data structures that are created by 'buildTree'.
void deleteTree() {
- RdxOps = 0;
VectorizableTree.clear();
ScalarToTreeEntry.clear();
MustGather.clear();
/// \returns a vector from a collection of scalars in \p VL.
Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
+ /// \returns whether the VectorizableTree is fully vectoriable and will
+ /// be beneficial even the tree height is tiny.
+ bool isFullyVectorizableTinyTree();
+
struct TreeEntry {
- TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0),
+ TreeEntry() : Scalars(), VectorizedValue(nullptr), LastScalarIndex(0),
NeedToGather(0) {}
/// \returns true if the scalars in VL are equal to this entry.
/// Holds all of the instructions that we gathered.
SetVector<Instruction *> GatherSeq;
+ /// A list of blocks that we are going to CSE.
+ SetVector<BasicBlock *> CSEBlocks;
/// Numbers instructions in different blocks.
DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers;
- /// Reduction operators.
- ValueSet *RdxOps;
+ /// List of users to ignore during scheduling and that don't need extracting.
+ ArrayRef<Value *> UserIgnoreList;
// Analysis and block reference.
Function *F;
ScalarEvolution *SE;
- DataLayout *DL;
+ const DataLayout *DL;
TargetTransformInfo *TTI;
AliasAnalysis *AA;
LoopInfo *LI;
IRBuilder<> Builder;
};
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) {
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+ ArrayRef<Value *> UserIgnoreLst) {
deleteTree();
- RdxOps = Rdx;
+ UserIgnoreList = UserIgnoreLst;
if (!getSameType(Roots))
return;
buildTree_rec(Roots, 0);
if (Entry->NeedToGather)
continue;
- for (Value::use_iterator User = Scalar->use_begin(),
- UE = Scalar->use_end(); User != UE; ++User) {
- DEBUG(dbgs() << "SLP: Checking user:" << **User << ".\n");
-
- bool Gathered = MustGather.count(*User);
+ for (User *U : Scalar->users()) {
+ DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
// Skip in-tree scalars that become vectors.
- if (ScalarToTreeEntry.count(*User) && !Gathered) {
+ if (ScalarToTreeEntry.count(U)) {
DEBUG(dbgs() << "SLP: \tInternal user will be removed:" <<
- **User << ".\n");
- int Idx = ScalarToTreeEntry[*User]; (void) Idx;
+ *U << ".\n");
+ int Idx = ScalarToTreeEntry[U]; (void) Idx;
assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
continue;
}
- Instruction *UserInst = dyn_cast<Instruction>(*User);
+ Instruction *UserInst = dyn_cast<Instruction>(U);
if (!UserInst)
continue;
- // Ignore uses that are part of the reduction.
- if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end())
+ // Ignore users in the user ignore list.
+ if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
+ UserIgnoreList.end())
continue;
- DEBUG(dbgs() << "SLP: Need to extract:" << **User << " from lane " <<
+ DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
Lane << " from " << *Scalar << ".\n");
- ExternalUses.push_back(ExternalUser(Scalar, *User, Lane));
+ ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
}
}
}
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
Instruction *Scalar = cast<Instruction>(VL[i]);
DEBUG(dbgs() << "SLP: Checking users of " << *Scalar << ". \n");
- for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end();
- U != UE; ++U) {
- DEBUG(dbgs() << "SLP: \tUser " << **U << ". \n");
- Instruction *User = dyn_cast<Instruction>(*U);
- if (!User) {
+ for (User *U : Scalar->users()) {
+ DEBUG(dbgs() << "SLP: \tUser " << *U << ". \n");
+ Instruction *UI = dyn_cast<Instruction>(U);
+ if (!UI) {
DEBUG(dbgs() << "SLP: Gathering due unknown user. \n");
newTreeEntry(VL, false);
return;
}
// We don't care if the user is in a different basic block.
- BasicBlock *UserBlock = User->getParent();
+ BasicBlock *UserBlock = UI->getParent();
if (UserBlock != BB) {
DEBUG(dbgs() << "SLP: User from a different basic block "
- << *User << ". \n");
+ << *UI << ". \n");
continue;
}
// If this is a PHINode within this basic block then we can place the
// extract wherever we want.
- if (isa<PHINode>(*User)) {
- DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *User << ". \n");
+ if (isa<PHINode>(*UI)) {
+ DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *UI << ". \n");
continue;
}
// Check if this is a safe in-tree user.
- if (ScalarToTreeEntry.count(User)) {
- int Idx = ScalarToTreeEntry[User];
+ if (ScalarToTreeEntry.count(UI)) {
+ int Idx = ScalarToTreeEntry[UI];
int VecLocation = VectorizableTree[Idx].LastScalarIndex;
if (VecLocation <= MyLastIndex) {
DEBUG(dbgs() << "SLP: Gathering due to unschedulable vector. \n");
newTreeEntry(VL, false);
return;
}
- DEBUG(dbgs() << "SLP: In-tree user (" << *User << ") at #" <<
+ DEBUG(dbgs() << "SLP: In-tree user (" << *UI << ") at #" <<
VecLocation << " vector value (" << *Scalar << ") at #"
<< MyLastIndex << ".\n");
continue;
}
- // This user is part of the reduction.
- if (RdxOps && RdxOps->count(User))
+ // Ignore users in the user ignore list.
+ if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UI) !=
+ UserIgnoreList.end())
continue;
// Make sure that we can schedule this unknown user.
BlockNumbering &BN = BlocksNumbers[BB];
- int UserIndex = BN.getIndex(User);
+ int UserIndex = BN.getIndex(UI);
if (UserIndex < MyLastIndex) {
DEBUG(dbgs() << "SLP: Can't schedule extractelement for "
- << *User << ". \n");
+ << *UI << ". \n");
newTreeEntry(VL, false);
return;
}
// Check that instructions in this bundle don't reference other instructions.
// The runtime of this check is O(N * N-1 * uses(N)) and a typical N is 4.
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
- for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end();
- U != UE; ++U) {
+ for (User *U : VL[i]->users()) {
for (unsigned j = 0; j < e; ++j) {
- if (i != j && *U == VL[j]) {
- DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << **U << ". \n");
+ if (i != j && U == VL[j]) {
+ DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << *U << ". \n");
newTreeEntry(VL, false);
return;
}
// Check for terminator values (e.g. invoke).
for (unsigned j = 0; j < VL.size(); ++j)
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
- TerminatorInst *Term = dyn_cast<TerminatorInst>(cast<PHINode>(VL[j])->getIncomingValue(i));
+ TerminatorInst *Term = dyn_cast<TerminatorInst>(
+ cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
if (Term) {
DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
newTreeEntry(VL, false);
ValueList Operands;
// Prepare the operand vector.
for (unsigned j = 0; j < VL.size(); ++j)
- Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i));
+ Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
+ PH->getIncomingBlock(i)));
buildTree_rec(Operands, Depth + 1);
}
}
case Instruction::Load: {
// Check if the loads are consecutive or of we need to swizzle them.
- for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
- if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
+ for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
+ LoadInst *L = cast<LoadInst>(VL[i]);
+ if (!L->isSimple() || !isConsecutiveAccess(VL[i], VL[i + 1])) {
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Need to swizzle loads.\n");
return;
}
-
+ }
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of loads.\n");
return;
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
+ // Sort operands of the instructions so that each side is more likely to
+ // have the same opcode.
+ if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
+ ValueList Left, Right;
+ reorderInputsAccordingToOpcode(VL, Left, Right);
+ buildTree_rec(Left, Depth + 1);
+ buildTree_rec(Right, Depth + 1);
+ return;
+ }
+
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
newTreeEntry(VL, false);
- DEBUG(dbgs() << "SLP: Non consecutive store.\n");
+ DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
return;
}
buildTree_rec(Operands, Depth + 1);
return;
}
+ case Instruction::Call: {
+ // Check if the calls are all to the same vectorizable intrinsic.
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(VL[0]);
+ Intrinsic::ID ID = II ? II->getIntrinsicID() : Intrinsic::not_intrinsic;
+
+ if (!isTriviallyVectorizable(ID)) {
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
+ return;
+ }
+
+ Function *Int = II->getCalledFunction();
+
+ for (unsigned i = 1, e = VL.size(); i != e; ++i) {
+ IntrinsicInst *II2 = dyn_cast<IntrinsicInst>(VL[i]);
+ if (!II2 || II2->getCalledFunction() != Int) {
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: mismatched calls:" << *II << "!=" << *VL[i]
+ << "\n");
+ return;
+ }
+ }
+
+ newTreeEntry(VL, true);
+ for (unsigned i = 0, e = II->getNumArgOperands(); i != e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j) {
+ IntrinsicInst *II2 = dyn_cast<IntrinsicInst>(VL[j]);
+ Operands.push_back(II2->getArgOperand(i));
+ }
+ buildTree_rec(Operands, Depth + 1);
+ }
+ return;
+ }
default:
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
return 0;
}
case Instruction::ExtractElement: {
- if (CanReuseExtract(VL))
- return 0;
+ if (CanReuseExtract(VL)) {
+ int DeadCost = 0;
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+ if (E->hasOneUse())
+ // Take credit for instruction that will become dead.
+ DeadCost +=
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
+ }
+ return -DeadCost;
+ }
return getGatherCost(VecTy);
}
case Instruction::ZExt:
TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
} else {
- ScalarCost = VecTy->getNumElements() *
- TTI->getArithmeticInstrCost(Opcode, ScalarTy);
- VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
+ // Certain instructions can be cheaper to vectorize if they have a
+ // constant second vector operand.
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_UniformConstantValue;
+
+ // If all operands are exactly the same ConstantInt then set the
+ // operand kind to OK_UniformConstantValue.
+ // If instead not all operands are constants, then set the operand kind
+ // to OK_AnyValue. If all operands are constants but not the same,
+ // then set the operand kind to OK_NonUniformConstantValue.
+ ConstantInt *CInt = nullptr;
+ for (unsigned i = 0; i < VL.size(); ++i) {
+ const Instruction *I = cast<Instruction>(VL[i]);
+ if (!isa<ConstantInt>(I->getOperand(1))) {
+ Op2VK = TargetTransformInfo::OK_AnyValue;
+ break;
+ }
+ if (i == 0) {
+ CInt = cast<ConstantInt>(I->getOperand(1));
+ continue;
+ }
+ if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
+ CInt != cast<ConstantInt>(I->getOperand(1)))
+ Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+ }
+
+ ScalarCost =
+ VecTy->getNumElements() *
+ TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK);
+ VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK);
}
return VecCost - ScalarCost;
}
// Cost of wide load - cost of scalar loads.
int ScalarLdCost = VecTy->getNumElements() *
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
- int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+ int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
return VecLdCost - ScalarLdCost;
}
case Instruction::Store: {
// We know that we can merge the stores. Calculate the cost.
int ScalarStCost = VecTy->getNumElements() *
TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
- int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
+ int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
return VecStCost - ScalarStCost;
}
+ case Instruction::Call: {
+ CallInst *CI = cast<CallInst>(VL0);
+ IntrinsicInst *II = cast<IntrinsicInst>(CI);
+ Intrinsic::ID ID = II->getIntrinsicID();
+
+ // Calculate the cost of the scalar and vector calls.
+ SmallVector<Type*, 4> ScalarTys, VecTys;
+ for (unsigned op = 0, opc = II->getNumArgOperands(); op!= opc; ++op) {
+ ScalarTys.push_back(CI->getArgOperand(op)->getType());
+ VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
+ VecTy->getNumElements()));
+ }
+
+ int ScalarCallCost = VecTy->getNumElements() *
+ TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
+
+ int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
+
+ DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
+ << " (" << VecCallCost << "-" << ScalarCallCost << ")"
+ << " for " << *II << "\n");
+
+ return VecCallCost - ScalarCallCost;
+ }
default:
llvm_unreachable("Unknown instruction");
}
}
+bool BoUpSLP::isFullyVectorizableTinyTree() {
+ DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
+ VectorizableTree.size() << " is fully vectorizable .\n");
+
+ // We only handle trees of height 2.
+ if (VectorizableTree.size() != 2)
+ return false;
+
+ // Handle splat stores.
+ if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars))
+ return true;
+
+ // Gathering cost would be too much for tiny trees.
+ if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
+ return false;
+
+ return true;
+}
+
int BoUpSLP::getTreeCost() {
int Cost = 0;
DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
VectorizableTree.size() << ".\n");
- // Don't vectorize tiny trees. Small load/store chains or consecutive stores
- // of constants will be vectoried in SelectionDAG in MergeConsecutiveStores.
- // The SelectionDAG vectorizer can only handle pairs (trees of height = 2).
- if (VectorizableTree.size() < 3) {
+ // We only vectorize tiny trees if it is fully vectorizable.
+ if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
if (!VectorizableTree.size()) {
assert(!ExternalUses.size() && "We should not have any external users");
}
Cost += C;
}
+ SmallSet<Value *, 16> ExtractCostCalculated;
int ExtractCost = 0;
for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
I != E; ++I) {
+ // We only add extract cost once for the same scalar.
+ if (!ExtractCostCalculated.insert(I->Scalar))
+ continue;
VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
I->Lane);
}
-
DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
return Cost + ExtractCost;
}
return LI->getPointerOperand();
if (StoreInst *SI = dyn_cast<StoreInst>(I))
return SI->getPointerOperand();
- return 0;
+ return nullptr;
}
unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
if (!A.Ptr || !B.Ptr || AA->alias(A, B))
return I;
}
- return 0;
+ return nullptr;
}
int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) {
Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
GatherSeq.insert(Insrt);
+ CSEBlocks.insert(Insrt->getParent());
// Add to our 'need-to-extract' list.
if (ScalarToTreeEntry.count(VL[i])) {
if (En->isSame(VL) && En->VectorizedValue)
return En->VectorizedValue;
}
- return 0;
+ return nullptr;
}
Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
case Instruction::Or:
case Instruction::Xor: {
ValueList LHSVL, RHSVL;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
- RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
- }
+ if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
+ reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
+ else
+ for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+ LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+ RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+ }
setInsertPointAfterBundle(E->Scalars);
BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
E->VectorizedValue = V;
+
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ return propagateMetadata(I, E->Scalars);
+
return V;
}
case Instruction::Load: {
LI = Builder.CreateLoad(VecPtr);
LI->setAlignment(Alignment);
E->VectorizedValue = LI;
- return LI;
+ return propagateMetadata(LI, E->Scalars);
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(VL0);
StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
S->setAlignment(Alignment);
E->VectorizedValue = S;
- return S;
+ return propagateMetadata(S, E->Scalars);
+ }
+ case Instruction::Call: {
+ CallInst *CI = cast<CallInst>(VL0);
+
+ setInsertPointAfterBundle(E->Scalars);
+ std::vector<Value *> OpVecs;
+ for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
+ ValueList OpVL;
+ for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+ CallInst *CEI = cast<CallInst>(E->Scalars[i]);
+ OpVL.push_back(CEI->getArgOperand(j));
+ }
+
+ Value *OpVec = vectorizeTree(OpVL);
+ DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
+ OpVecs.push_back(OpVec);
+ }
+
+ Module *M = F->getParent();
+ IntrinsicInst *II = cast<IntrinsicInst>(CI);
+ Intrinsic::ID ID = II->getIntrinsicID();
+ Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
+ Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
+ Value *V = Builder.CreateCall(CF, OpVecs);
+ E->VectorizedValue = V;
+ return V;
}
default:
llvm_unreachable("unknown inst");
}
- return 0;
+ return nullptr;
}
Value *BoUpSLP::vectorizeTree() {
// Skip users that we already RAUW. This happens when one instruction
// has multiple uses of the same value.
- if (std::find(Scalar->use_begin(), Scalar->use_end(), User) ==
- Scalar->use_end())
+ if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
+ Scalar->user_end())
continue;
assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
Value *Lane = Builder.getInt32(it->Lane);
// Generate extracts for out-of-tree users.
// Find the insertion point for the extractelement lane.
- if (PHINode *PN = dyn_cast<PHINode>(Vec)) {
- Builder.SetInsertPoint(PN->getParent()->getFirstInsertionPt());
- Value *Ex = Builder.CreateExtractElement(Vec, Lane);
- User->replaceUsesOfWith(Scalar, Ex);
- } else if (isa<Instruction>(Vec)){
+ if (isa<Instruction>(Vec)){
if (PHINode *PH = dyn_cast<PHINode>(User)) {
for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
if (PH->getIncomingValue(i) == Scalar) {
Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ CSEBlocks.insert(PH->getIncomingBlock(i));
PH->setOperand(i, Ex);
}
}
} else {
Builder.SetInsertPoint(cast<Instruction>(User));
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ CSEBlocks.insert(cast<Instruction>(User)->getParent());
User->replaceUsesOfWith(Scalar, Ex);
}
} else {
Builder.SetInsertPoint(F->getEntryBlock().begin());
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ CSEBlocks.insert(&F->getEntryBlock());
User->replaceUsesOfWith(Scalar, Ex);
}
Type *Ty = Scalar->getType();
if (!Ty->isVoidTy()) {
- for (Value::use_iterator User = Scalar->use_begin(),
- UE = Scalar->use_end(); User != UE; ++User) {
- DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n");
- assert(!MustGather.count(*User) &&
- "Replacing gathered value with undef");
-
- assert((ScalarToTreeEntry.count(*User) ||
- // It is legal to replace the reduction users by undef.
- (RdxOps && RdxOps->count(*User))) &&
+#ifndef NDEBUG
+ for (User *U : Scalar->users()) {
+ DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
+
+ assert((ScalarToTreeEntry.count(U) ||
+ // It is legal to replace users in the ignorelist by undef.
+ (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
+ UserIgnoreList.end())) &&
"Replacing out-of-tree value with undef");
}
+#endif
Value *Undef = UndefValue::get(Ty);
Scalar->replaceAllUsesWith(Undef);
}
Insert->moveBefore(PreHeader->getTerminator());
}
+ // Sort blocks by domination. This ensures we visit a block after all blocks
+ // dominating it are visited.
+ SmallVector<BasicBlock *, 8> CSEWorkList(CSEBlocks.begin(), CSEBlocks.end());
+ std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
+ [this](const BasicBlock *A, const BasicBlock *B) {
+ return DT->properlyDominates(A, B);
+ });
+
// Perform O(N^2) search over the gather sequences and merge identical
// instructions. TODO: We can further optimize this scan if we split the
// instructions into different buckets based on the insert lane.
- SmallPtrSet<Instruction*, 16> Visited;
- SmallVector<Instruction*, 16> ToRemove;
- ReversePostOrderTraversal<Function*> RPOT(F);
- for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(),
- E = RPOT.end(); I != E; ++I) {
+ SmallVector<Instruction *, 16> Visited;
+ for (SmallVectorImpl<BasicBlock *>::iterator I = CSEWorkList.begin(),
+ E = CSEWorkList.end();
+ I != E; ++I) {
+ assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
+ "Worklist not sorted properly!");
BasicBlock *BB = *I;
- // For all instructions in the function:
- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
- Instruction *In = it;
- if ((!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) ||
- !GatherSeq.count(In))
+ // For all instructions in blocks containing gather sequences:
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
+ Instruction *In = it++;
+ if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
continue;
// Check if we can replace this instruction with any of the
// visited instructions.
- for (SmallPtrSet<Instruction*, 16>::iterator v = Visited.begin(),
- ve = Visited.end(); v != ve; ++v) {
+ for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
+ ve = Visited.end();
+ v != ve; ++v) {
if (In->isIdenticalTo(*v) &&
DT->dominates((*v)->getParent(), In->getParent())) {
In->replaceAllUsesWith(*v);
- ToRemove.push_back(In);
- In = 0;
+ In->eraseFromParent();
+ In = nullptr;
break;
}
}
- if (In)
- Visited.insert(In);
+ if (In) {
+ assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
+ Visited.push_back(In);
+ }
}
}
-
- // Erase all of the instructions that we RAUWed.
- for (SmallVectorImpl<Instruction *>::iterator v = ToRemove.begin(),
- ve = ToRemove.end(); v != ve; ++v) {
- assert((*v)->getNumUses() == 0 && "Can't remove instructions with uses");
- (*v)->eraseFromParent();
- }
+ CSEBlocks.clear();
+ GatherSeq.clear();
}
/// The SLPVectorizer Pass.
}
ScalarEvolution *SE;
- DataLayout *DL;
+ const DataLayout *DL;
TargetTransformInfo *TTI;
AliasAnalysis *AA;
LoopInfo *LI;
DominatorTree *DT;
- virtual bool runOnFunction(Function &F) {
+ bool runOnFunction(Function &F) override {
+ if (skipOptnoneFunction(F))
+ return false;
+
SE = &getAnalysis<ScalarEvolution>();
- DL = getAnalysisIfAvailable<DataLayout>();
+ DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+ DL = DLP ? &DLP->getDataLayout() : nullptr;
TTI = &getAnalysis<TargetTransformInfo>();
AA = &getAnalysis<AliasAnalysis>();
LI = &getAnalysis<LoopInfo>();
- DT = &getAnalysis<DominatorTree>();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
StoreRefs.clear();
bool Changed = false;
DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
- // Use the bollom up slp vectorizer to construct chains that start with
+ // Use the bottom up slp vectorizer to construct chains that start with
// he store instructions.
BoUpSLP R(&F, SE, DL, TTI, AA, LI, DT);
return Changed;
}
- virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
FunctionPass::getAnalysisUsage(AU);
AU.addRequired<ScalarEvolution>();
AU.addRequired<AliasAnalysis>();
AU.addRequired<TargetTransformInfo>();
AU.addRequired<LoopInfo>();
- AU.addRequired<DominatorTree>();
+ AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<LoopInfo>();
- AU.addPreserved<DominatorTree>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
AU.setPreservesCFG();
}
bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
/// \brief Try to vectorize a list of operands.
+ /// \@param BuildVector A list of users to ignore for the purpose of
+ /// scheduling and that don't need extracting.
/// \returns true if a value was vectorized.
- bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R);
+ bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+ ArrayRef<Value *> BuildVector = None);
/// \brief Try to vectorize a chain that may start at the operands of \V;
bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
StoreListMap StoreRefs;
};
+/// \brief Check that the Values in the slice in VL array are still existent in
+/// the WeakVH array.
+/// Vectorization of part of the VL array may cause later values in the VL array
+/// to become invalid. We track when this has happened in the WeakVH array.
+static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL,
+ SmallVectorImpl<WeakVH> &VH,
+ unsigned SliceBegin,
+ unsigned SliceSize) {
+ for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i)
+ if (VH[i] != VL[i])
+ return true;
+
+ return false;
+}
+
bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
int CostThreshold, BoUpSLP &R) {
unsigned ChainLen = Chain.size();
if (!isPowerOf2_32(Sz) || VF < 2)
return false;
+ // Keep track of values that were deleted by vectorizing in the loop below.
+ SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
+
bool Changed = false;
// Look for profitable vectorizable trees at all offsets, starting at zero.
for (unsigned i = 0, e = ChainLen; i < e; ++i) {
if (i + VF > e)
break;
+
+ // Check that a previous iteration of this loop did not delete the Value.
+ if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
+ continue;
+
DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
<< "\n");
ArrayRef<Value *> Operands = Chain.slice(i, VF);
}
}
- return Changed;
+ return Changed;
}
bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
if (!SI)
continue;
+ // Don't touch volatile stores.
+ if (!SI->isSimple())
+ continue;
+
// Check that the pointer points to scalars.
Type *Ty = SI->getValueOperand()->getType();
if (Ty->isAggregateType() || Ty->isVectorTy())
return tryToVectorizeList(VL, R);
}
-bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
+bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+ ArrayRef<Value *> BuildVector) {
if (VL.size() < 2)
return false;
return false;
unsigned Opcode0 = I0->getOpcode();
-
+
Type *Ty0 = I0->getType();
unsigned Sz = DL->getTypeSizeInBits(Ty0);
unsigned VF = MinVecRegSize / Sz;
}
bool Changed = false;
-
+
+ // Keep track of values that were deleted by vectorizing in the loop below.
+ SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
+
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
unsigned OpsWidth = 0;
-
- if (i + VF > e)
+
+ if (i + VF > e)
OpsWidth = e - i;
else
OpsWidth = VF;
if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
break;
- DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " << "\n");
+ // Check that a previous iteration of this loop did not delete the Value.
+ if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
+ continue;
+
+ DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+ << "\n");
ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
-
- R.buildTree(Ops);
+
+ ArrayRef<Value *> BuildVectorSlice;
+ if (!BuildVector.empty())
+ BuildVectorSlice = BuildVector.slice(i, OpsWidth);
+
+ R.buildTree(Ops, BuildVectorSlice);
int Cost = R.getTreeCost();
-
+
if (Cost < -SLPCostThreshold) {
- DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n");
- R.vectorizeTree();
-
+ DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+ Value *VectorizedRoot = R.vectorizeTree();
+
+ // Reconstruct the build vector by extracting the vectorized root. This
+ // way we handle the case where some elements of the vector are undefined.
+ // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
+ if (!BuildVectorSlice.empty()) {
+ Instruction *InsertAfter = cast<Instruction>(VectorizedRoot);
+ for (auto &V : BuildVectorSlice) {
+ InsertElementInst *IE = cast<InsertElementInst>(V);
+ IRBuilder<> Builder(++BasicBlock::iterator(InsertAfter));
+ Instruction *Extract = cast<Instruction>(
+ Builder.CreateExtractElement(VectorizedRoot, IE->getOperand(2)));
+ IE->setOperand(1, Extract);
+ IE->removeFromParent();
+ IE->insertAfter(Extract);
+ InsertAfter = IE;
+ }
+ }
// Move to the next bundle.
i += VF - 1;
Changed = true;
}
}
-
- return Changed;
+
+ return Changed;
}
bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
/// *p =
///
class HorizontalReduction {
- SmallPtrSet<Value *, 16> ReductionOps;
+ SmallVector<Value *, 16> ReductionOps;
SmallVector<Value *, 32> ReducedVals;
BinaryOperator *ReductionRoot;
public:
HorizontalReduction()
- : ReductionRoot(0), ReductionPHI(0), ReductionOpcode(0),
+ : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
/// \brief Try to find a reduction tree.
bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
- DataLayout *DL) {
+ const DataLayout *DL) {
assert((!Phi ||
std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
"Thi phi needs to use the binary operator");
// In such a case start looking for a tree rooted in the first '+'.
if (Phi) {
if (B->getOperand(0) == Phi) {
- Phi = 0;
+ Phi = nullptr;
B = dyn_cast<BinaryOperator>(B->getOperand(1));
} else if (B->getOperand(1) == Phi) {
- Phi = 0;
+ Phi = nullptr;
B = dyn_cast<BinaryOperator>(B->getOperand(0));
}
}
// We need to be able to reassociate the adds.
if (!TreeN->isAssociative())
return false;
- ReductionOps.insert(TreeN);
+ ReductionOps.push_back(TreeN);
}
// Retract.
Stack.pop_back();
if (NumReducedVals < ReduxWidth)
return false;
- Value *VectorizedTree = 0;
+ Value *VectorizedTree = nullptr;
IRBuilder<> Builder(ReductionRoot);
FastMathFlags Unsafe;
Unsafe.setUnsafeAlgebra();
for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth);
- V.buildTree(ValsToReduce, &ReductionOps);
+ V.buildTree(ValsToReduce, ReductionOps);
// Estimate cost.
int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
}
// Update users.
if (ReductionPHI) {
- assert(ReductionRoot != NULL && "Need a reduction operation");
+ assert(ReductionRoot && "Need a reduction operation");
ReductionRoot->setOperand(0, VectorizedTree);
ReductionRoot->setOperand(1, ReductionPHI);
} else
ReductionRoot->replaceAllUsesWith(VectorizedTree);
}
- return VectorizedTree != 0;
+ return VectorizedTree != nullptr;
}
private:
///
/// Returns true if it matches
///
-static bool findBuildVector(InsertElementInst *IE,
- SmallVectorImpl<Value *> &Ops) {
- if (!isa<UndefValue>(IE->getOperand(0)))
+static bool findBuildVector(InsertElementInst *FirstInsertElem,
+ SmallVectorImpl<Value *> &BuildVector,
+ SmallVectorImpl<Value *> &BuildVectorOpds) {
+ if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
return false;
+ InsertElementInst *IE = FirstInsertElem;
while (true) {
- Ops.push_back(IE->getOperand(1));
+ BuildVector.push_back(IE);
+ BuildVectorOpds.push_back(IE->getOperand(1));
if (IE->use_empty())
return false;
- InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->use_back());
+ InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
if (!NextUse)
return true;
return false;
}
+static bool PhiTypeSorterFunc(Value *V, Value *V2) {
+ return V->getType() < V2->getType();
+}
+
bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;
- SmallSet<Instruction *, 16> VisitedInstrs;
+ SmallSet<Value *, 16> VisitedInstrs;
+
+ bool HaveVectorizedPhiNodes = true;
+ while (HaveVectorizedPhiNodes) {
+ HaveVectorizedPhiNodes = false;
+
+ // Collect the incoming values from the PHIs.
+ Incoming.clear();
+ for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
+ ++instr) {
+ PHINode *P = dyn_cast<PHINode>(instr);
+ if (!P)
+ break;
- // Collect the incoming values from the PHIs.
- for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
- ++instr) {
- PHINode *P = dyn_cast<PHINode>(instr);
+ if (!VisitedInstrs.count(P))
+ Incoming.push_back(P);
+ }
- if (!P)
- break;
+ // Sort by type.
+ std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
- // We may go through BB multiple times so skip the one we have checked.
- if (!VisitedInstrs.insert(instr))
- continue;
+ // Try to vectorize elements base on their type.
+ for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
+ E = Incoming.end();
+ IncIt != E;) {
- // Stop constructing the list when you reach a different type.
- if (Incoming.size() && P->getType() != Incoming[0]->getType()) {
- if (tryToVectorizeList(Incoming, R)) {
- // We would like to start over since some instructions are deleted
- // and the iterator may become invalid value.
+ // Look for the next elements with the same type.
+ SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
+ while (SameTypeIt != E &&
+ (*SameTypeIt)->getType() == (*IncIt)->getType()) {
+ VisitedInstrs.insert(*SameTypeIt);
+ ++SameTypeIt;
+ }
+
+ // Try to vectorize them.
+ unsigned NumElts = (SameTypeIt - IncIt);
+ DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
+ if (NumElts > 1 &&
+ tryToVectorizeList(ArrayRef<Value *>(IncIt, NumElts), R)) {
+ // Success start over because instructions might have been changed.
+ HaveVectorizedPhiNodes = true;
Changed = true;
- instr = BB->begin();
- ie = BB->end();
+ break;
}
- Incoming.clear();
+ // Start over at the next instruction of a different type (or the end).
+ IncIt = SameTypeIt;
}
-
- Incoming.push_back(P);
}
- if (Incoming.size() > 1)
- Changed |= tryToVectorizeList(Incoming, R);
-
VisitedInstrs.clear();
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
Value *Rdx =
(P->getIncomingBlock(0) == BB
? (P->getIncomingValue(0))
- : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : 0));
+ : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1)
+ : nullptr));
// Check if this is a Binary Operator.
BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
if (!BI)
if (BinaryOperator *BinOp =
dyn_cast<BinaryOperator>(SI->getValueOperand())) {
HorizontalReduction HorRdx;
- if (((HorRdx.matchAssociativeReduction(0, BinOp, DL) &&
+ if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) &&
HorRdx.tryToReduce(R, TTI)) ||
tryToVectorize(BinOp, R))) {
Changed = true;
}
// Try to vectorize trees that start at insertelement instructions.
- if (InsertElementInst *IE = dyn_cast<InsertElementInst>(it)) {
- SmallVector<Value *, 8> Ops;
- if (!findBuildVector(IE, Ops))
+ if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
+ SmallVector<Value *, 16> BuildVector;
+ SmallVector<Value *, 16> BuildVectorOpds;
+ if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
continue;
- if (tryToVectorizeList(Ops, R)) {
+ // Vectorize starting with the build vector operands ignoring the
+ // BuildVector instructions for the purpose of scheduling and user
+ // extraction.
+ if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
Changed = true;
it = BB->begin();
e = BB->end();