//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/VectorUtils.h"
+#include "llvm/Analysis/VectorUtils.h"
#include <algorithm>
#include <map>
#include <memory>
cl::desc(
"Attempt to vectorize horizontal reductions feeding into a store"));
+static cl::opt<int>
+MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
+ cl::desc("Attempt to vectorize for this register size in bits"));
+
namespace {
+// FIXME: Set this via cl::opt to allow overriding.
static const unsigned MinVecRegSize = 128;
static const unsigned RecursionMaxDepth = 12;
+// Limit the number of alias checks. The limit is chosen so that
+// it has no negative effect on the llvm benchmarks.
+static const unsigned AliasedCheckLimit = 10;
+
+// Another limit for the alias checks: The maximum distance between load/store
+// instructions where alias checks are done.
+// This limit is useful for very large basic blocks.
+static const unsigned MaxMemDepDistance = 160;
+
+/// \brief Predicate for the element types that the SLP vectorizer supports.
+///
+/// The most important thing to filter here are types which are invalid in LLVM
+/// vectors. We also filter target specific types which have absolutely no
+/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
+/// avoids spending time checking the cost model and realizing that they will
+/// be inevitably scalarized.
+static bool isValidElementType(Type *Ty) {
+ return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
+ !Ty->isPPC_FP128Ty();
+}
+
/// \returns the parent basic block if all of the instructions in \p VL
/// are in the same block or null otherwise.
static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
return Opcode;
}
+/// Get the intersection (logical and) of all of the potential IR flags
+/// of each scalar operation (VL) that will be converted into a vector (I).
+/// Flag set: NSW, NUW, exact, and all of fast-math.
+static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
+ if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
+ if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
+ // Intersection is initialized to the 0th scalar,
+ // so start counting from index '1'.
+ for (int i = 1, e = VL.size(); i < e; ++i) {
+ if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
+ Intersection->andIRFlags(Scalar);
+ }
+ VecOp->copyIRFlags(Intersection);
+ }
+ }
+}
+
/// \returns \p I after propagating metadata from \p VL.
static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
Instruction *I0 = cast<Instruction>(VL[0]);
MD = MDNode::getMostGenericTBAA(MD, IMD);
break;
case LLVMContext::MD_alias_scope:
+ MD = MDNode::getMostGenericAliasScope(MD, IMD);
+ break;
case LLVMContext::MD_noalias:
MD = MDNode::intersect(MD, IMD);
break;
return true;
}
-static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
- SmallVectorImpl<Value *> &Left,
- SmallVectorImpl<Value *> &Right) {
-
- SmallVector<Value *, 16> OrigLeft, OrigRight;
-
- bool AllSameOpcodeLeft = true;
- bool AllSameOpcodeRight = true;
- for (unsigned i = 0, e = VL.size(); i != e; ++i) {
- Instruction *I = cast<Instruction>(VL[i]);
- Value *V0 = I->getOperand(0);
- Value *V1 = I->getOperand(1);
-
- OrigLeft.push_back(V0);
- OrigRight.push_back(V1);
-
- Instruction *I0 = dyn_cast<Instruction>(V0);
- Instruction *I1 = dyn_cast<Instruction>(V1);
-
- // Check whether all operands on one side have the same opcode. In this case
- // we want to preserve the original order and not make things worse by
- // reordering.
- AllSameOpcodeLeft = I0;
- AllSameOpcodeRight = I1;
-
- if (i && AllSameOpcodeLeft) {
- if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) {
- if(P0->getOpcode() != I0->getOpcode())
- AllSameOpcodeLeft = false;
- } else
- AllSameOpcodeLeft = false;
- }
- if (i && AllSameOpcodeRight) {
- if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) {
- if(P1->getOpcode() != I1->getOpcode())
- AllSameOpcodeRight = false;
- } else
- AllSameOpcodeRight = false;
- }
+/// \returns True if in-tree use also needs extract. This refers to
+/// possible scalar operand in vectorized instruction.
+static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
+ TargetLibraryInfo *TLI) {
- // Sort two opcodes. In the code below we try to preserve the ability to use
- // broadcast of values instead of individual inserts.
- // vl1 = load
- // vl2 = phi
- // vr1 = load
- // vr2 = vr2
- // = vl1 x vr1
- // = vl2 x vr2
- // If we just sorted according to opcode we would leave the first line in
- // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
- // = vl1 x vr1
- // = vr2 x vl2
- // Because vr2 and vr1 are from the same load we loose the opportunity of a
- // broadcast for the packed right side in the backend: we have [vr1, vl2]
- // instead of [vr1, vr2=vr1].
- if (I0 && I1) {
- if(!i && I0->getOpcode() > I1->getOpcode()) {
- Left.push_back(I1);
- Right.push_back(I0);
- } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) {
- // Try not to destroy a broad cast for no apparent benefit.
- Left.push_back(I1);
- Right.push_back(I0);
- } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] == I0) {
- // Try preserve broadcasts.
- Left.push_back(I1);
- Right.push_back(I0);
- } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) {
- // Try preserve broadcasts.
- Left.push_back(I1);
- Right.push_back(I0);
- } else {
- Left.push_back(I0);
- Right.push_back(I1);
- }
- continue;
- }
- // One opcode, put the instruction on the right.
- if (I0) {
- Left.push_back(V1);
- Right.push_back(I0);
- continue;
+ unsigned Opcode = UserInst->getOpcode();
+ switch (Opcode) {
+ case Instruction::Load: {
+ LoadInst *LI = cast<LoadInst>(UserInst);
+ return (LI->getPointerOperand() == Scalar);
+ }
+ case Instruction::Store: {
+ StoreInst *SI = cast<StoreInst>(UserInst);
+ return (SI->getPointerOperand() == Scalar);
+ }
+ case Instruction::Call: {
+ CallInst *CI = cast<CallInst>(UserInst);
+ Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+ if (hasVectorInstrinsicScalarOpd(ID, 1)) {
+ return (CI->getArgOperand(1) == Scalar);
}
- Left.push_back(V0);
- Right.push_back(V1);
}
+ default:
+ return false;
+ }
+}
- bool LeftBroadcast = isSplat(Left);
- bool RightBroadcast = isSplat(Right);
+/// \returns the AA location that is being access by the instruction.
+static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return MemoryLocation::get(SI);
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ return MemoryLocation::get(LI);
+ return MemoryLocation();
+}
- // Don't reorder if the operands where good to begin with.
- if (!(LeftBroadcast || RightBroadcast) &&
- (AllSameOpcodeRight || AllSameOpcodeLeft)) {
- Left = OrigLeft;
- Right = OrigRight;
- }
+/// \returns True if the instruction is not a volatile or atomic load/store.
+static bool isSimple(Instruction *I) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ return LI->isSimple();
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->isSimple();
+ if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+ return !MI->isVolatile();
+ return true;
}
/// Bottom Up SLP Vectorizer.
typedef SmallPtrSet<Value *, 16> ValueSet;
typedef SmallVector<StoreInst *, 8> StoreList;
- BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl,
- TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa,
- LoopInfo *Li, DominatorTree *Dt)
- : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0),
- F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
- Builder(Se->getContext()) {}
+ BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
+ TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
+ DominatorTree *Dt, AssumptionCache *AC)
+ : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
+ SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
+ Builder(Se->getContext()) {
+ CodeMetrics::collectEphemeralValues(F, AC, EphValues);
+ }
/// \brief Vectorize the tree that starts with the elements in \p VL.
/// Returns the vectorized root.
}
/// \returns true if the memory operations A and B are consecutive.
- bool isConsecutiveAccess(Value *A, Value *B);
+ bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL);
/// \brief Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();
/// be beneficial even the tree height is tiny.
bool isFullyVectorizableTinyTree();
+ /// \reorder commutative operands in alt shuffle if they result in
+ /// vectorized code.
+ void reorderAltShuffleOperands(ArrayRef<Value *> VL,
+ SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right);
+ /// \reorder commutative operands to get better probability of
+ /// generating vectorized code.
+ void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+ SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right);
struct TreeEntry {
TreeEntry() : Scalars(), VectorizedValue(nullptr),
NeedToGather(0) {}
/// Create a new VectorizableTree entry.
TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
- VectorizableTree.push_back(TreeEntry());
+ VectorizableTree.emplace_back();
int idx = VectorizableTree.size() - 1;
TreeEntry *Last = &VectorizableTree[idx];
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
ExternalUser (Value *S, llvm::User *U, int L) :
- Scalar(S), User(U), Lane(L){};
+ Scalar(S), User(U), Lane(L){}
// Which scalar in our function.
Value *Scalar;
// Which user that uses the scalar.
};
typedef SmallVector<ExternalUser, 16> UserList;
+ /// Checks if two instructions may access the same memory.
+ ///
+ /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
+ /// is invariant in the calling loop.
+ bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
+ Instruction *Inst2) {
+
+ // First check if the result is already in the cache.
+ AliasCacheKey key = std::make_pair(Inst1, Inst2);
+ Optional<bool> &result = AliasCache[key];
+ if (result.hasValue()) {
+ return result.getValue();
+ }
+ MemoryLocation Loc2 = getLocation(Inst2, AA);
+ bool aliased = true;
+ if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
+ // Do the alias check.
+ aliased = AA->alias(Loc1, Loc2);
+ }
+ // Store the result in the cache.
+ result = aliased;
+ return aliased;
+ }
+
+ typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
+
+ /// Cache for alias results.
+ /// TODO: consider moving this to the AliasAnalysis itself.
+ DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
+
+ /// Removes an instruction from its block and eventually deletes it.
+ /// It's like Instruction::eraseFromParent() except that the actual deletion
+ /// is delayed until BoUpSLP is destructed.
+ /// This is required to ensure that there are no incorrect collisions in the
+ /// AliasCache, which can happen if a new instruction is allocated at the
+ /// same address as a previously deleted instruction.
+ void eraseInstruction(Instruction *I) {
+ I->removeFromParent();
+ I->dropAllReferences();
+ DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
+ }
+
+ /// Temporary store for deleted instructions. Instructions will be deleted
+ /// eventually when the BoUpSLP is destructed.
+ SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
+
/// A list of values that need to extracted out of the tree.
/// This list holds pairs of (Internal Scalar : External User).
UserList ExternalUses;
+ /// Values used only by @llvm.assume calls.
+ SmallPtrSet<const Value *, 32> EphValues;
+
/// Holds all of the instructions that we gathered.
SetVector<Instruction *> GatherSeq;
/// A list of blocks that we are going to CSE.
/// Checks if a bundle of instructions can be scheduled, i.e. has no
/// cyclic dependencies. This is only a dry-run, no instructions are
/// actually moved at this stage.
- bool tryScheduleBundle(ArrayRef<Value *> VL, AliasAnalysis *AA);
+ bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
/// Un-bundles a group of instructions.
void cancelScheduling(ArrayRef<Value *> VL);
/// Updates the dependency information of a bundle and of all instructions/
/// bundles which depend on the original bundle.
void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
- AliasAnalysis *AA);
+ BoUpSLP *SLP);
/// Sets all instruction in the scheduling region to un-scheduled.
void resetSchedule();
};
/// Attaches the BlockScheduling structures to basic blocks.
- DenseMap<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
+ MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
/// Performs the "real" scheduling. Done before vectorization is actually
/// performed in a basic block.
// Analysis and block reference.
Function *F;
ScalarEvolution *SE;
- const DataLayout *DL;
TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;
AliasAnalysis *AA;
for (User *U : Scalar->users()) {
DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
- // Skip in-tree scalars that become vectors.
- if (ScalarToTreeEntry.count(U)) {
- DEBUG(dbgs() << "SLP: \tInternal user will be removed:" <<
- *U << ".\n");
- int Idx = ScalarToTreeEntry[U]; (void) Idx;
- assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
- continue;
- }
Instruction *UserInst = dyn_cast<Instruction>(U);
if (!UserInst)
continue;
+ // Skip in-tree scalars that become vectors
+ if (ScalarToTreeEntry.count(U)) {
+ int Idx = ScalarToTreeEntry[U];
+ TreeEntry *UseEntry = &VectorizableTree[Idx];
+ Value *UseScalar = UseEntry->Scalars[0];
+ // Some in-tree scalars will remain as scalar in vectorized
+ // instructions. If that is the case, the one in Lane 0 will
+ // be used.
+ if (UseScalar != U ||
+ !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
+ DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
+ << ".\n");
+ assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
+ continue;
+ }
+ }
+
// Ignore users in the user ignore list.
if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
UserIgnoreList.end())
// We now know that this is a vector of instructions of the same type from
// the same block.
+ // Don't vectorize ephemeral values.
+ for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+ if (EphValues.count(VL[i])) {
+ DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
+ ") is ephemeral.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+
// Check if this is a duplicate of another entry.
if (ScalarToTreeEntry.count(VL[0])) {
int Idx = ScalarToTreeEntry[VL[0]];
}
}
- // If any of the scalars appears in the table OR it is marked as a value that
- // needs to stat scalar then we need to gather the scalars.
+ // If any of the scalars is marked as a value that needs to stay scalar then
+ // we need to gather the scalars.
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
- if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) {
- DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n");
+ if (MustGather.count(VL[i])) {
+ DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
newTreeEntry(VL, false);
return;
}
}
BlockScheduling &BS = *BSRef.get();
- if (!BS.tryScheduleBundle(VL, AA)) {
+ if (!BS.tryScheduleBundle(VL, this)) {
DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
return;
}
- if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
- if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) {
+ const DataLayout &DL = F->getParent()->getDataLayout();
+ if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
+ if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
++NumLoadsWantToChangeOrder;
}
BS.cancelScheduling(VL);
Type *SrcTy = VL0->getOperand(0)->getType();
for (unsigned i = 0; i < VL.size(); ++i) {
Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
- if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) {
+ if (Ty != SrcTy || !isValidElementType(Ty)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
case Instruction::ICmp:
case Instruction::FCmp: {
// Check that all of the compares have the same predicate.
- CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
+ CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
for (unsigned i = 1, e = VL.size(); i < e; ++i) {
CmpInst *Cmp = cast<CmpInst>(VL[i]);
}
return;
}
+ case Instruction::GetElementPtr: {
+ // We don't combine GEPs with complicated (nested) indexing.
+ for (unsigned j = 0; j < VL.size(); ++j) {
+ if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
+ DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+
+ // We can't combine several GEPs into one vector if they operate on
+ // different types.
+ Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
+ for (unsigned j = 0; j < VL.size(); ++j) {
+ Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
+ if (Ty0 != CurTy) {
+ DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+
+ // We don't combine GEPs with non-constant indexes.
+ for (unsigned j = 0; j < VL.size(); ++j) {
+ auto Op = cast<Instruction>(VL[j])->getOperand(1);
+ if (!isa<ConstantInt>(Op)) {
+ DEBUG(
+ dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+ BS.cancelScheduling(VL);
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+ for (unsigned i = 0, e = 2; i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+ buildTree_rec(Operands, Depth + 1);
+ }
+ return;
+ }
case Instruction::Store: {
+ const DataLayout &DL = F->getParent()->getDataLayout();
// Check if the stores are consecutive or of we need to swizzle them.
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
- if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
+ if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
BS.cancelScheduling(VL);
newTreeEntry(VL, false);
DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
}
newTreeEntry(VL, true);
DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+
+ // Reorder operands if reordering would enable vectorization.
+ if (isa<BinaryOperator>(VL0)) {
+ ValueList Left, Right;
+ reorderAltShuffleOperands(VL, Left, Right);
+ buildTree_rec(Left, Depth + 1);
+ buildTree_rec(Right, Depth + 1);
+ return;
+ }
+
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
}
return VecCost - ScalarCost;
}
+ case Instruction::GetElementPtr: {
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_UniformConstantValue;
+
+ int ScalarCost =
+ VecTy->getNumElements() *
+ TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
+ int VecCost =
+ TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
+
+ return VecCost - ScalarCost;
+ }
case Instruction::Load: {
// Cost of wide load - cost of scalar loads.
int ScalarLdCost = VecTy->getNumElements() *
if (VectorizableTree.size() != 2)
return false;
- // Handle splat stores.
- if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars))
+ // Handle splat and all-constants stores.
+ if (!VectorizableTree[0].NeedToGather &&
+ (allConstant(VectorizableTree[1].Scalars) ||
+ isSplat(VectorizableTree[1].Scalars)))
return true;
// Gathering cost would be too much for tiny trees.
// We only vectorize tiny trees if it is fully vectorizable.
if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
- if (!VectorizableTree.size()) {
+ if (VectorizableTree.empty()) {
assert(!ExternalUses.size() && "We should not have any external users");
}
return INT_MAX;
for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
I != E; ++I) {
// We only add extract cost once for the same scalar.
- if (!ExtractCostCalculated.insert(I->Scalar))
+ if (!ExtractCostCalculated.insert(I->Scalar).second)
+ continue;
+
+ // Uses by ephemeral values are free (because the ephemeral value will be
+ // removed prior to code generation, and so the extraction will be
+ // removed as well).
+ if (EphValues.count(I->User))
continue;
VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
return -1;
}
-bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
+bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL) {
Value *PtrA = getPointerOperand(A);
Value *PtrB = getPointerOperand(B);
unsigned ASA = getAddressSpaceOperand(A);
if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
return false;
- unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA);
+ unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
- APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty));
+ APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
- PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA);
- PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB);
+ PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+ PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
APInt OffsetDelta = OffsetB - OffsetA;
return X == PtrSCEVB;
}
+// Reorder commutative operations in alternate shuffle if the resulting vectors
+// are consecutive loads. This would allow us to vectorize the tree.
+// If we have something like-
+// load a[0] - load b[0]
+// load b[1] + load a[1]
+// load a[2] - load b[2]
+// load a[3] + load b[3]
+// Reordering the second load b[1] load a[1] would allow us to vectorize this
+// code.
+void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
+ SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right) {
+ const DataLayout &DL = F->getParent()->getDataLayout();
+
+ // Push left and right operands of binary operation into Left and Right
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ Left.push_back(cast<Instruction>(VL[i])->getOperand(0));
+ Right.push_back(cast<Instruction>(VL[i])->getOperand(1));
+ }
+
+ // Reorder if we have a commutative operation and consecutive access
+ // are on either side of the alternate instructions.
+ for (unsigned j = 0; j < VL.size() - 1; ++j) {
+ if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
+ if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
+ Instruction *VL1 = cast<Instruction>(VL[j]);
+ Instruction *VL2 = cast<Instruction>(VL[j + 1]);
+ if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
+ std::swap(Left[j], Right[j]);
+ continue;
+ } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
+ std::swap(Left[j + 1], Right[j + 1]);
+ continue;
+ }
+ // else unchanged
+ }
+ }
+ if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
+ if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
+ Instruction *VL1 = cast<Instruction>(VL[j]);
+ Instruction *VL2 = cast<Instruction>(VL[j + 1]);
+ if (isConsecutiveAccess(L, L1, DL) && VL1->isCommutative()) {
+ std::swap(Left[j], Right[j]);
+ continue;
+ } else if (isConsecutiveAccess(L, L1, DL) && VL2->isCommutative()) {
+ std::swap(Left[j + 1], Right[j + 1]);
+ continue;
+ }
+ // else unchanged
+ }
+ }
+ }
+}
+
+void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+ SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right) {
+
+ SmallVector<Value *, 16> OrigLeft, OrigRight;
+
+ bool AllSameOpcodeLeft = true;
+ bool AllSameOpcodeRight = true;
+ for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+ Instruction *I = cast<Instruction>(VL[i]);
+ Value *VLeft = I->getOperand(0);
+ Value *VRight = I->getOperand(1);
+
+ OrigLeft.push_back(VLeft);
+ OrigRight.push_back(VRight);
+
+ Instruction *ILeft = dyn_cast<Instruction>(VLeft);
+ Instruction *IRight = dyn_cast<Instruction>(VRight);
+
+ // Check whether all operands on one side have the same opcode. In this case
+ // we want to preserve the original order and not make things worse by
+ // reordering.
+ if (i && AllSameOpcodeLeft && ILeft) {
+ if (Instruction *PLeft = dyn_cast<Instruction>(OrigLeft[i - 1])) {
+ if (PLeft->getOpcode() != ILeft->getOpcode())
+ AllSameOpcodeLeft = false;
+ } else
+ AllSameOpcodeLeft = false;
+ }
+ if (i && AllSameOpcodeRight && IRight) {
+ if (Instruction *PRight = dyn_cast<Instruction>(OrigRight[i - 1])) {
+ if (PRight->getOpcode() != IRight->getOpcode())
+ AllSameOpcodeRight = false;
+ } else
+ AllSameOpcodeRight = false;
+ }
+
+ // Sort two opcodes. In the code below we try to preserve the ability to use
+ // broadcast of values instead of individual inserts.
+ // vl1 = load
+ // vl2 = phi
+ // vr1 = load
+ // vr2 = vr2
+ // = vl1 x vr1
+ // = vl2 x vr2
+ // If we just sorted according to opcode we would leave the first line in
+ // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
+ // = vl1 x vr1
+ // = vr2 x vl2
+ // Because vr2 and vr1 are from the same load we loose the opportunity of a
+ // broadcast for the packed right side in the backend: we have [vr1, vl2]
+ // instead of [vr1, vr2=vr1].
+ if (ILeft && IRight) {
+ if (!i && ILeft->getOpcode() > IRight->getOpcode()) {
+ Left.push_back(IRight);
+ Right.push_back(ILeft);
+ } else if (i && ILeft->getOpcode() > IRight->getOpcode() &&
+ Right[i - 1] != IRight) {
+ // Try not to destroy a broad cast for no apparent benefit.
+ Left.push_back(IRight);
+ Right.push_back(ILeft);
+ } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
+ Right[i - 1] == ILeft) {
+ // Try preserve broadcasts.
+ Left.push_back(IRight);
+ Right.push_back(ILeft);
+ } else if (i && ILeft->getOpcode() == IRight->getOpcode() &&
+ Left[i - 1] == IRight) {
+ // Try preserve broadcasts.
+ Left.push_back(IRight);
+ Right.push_back(ILeft);
+ } else {
+ Left.push_back(ILeft);
+ Right.push_back(IRight);
+ }
+ continue;
+ }
+ // One opcode, put the instruction on the right.
+ if (ILeft) {
+ Left.push_back(VRight);
+ Right.push_back(ILeft);
+ continue;
+ }
+ Left.push_back(VLeft);
+ Right.push_back(VRight);
+ }
+
+ bool LeftBroadcast = isSplat(Left);
+ bool RightBroadcast = isSplat(Right);
+
+ // If operands end up being broadcast return this operand order.
+ if (LeftBroadcast || RightBroadcast)
+ return;
+
+ // Don't reorder if the operands where good to begin.
+ if (AllSameOpcodeRight || AllSameOpcodeLeft) {
+ Left = OrigLeft;
+ Right = OrigRight;
+ }
+
+ const DataLayout &DL = F->getParent()->getDataLayout();
+
+ // Finally check if we can get longer vectorizable chain by reordering
+ // without breaking the good operand order detected above.
+ // E.g. If we have something like-
+ // load a[0] load b[0]
+ // load b[1] load a[1]
+ // load a[2] load b[2]
+ // load a[3] load b[3]
+ // Reordering the second load b[1] load a[1] would allow us to vectorize
+ // this code and we still retain AllSameOpcode property.
+ // FIXME: This load reordering might break AllSameOpcode in some rare cases
+ // such as-
+ // add a[0],c[0] load b[0]
+ // add a[1],c[2] load b[1]
+ // b[2] load b[2]
+ // add a[3],c[3] load b[3]
+ for (unsigned j = 0; j < VL.size() - 1; ++j) {
+ if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
+ if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
+ if (isConsecutiveAccess(L, L1, DL)) {
+ std::swap(Left[j + 1], Right[j + 1]);
+ continue;
+ }
+ }
+ }
+ if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
+ if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
+ if (isConsecutiveAccess(L, L1, DL)) {
+ std::swap(Left[j + 1], Right[j + 1]);
+ continue;
+ }
+ }
+ }
+ // else unchanged
+ }
+}
+
void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
Instruction *VL0 = cast<Instruction>(VL[0]);
BasicBlock::iterator NextInst = VL0;
return Gather(E->Scalars, VecTy);
}
+ const DataLayout &DL = F->getParent()->getDataLayout();
unsigned Opcode = getSameOpcode(E->Scalars);
switch (Opcode) {
ValueList Operands;
BasicBlock *IBB = PH->getIncomingBlock(i);
- if (!VisitedBBs.insert(IBB)) {
+ if (!VisitedBBs.insert(IBB).second) {
NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
continue;
}
// Prepare the operand vector.
- for (unsigned j = 0; j < E->Scalars.size(); ++j)
- Operands.push_back(cast<PHINode>(E->Scalars[j])->
- getIncomingValueForBlock(IBB));
+ for (Value *V : E->Scalars)
+ Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
Builder.SetInsertPoint(IBB->getTerminator());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
case Instruction::FPTrunc:
case Instruction::BitCast: {
ValueList INVL;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i)
- INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+ for (Value *V : E->Scalars)
+ INVL.push_back(cast<Instruction>(V)->getOperand(0));
setInsertPointAfterBundle(E->Scalars);
case Instruction::FCmp:
case Instruction::ICmp: {
ValueList LHSV, RHSV;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
- RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+ for (Value *V : E->Scalars) {
+ LHSV.push_back(cast<Instruction>(V)->getOperand(0));
+ RHSV.push_back(cast<Instruction>(V)->getOperand(1));
}
setInsertPointAfterBundle(E->Scalars);
if (Value *V = alreadyVectorized(E->Scalars))
return V;
- CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
+ CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V;
if (Opcode == Instruction::FCmp)
V = Builder.CreateFCmp(P0, L, R);
}
case Instruction::Select: {
ValueList TrueVec, FalseVec, CondVec;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
- TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
- FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
+ for (Value *V : E->Scalars) {
+ CondVec.push_back(cast<Instruction>(V)->getOperand(0));
+ TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
+ FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
}
setInsertPointAfterBundle(E->Scalars);
if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
else
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
- RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+ for (Value *V : E->Scalars) {
+ LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
+ RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
}
setInsertPointAfterBundle(E->Scalars);
BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
E->VectorizedValue = V;
+ propagateIRFlags(E->VectorizedValue, E->Scalars);
++NumVectorInstructions;
if (Instruction *I = dyn_cast<Instruction>(V))
Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
VecTy->getPointerTo(AS));
+
+ // The pointer operand uses an in-tree scalar so we add the new BitCast to
+ // ExternalUses list to make sure that an extract will be generated in the
+ // future.
+ if (ScalarToTreeEntry.count(LI->getPointerOperand()))
+ ExternalUses.push_back(
+ ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
+
unsigned Alignment = LI->getAlignment();
LI = Builder.CreateLoad(VecPtr);
- if (!Alignment)
- Alignment = DL->getABITypeAlignment(ScalarLoadTy);
+ if (!Alignment) {
+ Alignment = DL.getABITypeAlignment(ScalarLoadTy);
+ }
LI->setAlignment(Alignment);
E->VectorizedValue = LI;
++NumVectorInstructions;
unsigned AS = SI->getPointerAddressSpace();
ValueList ValueOp;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i)
- ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
+ for (Value *V : E->Scalars)
+ ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
setInsertPointAfterBundle(E->Scalars);
Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
VecTy->getPointerTo(AS));
StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
- if (!Alignment)
- Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
+
+ // The pointer operand uses an in-tree scalar so we add the new BitCast to
+ // ExternalUses list to make sure that an extract will be generated in the
+ // future.
+ if (ScalarToTreeEntry.count(SI->getPointerOperand()))
+ ExternalUses.push_back(
+ ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
+
+ if (!Alignment) {
+ Alignment = DL.getABITypeAlignment(SI->getValueOperand()->getType());
+ }
S->setAlignment(Alignment);
E->VectorizedValue = S;
++NumVectorInstructions;
return propagateMetadata(S, E->Scalars);
}
+ case Instruction::GetElementPtr: {
+ setInsertPointAfterBundle(E->Scalars);
+
+ ValueList Op0VL;
+ for (Value *V : E->Scalars)
+ Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
+
+ Value *Op0 = vectorizeTree(Op0VL);
+
+ std::vector<Value *> OpVecs;
+ for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
+ ++j) {
+ ValueList OpVL;
+ for (Value *V : E->Scalars)
+ OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
+
+ Value *OpVec = vectorizeTree(OpVL);
+ OpVecs.push_back(OpVec);
+ }
+
+ Value *V = Builder.CreateGEP(
+ cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ return propagateMetadata(I, E->Scalars);
+
+ return V;
+ }
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
setInsertPointAfterBundle(E->Scalars);
Function *FI;
Intrinsic::ID IID = Intrinsic::not_intrinsic;
+ Value *ScalarArg = nullptr;
if (CI && (FI = CI->getCalledFunction())) {
- IID = (Intrinsic::ID) FI->getIntrinsicID();
+ IID = FI->getIntrinsicID();
}
std::vector<Value *> OpVecs;
for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
// a scalar. This argument should not be vectorized.
if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
CallInst *CEI = cast<CallInst>(E->Scalars[0]);
+ ScalarArg = CEI->getArgOperand(j);
OpVecs.push_back(CEI->getArgOperand(j));
continue;
}
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- CallInst *CEI = cast<CallInst>(E->Scalars[i]);
+ for (Value *V : E->Scalars) {
+ CallInst *CEI = cast<CallInst>(V);
OpVL.push_back(CEI->getArgOperand(j));
}
Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
Value *V = Builder.CreateCall(CF, OpVecs);
+
+ // The scalar argument uses an in-tree scalar so we add the new vectorized
+ // call to ExternalUses list to make sure that an extract will be
+ // generated in the future.
+ if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
+ ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
+
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
}
case Instruction::ShuffleVector: {
ValueList LHSVL, RHSVL;
- for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
- LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
- RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
- }
+ assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
+ reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
setInsertPointAfterBundle(E->Scalars);
Value *LHS = vectorizeTree(LHSVL);
BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
- // Create appropriate shuffle to take alternative operations from
- // the vector.
- std::vector<Constant *> Mask(E->Scalars.size());
+ // Create shuffle to take alternate operations from the vector.
+ // Also, gather up odd and even scalar ops to propagate IR flags to
+ // each vector operation.
+ ValueList OddScalars, EvenScalars;
unsigned e = E->Scalars.size();
+ SmallVector<Constant *, 8> Mask(e);
for (unsigned i = 0; i < e; ++i) {
- if (i & 1)
+ if (i & 1) {
Mask[i] = Builder.getInt32(e + i);
- else
+ OddScalars.push_back(E->Scalars[i]);
+ } else {
Mask[i] = Builder.getInt32(i);
+ EvenScalars.push_back(E->Scalars[i]);
+ }
}
Value *ShuffleMask = ConstantVector::get(Mask);
+ propagateIRFlags(V0, EvenScalars);
+ propagateIRFlags(V1, OddScalars);
Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
E->VectorizedValue = V;
Scalar->replaceAllUsesWith(Undef);
}
DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
- cast<Instruction>(Scalar)->eraseFromParent();
+ eraseInstruction(cast<Instruction>(Scalar));
}
}
if (In->isIdenticalTo(*v) &&
DT->dominates((*v)->getParent(), In->getParent())) {
In->replaceAllUsesWith(*v);
- In->eraseFromParent();
+ eraseInstruction(In);
In = nullptr;
break;
}
// Groups the instructions to a bundle (which is then a single scheduling entity)
// and schedules instructions until the bundle gets ready.
bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
- AliasAnalysis *AA) {
+ BoUpSLP *SLP) {
if (isa<PHINode>(VL[0]))
return true;
DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
<< BB->getName() << "\n");
- calculateDependencies(Bundle, true, AA);
+ calculateDependencies(Bundle, true, SLP);
// Now try to schedule the new bundle. As soon as the bundle is "ready" it
// means that there are no cyclic dependencies and we can schedule it.
}
}
-/// \returns the AA location that is being access by the instruction.
-static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) {
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return AA->getLocation(SI);
- if (LoadInst *LI = dyn_cast<LoadInst>(I))
- return AA->getLocation(LI);
- return AliasAnalysis::Location();
-}
-
void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
- AliasAnalysis *AA) {
+ BoUpSLP *SLP) {
assert(SD->isSchedulingEntity());
SmallVector<ScheduleData *, 10> WorkList;
// Handle the memory dependencies.
ScheduleData *DepDest = BundleMember->NextLoadStore;
if (DepDest) {
- AliasAnalysis::Location SrcLoc = getLocation(BundleMember->Inst, AA);
+ Instruction *SrcInst = BundleMember->Inst;
+ MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
+ unsigned numAliased = 0;
+ unsigned DistToSrc = 1;
while (DepDest) {
assert(isInSchedulingRegion(DepDest));
- if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) {
- AliasAnalysis::Location DstLoc = getLocation(DepDest->Inst, AA);
- if (!SrcLoc.Ptr || !DstLoc.Ptr || AA->alias(SrcLoc, DstLoc)) {
- DepDest->MemoryDependencies.push_back(BundleMember);
- BundleMember->Dependencies++;
- ScheduleData *DestBundle = DepDest->FirstInBundle;
- if (!DestBundle->IsScheduled) {
- BundleMember->incrementUnscheduledDeps(1);
- }
- if (!DestBundle->hasValidDependencies()) {
- WorkList.push_back(DestBundle);
- }
+
+ // We have two limits to reduce the complexity:
+ // 1) AliasedCheckLimit: It's a small limit to reduce calls to
+ // SLP->isAliased (which is the expensive part in this loop).
+ // 2) MaxMemDepDistance: It's for very large blocks and it aborts
+ // the whole loop (even if the loop is fast, it's quadratic).
+ // It's important for the loop break condition (see below) to
+ // check this limit even between two read-only instructions.
+ if (DistToSrc >= MaxMemDepDistance ||
+ ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
+ (numAliased >= AliasedCheckLimit ||
+ SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
+
+ // We increment the counter only if the locations are aliased
+ // (instead of counting all alias checks). This gives a better
+ // balance between reduced runtime and accurate dependencies.
+ numAliased++;
+
+ DepDest->MemoryDependencies.push_back(BundleMember);
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = DepDest->FirstInBundle;
+ if (!DestBundle->IsScheduled) {
+ BundleMember->incrementUnscheduledDeps(1);
+ }
+ if (!DestBundle->hasValidDependencies()) {
+ WorkList.push_back(DestBundle);
}
}
DepDest = DepDest->NextLoadStore;
+
+ // Example, explaining the loop break condition: Let's assume our
+ // starting instruction is i0 and MaxMemDepDistance = 3.
+ //
+ // +--------v--v--v
+ // i0,i1,i2,i3,i4,i5,i6,i7,i8
+ // +--------^--^--^
+ //
+ // MaxMemDepDistance let us stop alias-checking at i3 and we add
+ // dependencies from i0 to i3,i4,.. (even if they are not aliased).
+ // Previously we already added dependencies from i3 to i6,i7,i8
+ // (because of MaxMemDepDistance). As we added a dependency from
+ // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
+ // and we can abort this loop at i6.
+ if (DistToSrc >= 2 * MaxMemDepDistance)
+ break;
+ DistToSrc++;
}
}
}
"scheduler and vectorizer have different opinion on what is a bundle");
SD->FirstInBundle->SchedulingPriority = Idx++;
if (SD->isSchedulingEntity()) {
- BS->calculateDependencies(SD, false, AA);
+ BS->calculateDependencies(SD, false, this);
NumToSchedule++;
}
}
}
ScalarEvolution *SE;
- const DataLayout *DL;
TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;
AliasAnalysis *AA;
LoopInfo *LI;
DominatorTree *DT;
+ AssumptionCache *AC;
bool runOnFunction(Function &F) override {
if (skipOptnoneFunction(F))
return false;
SE = &getAnalysis<ScalarEvolution>();
- DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
- DL = DLP ? &DLP->getDataLayout() : nullptr;
- TTI = &getAnalysis<TargetTransformInfo>();
- TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ TLI = TLIP ? &TLIP->getTLI() : nullptr;
AA = &getAnalysis<AliasAnalysis>();
- LI = &getAnalysis<LoopInfo>();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
StoreRefs.clear();
bool Changed = false;
if (!TTI->getNumberOfRegisters(true))
return false;
- // Must have DataLayout. We can't require it because some tests run w/o
- // triple.
- if (!DL)
- return false;
+ // Use the vector register size specified by the target unless overridden
+ // by a command-line option.
+ // TODO: It would be better to limit the vectorization factor based on
+ // data type rather than just register size. For example, x86 AVX has
+ // 256-bit registers, but it does not support integer operations
+ // at that width (that requires AVX2).
+ if (MaxVectorRegSizeOption.getNumOccurrences())
+ MaxVecRegSize = MaxVectorRegSizeOption;
+ else
+ MaxVecRegSize = TTI->getRegisterBitWidth(true);
// Don't vectorize when the attribute NoImplicitFloat is used.
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
// Use the bottom up slp vectorizer to construct chains that start with
// store instructions.
- BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT);
+ BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
+
+ // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
+ // delete instructions.
// Scan the blocks in the function in post order.
- for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
- e = po_end(&F.getEntryBlock()); it != e; ++it) {
- BasicBlock *BB = *it;
+ for (auto BB : post_order(&F.getEntryBlock())) {
// Vectorize trees that end at stores.
if (unsigned count = collectStores(BB, R)) {
(void)count;
void getAnalysisUsage(AnalysisUsage &AU) const override {
FunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<ScalarEvolution>();
AU.addRequired<AliasAnalysis>();
- AU.addRequired<TargetTransformInfo>();
- AU.addRequired<LoopInfo>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<LoopInfo>();
+ AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.setPreservesCFG();
}
bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
- BoUpSLP &R);
+ BoUpSLP &R, unsigned VecRegSize);
bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
BoUpSLP &R);
private:
StoreListMap StoreRefs;
+ unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
};
/// \brief Check that the Values in the slice in VL array are still existent in
/// the WeakVH array.
/// Vectorization of part of the VL array may cause later values in the VL array
/// to become invalid. We track when this has happened in the WeakVH array.
-static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL,
- SmallVectorImpl<WeakVH> &VH,
- unsigned SliceBegin,
- unsigned SliceSize) {
- for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i)
- if (VH[i] != VL[i])
- return true;
-
- return false;
+static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
+ unsigned SliceBegin, unsigned SliceSize) {
+ VL = VL.slice(SliceBegin, SliceSize);
+ VH = VH.slice(SliceBegin, SliceSize);
+ return !std::equal(VL.begin(), VL.end(), VH.begin());
}
bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
- int CostThreshold, BoUpSLP &R) {
+ int CostThreshold, BoUpSLP &R,
+ unsigned VecRegSize) {
unsigned ChainLen = Chain.size();
DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
<< "\n");
Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
- unsigned Sz = DL->getTypeSizeInBits(StoreTy);
- unsigned VF = MinVecRegSize / Sz;
+ auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
+ unsigned Sz = DL.getTypeSizeInBits(StoreTy);
+ unsigned VF = VecRegSize / Sz;
if (!isPowerOf2_32(Sz) || VF < 2)
return false;
bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
int costThreshold, BoUpSLP &R) {
- SetVector<Value *> Heads, Tails;
- SmallDenseMap<Value *, Value *> ConsecutiveChain;
+ SetVector<StoreInst *> Heads, Tails;
+ SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
// We may run into multiple chains that merge into a single chain. We mark the
// stores that we vectorized so that we don't visit the same store twice.
// Do a quadratic search on all of the given stores and find
// all of the pairs of stores that follow each other.
+ SmallVector<unsigned, 16> IndexQueue;
for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
- for (unsigned j = 0; j < e; ++j) {
- if (i == j)
- continue;
-
- if (R.isConsecutiveAccess(Stores[i], Stores[j])) {
- Tails.insert(Stores[j]);
+ const DataLayout &DL = Stores[i]->getModule()->getDataLayout();
+ IndexQueue.clear();
+ // If a store has multiple consecutive store candidates, search Stores
+ // array according to the sequence: from i+1 to e, then from i-1 to 0.
+ // This is because usually pairing with immediate succeeding or preceding
+ // candidate create the best chance to find slp vectorization opportunity.
+ unsigned j = 0;
+ for (j = i + 1; j < e; ++j)
+ IndexQueue.push_back(j);
+ for (j = i; j > 0; --j)
+ IndexQueue.push_back(j - 1);
+
+ for (auto &k : IndexQueue) {
+ if (R.isConsecutiveAccess(Stores[i], Stores[k], DL)) {
+ Tails.insert(Stores[k]);
Heads.insert(Stores[i]);
- ConsecutiveChain[Stores[i]] = Stores[j];
+ ConsecutiveChain[Stores[i]] = Stores[k];
+ break;
}
}
}
// For stores that start but don't end a link in the chain:
- for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
+ for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
it != e; ++it) {
if (Tails.count(*it))
continue;
// We found a store instr that starts a chain. Now follow the chain and try
// to vectorize it.
BoUpSLP::ValueList Operands;
- Value *I = *it;
+ StoreInst *I = *it;
// Collect the chain into a list.
while (Tails.count(I) || Heads.count(I)) {
if (VectorizedStores.count(I))
I = ConsecutiveChain[I];
}
- bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R);
-
- // Mark the vectorized stores so that we don't vectorize them again.
- if (Vectorized)
- VectorizedStores.insert(Operands.begin(), Operands.end());
- Changed |= Vectorized;
+ // FIXME: Is division-by-2 the correct step? Should we assert that the
+ // register size is a power-of-2?
+ for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
+ if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
+ // Mark the vectorized stores so that we don't vectorize them again.
+ VectorizedStores.insert(Operands.begin(), Operands.end());
+ Changed = true;
+ break;
+ }
+ }
}
return Changed;
unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
unsigned count = 0;
StoreRefs.clear();
- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
- StoreInst *SI = dyn_cast<StoreInst>(it);
+ const DataLayout &DL = BB->getModule()->getDataLayout();
+ for (Instruction &I : *BB) {
+ StoreInst *SI = dyn_cast<StoreInst>(&I);
if (!SI)
continue;
// Check that the pointer points to scalars.
Type *Ty = SI->getValueOperand()->getType();
- if (Ty->isAggregateType() || Ty->isVectorTy())
+ if (!isValidElementType(Ty))
continue;
// Find the base pointer.
return false;
unsigned Opcode0 = I0->getOpcode();
+ const DataLayout &DL = I0->getModule()->getDataLayout();
Type *Ty0 = I0->getType();
- unsigned Sz = DL->getTypeSizeInBits(Ty0);
+ unsigned Sz = DL.getTypeSizeInBits(Ty0);
+ // FIXME: Register size should be a parameter to this function, so we can
+ // try different vectorization factors.
unsigned VF = MinVecRegSize / Sz;
- for (int i = 0, e = VL.size(); i < e; ++i) {
- Type *Ty = VL[i]->getType();
- if (Ty->isAggregateType() || Ty->isVectorTy())
+ for (Value *V : VL) {
+ Type *Ty = V->getType();
+ if (!isValidElementType(Ty))
return false;
- Instruction *Inst = dyn_cast<Instruction>(VL[i]);
+ Instruction *Inst = dyn_cast<Instruction>(V);
if (!Inst || Inst->getOpcode() != Opcode0)
return false;
}
BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
if (tryToVectorizePair(A, B0, R)) {
- B->moveBefore(V);
return true;
}
if (tryToVectorizePair(A, B1, R)) {
- B->moveBefore(V);
return true;
}
}
BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
if (tryToVectorizePair(A0, B, R)) {
- A->moveBefore(V);
return true;
}
if (tryToVectorizePair(A1, B, R)) {
- A->moveBefore(V);
return true;
}
}
ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
/// \brief Try to find a reduction tree.
- bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
- const DataLayout *DL) {
+ bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
assert((!Phi ||
std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
"Thi phi needs to use the binary operator");
return false;
Type *Ty = B->getType();
- if (Ty->isVectorTy())
+ if (!isValidElementType(Ty))
return false;
+ const DataLayout &DL = B->getModule()->getDataLayout();
ReductionOpcode = B->getOpcode();
ReducedValueOpcode = 0;
- ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty);
+ // FIXME: Register size should be a parameter to this function, so we can
+ // try different vectorization factors.
+ ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
ReductionRoot = B;
ReductionPHI = Phi;
/// \brief Emit a horizontal reduction of the vectorized value.
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
assert(VectorizedValue && "Need to have a vectorized tree node");
- Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue);
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
- Value *TmpVec = ValToReduce;
+ Value *TmpVec = VectorizedValue;
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
if (IsPairwiseReduction) {
Value *LeftMask =
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
// We may go through BB multiple times so skip the one we have checked.
- if (!VisitedInstrs.insert(it))
+ if (!VisitedInstrs.insert(it).second)
continue;
if (isa<DbgInfoIntrinsic>(it))
// Try to match and vectorize a horizontal reduction.
HorizontalReduction HorRdx;
- if (ShouldVectorizeHor &&
- HorRdx.matchAssociativeReduction(P, BI, DL) &&
+ if (ShouldVectorizeHor && HorRdx.matchAssociativeReduction(P, BI) &&
HorRdx.tryToReduce(R, TTI)) {
Changed = true;
it = BB->begin();
if (BinaryOperator *BinOp =
dyn_cast<BinaryOperator>(SI->getValueOperand())) {
HorizontalReduction HorRdx;
- if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) &&
+ if (((HorRdx.matchAssociativeReduction(nullptr, BinOp) &&
HorRdx.tryToReduce(R, TTI)) ||
tryToVectorize(BinOp, R))) {
Changed = true;
}
}
+ // Try to vectorize horizontal reductions feeding into a return.
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
+ if (RI->getNumOperands() != 0)
+ if (BinaryOperator *BinOp =
+ dyn_cast<BinaryOperator>(RI->getOperand(0))) {
+ DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
+ if (tryToVectorizePair(BinOp->getOperand(0),
+ BinOp->getOperand(1), R)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
+ }
+
// Try to vectorize trees that start at compare instructions.
if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
// and the iterator may become invalid value.
it = BB->begin();
e = BB->end();
+ break;
}
}
}
<< it->second.size() << ".\n");
// Process the stores in chunks of 16.
+ // TODO: The limit of 16 inhibits greater vectorization factors.
+ // For example, AVX2 supports v32i8. Increasing this limit, however,
+ // may cause a significant compile-time increase.
for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
unsigned Len = std::min<unsigned>(CE - CI, 16);
Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
static const char lv_name[] = "SLP Vectorizer";
INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)