X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTransforms%2FVectorize%2FSLPVectorizer.cpp;h=73e80565f4dd2c55495dd1436e6943d34e7e238b;hb=5d7a73f866c6729b9cb7a1cca9711b68d125a981;hp=5bc3d852e79569eebaa3bf8ecac98935b5e67a5e;hpb=722b0a4d293b16eebaed94ae65d5f11743cbcea5;p=oota-llvm.git diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 5bc3d852e79..73e80565f4d 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -47,30 +47,36 @@ using namespace llvm; static cl::opt SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, - cl::desc("Only vectorize trees if the gain is above this " - "number. (gain = -cost of vectorization)")); + cl::desc("Only vectorize if you gain more than this " + "number ")); namespace { static const unsigned MinVecRegSize = 128; -static const unsigned RecursionMaxDepth = 6; +static const unsigned RecursionMaxDepth = 12; /// RAII pattern to save the insertion point of the IR builder. class BuilderLocGuard { public: - BuilderLocGuard(IRBuilder<> &B) : Builder(B), Loc(B.GetInsertPoint()) {} - ~BuilderLocGuard() { Builder.SetInsertPoint(Loc); } + BuilderLocGuard(IRBuilder<> &B) : Builder(B), Loc(B.GetInsertPoint()), + DbgLoc(B.getCurrentDebugLocation()) {} + ~BuilderLocGuard() { + Builder.SetCurrentDebugLocation(DbgLoc); + if (Loc) + Builder.SetInsertPoint(Loc); + } private: // Prevent copying. BuilderLocGuard(const BuilderLocGuard &); BuilderLocGuard &operator=(const BuilderLocGuard &); IRBuilder<> &Builder; - BasicBlock::iterator Loc; + AssertingVH Loc; + DebugLoc DbgLoc; }; -/// A helper class for numbering instructions in multible blocks. -/// Numbers starts at zero for each basic block. +/// A helper class for numbering instructions in multiple blocks. +/// Numbers start at zero for each basic block. struct BlockNumbering { BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {} @@ -91,6 +97,7 @@ struct BlockNumbering { } int getIndex(Instruction *I) { + assert(I->getParent() == BB && "Invalid instruction"); if (!Valid) numberInstructions(); assert(InstrIdx.count(I) && "Unknown instruction"); @@ -114,80 +121,182 @@ private: /// Maps instructions to numbers and back. SmallDenseMap InstrIdx; /// Maps integers to Instructions. - std::vector InstrVec; + SmallVector InstrVec; }; -class FuncSLP { +/// \returns the parent basic block if all of the instructions in \p VL +/// are in the same block or null otherwise. +static BasicBlock *getSameBlock(ArrayRef VL) { + Instruction *I0 = dyn_cast(VL[0]); + if (!I0) + return 0; + BasicBlock *BB = I0->getParent(); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast(VL[i]); + if (!I) + return 0; + + if (BB != I->getParent()) + return 0; + } + return BB; +} + +/// \returns True if all of the values in \p VL are constants. +static bool allConstant(ArrayRef VL) { + for (unsigned i = 0, e = VL.size(); i < e; ++i) + if (!isa(VL[i])) + return false; + return true; +} + +/// \returns True if all of the values in \p VL are identical. +static bool isSplat(ArrayRef VL) { + for (unsigned i = 1, e = VL.size(); i < e; ++i) + if (VL[i] != VL[0]) + return false; + return true; +} + +/// \returns The opcode if all of the Instructions in \p VL have the same +/// opcode, or zero. +static unsigned getSameOpcode(ArrayRef VL) { + Instruction *I0 = dyn_cast(VL[0]); + if (!I0) + return 0; + unsigned Opcode = I0->getOpcode(); + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast(VL[i]); + if (!I || Opcode != I->getOpcode()) + return 0; + } + return Opcode; +} + +/// \returns The type that all of the values in \p VL have or null if there +/// are different types. +static Type* getSameType(ArrayRef VL) { + Type *Ty = VL[0]->getType(); + for (int i = 1, e = VL.size(); i < e; i++) + if (VL[i]->getType() != Ty) + return 0; + + return Ty; +} + +/// \returns True if the ExtractElement instructions in VL can be vectorized +/// to use the original vector. +static bool CanReuseExtract(ArrayRef VL) { + assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); + // Check if all of the extracts come from the same vector and from the + // correct offset. + Value *VL0 = VL[0]; + ExtractElementInst *E0 = cast(VL0); + Value *Vec = E0->getOperand(0); + + // We have to extract from the same vector type. + unsigned NElts = Vec->getType()->getVectorNumElements(); + + if (NElts != VL.size()) + return false; + + // Check that all of the indices extract from the correct offset. + ConstantInt *CI = dyn_cast(E0->getOperand(1)); + if (!CI || CI->getZExtValue()) + return false; + + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + ExtractElementInst *E = cast(VL[i]); + ConstantInt *CI = dyn_cast(E->getOperand(1)); + + if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) + return false; + } + + return true; +} + +/// Bottom Up SLP Vectorizer. +class BoUpSLP { +public: typedef SmallVector ValueList; typedef SmallVector InstrList; typedef SmallPtrSet ValueSet; typedef SmallVector StoreList; -public: - static const int MAX_COST = INT_MIN; - - FuncSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl, - TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li, + BoUpSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl, + TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li, DominatorTree *Dt) : F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), DT(Dt), Builder(Se->getContext()) { - for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { - BasicBlock *BB = it; - BlocksNumbers[BB] = BlockNumbering(BB); + // Setup the block numbering utility for all of the blocks in the + // function. + for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { + BasicBlock *BB = it; + BlocksNumbers[BB] = BlockNumbering(BB); + } } - } - /// \brief Take the pointer operand from the Load/Store instruction. - /// \returns NULL if this is not a valid Load/Store instruction. - static Value *getPointerOperand(Value *I); + /// \brief Vectorize the tree that starts with the elements in \p VL. + void vectorizeTree(); - /// \brief Take the address space operand from the Load/Store instruction. - /// \returns -1 if this is not a valid Load/Store instruction. - static unsigned getAddressSpaceOperand(Value *I); + /// \returns the vectorization cost of the subtree that starts at \p VL. + /// A negative number means that this is profitable. + int getTreeCost(); + + /// Construct a vectorizable tree that starts at \p Roots. + void buildTree(ArrayRef Roots); + + /// Clear the internal data structures that are created by 'buildTree'. + void deleteTree() { + VectorizableTree.clear(); + ScalarToTreeEntry.clear(); + MustGather.clear(); + ExternalUses.clear(); + MemBarrierIgnoreList.clear(); + } /// \returns true if the memory operations A and B are consecutive. bool isConsecutiveAccess(Value *A, Value *B); - /// \brief Vectorize the tree that starts with the elements in \p VL. - /// \returns the vectorized value. - Value *vectorizeTree(ArrayRef VL); - - /// \returns the vectorization cost of the subtree that starts at \p VL. - /// A negative number means that this is profitable. - int getTreeCost(ArrayRef VL); + /// \brief Perform LICM and CSE on the newly generated gather sequences. + void optimizeGatherSequence(); +private: + struct TreeEntry; - /// \returns the scalarization cost for this list of values. Assuming that - /// this subtree gets vectorized, we may need to extract the values from the - /// roots. This method calculates the cost of extracting the values. - int getGatherCost(ArrayRef VL); + /// \returns the cost of the vectorizable entry. + int getEntryCost(TreeEntry *E); - /// \brief Attempts to order and vectorize a sequence of stores. This - /// function does a quadratic scan of the given stores. - /// \returns true if the basic block was modified. - bool vectorizeStores(ArrayRef Stores, int costThreshold); + /// This is the recursive part of buildTree. + void buildTree_rec(ArrayRef Roots, unsigned Depth); - /// \brief Vectorize a group of scalars into a vector tree. - /// \returns the vectorized value. - Value *vectorizeArith(ArrayRef Operands); + /// Vectorize a single entry in the tree. + Value *vectorizeTree(TreeEntry *E); - /// \brief This method contains the recursive part of getTreeCost. - int getTreeCost_rec(ArrayRef VL, unsigned Depth); + /// Vectorize a single entry in the tree, starting in \p VL. + Value *vectorizeTree(ArrayRef VL); - /// \brief This recursive method looks for vectorization hazards such as - /// values that are used by multiple users and checks that values are used - /// by only one vector lane. It updates the variables LaneMap, MultiUserVals. - void getTreeUses_rec(ArrayRef VL, unsigned Depth); + /// \returns the pointer to the vectorized value if \p VL is already + /// vectorized, or NULL. They may happen in cycles. + Value *alreadyVectorized(ArrayRef VL); - /// \brief This method contains the recursive part of vectorizeTree. - Value *vectorizeTree_rec(ArrayRef VL); + /// \brief Take the pointer operand from the Load/Store instruction. + /// \returns NULL if this is not a valid Load/Store instruction. + static Value *getPointerOperand(Value *I); - /// \brief Vectorize a sorted sequence of stores. - bool vectorizeStoreChain(ArrayRef Chain, int CostThreshold); + /// \brief Take the address space operand from the Load/Store instruction. + /// \returns -1 if this is not a valid Load/Store instruction. + static unsigned getAddressSpaceOperand(Value *I); /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. int getGatherCost(Type *Ty); + /// \returns the scalarization cost for this list of values. Assuming that + /// this subtree gets vectorized, we may need to extract the values from the + /// roots. This method calculates the cost of extracting the values. + int getGatherCost(ArrayRef VL); + /// \returns the AA location that is being access by the instruction. AliasAnalysis::Location getLocation(Instruction *I); @@ -199,55 +308,94 @@ public: /// \returns the index of the last instrucion in the BB from \p VL. int getLastIndex(ArrayRef VL); - /// \returns the Instrucion in the bundle \p VL. + /// \returns the Instruction in the bundle \p VL. Instruction *getLastInstruction(ArrayRef VL); - /// \returns the Instruction at index \p Index which is in Block \p BB. - Instruction *getInstructionForIndex(unsigned Index, BasicBlock *BB); - - /// \returns the index of the first User of \p VL. - int getFirstUserIndex(ArrayRef VL); - /// \returns a vector from a collection of scalars in \p VL. Value *Gather(ArrayRef VL, VectorType *Ty); - /// \brief Perform LICM and CSE on the newly generated gather sequences. - void optimizeGatherSequence(); + struct TreeEntry { + TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0), + NeedToGather(0) {} - bool needToGatherAny(ArrayRef VL) { - for (int i = 0, e = VL.size(); i < e; ++i) - if (MustGather.count(VL[i])) - return true; - return false; + /// \returns true if the scalars in VL are equal to this entry. + bool isSame(ArrayRef VL) { + assert(VL.size() == Scalars.size() && "Invalid size"); + for (int i = 0, e = VL.size(); i != e; ++i) + if (VL[i] != Scalars[i]) + return false; + return true; + } + + /// A vector of scalars. + ValueList Scalars; + + /// The Scalars are vectorized into this value. It is initialized to Null. + Value *VectorizedValue; + + /// The index in the basic block of the last scalar. + int LastScalarIndex; + + /// Do we need to gather this sequence ? + bool NeedToGather; + }; + + /// Create a new VectorizableTree entry. + TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized) { + VectorizableTree.push_back(TreeEntry()); + int idx = VectorizableTree.size() - 1; + TreeEntry *Last = &VectorizableTree[idx]; + Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); + Last->NeedToGather = !Vectorized; + if (Vectorized) { + Last->LastScalarIndex = getLastIndex(VL); + for (int i = 0, e = VL.size(); i != e; ++i) { + assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); + ScalarToTreeEntry[VL[i]] = idx; + } + } else { + Last->LastScalarIndex = 0; + MustGather.insert(VL.begin(), VL.end()); + } + return Last; } /// -- Vectorization State -- + /// Holds all of the tree entries. + std::vector VectorizableTree; - /// Maps values in the tree to the vector lanes that uses them. This map must - /// be reset between runs of getCost. - std::map LaneMap; - /// A list of instructions to ignore while sinking - /// memory instructions. This map must be reset between runs of getCost. - ValueSet MemBarrierIgnoreList; - - /// Maps between the first scalar to the vector. This map must be reset - /// between runs. - DenseMap VectorizedValues; + /// Maps a specific scalar to its tree entry. + SmallDenseMap ScalarToTreeEntry; - /// Contains values that must be gathered because they are used - /// by multiple lanes, or by users outside the tree. - /// NOTICE: The vectorization methods also use this set. + /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; - /// Contains a list of values that are used outside the current tree. This - /// set must be reset between runs. - SetVector MultiUserVals; + /// This POD struct describes one external user in the vectorized tree. + struct ExternalUser { + ExternalUser (Value *S, llvm::User *U, int L) : + Scalar(S), User(U), Lane(L){}; + // Which scalar in our function. + Value *Scalar; + // Which user that uses the scalar. + llvm::User *User; + // Which lane does the scalar belong to. + int Lane; + }; + typedef SmallVector UserList; + + /// A list of values that need to extracted out of the tree. + /// This list holds pairs of (Internal Scalar : External User). + UserList ExternalUses; + + /// A list of instructions to ignore while sinking + /// memory instructions. This map must be reset between runs of getCost. + ValueSet MemBarrierIgnoreList; /// Holds all of the instructions that we gathered. SetVector GatherSeq; /// Numbers instructions in different blocks. - std::map BlocksNumbers; + DenseMap BlocksNumbers; // Analysis and block reference. Function *F; @@ -261,369 +409,215 @@ public: IRBuilder<> Builder; }; -int FuncSLP::getGatherCost(Type *Ty) { - int Cost = 0; - for (unsigned i = 0, e = cast(Ty)->getNumElements(); i < e; ++i) - Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); - return Cost; -} - -int FuncSLP::getGatherCost(ArrayRef VL) { - // Find the type of the operands in VL. - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - // Find the cost of inserting/extracting values from the vector. - return getGatherCost(VecTy); -} - -AliasAnalysis::Location FuncSLP::getLocation(Instruction *I) { - if (StoreInst *SI = dyn_cast(I)) - return AA->getLocation(SI); - if (LoadInst *LI = dyn_cast(I)) - return AA->getLocation(LI); - return AliasAnalysis::Location(); -} - -Value *FuncSLP::getPointerOperand(Value *I) { - if (LoadInst *LI = dyn_cast(I)) - return LI->getPointerOperand(); - if (StoreInst *SI = dyn_cast(I)) - return SI->getPointerOperand(); - return 0; -} - -unsigned FuncSLP::getAddressSpaceOperand(Value *I) { - if (LoadInst *L = dyn_cast(I)) - return L->getPointerAddressSpace(); - if (StoreInst *S = dyn_cast(I)) - return S->getPointerAddressSpace(); - return -1; -} - -bool FuncSLP::isConsecutiveAccess(Value *A, Value *B) { - Value *PtrA = getPointerOperand(A); - Value *PtrB = getPointerOperand(B); - unsigned ASA = getAddressSpaceOperand(A); - unsigned ASB = getAddressSpaceOperand(B); - - // Check that the address spaces match and that the pointers are valid. - if (!PtrA || !PtrB || (ASA != ASB)) - return false; - - // Check that A and B are of the same type. - if (PtrA->getType() != PtrB->getType()) - return false; - - // Calculate the distance. - const SCEV *PtrSCEVA = SE->getSCEV(PtrA); - const SCEV *PtrSCEVB = SE->getSCEV(PtrB); - const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB); - const SCEVConstant *ConstOffSCEV = dyn_cast(OffsetSCEV); +void BoUpSLP::buildTree(ArrayRef Roots) { + deleteTree(); + if (!getSameType(Roots)) + return; + buildTree_rec(Roots, 0); - // Non constant distance. - if (!ConstOffSCEV) - return false; + // Collect the values that we need to extract from the tree. + for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { + TreeEntry *Entry = &VectorizableTree[EIdx]; - int64_t Offset = ConstOffSCEV->getValue()->getSExtValue(); - Type *Ty = cast(PtrA->getType())->getElementType(); - // The Instructions are connsecutive if the size of the first load/store is - // the same as the offset. - int64_t Sz = DL->getTypeStoreSize(Ty); - return ((-Offset) == Sz); -} + // For each lane: + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; -Value *FuncSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { - assert(Src->getParent() == Dst->getParent() && "Not the same BB"); - BasicBlock::iterator I = Src, E = Dst; - /// Scan all of the instruction from SRC to DST and check if - /// the source may alias. - for (++I; I != E; ++I) { - // Ignore store instructions that are marked as 'ignore'. - if (MemBarrierIgnoreList.count(I)) - continue; - if (Src->mayWriteToMemory()) /* Write */ { - if (!I->mayReadOrWriteMemory()) + // No need to handle users of gathered values. + if (Entry->NeedToGather) continue; - } else /* Read */ { - if (!I->mayWriteToMemory()) - continue; - } - AliasAnalysis::Location A = getLocation(&*I); - AliasAnalysis::Location B = getLocation(Src); - if (!A.Ptr || !B.Ptr || AA->alias(A, B)) - return I; - } - return 0; -} + for (Value::use_iterator User = Scalar->use_begin(), + UE = Scalar->use_end(); User != UE; ++User) { + DEBUG(dbgs() << "SLP: Checking user:" << **User << ".\n"); -static BasicBlock *getSameBlock(ArrayRef VL) { - BasicBlock *BB = 0; - for (int i = 0, e = VL.size(); i < e; i++) { - Instruction *I = dyn_cast(VL[i]); - if (!I) - return 0; - - if (!BB) { - BB = I->getParent(); - continue; - } - - if (BB != I->getParent()) - return 0; - } - return BB; -} + bool Gathered = MustGather.count(*User); -static bool allConstant(ArrayRef VL) { - for (unsigned i = 0, e = VL.size(); i < e; ++i) - if (!isa(VL[i])) - return false; - return true; -} + // Skip in-tree scalars that become vectors. + if (ScalarToTreeEntry.count(*User) && !Gathered) { + DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << + **User << ".\n"); + int Idx = ScalarToTreeEntry[*User]; (void) Idx; + assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); + continue; + } -static bool isSplat(ArrayRef VL) { - for (unsigned i = 1, e = VL.size(); i < e; ++i) - if (VL[i] != VL[0]) - return false; - return true; -} + if (!isa(*User)) + continue; -static unsigned getSameOpcode(ArrayRef VL) { - unsigned Opcode = 0; - for (int i = 0, e = VL.size(); i < e; i++) { - if (Instruction *I = dyn_cast(VL[i])) { - if (!Opcode) { - Opcode = I->getOpcode(); - continue; + DEBUG(dbgs() << "SLP: Need to extract:" << **User << " from lane " << + Lane << " from " << *Scalar << ".\n"); + ExternalUses.push_back(ExternalUser(Scalar, *User, Lane)); } - if (Opcode != I->getOpcode()) - return 0; } } - return Opcode; } -static bool CanReuseExtract(ArrayRef VL, unsigned VF, - VectorType *VecTy) { - assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); - // Check if all of the extracts come from the same vector and from the - // correct offset. - Value *VL0 = VL[0]; - ExtractElementInst *E0 = cast(VL0); - Value *Vec = E0->getOperand(0); - - // We have to extract from the same vector type. - if (Vec->getType() != VecTy) - return false; - - // Check that all of the indices extract from the correct offset. - ConstantInt *CI = dyn_cast(E0->getOperand(1)); - if (!CI || CI->getZExtValue()) - return false; - for (unsigned i = 1, e = VF; i < e; ++i) { - ExtractElementInst *E = cast(VL[i]); - ConstantInt *CI = dyn_cast(E->getOperand(1)); +void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { + bool SameTy = getSameType(VL); (void)SameTy; + assert(SameTy && "Invalid types!"); - if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) - return false; + if (Depth == RecursionMaxDepth) { + DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); + newTreeEntry(VL, false); + return; } - return true; -} - -void FuncSLP::getTreeUses_rec(ArrayRef VL, unsigned Depth) { - if (Depth == RecursionMaxDepth) - return MustGather.insert(VL.begin(), VL.end()); - // Don't handle vectors. - if (VL[0]->getType()->isVectorTy()) + if (VL[0]->getType()->isVectorTy()) { + DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); + newTreeEntry(VL, false); return; + } if (StoreInst *SI = dyn_cast(VL[0])) - if (SI->getValueOperand()->getType()->isVectorTy()) + if (SI->getValueOperand()->getType()->isVectorTy()) { + DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); + newTreeEntry(VL, false); return; + } // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL)) - return MustGather.insert(VL.begin(), VL.end()); - - // Stop the scan at unknown IR. - Instruction *VL0 = dyn_cast(VL[0]); - assert(VL0 && "Invalid instruction"); - - // Mark instructions with multiple users. - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *I = dyn_cast(VL[i]); - // Remember to check if all of the users of this instruction are vectorized - // within our tree. At depth zero we have no local users, only external - // users that we don't care about. - if (Depth && I && I->getNumUses() > 1) { - DEBUG(dbgs() << "SLP: Adding to MultiUserVals " - "because it has multiple users:" << *I << " \n"); - MultiUserVals.insert(I); - } + if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || + !getSameOpcode(VL)) { + DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); + newTreeEntry(VL, false); + return; } - // Check that the instruction is only used within one lane. - for (int i = 0, e = VL.size(); i < e; ++i) { - if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) { - DEBUG(dbgs() << "SLP: Value used by multiple lanes:" << *VL[i] << "\n"); - return MustGather.insert(VL.begin(), VL.end()); + // We now know that this is a vector of instructions of the same type from + // the same block. + + // Check if this is a duplicate of another entry. + if (ScalarToTreeEntry.count(VL[0])) { + int Idx = ScalarToTreeEntry[VL[0]]; + TreeEntry *E = &VectorizableTree[Idx]; + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); + if (E->Scalars[i] != VL[i]) { + DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); + newTreeEntry(VL, false); + return; + } } - // Make this instruction as 'seen' and remember the lane. - LaneMap[VL[i]] = i; + DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); + return; } - unsigned Opcode = getSameOpcode(VL); - if (!Opcode) - return MustGather.insert(VL.begin(), VL.end()); - - switch (Opcode) { - case Instruction::ExtractElement: { - VectorType *VecTy = VectorType::get(VL[0]->getType(), VL.size()); - // No need to follow ExtractElements that are going to be optimized away. - if (CanReuseExtract(VL, VL.size(), VecTy)) + // Check that none of the instructions in the bundle are already in the tree. + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + if (ScalarToTreeEntry.count(VL[i])) { + DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << + ") is already in tree.\n"); + newTreeEntry(VL, false); return; - // Fall through. + } } - case Instruction::Load: - return; - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: - case Instruction::Select: - case Instruction::ICmp: - case Instruction::FCmp: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { - ValueList Operands; - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast(VL[j])->getOperand(i)); - getTreeUses_rec(Operands, Depth + 1); + // If any of the scalars appears in the table OR it is marked as a value that + // needs to stat scalar then we need to gather the scalars. + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) { + DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n"); + newTreeEntry(VL, false); + return; } - return; - } - case Instruction::Store: { - ValueList Operands; - for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast(VL[j])->getOperand(0)); - getTreeUses_rec(Operands, Depth + 1); - return; - } - default: - return MustGather.insert(VL.begin(), VL.end()); } -} -int FuncSLP::getLastIndex(ArrayRef VL) { - BasicBlock *BB = cast(VL[0])->getParent(); - assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); - BlockNumbering &BN = BlocksNumbers[BB]; + // Check that all of the users of the scalars that we want to vectorize are + // schedulable. + Instruction *VL0 = cast(VL[0]); + int MyLastIndex = getLastIndex(VL); + BasicBlock *BB = cast(VL0)->getParent(); - int MaxIdx = BN.getIndex(BB->getFirstNonPHI()); - for (unsigned i = 0, e = VL.size(); i < e; ++i) - MaxIdx = std::max(MaxIdx, BN.getIndex(cast(VL[i]))); - return MaxIdx; -} + for (unsigned i = 0, e = VL.size(); i != e; ++i) { + Instruction *Scalar = cast(VL[i]); + DEBUG(dbgs() << "SLP: Checking users of " << *Scalar << ". \n"); + for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end(); + U != UE; ++U) { + DEBUG(dbgs() << "SLP: \tUser " << **U << ". \n"); + Instruction *User = dyn_cast(*U); + if (!User) { + DEBUG(dbgs() << "SLP: Gathering due unknown user. \n"); + newTreeEntry(VL, false); + return; + } -Instruction *FuncSLP::getLastInstruction(ArrayRef VL) { - BasicBlock *BB = cast(VL[0])->getParent(); - assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); - BlockNumbering &BN = BlocksNumbers[BB]; + // We don't care if the user is in a different basic block. + BasicBlock *UserBlock = User->getParent(); + if (UserBlock != BB) { + DEBUG(dbgs() << "SLP: User from a different basic block " + << *User << ". \n"); + continue; + } - int MaxIdx = BN.getIndex(cast(VL[0])); - for (unsigned i = 1, e = VL.size(); i < e; ++i) - MaxIdx = std::max(MaxIdx, BN.getIndex(cast(VL[i]))); - return BN.getInstruction(MaxIdx); -} + // If this is a PHINode within this basic block then we can place the + // extract wherever we want. + if (isa(*User)) { + DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *User << ". \n"); + continue; + } -Instruction *FuncSLP::getInstructionForIndex(unsigned Index, BasicBlock *BB) { - BlockNumbering &BN = BlocksNumbers[BB]; - return BN.getInstruction(Index); -} + // Check if this is a safe in-tree user. + if (ScalarToTreeEntry.count(User)) { + int Idx = ScalarToTreeEntry[User]; + int VecLocation = VectorizableTree[Idx].LastScalarIndex; + if (VecLocation <= MyLastIndex) { + DEBUG(dbgs() << "SLP: Gathering due to unschedulable vector. \n"); + newTreeEntry(VL, false); + return; + } + DEBUG(dbgs() << "SLP: In-tree user (" << *User << ") at #" << + VecLocation << " vector value (" << *Scalar << ") at #" + << MyLastIndex << ".\n"); + continue; + } -int FuncSLP::getFirstUserIndex(ArrayRef VL) { - BasicBlock *BB = getSameBlock(VL); - assert(BB && "All instructions must come from the same block"); - BlockNumbering &BN = BlocksNumbers[BB]; + // Make sure that we can schedule this unknown user. + BlockNumbering &BN = BlocksNumbers[BB]; + int UserIndex = BN.getIndex(User); + if (UserIndex < MyLastIndex) { - // Find the first user of the values. - int FirstUser = BN.getIndex(BB->getTerminator()); + DEBUG(dbgs() << "SLP: Can't schedule extractelement for " + << *User << ". \n"); + newTreeEntry(VL, false); + return; + } + } + } + + // Check that every instructions appears once in this bundle. + for (unsigned i = 0, e = VL.size(); i < e; ++i) + for (unsigned j = i+1; j < e; ++j) + if (VL[i] == VL[j]) { + DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + newTreeEntry(VL, false); + return; + } + + // Check that instructions in this bundle don't reference other instructions. + // The runtime of this check is O(N * N-1 * uses(N)) and a typical N is 4. for (unsigned i = 0, e = VL.size(); i < e; ++i) { for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); U != UE; ++U) { - Instruction *Instr = dyn_cast(*U); - - if (!Instr || Instr->getParent() != BB) - continue; - - FirstUser = std::min(FirstUser, BN.getIndex(Instr)); + for (unsigned j = 0; j < e; ++j) { + if (i != j && *U == VL[j]) { + DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << **U << ". \n"); + newTreeEntry(VL, false); + return; + } + } } } - return FirstUser; -} - -int FuncSLP::getTreeCost_rec(ArrayRef VL, unsigned Depth) { - Type *ScalarTy = VL[0]->getType(); - - if (StoreInst *SI = dyn_cast(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - /// Don't mess with vectors. - if (ScalarTy->isVectorTy()) - return FuncSLP::MAX_COST; + DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - - if (allConstant(VL)) - return 0; - - if (isSplat(VL)) - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); - - if (Depth == RecursionMaxDepth || needToGatherAny(VL)) - return getGatherCost(VecTy); - - BasicBlock *BB = getSameBlock(VL); unsigned Opcode = getSameOpcode(VL); - assert(Opcode && BB && "Invalid Instruction Value"); // Check if it is safe to sink the loads or the stores. if (Opcode == Instruction::Load || Opcode == Instruction::Store) { - int MaxIdx = getLastIndex(VL); - Instruction *Last = getInstructionForIndex(MaxIdx, BB); + Instruction *Last = getLastInstruction(VL); for (unsigned i = 0, e = VL.size(); i < e; ++i) { if (VL[i] == Last) @@ -631,524 +625,864 @@ int FuncSLP::getTreeCost_rec(ArrayRef VL, unsigned Depth) { Value *Barrier = getSinkBarrier(cast(VL[i]), Last); if (Barrier) { DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last - << "\n because of " << *Barrier << "\n"); - return MAX_COST; + << "\n because of " << *Barrier << ". Gathering.\n"); + newTreeEntry(VL, false); + return; } } } - Instruction *VL0 = cast(VL[0]); switch (Opcode) { - case Instruction::ExtractElement: { - if (CanReuseExtract(VL, VL.size(), VecTy)) - return 0; - return getGatherCost(VecTy); - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - ValueList Operands; - Type *SrcTy = VL0->getOperand(0)->getType(); - // Prepare the operand vector. - for (unsigned j = 0; j < VL.size(); ++j) { - Operands.push_back(cast(VL[j])->getOperand(0)); - // Check that the casted type is the same for all users. - if (cast(VL[j])->getOperand(0)->getType() != SrcTy) - return getGatherCost(VecTy); + case Instruction::PHI: { + PHINode *PH = dyn_cast(VL0); + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); + + for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getIncomingValue(i)); + + buildTree_rec(Operands, Depth + 1); + } + return; } + case Instruction::ExtractElement: { + bool Reuse = CanReuseExtract(VL); + if (Reuse) { + DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); + } + newTreeEntry(VL, Reuse); + return; + } + case Instruction::Load: { + // Check if the loads are consecutive or of we need to swizzle them. + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i + 1])) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Need to swizzle loads.\n"); + return; + } - int Cost = getTreeCost_rec(Operands, Depth + 1); - if (Cost == FuncSLP::MAX_COST) - return Cost; + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of loads.\n"); + return; + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + for (unsigned i = 0; i < VL.size(); ++i) { + Type *Ty = cast(VL[i])->getOperand(0)->getType(); + if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); + return; + } + } + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of casts.\n"); - // Calculate the cost of this instruction. - int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), - VL0->getType(), SrcTy); + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); - VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); - int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); - Cost += (VecCost - ScalarCost); - return Cost; - } - case Instruction::FCmp: - case Instruction::ICmp: { - // Check that all of the compares have the same predicate. - CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - CmpInst *Cmp = cast(VL[i]); - if (Cmp->getPredicate() != P0) - return getGatherCost(VecTy); + buildTree_rec(Operands, Depth+1); + } + return; } - // Fall through. - } - case Instruction::Select: - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - int TotalCost = 0; - // Calculate the cost of all of the operands. - for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + case Instruction::ICmp: + case Instruction::FCmp: { + // Check that all of the compares have the same predicate. + CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); + Type *ComparedTy = cast(VL[0])->getOperand(0)->getType(); + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + CmpInst *Cmp = cast(VL[i]); + if (Cmp->getPredicate() != P0 || + Cmp->getOperand(0)->getType() != ComparedTy) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); + return; + } + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of compares.\n"); + + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth+1); + } + return; + } + case Instruction::Select: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); + + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth+1); + } + return; + } + case Instruction::Store: { + // Check if the stores are consecutive or of we need to swizzle them. + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i + 1])) { + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Non consecutive store.\n"); + return; + } + + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a vector of stores.\n"); + ValueList Operands; - // Prepare the operand vector. for (unsigned j = 0; j < VL.size(); ++j) - Operands.push_back(cast(VL[j])->getOperand(i)); + Operands.push_back(cast(VL[j])->getOperand(0)); - int Cost = getTreeCost_rec(Operands, Depth + 1); - if (Cost == MAX_COST) - return MAX_COST; - TotalCost += TotalCost; + // We can ignore these values because we are sinking them down. + MemBarrierIgnoreList.insert(VL.begin(), VL.end()); + buildTree_rec(Operands, Depth + 1); + return; } + default: + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); + return; + } +} - // Calculate the cost of this instruction. - int ScalarCost = 0; - int VecCost = 0; - if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || - Opcode == Instruction::Select) { - VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); - ScalarCost = - VecTy->getNumElements() * - TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); - VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); - } else { - ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy); - VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); +int BoUpSLP::getEntryCost(TreeEntry *E) { + ArrayRef VL = E->Scalars; + + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + + if (E->NeedToGather) { + if (allConstant(VL)) + return 0; + if (isSplat(VL)) { + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); } - TotalCost += (VecCost - ScalarCost); - return TotalCost; + return getGatherCost(E->Scalars); } - case Instruction::Load: { - // If we are scalarize the loads, add the cost of forming the vector. - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1])) - return getGatherCost(VecTy); - - // Cost of wide load - cost of scalar loads. - int ScalarLdCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); - return VecLdCost - ScalarLdCost; + + assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) && + "Invalid VL"); + Instruction *VL0 = cast(VL[0]); + unsigned Opcode = VL0->getOpcode(); + switch (Opcode) { + case Instruction::PHI: { + return 0; + } + case Instruction::ExtractElement: { + if (CanReuseExtract(VL)) + return 0; + return getGatherCost(VecTy); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + + // Calculate the cost of this instruction. + int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), + VL0->getType(), SrcTy); + + VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); + int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); + return VecCost - ScalarCost; + } + case Instruction::FCmp: + case Instruction::ICmp: + case Instruction::Select: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Calculate the cost of this instruction. + int ScalarCost = 0; + int VecCost = 0; + if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || + Opcode == Instruction::Select) { + VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); + ScalarCost = VecTy->getNumElements() * + TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); + VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); + } else { + ScalarCost = VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy); + VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); + } + return VecCost - ScalarCost; + } + case Instruction::Load: { + // Cost of wide load - cost of scalar loads. + int ScalarLdCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + return VecLdCost - ScalarLdCost; + } + case Instruction::Store: { + // We know that we can merge the stores. Calculate the cost. + int ScalarStCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); + return VecStCost - ScalarStCost; + } + default: + llvm_unreachable("Unknown instruction"); } - case Instruction::Store: { - // We know that we can merge the stores. Calculate the cost. - int ScalarStCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); - int StoreCost = VecStCost - ScalarStCost; - - ValueList Operands; - for (unsigned j = 0; j < VL.size(); ++j) { - Operands.push_back(cast(VL[j])->getOperand(0)); - MemBarrierIgnoreList.insert(VL[j]); +} + +int BoUpSLP::getTreeCost() { + int Cost = 0; + DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << + VectorizableTree.size() << ".\n"); + + // Don't vectorize tiny trees. Small load/store chains or consecutive stores + // of constants will be vectoried in SelectionDAG in MergeConsecutiveStores. + // The SelectionDAG vectorizer can only handle pairs (trees of height = 2). + if (VectorizableTree.size() < 3) { + if (!VectorizableTree.size()) { + assert(!ExternalUses.size() && "We should not have any external users"); } + return 0; + } - int Cost = getTreeCost_rec(Operands, Depth + 1); - if (Cost == MAX_COST) - return MAX_COST; + unsigned BundleWidth = VectorizableTree[0].Scalars.size(); - int TotalCost = StoreCost + Cost; - return TotalCost; + for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) { + int C = getEntryCost(&VectorizableTree[i]); + DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " + << *VectorizableTree[i].Scalars[0] << " .\n"); + Cost += C; } - default: - // Unable to vectorize unknown instructions. - return getGatherCost(VecTy); + + int ExtractCost = 0; + for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end(); + I != E; ++I) { + + VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth); + ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, + I->Lane); } + + + DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n"); + return Cost + ExtractCost; } -int FuncSLP::getTreeCost(ArrayRef VL) { - // Get rid of the list of stores that were removed, and from the - // lists of instructions with multiple users. - MemBarrierIgnoreList.clear(); - LaneMap.clear(); - MultiUserVals.clear(); - MustGather.clear(); - - if (!getSameBlock(VL)) - return MAX_COST; - - // Find the location of the last root. - int LastRootIndex = getLastIndex(VL); - int FirstUserIndex = getFirstUserIndex(VL); - - // Don't vectorize if there are users of the tree roots inside the tree - // itself. - if (LastRootIndex > FirstUserIndex) - return MAX_COST; - - // Scan the tree and find which value is used by which lane, and which values - // must be scalarized. - getTreeUses_rec(VL, 0); - - // Check that instructions with multiple users can be vectorized. Mark unsafe - // instructions. - for (SetVector::iterator it = MultiUserVals.begin(), - e = MultiUserVals.end(); - it != e; ++it) { - // Check that all of the users of this instr are within the tree. - for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end(); - I != E; ++I) { - if (LaneMap.find(*I) == LaneMap.end()) { - DEBUG(dbgs() << "SLP: Adding to MustExtract " - "because of an out of tree usage.\n"); - MustGather.insert(*it); - continue; - } - } - } +int BoUpSLP::getGatherCost(Type *Ty) { + int Cost = 0; + for (unsigned i = 0, e = cast(Ty)->getNumElements(); i < e; ++i) + Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + return Cost; +} - // Now calculate the cost of vectorizing the tree. - return getTreeCost_rec(VL, 0); +int BoUpSLP::getGatherCost(ArrayRef VL) { + // Find the type of the operands in VL. + Type *ScalarTy = VL[0]->getType(); + if (StoreInst *SI = dyn_cast(VL[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + // Find the cost of inserting/extracting values from the vector. + return getGatherCost(VecTy); } -bool FuncSLP::vectorizeStoreChain(ArrayRef Chain, int CostThreshold) { - unsigned ChainLen = Chain.size(); - DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen - << "\n"); - Type *StoreTy = cast(Chain[0])->getValueOperand()->getType(); - unsigned Sz = DL->getTypeSizeInBits(StoreTy); - unsigned VF = MinVecRegSize / Sz; - if (!isPowerOf2_32(Sz) || VF < 2) +AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) { + if (StoreInst *SI = dyn_cast(I)) + return AA->getLocation(SI); + if (LoadInst *LI = dyn_cast(I)) + return AA->getLocation(LI); + return AliasAnalysis::Location(); +} + +Value *BoUpSLP::getPointerOperand(Value *I) { + if (LoadInst *LI = dyn_cast(I)) + return LI->getPointerOperand(); + if (StoreInst *SI = dyn_cast(I)) + return SI->getPointerOperand(); + return 0; +} + +unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { + if (LoadInst *L = dyn_cast(I)) + return L->getPointerAddressSpace(); + if (StoreInst *S = dyn_cast(I)) + return S->getPointerAddressSpace(); + return -1; +} + +bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { + Value *PtrA = getPointerOperand(A); + Value *PtrB = getPointerOperand(B); + unsigned ASA = getAddressSpaceOperand(A); + unsigned ASB = getAddressSpaceOperand(B); + + // Check that the address spaces match and that the pointers are valid. + if (!PtrA || !PtrB || (ASA != ASB)) return false; - bool Changed = false; - // Look for profitable vectorizable trees at all offsets, starting at zero. - for (unsigned i = 0, e = ChainLen; i < e; ++i) { - if (i + VF > e) - break; - DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i - << "\n"); - ArrayRef Operands = Chain.slice(i, VF); + // Make sure that A and B are different pointers of the same type. + if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) + return false; - int Cost = getTreeCost(Operands); - if (Cost == FuncSLP::MAX_COST) - continue; - DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); - if (Cost < CostThreshold) { - DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); - vectorizeTree(Operands); + // Calculate a constant offset from the base pointer without using SCEV + // in the supported cases. + // TODO: Add support for the case where one of the pointers is a GEP that + // uses the other pointer. + GetElementPtrInst *GepA = dyn_cast(PtrA); + GetElementPtrInst *GepB = dyn_cast(PtrB); - // Remove the scalar stores. - for (int i = 0, e = VF; i < e; ++i) - cast(Operands[i])->eraseFromParent(); + unsigned BW = DL->getPointerSizeInBits(ASA); + Type *Ty = cast(PtrA->getType())->getElementType(); + int64_t Sz = DL->getTypeStoreSize(Ty); - // Move to the next bundle. - i += VF - 1; - Changed = true; - } + // Check if PtrA is the base and PtrB is a constant offset. + if (GepB && GepB->getPointerOperand() == PtrA) { + APInt Offset(BW, 0); + if (GepB->accumulateConstantOffset(*DL, Offset)) + return Offset.getSExtValue() == Sz; + return false; } - if (Changed || ChainLen > VF) - return Changed; - - // Handle short chains. This helps us catch types such as <3 x float> that - // are smaller than vector size. - int Cost = getTreeCost(Chain); - if (Cost == FuncSLP::MAX_COST) + // Check if PtrB is the base and PtrA is a constant offset. + if (GepA && GepA->getPointerOperand() == PtrB) { + APInt Offset(BW, 0); + if (GepA->accumulateConstantOffset(*DL, Offset)) + return Offset.getSExtValue() == -Sz; return false; - if (Cost < CostThreshold) { - DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost - << " for size = " << ChainLen << "\n"); - vectorizeTree(Chain); + } - // Remove all of the scalar stores. - for (int i = 0, e = Chain.size(); i < e; ++i) - cast(Chain[i])->eraseFromParent(); + // If both pointers are GEPs: + if (GepA && GepB) { + // Check that they have the same base pointer and number of indices. + if (GepA->getPointerOperand() != GepB->getPointerOperand() || + GepA->getNumIndices() != GepB->getNumIndices()) + return false; - return true; - } + // Try to strip the geps. This makes SCEV faster. + // Make sure that all of the indices except for the last are identical. + int LastIdx = GepA->getNumIndices(); + for (int i = 0; i < LastIdx - 1; i++) { + if (GepA->getOperand(i+1) != GepB->getOperand(i+1)) + return false; + } - return false; -} + PtrA = GepA->getOperand(LastIdx); + PtrB = GepB->getOperand(LastIdx); + Sz = 1; + } -bool FuncSLP::vectorizeStores(ArrayRef Stores, int costThreshold) { - SetVector Heads, Tails; - SmallDenseMap ConsecutiveChain; + ConstantInt *CA = dyn_cast(PtrA); + ConstantInt *CB = dyn_cast(PtrB); + if (CA && CB) { + return (CA->getSExtValue() + Sz == CB->getSExtValue()); + } - // We may run into multiple chains that merge into a single chain. We mark the - // stores that we vectorized so that we don't visit the same store twice. - ValueSet VectorizedStores; - bool Changed = false; + // Calculate the distance. + const SCEV *PtrSCEVA = SE->getSCEV(PtrA); + const SCEV *PtrSCEVB = SE->getSCEV(PtrB); + const SCEV *C = SE->getConstant(PtrSCEVA->getType(), Sz); + const SCEV *X = SE->getAddExpr(PtrSCEVA, C); + return X == PtrSCEVB; +} - // Do a quadratic search on all of the given stores and find - // all of the pairs of loads that follow each other. - for (unsigned i = 0, e = Stores.size(); i < e; ++i) - for (unsigned j = 0; j < e; ++j) { - if (i == j) +Value *BoUpSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) { + assert(Src->getParent() == Dst->getParent() && "Not the same BB"); + BasicBlock::iterator I = Src, E = Dst; + /// Scan all of the instruction from SRC to DST and check if + /// the source may alias. + for (++I; I != E; ++I) { + // Ignore store instructions that are marked as 'ignore'. + if (MemBarrierIgnoreList.count(I)) + continue; + if (Src->mayWriteToMemory()) /* Write */ { + if (!I->mayReadOrWriteMemory()) + continue; + } else /* Read */ { + if (!I->mayWriteToMemory()) continue; - - if (isConsecutiveAccess(Stores[i], Stores[j])) { - Tails.insert(Stores[j]); - Heads.insert(Stores[i]); - ConsecutiveChain[Stores[i]] = Stores[j]; - } } + AliasAnalysis::Location A = getLocation(&*I); + AliasAnalysis::Location B = getLocation(Src); - // For stores that start but don't end a link in the chain: - for (SetVector::iterator it = Heads.begin(), e = Heads.end(); - it != e; ++it) { - if (Tails.count(*it)) - continue; + if (!A.Ptr || !B.Ptr || AA->alias(A, B)) + return I; + } + return 0; +} - // We found a store instr that starts a chain. Now follow the chain and try - // to vectorize it. - ValueList Operands; - Value *I = *it; - // Collect the chain into a list. - while (Tails.count(I) || Heads.count(I)) { - if (VectorizedStores.count(I)) - break; - Operands.push_back(I); - // Move to the next value in the chain. - I = ConsecutiveChain[I]; - } +int BoUpSLP::getLastIndex(ArrayRef VL) { + BasicBlock *BB = cast(VL[0])->getParent(); + assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); + BlockNumbering &BN = BlocksNumbers[BB]; - bool Vectorized = vectorizeStoreChain(Operands, costThreshold); + int MaxIdx = BN.getIndex(BB->getFirstNonPHI()); + for (unsigned i = 0, e = VL.size(); i < e; ++i) + MaxIdx = std::max(MaxIdx, BN.getIndex(cast(VL[i]))); + return MaxIdx; +} - // Mark the vectorized stores so that we don't vectorize them again. - if (Vectorized) - VectorizedStores.insert(Operands.begin(), Operands.end()); - Changed |= Vectorized; - } +Instruction *BoUpSLP::getLastInstruction(ArrayRef VL) { + BasicBlock *BB = cast(VL[0])->getParent(); + assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block"); + BlockNumbering &BN = BlocksNumbers[BB]; - return Changed; + int MaxIdx = BN.getIndex(cast(VL[0])); + for (unsigned i = 1, e = VL.size(); i < e; ++i) + MaxIdx = std::max(MaxIdx, BN.getIndex(cast(VL[i]))); + Instruction *I = BN.getInstruction(MaxIdx); + assert(I && "bad location"); + return I; } -Value *FuncSLP::Gather(ArrayRef VL, VectorType *Ty) { +Value *BoUpSLP::Gather(ArrayRef VL, VectorType *Ty) { Value *Vec = UndefValue::get(Ty); // Generate the 'InsertElement' instruction. for (unsigned i = 0; i < Ty->getNumElements(); ++i) { Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); - if (Instruction *I = dyn_cast(Vec)) - GatherSeq.insert(I); + if (Instruction *Insrt = dyn_cast(Vec)) { + GatherSeq.insert(Insrt); + + // Add to our 'need-to-extract' list. + if (ScalarToTreeEntry.count(VL[i])) { + int Idx = ScalarToTreeEntry[VL[i]]; + TreeEntry *E = &VectorizableTree[Idx]; + // Find which lane we need to extract. + int FoundLane = -1; + for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { + // Is this the lane of the scalar that we are looking for ? + if (E->Scalars[Lane] == VL[i]) { + FoundLane = Lane; + break; + } + } + assert(FoundLane >= 0 && "Could not find the correct lane"); + ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane)); + } + } } return Vec; } -Value *FuncSLP::vectorizeTree_rec(ArrayRef VL) { - BuilderLocGuard Guard(Builder); +Value *BoUpSLP::alreadyVectorized(ArrayRef VL) { + if (ScalarToTreeEntry.count(VL[0])) { + int Idx = ScalarToTreeEntry[VL[0]]; + TreeEntry *En = &VectorizableTree[Idx]; + if (En->isSame(VL) && En->VectorizedValue) + return En->VectorizedValue; + } + return 0; +} + +Value *BoUpSLP::vectorizeTree(ArrayRef VL) { + if (ScalarToTreeEntry.count(VL[0])) { + int Idx = ScalarToTreeEntry[VL[0]]; + TreeEntry *E = &VectorizableTree[Idx]; + if (E->isSame(VL)) + return vectorizeTree(E); + } Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast(VL[0])) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - if (needToGatherAny(VL)) - return Gather(VL, VecTy); + return Gather(VL, VecTy); +} + +Value *BoUpSLP::vectorizeTree(TreeEntry *E) { + BuilderLocGuard Guard(Builder); - if (VectorizedValues.count(VL[0])) { - DEBUG(dbgs() << "SLP: Diamond merged at depth.\n"); - return VectorizedValues[VL[0]]; + if (E->VectorizedValue) { + DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); + return E->VectorizedValue; } - Instruction *VL0 = cast(VL[0]); + Type *ScalarTy = E->Scalars[0]->getType(); + if (StoreInst *SI = dyn_cast(E->Scalars[0])) + ScalarTy = SI->getValueOperand()->getType(); + VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); + + if (E->NeedToGather) { + return Gather(E->Scalars, VecTy); + } + + Instruction *VL0 = cast(E->Scalars[0]); unsigned Opcode = VL0->getOpcode(); - assert(Opcode == getSameOpcode(VL) && "Invalid opcode"); + assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode"); switch (Opcode) { - case Instruction::ExtractElement: { - if (CanReuseExtract(VL, VL.size(), VecTy)) - return VL0->getOperand(0); - return Gather(VL, VecTy); - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - ValueList INVL; - for (int i = 0, e = VL.size(); i < e; ++i) - INVL.push_back(cast(VL[i])->getOperand(0)); - - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *InVec = vectorizeTree_rec(INVL); - CastInst *CI = dyn_cast(VL0); - Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::FCmp: - case Instruction::ICmp: { - // Check that all of the compares have the same predicate. - CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - CmpInst *Cmp = cast(VL[i]); - if (Cmp->getPredicate() != P0) - return Gather(VL, VecTy); + case Instruction::PHI: { + PHINode *PH = dyn_cast(VL0); + Builder.SetInsertPoint(PH->getParent()->getFirstInsertionPt()); + Builder.SetCurrentDebugLocation(PH->getDebugLoc()); + PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); + E->VectorizedValue = NewPhi; + + // PHINodes may have multiple entries from the same block. We want to + // visit every block once. + SmallSet VisitedBBs; + + for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + ValueList Operands; + BasicBlock *IBB = PH->getIncomingBlock(i); + + if (VisitedBBs.count(IBB)) { + NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); + continue; + } + + VisitedBBs.insert(IBB); + + // Prepare the operand vector. + for (unsigned j = 0; j < E->Scalars.size(); ++j) + Operands.push_back(cast(E->Scalars[j])-> + getIncomingValueForBlock(IBB)); + + Builder.SetInsertPoint(IBB->getTerminator()); + Builder.SetCurrentDebugLocation(PH->getDebugLoc()); + Value *Vec = vectorizeTree(Operands); + NewPhi->addIncoming(Vec, IBB); + } + + assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && + "Invalid number of incoming values"); + return NewPhi; } - ValueList LHSV, RHSV; - for (int i = 0, e = VL.size(); i < e; ++i) { - LHSV.push_back(cast(VL[i])->getOperand(0)); - RHSV.push_back(cast(VL[i])->getOperand(1)); + case Instruction::ExtractElement: { + if (CanReuseExtract(E->Scalars)) { + Value *V = VL0->getOperand(0); + E->VectorizedValue = V; + return V; + } + return Gather(E->Scalars, VecTy); } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + ValueList INVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + INVL.push_back(cast(E->Scalars[i])->getOperand(0)); + + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + + Value *InVec = vectorizeTree(INVL); + + if (Value *V = alreadyVectorized(E->Scalars)) + return V; + + CastInst *CI = dyn_cast(VL0); + Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); + E->VectorizedValue = V; + return V; + } + case Instruction::FCmp: + case Instruction::ICmp: { + ValueList LHSV, RHSV; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSV.push_back(cast(E->Scalars[i])->getOperand(0)); + RHSV.push_back(cast(E->Scalars[i])->getOperand(1)); + } - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *L = vectorizeTree_rec(LHSV); - Value *R = vectorizeTree_rec(RHSV); - Value *V; + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); - if (Opcode == Instruction::FCmp) - V = Builder.CreateFCmp(P0, L, R); - else - V = Builder.CreateICmp(P0, L, R); + Value *L = vectorizeTree(LHSV); + Value *R = vectorizeTree(RHSV); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::Select: { - ValueList TrueVec, FalseVec, CondVec; - for (int i = 0, e = VL.size(); i < e; ++i) { - CondVec.push_back(cast(VL[i])->getOperand(0)); - TrueVec.push_back(cast(VL[i])->getOperand(1)); - FalseVec.push_back(cast(VL[i])->getOperand(2)); - } + if (Value *V = alreadyVectorized(E->Scalars)) + return V; - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *True = vectorizeTree_rec(TrueVec); - Value *False = vectorizeTree_rec(FalseVec); - Value *Cond = vectorizeTree_rec(CondVec); - Value *V = Builder.CreateSelect(Cond, True, False); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - ValueList LHSVL, RHSVL; - for (int i = 0, e = VL.size(); i < e; ++i) { - LHSVL.push_back(cast(VL[i])->getOperand(0)); - RHSVL.push_back(cast(VL[i])->getOperand(1)); + CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); + Value *V; + if (Opcode == Instruction::FCmp) + V = Builder.CreateFCmp(P0, L, R); + else + V = Builder.CreateICmp(P0, L, R); + + E->VectorizedValue = V; + return V; } + case Instruction::Select: { + ValueList TrueVec, FalseVec, CondVec; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + CondVec.push_back(cast(E->Scalars[i])->getOperand(0)); + TrueVec.push_back(cast(E->Scalars[i])->getOperand(1)); + FalseVec.push_back(cast(E->Scalars[i])->getOperand(2)); + } + + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *LHS = vectorizeTree_rec(LHSVL); - Value *RHS = vectorizeTree_rec(RHSVL); + Value *Cond = vectorizeTree(CondVec); + Value *True = vectorizeTree(TrueVec); + Value *False = vectorizeTree(FalseVec); - if (LHS == RHS) { - assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); + if (Value *V = alreadyVectorized(E->Scalars)) + return V; + + Value *V = Builder.CreateSelect(Cond, True, False); + E->VectorizedValue = V; + return V; } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + ValueList LHSVL, RHSVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSVL.push_back(cast(E->Scalars[i])->getOperand(0)); + RHSVL.push_back(cast(E->Scalars[i])->getOperand(1)); + } - BinaryOperator *BinOp = cast(VL0); - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); - VectorizedValues[VL0] = V; - return V; - } - case Instruction::Load: { - // Check if all of the loads are consecutive. - for (unsigned i = 1, e = VL.size(); i < e; ++i) - if (!isConsecutiveAccess(VL[i - 1], VL[i])) - return Gather(VL, VecTy); - - // Loads are inserted at the head of the tree because we don't want to - // sink them all the way down past store instructions. - Builder.SetInsertPoint(getLastInstruction(VL)); - LoadInst *LI = cast(VL0); - Value *VecPtr = - Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo()); - unsigned Alignment = LI->getAlignment(); - LI = Builder.CreateLoad(VecPtr); - LI->setAlignment(Alignment); - - VectorizedValues[VL0] = LI; - return LI; - } - case Instruction::Store: { - StoreInst *SI = cast(VL0); - unsigned Alignment = SI->getAlignment(); + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + + Value *LHS = vectorizeTree(LHSVL); + Value *RHS = vectorizeTree(RHSVL); - ValueList ValueOp; - for (int i = 0, e = VL.size(); i < e; ++i) - ValueOp.push_back(cast(VL[i])->getValueOperand()); + if (LHS == RHS && isa(LHS)) { + assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); + } - Value *VecValue = vectorizeTree_rec(ValueOp); + if (Value *V = alreadyVectorized(E->Scalars)) + return V; - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *VecPtr = - Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo()); - Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); - return 0; - } - default: - return Gather(VL, VecTy); + BinaryOperator *BinOp = cast(VL0); + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); + E->VectorizedValue = V; + return V; + } + case Instruction::Load: { + // Loads are inserted at the head of the tree because we don't want to + // sink them all the way down past store instructions. + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + + LoadInst *LI = cast(VL0); + Value *VecPtr = + Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo()); + unsigned Alignment = LI->getAlignment(); + LI = Builder.CreateLoad(VecPtr); + LI->setAlignment(Alignment); + E->VectorizedValue = LI; + return LI; + } + case Instruction::Store: { + StoreInst *SI = cast(VL0); + unsigned Alignment = SI->getAlignment(); + + ValueList ValueOp; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) + ValueOp.push_back(cast(E->Scalars[i])->getValueOperand()); + + Builder.SetInsertPoint(getLastInstruction(E->Scalars)); + Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); + + Value *VecValue = vectorizeTree(ValueOp); + Value *VecPtr = + Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo()); + StoreInst *S = Builder.CreateStore(VecValue, VecPtr); + S->setAlignment(Alignment); + E->VectorizedValue = S; + return S; + } + default: + llvm_unreachable("unknown inst"); } + return 0; } -Value *FuncSLP::vectorizeTree(ArrayRef VL) { - Builder.SetInsertPoint(getLastInstruction(VL)); - Value *V = vectorizeTree_rec(VL); +void BoUpSLP::vectorizeTree() { + Builder.SetInsertPoint(F->getEntryBlock().begin()); + vectorizeTree(&VectorizableTree[0]); - // We moved some instructions around. We have to number them again - // before we can do any analysis. - for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) - BlocksNumbers[it].forget(); - // Clear the state. - MustGather.clear(); - VectorizedValues.clear(); - MemBarrierIgnoreList.clear(); - return V; -} + DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); + + // Extract all of the elements with the external uses. + for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end(); + it != e; ++it) { + Value *Scalar = it->Scalar; + llvm::User *User = it->User; + + // Skip users that we already RAUW. This happens when one instruction + // has multiple uses of the same value. + if (std::find(Scalar->use_begin(), Scalar->use_end(), User) == + Scalar->use_end()) + continue; + assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); + + int Idx = ScalarToTreeEntry[Scalar]; + TreeEntry *E = &VectorizableTree[Idx]; + assert(!E->NeedToGather && "Extracting from a gather list"); + + Value *Vec = E->VectorizedValue; + assert(Vec && "Can't find vectorizable value"); + + Value *Lane = Builder.getInt32(it->Lane); + // Generate extracts for out-of-tree users. + // Find the insertion point for the extractelement lane. + if (PHINode *PN = dyn_cast(Vec)) { + Builder.SetInsertPoint(PN->getParent()->getFirstInsertionPt()); + Value *Ex = Builder.CreateExtractElement(Vec, Lane); + User->replaceUsesOfWith(Scalar, Ex); + } else if (isa(Vec)){ + if (PHINode *PH = dyn_cast(User)) { + for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { + if (PH->getIncomingValue(i) == Scalar) { + Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); + Value *Ex = Builder.CreateExtractElement(Vec, Lane); + PH->setOperand(i, Ex); + } + } + } else { + Builder.SetInsertPoint(cast(User)); + Value *Ex = Builder.CreateExtractElement(Vec, Lane); + User->replaceUsesOfWith(Scalar, Ex); + } + } else { + Builder.SetInsertPoint(F->getEntryBlock().begin()); + Value *Ex = Builder.CreateExtractElement(Vec, Lane); + User->replaceUsesOfWith(Scalar, Ex); + } -Value *FuncSLP::vectorizeArith(ArrayRef Operands) { - Value *Vec = vectorizeTree(Operands); - // After vectorizing the operands we need to generate extractelement - // instructions and replace all of the uses of the scalar values with - // the values that we extracted from the vectorized tree. - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i)); - Operands[i]->replaceAllUsesWith(S); + DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); } - return Vec; + // For each vectorized value: + for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { + TreeEntry *Entry = &VectorizableTree[EIdx]; + + // For each lane: + for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { + Value *Scalar = Entry->Scalars[Lane]; + + // No need to handle users of gathered values. + if (Entry->NeedToGather) + continue; + + assert(Entry->VectorizedValue && "Can't find vectorizable value"); + + Type *Ty = Scalar->getType(); + if (!Ty->isVoidTy()) { + for (Value::use_iterator User = Scalar->use_begin(), + UE = Scalar->use_end(); User != UE; ++User) { + DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n"); + assert(!MustGather.count(*User) && + "Replacing gathered value with undef"); + assert(ScalarToTreeEntry.count(*User) && + "Replacing out-of-tree value with undef"); + } + Value *Undef = UndefValue::get(Ty); + Scalar->replaceAllUsesWith(Undef); + } + DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); + cast(Scalar)->eraseFromParent(); + } + } + + for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) { + BlocksNumbers[it].forget(); + } + Builder.ClearInsertionPoint(); } -void FuncSLP::optimizeGatherSequence() { +void BoUpSLP::optimizeGatherSequence() { + DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() + << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. for (SetVector::iterator it = GatherSeq.begin(), e = GatherSeq.end(); it != e; ++it) { @@ -1165,7 +1499,7 @@ void FuncSLP::optimizeGatherSequence() { // Check if it has a preheader. BasicBlock *PreHeader = L->getLoopPreheader(); if (!PreHeader) - return; + continue; // If the vector or the element that we insert into it are // instructions that are defined in this basic block then we can't @@ -1185,29 +1519,41 @@ void FuncSLP::optimizeGatherSequence() { // instructions. TODO: We can further optimize this scan if we split the // instructions into different buckets based on the insert lane. SmallPtrSet Visited; + SmallVector ToRemove; ReversePostOrderTraversal RPOT(F); for (ReversePostOrderTraversal::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I) { BasicBlock *BB = *I; // For all instructions in the function: for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - InsertElementInst *Insert = dyn_cast(it); - if (!Insert || !GatherSeq.count(Insert)) + Instruction *In = it; + if ((!isa(In) && !isa(In)) || + !GatherSeq.count(In)) continue; - // Check if we can replace this instruction with any of the - // visited instructions. + // Check if we can replace this instruction with any of the + // visited instructions. for (SmallPtrSet::iterator v = Visited.begin(), ve = Visited.end(); v != ve; ++v) { - if (Insert->isIdenticalTo(*v) && - DT->dominates((*v)->getParent(), Insert->getParent())) { - Insert->replaceAllUsesWith(*v); + if (In->isIdenticalTo(*v) && + DT->dominates((*v)->getParent(), In->getParent())) { + In->replaceAllUsesWith(*v); + ToRemove.push_back(In); + In = 0; break; } } - Visited.insert(Insert); + if (In) + Visited.insert(In); } } + + // Erase all of the instructions that we RAUWed. + for (SmallVectorImpl::iterator v = ToRemove.begin(), + ve = ToRemove.end(); v != ve; ++v) { + assert((*v)->getNumUses() == 0 && "Can't remove instructions with uses"); + (*v)->eraseFromParent(); + } } /// The SLPVectorizer Pass. @@ -1245,17 +1591,21 @@ struct SLPVectorizer : public FunctionPass { if (!DL) return false; + // Don't vectorize when the attribute NoImplicitFloat is used. + if (F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::NoImplicitFloat)) + return false; + DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); // Use the bollom up slp vectorizer to construct chains that start with // he store instructions. - FuncSLP R(&F, SE, DL, TTI, AA, LI, DT); + BoUpSLP R(&F, SE, DL, TTI, AA, LI, DT); - for (Function::iterator it = F.begin(), e = F.end(); it != e; ++it) { - BasicBlock *BB = it; - - // Vectorize trees that end at reductions. - Changed |= vectorizeChainsInBlock(BB, R); + // Scan the blocks in the function in post order. + for (po_iterator it = po_begin(&F.getEntryBlock()), + e = po_end(&F.getEntryBlock()); it != e; ++it) { + BasicBlock *BB = *it; // Vectorize trees that end at stores. if (unsigned count = collectStores(BB, R)) { @@ -1263,6 +1613,9 @@ struct SLPVectorizer : public FunctionPass { DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n"); Changed |= vectorizeStoreChains(R); } + + // Vectorize trees that end at reductions. + Changed |= vectorizeChainsInBlock(BB, R); } if (Changed) { @@ -1280,6 +1633,9 @@ struct SLPVectorizer : public FunctionPass { AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + AU.setPreservesCFG(); } private: @@ -1288,31 +1644,130 @@ private: /// object. We sort the stores to their base objects to reduce the cost of the /// quadratic search on the stores. TODO: We can further reduce this cost /// if we flush the chain creation every time we run into a memory barrier. - unsigned collectStores(BasicBlock *BB, FuncSLP &R); + unsigned collectStores(BasicBlock *BB, BoUpSLP &R); /// \brief Try to vectorize a chain that starts at two arithmetic instrs. - bool tryToVectorizePair(Value *A, Value *B, FuncSLP &R); + bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); - /// \brief Try to vectorize a list of operands. If \p NeedExtracts is true - /// then we calculate the cost of extracting the scalars from the vector. + /// \brief Try to vectorize a list of operands. /// \returns true if a value was vectorized. - bool tryToVectorizeList(ArrayRef VL, FuncSLP &R, bool NeedExtracts); + bool tryToVectorizeList(ArrayRef VL, BoUpSLP &R); /// \brief Try to vectorize a chain that may start at the operands of \V; - bool tryToVectorize(BinaryOperator *V, FuncSLP &R); + bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); /// \brief Vectorize the stores that were collected in StoreRefs. - bool vectorizeStoreChains(FuncSLP &R); + bool vectorizeStoreChains(BoUpSLP &R); /// \brief Scan the basic block and look for patterns that are likely to start /// a vectorization chain. - bool vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R); + bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); + bool vectorizeStoreChain(ArrayRef Chain, int CostThreshold, + BoUpSLP &R); + + bool vectorizeStores(ArrayRef Stores, int costThreshold, + BoUpSLP &R); private: StoreListMap StoreRefs; }; -unsigned SLPVectorizer::collectStores(BasicBlock *BB, FuncSLP &R) { +bool SLPVectorizer::vectorizeStoreChain(ArrayRef Chain, + int CostThreshold, BoUpSLP &R) { + unsigned ChainLen = Chain.size(); + DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen + << "\n"); + Type *StoreTy = cast(Chain[0])->getValueOperand()->getType(); + unsigned Sz = DL->getTypeSizeInBits(StoreTy); + unsigned VF = MinVecRegSize / Sz; + + if (!isPowerOf2_32(Sz) || VF < 2) + return false; + + bool Changed = false; + // Look for profitable vectorizable trees at all offsets, starting at zero. + for (unsigned i = 0, e = ChainLen; i < e; ++i) { + if (i + VF > e) + break; + DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i + << "\n"); + ArrayRef Operands = Chain.slice(i, VF); + + R.buildTree(Operands); + + int Cost = R.getTreeCost(); + + DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); + if (Cost < CostThreshold) { + DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + R.vectorizeTree(); + + // Move to the next bundle. + i += VF - 1; + Changed = true; + } + } + + return Changed; +} + +bool SLPVectorizer::vectorizeStores(ArrayRef Stores, + int costThreshold, BoUpSLP &R) { + SetVector Heads, Tails; + SmallDenseMap ConsecutiveChain; + + // We may run into multiple chains that merge into a single chain. We mark the + // stores that we vectorized so that we don't visit the same store twice. + BoUpSLP::ValueSet VectorizedStores; + bool Changed = false; + + // Do a quadratic search on all of the given stores and find + // all of the pairs of stores that follow each other. + for (unsigned i = 0, e = Stores.size(); i < e; ++i) { + for (unsigned j = 0; j < e; ++j) { + if (i == j) + continue; + + if (R.isConsecutiveAccess(Stores[i], Stores[j])) { + Tails.insert(Stores[j]); + Heads.insert(Stores[i]); + ConsecutiveChain[Stores[i]] = Stores[j]; + } + } + } + + // For stores that start but don't end a link in the chain: + for (SetVector::iterator it = Heads.begin(), e = Heads.end(); + it != e; ++it) { + if (Tails.count(*it)) + continue; + + // We found a store instr that starts a chain. Now follow the chain and try + // to vectorize it. + BoUpSLP::ValueList Operands; + Value *I = *it; + // Collect the chain into a list. + while (Tails.count(I) || Heads.count(I)) { + if (VectorizedStores.count(I)) + break; + Operands.push_back(I); + // Move to the next value in the chain. + I = ConsecutiveChain[I]; + } + + bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R); + + // Mark the vectorized stores so that we don't vectorize them again. + if (Vectorized) + VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed |= Vectorized; + } + + return Changed; +} + + +unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { unsigned count = 0; StoreRefs.clear(); for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { @@ -1337,15 +1792,14 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, FuncSLP &R) { return count; } -bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, FuncSLP &R) { +bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { if (!A || !B) return false; Value *VL[] = { A, B }; - return tryToVectorizeList(VL, R, true); + return tryToVectorizeList(VL, R); } -bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, FuncSLP &R, - bool NeedExtracts) { +bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R) { if (VL.size() < 2) return false; @@ -1367,21 +1821,18 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, FuncSLP &R, return 0; } - int Cost = R.getTreeCost(VL); - if (Cost == FuncSLP::MAX_COST) - return false; + R.buildTree(VL); + int Cost = R.getTreeCost(); - int ExtrCost = NeedExtracts ? R.getGatherCost(VL) : 0; - DEBUG(dbgs() << "SLP: Cost of pair:" << Cost - << " Cost of extract:" << ExtrCost << ".\n"); - if ((Cost + ExtrCost) >= -SLPCostThreshold) + if (Cost >= -SLPCostThreshold) return false; - DEBUG(dbgs() << "SLP: Vectorizing pair.\n"); - R.vectorizeArith(VL); + + DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n"); + R.vectorizeTree(); return true; } -bool SLPVectorizer::tryToVectorize(BinaryOperator *V, FuncSLP &R) { +bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { if (!V) return false; @@ -1421,9 +1872,52 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, FuncSLP &R) { return 0; } -bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R) { +bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + SmallVector Incoming; + SmallSet VisitedInstrs; + + // Collect the incoming values from the PHIs. + for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie; + ++instr) { + PHINode *P = dyn_cast(instr); + + if (!P) + break; + + // We may go through BB multiple times so skip the one we have checked. + if (VisitedInstrs.count(instr)) + continue; + VisitedInstrs.insert(instr); + + // Stop constructing the list when you reach a different type. + if (Incoming.size() && P->getType() != Incoming[0]->getType()) { + if (tryToVectorizeList(Incoming, R)) { + // We would like to start over since some instructions are deleted + // and the iterator may become invalid value. + Changed = true; + instr = BB->begin(); + ie = BB->end(); + } + + Incoming.clear(); + } + + Incoming.push_back(P); + } + + if (Incoming.size() > 1) + Changed |= tryToVectorizeList(Incoming, R); + + VisitedInstrs.clear(); + + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { + + // We may go through BB multiple times so skip the one we have checked. + if (VisitedInstrs.count(it)) + continue; + VisitedInstrs.insert(it); + if (isa(it)) continue; @@ -1445,51 +1939,46 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R) { if (Inst == P) Inst = BI->getOperand(1); - Changed |= tryToVectorize(dyn_cast(Inst), R); + if (tryToVectorize(dyn_cast(Inst), R)) { + // We would like to start over since some instructions are deleted + // and the iterator may become invalid value. + Changed = true; + it = BB->begin(); + e = BB->end(); + } continue; } // Try to vectorize trees that start at compare instructions. if (CmpInst *CI = dyn_cast(it)) { if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { - Changed |= true; + Changed = true; + // We would like to start over since some instructions are deleted + // and the iterator may become invalid value. + it = BB->begin(); + e = BB->end(); continue; } - for (int i = 0; i < 2; ++i) - if (BinaryOperator *BI = dyn_cast(CI->getOperand(i))) - Changed |= - tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R); - continue; - } - } - // Scan the PHINodes in our successors in search for pairing hints. - for (succ_iterator it = succ_begin(BB), e = succ_end(BB); it != e; ++it) { - BasicBlock *Succ = *it; - SmallVector Incoming; - - // Collect the incoming values from the PHIs. - for (BasicBlock::iterator instr = Succ->begin(), ie = Succ->end(); - instr != ie; ++instr) { - PHINode *P = dyn_cast(instr); - - if (!P) - break; - - Value *V = P->getIncomingValueForBlock(BB); - if (Instruction *I = dyn_cast(V)) - if (I->getParent() == BB) - Incoming.push_back(I); + for (int i = 0; i < 2; ++i) { + if (BinaryOperator *BI = dyn_cast(CI->getOperand(i))) { + if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) { + Changed = true; + // We would like to start over since some instructions are deleted + // and the iterator may become invalid value. + it = BB->begin(); + e = BB->end(); + } + } + } + continue; } - - if (Incoming.size() > 1) - Changed |= tryToVectorizeList(Incoming, R, true); } return Changed; } -bool SLPVectorizer::vectorizeStoreChains(FuncSLP &R) { +bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { bool Changed = false; // Attempt to sort and vectorize each of the store-groups. for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); @@ -1498,9 +1987,14 @@ bool SLPVectorizer::vectorizeStoreChains(FuncSLP &R) { continue; DEBUG(dbgs() << "SLP: Analyzing a store chain of length " - << it->second.size() << ".\n"); + << it->second.size() << ".\n"); - Changed |= R.vectorizeStores(it->second, -SLPCostThreshold); + // Process the stores in chunks of 16. + for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { + unsigned Len = std::min(CE - CI, 16); + ArrayRef Chunk(&it->second[CI], Len); + Changed |= vectorizeStores(Chunk, -SLPCostThreshold, R); + } } return Changed; }