1 //===- VecUtils.cpp --- Vectorization Utilities ---------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
9 #define DEBUG_TYPE "SLP"
12 #include "llvm/ADT/DenseMap.h"
13 #include "llvm/ADT/SmallPtrSet.h"
14 #include "llvm/ADT/SmallSet.h"
15 #include "llvm/ADT/SmallVector.h"
16 #include "llvm/Analysis/AliasAnalysis.h"
17 #include "llvm/Analysis/ScalarEvolution.h"
18 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
19 #include "llvm/Analysis/TargetTransformInfo.h"
20 #include "llvm/Analysis/Verifier.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/IR/Constants.h"
23 #include "llvm/IR/DataLayout.h"
24 #include "llvm/IR/Function.h"
25 #include "llvm/IR/Instructions.h"
26 #include "llvm/IR/Module.h"
27 #include "llvm/IR/Type.h"
28 #include "llvm/IR/Value.h"
29 #include "llvm/Pass.h"
30 #include "llvm/Support/CommandLine.h"
31 #include "llvm/Support/Debug.h"
32 #include "llvm/Support/raw_ostream.h"
33 #include "llvm/Target/TargetLibraryInfo.h"
34 #include "llvm/Transforms/Scalar.h"
35 #include "llvm/Transforms/Utils/Local.h"
41 static const unsigned MinVecRegSize = 128;
43 static const unsigned RecursionMaxDepth = 6;
47 BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl,
48 TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp) :
49 Builder(S->getContext()), BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp) {
53 void BoUpSLP::numberInstructions() {
57 // Number the instructions in the block.
58 for (BasicBlock::iterator it=BB->begin(), e=BB->end(); it != e; ++it) {
60 InstrVec.push_back(it);
61 assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation");
65 Value *BoUpSLP::getPointerOperand(Value *I) {
66 if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand();
67 if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand();
71 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
72 if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace();
73 if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace();
77 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
78 Value *PtrA = getPointerOperand(A);
79 Value *PtrB = getPointerOperand(B);
80 unsigned ASA = getAddressSpaceOperand(A);
81 unsigned ASB = getAddressSpaceOperand(B);
83 // Check that the address spaces match and that the pointers are valid.
84 if (!PtrA || !PtrB || (ASA != ASB)) return false;
86 // Check that A and B are of the same type.
87 if (PtrA->getType() != PtrB->getType()) return false;
89 // Calculate the distance.
90 const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
91 const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
92 const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB);
93 const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV);
95 // Non constant distance.
96 if (!ConstOffSCEV) return false;
98 int64_t Offset = ConstOffSCEV->getValue()->getSExtValue();
99 Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
100 // The Instructions are connsecutive if the size of the first load/store is
101 // the same as the offset.
102 int64_t Sz = DL->getTypeStoreSize(Ty);
103 return ((-Offset) == Sz);
106 bool BoUpSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) {
107 unsigned ChainLen = Chain.size();
108 DEBUG(dbgs()<<"SLP: Analyzing a store chain of length " <<ChainLen<< "\n");
109 Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
110 unsigned Sz = DL->getTypeSizeInBits(StoreTy);
111 unsigned VF = MinVecRegSize / Sz;
113 if (!isPowerOf2_32(Sz) || VF < 2) return false;
115 bool Changed = false;
116 // Look for profitable vectorizable trees at all offsets, starting at zero.
117 for (unsigned i = 0, e = ChainLen; i < e; ++i) {
118 if (i + VF > e) break;
119 DEBUG(dbgs()<<"SLP: Analyzing " << VF << " stores at offset "<< i << "\n");
120 ArrayRef<Value *> Operands = Chain.slice(i, VF);
122 int Cost = getTreeCost(Operands);
123 DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
124 if (Cost < CostThreshold) {
125 DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
126 Builder.SetInsertPoint(getInsertionPoint(getLastIndex(Operands,VF)));
127 vectorizeTree(Operands, VF);
136 int Cost = getTreeCost(Chain);
137 if (Cost < CostThreshold) {
138 DEBUG(dbgs() << "SLP: Found store chain cost = "<< Cost <<" for size = " <<
140 Builder.SetInsertPoint(getInsertionPoint(getLastIndex(Chain, ChainLen)));
141 vectorizeTree(Chain, ChainLen);
148 bool BoUpSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) {
149 SetVector<Value*> Heads, Tails;
150 SmallDenseMap<Value*, Value*> ConsecutiveChain;
152 // We may run into multiple chains that merge into a single chain. We mark the
153 // stores that we vectorized so that we don't visit the same store twice.
154 ValueSet VectorizedStores;
155 bool Changed = false;
157 // Do a quadratic search on all of the given stores and find
158 // all of the pairs of loads that follow each other.
159 for (unsigned i = 0, e = Stores.size(); i < e; ++i)
160 for (unsigned j = 0; j < e; ++j) {
161 if (i == j) continue;
162 if (isConsecutiveAccess(Stores[i], Stores[j])) {
163 Tails.insert(Stores[j]);
164 Heads.insert(Stores[i]);
165 ConsecutiveChain[Stores[i]] = Stores[j];
169 // For stores that start but don't end a link in the chain:
170 for (SetVector<Value*>::iterator it = Heads.begin(), e = Heads.end();
172 if (Tails.count(*it)) continue;
174 // We found a store instr that starts a chain. Now follow the chain and try
178 // Collect the chain into a list.
179 while (Tails.count(I) || Heads.count(I)) {
180 if (VectorizedStores.count(I)) break;
181 Operands.push_back(I);
182 // Move to the next value in the chain.
183 I = ConsecutiveChain[I];
186 bool Vectorized = vectorizeStoreChain(Operands, costThreshold);
188 // Mark the vectorized stores so that we don't vectorize them again.
190 VectorizedStores.insert(Operands.begin(), Operands.end());
191 Changed |= Vectorized;
197 int BoUpSLP::getScalarizationCost(ArrayRef<Value *> VL) {
198 // Find the type of the operands in VL.
199 Type *ScalarTy = VL[0]->getType();
200 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
201 ScalarTy = SI->getValueOperand()->getType();
202 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
203 // Find the cost of inserting/extracting values from the vector.
204 return getScalarizationCost(VecTy);
207 int BoUpSLP::getScalarizationCost(Type *Ty) {
209 for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
210 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
214 AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) {
215 if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI);
216 if (LoadInst *LI = dyn_cast<LoadInst>(I)) return AA->getLocation(LI);
217 return AliasAnalysis::Location();
220 Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) {
221 assert(Src->getParent() == Dst->getParent() && "Not the same BB");
222 BasicBlock::iterator I = Src, E = Dst;
223 /// Scan all of the instruction from SRC to DST and check if
224 /// the source may alias.
225 for (++I; I != E; ++I) {
226 // Ignore store instructions that are marked as 'ignore'.
227 if (MemBarrierIgnoreList.count(I)) continue;
228 if (Src->mayWriteToMemory()) /* Write */ {
229 if (!I->mayReadOrWriteMemory()) continue;
231 if (!I->mayWriteToMemory()) continue;
233 AliasAnalysis::Location A = getLocation(&*I);
234 AliasAnalysis::Location B = getLocation(Src);
236 if (!A.Ptr || !B.Ptr || AA->alias(A, B))
242 Value *BoUpSLP::vectorizeArith(ArrayRef<Value *> Operands) {
243 int LastIdx = getLastIndex(Operands, Operands.size());
244 Instruction *Loc = getInsertionPoint(LastIdx);
245 Builder.SetInsertPoint(Loc);
247 assert(getFirstUserIndex(Operands, Operands.size()) > LastIdx &&
248 "Vectorizing with in-tree users");
250 Value *Vec = vectorizeTree(Operands, Operands.size());
251 // After vectorizing the operands we need to generate extractelement
252 // instructions and replace all of the uses of the scalar values with
253 // the values that we extracted from the vectorized tree.
254 for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
255 Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i));
256 Operands[i]->replaceAllUsesWith(S);
262 int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) {
263 // Get rid of the list of stores that were removed, and from the
264 // lists of instructions with multiple users.
265 MemBarrierIgnoreList.clear();
267 MultiUserVals.clear();
268 MustScalarize.clear();
271 // Find the location of the last root.
272 int LastRootIndex = getLastIndex(VL, VL.size());
273 int FirstUserIndex = getFirstUserIndex(VL, VL.size());
275 // Don't vectorize if there are users of the tree roots inside the tree
277 if (LastRootIndex > FirstUserIndex)
280 // Scan the tree and find which value is used by which lane, and which values
281 // must be scalarized.
282 getTreeUses_rec(VL, 0);
284 // Check that instructions with multiple users can be vectorized. Mark unsafe
286 for (SetVector<Value*>::iterator it = MultiUserVals.begin(),
287 e = MultiUserVals.end(); it != e; ++it) {
288 // Check that all of the users of this instr are within the tree
289 // and that they are all from the same lane.
291 for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end();
293 if (LaneMap.find(*I) == LaneMap.end()) {
294 DEBUG(dbgs()<<"SLP: Instr " << **it << " has multiple users.\n");
296 // We don't have an ordering problem if the user is not in this basic
298 Instruction *Inst = cast<Instruction>(*I);
299 if (Inst->getParent() != BB) {
300 MustExtract.insert(*it);
304 // We don't have an ordering problem if the user is after the last root.
305 int Idx = InstrIdx[Inst];
306 if (Idx < LastRootIndex) {
307 MustScalarize.insert(*it);
308 DEBUG(dbgs()<<"SLP: Adding to MustScalarize "
309 "because of an unsafe out of tree usage.\n");
314 DEBUG(dbgs()<<"SLP: Adding to MustExtract "
315 "because of a safe out of tree usage.\n");
316 MustExtract.insert(*it);
319 if (Lane == -1) Lane = LaneMap[*I];
320 if (Lane != LaneMap[*I]) {
321 MustScalarize.insert(*it);
322 DEBUG(dbgs()<<"SLP: Adding " << **it <<
323 " to MustScalarize because multiple lane use it: "
324 << Lane << " and " << LaneMap[*I] << ".\n");
330 // Now calculate the cost of vectorizing the tree.
331 return getTreeCost_rec(VL, 0);
334 static bool CanReuseExtract(ArrayRef<Value *> VL, unsigned VF,
336 // Check if all of the extracts come from the same vector and from the
339 ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
340 Value *Vec = E0->getOperand(0);
342 // We have to extract from the same vector type.
343 if (Vec->getType() != VecTy)
346 // Check that all of the indices extract from the correct offset.
347 ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
348 if (!CI || CI->getZExtValue())
351 for (unsigned i = 1, e = VF; i < e; ++i) {
352 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
353 ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
355 if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
362 void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) {
363 if (Depth == RecursionMaxDepth) return;
365 // Don't handle vectors.
366 if (VL[0]->getType()->isVectorTy()) return;
367 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
368 if (SI->getValueOperand()->getType()->isVectorTy()) return;
370 // Check if all of the operands are constants.
371 bool AllConst = true;
372 bool AllSameScalar = true;
373 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
374 AllConst &= isa<Constant>(VL[i]);
375 AllSameScalar &= (VL[0] == VL[i]);
376 Instruction *I = dyn_cast<Instruction>(VL[i]);
377 // If one of the instructions is out of this BB, we need to scalarize all.
378 if (I && I->getParent() != BB) return;
381 // If all of the operands are identical or constant we have a simple solution.
382 if (AllConst || AllSameScalar) return;
384 // Scalarize unknown structures.
385 Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
388 unsigned Opcode = VL0->getOpcode();
389 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
390 Instruction *I = dyn_cast<Instruction>(VL[i]);
391 // If not all of the instructions are identical then we have to scalarize.
392 if (!I || Opcode != I->getOpcode()) return;
395 for (int i = 0, e = VL.size(); i < e; ++i) {
396 // Check that the instruction is only used within
398 if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) return;
399 // Make this instruction as 'seen' and remember the lane.
403 // Mark instructions with multiple users.
404 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
405 Instruction *I = dyn_cast<Instruction>(VL[i]);
406 // Remember to check if all of the users of this instr are vectorized
407 // within our tree. At depth zero we have no local users, only external
408 // users that we don't care about.
409 if (Depth && I && I->getNumUses() > 1) {
410 DEBUG(dbgs()<<"SLP: Adding to MultiUserVals "
411 "because it has multiple users:" << *I << " \n");
412 MultiUserVals.insert(I);
417 case Instruction::ExtractElement: {
418 VectorType *VecTy = VectorType::get(VL[0]->getType(), VL.size());
419 // No need to follow ExtractElements that are going to be optimized away.
420 if (CanReuseExtract(VL, VL.size(), VecTy)) return;
423 case Instruction::ZExt:
424 case Instruction::SExt:
425 case Instruction::FPToUI:
426 case Instruction::FPToSI:
427 case Instruction::FPExt:
428 case Instruction::PtrToInt:
429 case Instruction::IntToPtr:
430 case Instruction::SIToFP:
431 case Instruction::UIToFP:
432 case Instruction::Trunc:
433 case Instruction::FPTrunc:
434 case Instruction::BitCast:
435 case Instruction::Select:
436 case Instruction::ICmp:
437 case Instruction::FCmp:
438 case Instruction::Add:
439 case Instruction::FAdd:
440 case Instruction::Sub:
441 case Instruction::FSub:
442 case Instruction::Mul:
443 case Instruction::FMul:
444 case Instruction::UDiv:
445 case Instruction::SDiv:
446 case Instruction::FDiv:
447 case Instruction::URem:
448 case Instruction::SRem:
449 case Instruction::FRem:
450 case Instruction::Shl:
451 case Instruction::LShr:
452 case Instruction::AShr:
453 case Instruction::And:
454 case Instruction::Or:
455 case Instruction::Xor: {
456 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
458 // Prepare the operand vector.
459 for (unsigned j = 0; j < VL.size(); ++j)
460 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
462 getTreeUses_rec(Operands, Depth+1);
466 case Instruction::Store: {
468 for (unsigned j = 0; j < VL.size(); ++j)
469 Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
470 getTreeUses_rec(Operands, Depth+1);
478 int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
479 Type *ScalarTy = VL[0]->getType();
481 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
482 ScalarTy = SI->getValueOperand()->getType();
484 /// Don't mess with vectors.
485 if (ScalarTy->isVectorTy()) return max_cost;
486 VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
488 if (Depth == RecursionMaxDepth) return getScalarizationCost(VecTy);
490 // Check if all of the operands are constants.
491 bool AllConst = true;
492 bool AllSameScalar = true;
493 bool MustScalarizeFlag = false;
494 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
495 AllConst &= isa<Constant>(VL[i]);
496 AllSameScalar &= (VL[0] == VL[i]);
497 // Must have a single use.
498 Instruction *I = dyn_cast<Instruction>(VL[i]);
499 MustScalarizeFlag |= MustScalarize.count(VL[i]);
500 // This instruction is outside the basic block.
501 if (I && I->getParent() != BB)
502 return getScalarizationCost(VecTy);
505 // Is this a simple vector constant.
506 if (AllConst) return 0;
508 // If all of the operands are identical we can broadcast them.
509 Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
511 // If we are in a loop, and this is not an instruction (e.g. constant or
512 // argument) or the instruction is defined outside the loop then assume
513 // that the cost is zero.
514 if (L && (!VL0 || !L->contains(VL0)))
517 // We need to broadcast the scalar.
518 return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
521 // If this is not a constant, or a scalar from outside the loop then we
522 // need to scalarize it.
523 if (MustScalarizeFlag)
524 return getScalarizationCost(VecTy);
526 if (!VL0) return getScalarizationCost(VecTy);
527 assert(VL0->getParent() == BB && "Wrong BB");
529 unsigned Opcode = VL0->getOpcode();
530 for (unsigned i = 0, e = VL.size(); i < e; ++i) {
531 Instruction *I = dyn_cast<Instruction>(VL[i]);
532 // If not all of the instructions are identical then we have to scalarize.
533 if (!I || Opcode != I->getOpcode()) return getScalarizationCost(VecTy);
536 // Check if it is safe to sink the loads or the stores.
537 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
538 int MaxIdx = getLastIndex(VL, VL.size());
539 Instruction *Last = InstrVec[MaxIdx];
541 for (unsigned i = 0, e = VL.size(); i < e; ++i ) {
542 if (VL[i] == Last) continue;
543 Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last);
545 DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " <<
546 *Last << "\n because of " << *Barrier << "\n");
552 // Calculate the extract cost.
553 unsigned ExternalUserExtractCost = 0;
554 for (unsigned i = 0, e = VL.size(); i < e; ++i)
555 if (MustExtract.count(VL[i]))
556 ExternalUserExtractCost +=
557 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
560 case Instruction::ExtractElement: {
561 if (CanReuseExtract(VL, VL.size(), VecTy))
563 return getScalarizationCost(VecTy);
565 case Instruction::ZExt:
566 case Instruction::SExt:
567 case Instruction::FPToUI:
568 case Instruction::FPToSI:
569 case Instruction::FPExt:
570 case Instruction::PtrToInt:
571 case Instruction::IntToPtr:
572 case Instruction::SIToFP:
573 case Instruction::UIToFP:
574 case Instruction::Trunc:
575 case Instruction::FPTrunc:
576 case Instruction::BitCast: {
577 int Cost = ExternalUserExtractCost;
579 Type *SrcTy = VL0->getOperand(0)->getType();
580 // Prepare the operand vector.
581 for (unsigned j = 0; j < VL.size(); ++j) {
582 Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
583 // Check that the casted type is the same for all users.
584 if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy)
585 return getScalarizationCost(VecTy);
588 Cost += getTreeCost_rec(Operands, Depth+1);
589 if (Cost >= max_cost) return max_cost;
591 // Calculate the cost of this instruction.
592 int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
593 VL0->getType(), SrcTy);
595 VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
596 int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
597 Cost += (VecCost - ScalarCost);
600 case Instruction::FCmp:
601 case Instruction::ICmp: {
602 // Check that all of the compares have the same predicate.
603 CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
604 for (unsigned i = 1, e = VL.size(); i < e; ++i) {
605 CmpInst *Cmp = cast<CmpInst>(VL[i]);
606 if (Cmp->getPredicate() != P0)
607 return getScalarizationCost(VecTy);
611 case Instruction::Select:
612 case Instruction::Add:
613 case Instruction::FAdd:
614 case Instruction::Sub:
615 case Instruction::FSub:
616 case Instruction::Mul:
617 case Instruction::FMul:
618 case Instruction::UDiv:
619 case Instruction::SDiv:
620 case Instruction::FDiv:
621 case Instruction::URem:
622 case Instruction::SRem:
623 case Instruction::FRem:
624 case Instruction::Shl:
625 case Instruction::LShr:
626 case Instruction::AShr:
627 case Instruction::And:
628 case Instruction::Or:
629 case Instruction::Xor: {
630 int Cost = ExternalUserExtractCost;
631 // Calculate the cost of all of the operands.
632 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
634 // Prepare the operand vector.
635 for (unsigned j = 0; j < VL.size(); ++j)
636 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
638 Cost += getTreeCost_rec(Operands, Depth+1);
639 if (Cost >= max_cost) return max_cost;
642 // Calculate the cost of this instruction.
645 if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
646 Opcode == Instruction::Select) {
647 VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
648 ScalarCost = VecTy->getNumElements() *
649 TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
650 VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
652 ScalarCost = VecTy->getNumElements() *
653 TTI->getArithmeticInstrCost(Opcode, ScalarTy);
654 VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
656 Cost += (VecCost - ScalarCost);
659 case Instruction::Load: {
660 // If we are scalarize the loads, add the cost of forming the vector.
661 for (unsigned i = 0, e = VL.size()-1; i < e; ++i)
662 if (!isConsecutiveAccess(VL[i], VL[i+1]))
663 return getScalarizationCost(VecTy);
665 // Cost of wide load - cost of scalar loads.
666 int ScalarLdCost = VecTy->getNumElements() *
667 TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
668 int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
669 return VecLdCost - ScalarLdCost + ExternalUserExtractCost;
671 case Instruction::Store: {
672 // We know that we can merge the stores. Calculate the cost.
673 int ScalarStCost = VecTy->getNumElements() *
674 TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
675 int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,0);
676 int StoreCost = VecStCost - ScalarStCost;
679 for (unsigned j = 0; j < VL.size(); ++j) {
680 Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
681 MemBarrierIgnoreList.insert(VL[j]);
684 int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1);
685 return TotalCost + ExternalUserExtractCost;
688 // Unable to vectorize unknown instructions.
689 return getScalarizationCost(VecTy);
693 int BoUpSLP::getLastIndex(ArrayRef<Value *> VL, unsigned VF) {
694 int MaxIdx = InstrIdx[BB->getFirstNonPHI()];
695 for (unsigned i = 0; i < VF; ++i )
696 MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
700 int BoUpSLP::getFirstUserIndex(ArrayRef<Value *> VL, unsigned VF) {
701 // Find the first user of the values.
702 int FirstUser = InstrVec.size();
703 for (unsigned i = 0; i < VF; ++i) {
704 for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end();
706 Instruction *Instr = dyn_cast<Instruction>(*U);
707 if (!Instr || Instr->getParent() != BB)
710 FirstUser = std::min(FirstUser, InstrIdx[Instr]);
716 int BoUpSLP::getLastIndex(Instruction *I, Instruction *J) {
717 assert(I->getParent() == BB && "Invalid parent for instruction I");
718 assert(J->getParent() == BB && "Invalid parent for instruction J");
719 return std::max(InstrIdx[I],InstrIdx[J]);
722 Instruction *BoUpSLP::getInsertionPoint(unsigned Index) {
723 return InstrVec[Index + 1];
726 Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) {
727 Value *Vec = UndefValue::get(Ty);
728 for (unsigned i=0; i < Ty->getNumElements(); ++i) {
729 // Generate the 'InsertElement' instruction.
730 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
731 // Remember that this instruction is used as part of a 'gather' sequence.
732 // The caller of the bottom-up slp vectorizer can try to hoist the sequence
733 // if the users are outside of the basic block.
734 GatherInstructions.push_back(Vec);
737 for (unsigned i = 0; i < Ty->getNumElements(); ++i)
738 VectorizedValues[VL[i]] = Vec;
743 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) {
744 Value *V = vectorizeTree_rec(VL, VF);
746 int LastInstrIdx = getLastIndex(VL, VL.size());
747 for (SetVector<Value*>::iterator it = MustExtract.begin(),
748 e = MustExtract.end(); it != e; ++it) {
749 Instruction *I = cast<Instruction>(*it);
751 // This is a scalarized value, so we can use the original value.
752 // No need to extract from the vector.
753 if (!LaneMap.count(I))
756 Value *Vec = VectorizedValues[I];
757 // We decided not to vectorize I because one of its users was not
758 // vectorizerd. This is okay.
762 Value *Idx = Builder.getInt32(LaneMap[I]);
763 Value *Extract = Builder.CreateExtractElement(Vec, Idx);
764 bool Replaced = false;
765 for (Value::use_iterator U = I->use_begin(), UE = I->use_end(); U != UE;
767 Instruction *UI = cast<Instruction>(*U);
768 if (UI->getParent() != I->getParent() || InstrIdx[UI] > LastInstrIdx)
769 UI->replaceUsesOfWith(I ,Extract);
772 assert(Replaced && "Must replace at least one outside user");
776 // We moved some instructions around. We have to number them again
777 // before we can do any analysis.
778 numberInstructions();
779 MustScalarize.clear();
781 VectorizedValues.clear();
785 Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
786 Type *ScalarTy = VL[0]->getType();
787 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
788 ScalarTy = SI->getValueOperand()->getType();
789 VectorType *VecTy = VectorType::get(ScalarTy, VF);
791 // Check if all of the operands are constants or identical.
792 bool AllConst = true;
793 bool AllSameScalar = true;
794 for (unsigned i = 0, e = VF; i < e; ++i) {
795 AllConst &= isa<Constant>(VL[i]);
796 AllSameScalar &= (VL[0] == VL[i]);
797 // The instruction must be in the same BB, and it must be vectorizable.
798 Instruction *I = dyn_cast<Instruction>(VL[i]);
799 if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB))
800 return Scalarize(VL, VecTy);
803 // Check that this is a simple vector constant.
804 if (AllConst || AllSameScalar)
805 return Scalarize(VL, VecTy);
807 // Scalarize unknown structures.
808 Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
810 return Scalarize(VL, VecTy);
812 if (VectorizedValues.count(VL0)) {
813 Value * Vec = VectorizedValues[VL0];
814 for (int i = 0; i < VF; ++i)
815 VectorizedValues[VL[i]] = Vec;
819 unsigned Opcode = VL0->getOpcode();
820 for (unsigned i = 0, e = VF; i < e; ++i) {
821 Instruction *I = dyn_cast<Instruction>(VL[i]);
822 // If not all of the instructions are identical then we have to scalarize.
823 if (!I || Opcode != I->getOpcode())
824 return Scalarize(VL, VecTy);
828 case Instruction::ExtractElement: {
829 if (CanReuseExtract(VL, VL.size(), VecTy))
830 return VL0->getOperand(0);
831 return Scalarize(VL, VecTy);
833 case Instruction::ZExt:
834 case Instruction::SExt:
835 case Instruction::FPToUI:
836 case Instruction::FPToSI:
837 case Instruction::FPExt:
838 case Instruction::PtrToInt:
839 case Instruction::IntToPtr:
840 case Instruction::SIToFP:
841 case Instruction::UIToFP:
842 case Instruction::Trunc:
843 case Instruction::FPTrunc:
844 case Instruction::BitCast: {
846 for (int i = 0; i < VF; ++i)
847 INVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
848 Value *InVec = vectorizeTree_rec(INVL, VF);
849 CastInst *CI = dyn_cast<CastInst>(VL0);
850 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
852 for (int i = 0; i < VF; ++i)
853 VectorizedValues[VL[i]] = V;
857 case Instruction::FCmp:
858 case Instruction::ICmp: {
859 // Check that all of the compares have the same predicate.
860 CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
861 for (unsigned i = 1, e = VF; i < e; ++i) {
862 CmpInst *Cmp = cast<CmpInst>(VL[i]);
863 if (Cmp->getPredicate() != P0)
864 return Scalarize(VL, VecTy);
867 ValueList LHSV, RHSV;
868 for (int i = 0; i < VF; ++i) {
869 LHSV.push_back(cast<Instruction>(VL[i])->getOperand(0));
870 RHSV.push_back(cast<Instruction>(VL[i])->getOperand(1));
873 Value *L = vectorizeTree_rec(LHSV, VF);
874 Value *R = vectorizeTree_rec(RHSV, VF);
876 if (VL0->getOpcode() == Instruction::FCmp)
877 V = Builder.CreateFCmp(P0, L, R);
879 V = Builder.CreateICmp(P0, L, R);
881 for (int i = 0; i < VF; ++i)
882 VectorizedValues[VL[i]] = V;
887 case Instruction::Select: {
888 ValueList TrueVec, FalseVec, CondVec;
889 for (int i = 0; i < VF; ++i) {
890 CondVec.push_back(cast<Instruction>(VL[i])->getOperand(0));
891 TrueVec.push_back(cast<Instruction>(VL[i])->getOperand(1));
892 FalseVec.push_back(cast<Instruction>(VL[i])->getOperand(2));
895 Value *True = vectorizeTree_rec(TrueVec, VF);
896 Value *False = vectorizeTree_rec(FalseVec, VF);
897 Value *Cond = vectorizeTree_rec(CondVec, VF);
898 Value *V = Builder.CreateSelect(Cond, True, False);
900 for (int i = 0; i < VF; ++i)
901 VectorizedValues[VL[i]] = V;
905 case Instruction::Add:
906 case Instruction::FAdd:
907 case Instruction::Sub:
908 case Instruction::FSub:
909 case Instruction::Mul:
910 case Instruction::FMul:
911 case Instruction::UDiv:
912 case Instruction::SDiv:
913 case Instruction::FDiv:
914 case Instruction::URem:
915 case Instruction::SRem:
916 case Instruction::FRem:
917 case Instruction::Shl:
918 case Instruction::LShr:
919 case Instruction::AShr:
920 case Instruction::And:
921 case Instruction::Or:
922 case Instruction::Xor: {
923 ValueList LHSVL, RHSVL;
924 for (int i = 0; i < VF; ++i) {
925 LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
926 RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
929 Value *LHS = vectorizeTree_rec(LHSVL, VF);
930 Value *RHS = vectorizeTree_rec(RHSVL, VF);
931 BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
932 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS,RHS);
934 for (int i = 0; i < VF; ++i)
935 VectorizedValues[VL[i]] = V;
939 case Instruction::Load: {
940 LoadInst *LI = cast<LoadInst>(VL0);
941 unsigned Alignment = LI->getAlignment();
943 // Check if all of the loads are consecutive.
944 for (unsigned i = 1, e = VF; i < e; ++i)
945 if (!isConsecutiveAccess(VL[i-1], VL[i]))
946 return Scalarize(VL, VecTy);
948 // Loads are inserted at the head of the tree because we don't want to sink
949 // them all the way down past store instructions.
950 Instruction *Loc = getInsertionPoint(getLastIndex(VL, VL.size()));
951 IRBuilder<> LoadBuilder(Loc);
952 Value *VecPtr = LoadBuilder.CreateBitCast(LI->getPointerOperand(),
953 VecTy->getPointerTo());
954 LI = LoadBuilder.CreateLoad(VecPtr);
955 LI->setAlignment(Alignment);
957 for (int i = 0; i < VF; ++i)
958 VectorizedValues[VL[i]] = LI;
962 case Instruction::Store: {
963 StoreInst *SI = cast<StoreInst>(VL0);
964 unsigned Alignment = SI->getAlignment();
967 for (int i = 0; i < VF; ++i)
968 ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());
970 Value *VecValue = vectorizeTree_rec(ValueOp, VF);
971 Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
972 VecTy->getPointerTo());
973 Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment);
975 for (int i = 0; i < VF; ++i)
976 cast<Instruction>(VL[i])->eraseFromParent();
980 return Scalarize(VL, VecTy);
984 } // end of namespace