lib/Transforms/Scalar/LoopDistribute.cpp

   1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file implements the Loop Distribution Pass.  Its main focus is to
  11 // distribute loops that cannot be vectorized due to dependence cycles.  It
  12 // tries to isolate the offending dependences into a new loop allowing
  13 // vectorization of the remaining parts.
  14 //
  15 // For dependence analysis, the pass uses the LoopVectorizer's
  16 // LoopAccessAnalysis.  Because this analysis presumes no change in the order of
  17 // memory operations, special care is taken to preserve the lexical order of
  18 // these operations.
  19 //
  20 // Similarly to the Vectorizer, the pass also supports loop versioning to
  21 // run-time disambiguate potentially overlapping arrays.
  22 //
  23 //===----------------------------------------------------------------------===//
  24
  25 #include "llvm/ADT/DepthFirstIterator.h"
  26 #include "llvm/ADT/EquivalenceClasses.h"
  27 #include "llvm/ADT/STLExtras.h"
  28 #include "llvm/ADT/Statistic.h"
  29 #include "llvm/Analysis/LoopAccessAnalysis.h"
  30 #include "llvm/Analysis/LoopInfo.h"
  31 #include "llvm/IR/Dominators.h"
  32 #include "llvm/Pass.h"
  33 #include "llvm/Support/CommandLine.h"
  34 #include "llvm/Support/Debug.h"
  35 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  36 #include "llvm/Transforms/Utils/Cloning.h"
  37 #include <list>
  38
  39 #define LDIST_NAME "loop-distribute"
  40 #define DEBUG_TYPE LDIST_NAME
  41
  42 using namespace llvm;
  43
  44 static cl::opt<bool>
  45     LDistVerify("loop-distribute-verify", cl::Hidden,
  46                 cl::desc("Turn on DominatorTree and LoopInfo verification "
  47                          "after Loop Distribution"),
  48                 cl::init(false));
  49
  50 static cl::opt<bool> DistributeNonIfConvertible(
  51     "loop-distribute-non-if-convertible", cl::Hidden,
  52     cl::desc("Whether to distribute into a loop that may not be "
  53              "if-convertible by the loop vectorizer"),
  54     cl::init(false));
  55
  56 STATISTIC(NumLoopsDistributed, "Number of loops distributed");
  57
  58 /// \brief Remaps instructions in a loop including the preheader.
  59 static void remapInstructionsInLoop(const SmallVectorImpl<BasicBlock *> &Blocks,
  60                                     ValueToValueMapTy &VMap) {
  61   // Rewrite the code to refer to itself.
  62   for (auto *BB : Blocks)
  63     for (auto &Inst : *BB)
  64       RemapInstruction(&Inst, VMap,
  65                        RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
  66 }
  67
  68 /// \brief Clones a loop \p OrigLoop.  Returns the loop and the blocks in \p
  69 /// Blocks.
  70 ///
  71 /// Updates LoopInfo and DominatorTree assuming the loop is dominated by block
  72 /// \p LoopDomBB.  Insert the new blocks before block specified in \p Before.
  73 static Loop *cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
  74                                     Loop *OrigLoop, ValueToValueMapTy &VMap,
  75                                     const Twine &NameSuffix, LoopInfo *LI,
  76                                     DominatorTree *DT,
  77                                     SmallVectorImpl<BasicBlock *> &Blocks) {
  78   Function *F = OrigLoop->getHeader()->getParent();
  79   Loop *ParentLoop = OrigLoop->getParentLoop();
  80
  81   Loop *NewLoop = new Loop();
  82   if (ParentLoop)
  83     ParentLoop->addChildLoop(NewLoop);
  84   else
  85     LI->addTopLevelLoop(NewLoop);
  86
  87   BasicBlock *OrigPH = OrigLoop->getLoopPreheader();
  88   BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F);
  89   // To rename the loop PHIs.
  90   VMap[OrigPH] = NewPH;
  91   Blocks.push_back(NewPH);
  92
  93   // Update LoopInfo.
  94   if (ParentLoop)
  95     ParentLoop->addBasicBlockToLoop(NewPH, *LI);
  96
  97   // Update DominatorTree.
  98   DT->addNewBlock(NewPH, LoopDomBB);
  99
 100   for (BasicBlock *BB : OrigLoop->getBlocks()) {
 101     BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F);
 102     VMap[BB] = NewBB;
 103
 104     // Update LoopInfo.
 105     NewLoop->addBasicBlockToLoop(NewBB, *LI);
 106
 107     // Update DominatorTree.
 108     BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock();
 109     DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
 110
 111     Blocks.push_back(NewBB);
 112   }
 113
 114   // Move them physically from the end of the block list.
 115   F->getBasicBlockList().splice(Before, F->getBasicBlockList(), NewPH);
 116   F->getBasicBlockList().splice(Before, F->getBasicBlockList(),
 117                                 NewLoop->getHeader(), F->end());
 118
 119   return NewLoop;
 120 }
 121
 122 namespace {
 123 /// \brief Maintains the set of instructions of the loop for a partition before
 124 /// cloning.  After cloning, it hosts the new loop.
 125 class InstPartition {
 126   typedef SmallPtrSet<Instruction *, 8> InstructionSet;
 127
 128 public:
 129   InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
 130       : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) {
 131     Set.insert(I);
 132   }
 133
 134   /// \brief Returns whether this partition contains a dependence cycle.
 135   bool hasDepCycle() const { return DepCycle; }
 136
 137   /// \brief Adds an instruction to this partition.
 138   void add(Instruction *I) { Set.insert(I); }
 139
 140   /// \brief Collection accessors.
 141   InstructionSet::iterator begin() { return Set.begin(); }
 142   InstructionSet::iterator end() { return Set.end(); }
 143   InstructionSet::const_iterator begin() const { return Set.begin(); }
 144   InstructionSet::const_iterator end() const { return Set.end(); }
 145   bool empty() const { return Set.empty(); }
 146
 147   /// \brief Moves this partition into \p Other.  This partition becomes empty
 148   /// after this.
 149   void moveTo(InstPartition &Other) {
 150     Other.Set.insert(Set.begin(), Set.end());
 151     Set.clear();
 152     Other.DepCycle |= DepCycle;
 153   }
 154
 155   /// \brief Populates the partition with a transitive closure of all the
 156   /// instructions that the seeded instructions dependent on.
 157   void populateUsedSet() {
 158     // FIXME: We currently don't use control-dependence but simply include all
 159     // blocks (possibly empty at the end) and let simplifycfg mostly clean this
 160     // up.
 161     for (auto *B : OrigLoop->getBlocks())
 162       Set.insert(B->getTerminator());
 163
 164     // Follow the use-def chains to form a transitive closure of all the
 165     // instructions that the originally seeded instructions depend on.
 166     SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
 167     while (!Worklist.empty()) {
 168       Instruction *I = Worklist.pop_back_val();
 169       // Insert instructions from the loop that we depend on.
 170       for (Value *V : I->operand_values()) {
 171         auto *I = dyn_cast<Instruction>(V);
 172         if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
 173           Worklist.push_back(I);
 174       }
 175     }
 176   }
 177
 178   /// \brief Clones the original loop.
 179   ///
 180   /// Updates LoopInfo and DominatorTree using the information that block \p
 181   /// LoopDomBB dominates the loop.
 182   Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
 183                                unsigned Index, LoopInfo *LI,
 184                                DominatorTree *DT) {
 185     ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
 186                                           VMap, Twine(".ldist") + Twine(Index),
 187                                           LI, DT, ClonedLoopBlocks);
 188     return ClonedLoop;
 189   }
 190
 191   /// \brief The cloned loop.  If this partition is mapped to the original loop,
 192   /// this is null.
 193   const Loop *getClonedLoop() const { return ClonedLoop; }
 194
 195   /// \brief Returns the loop where this partition ends up after distribution.
 196   /// If this partition is mapped to the original loop then use the block from
 197   /// the loop.
 198   const Loop *getDistributedLoop() const {
 199     return ClonedLoop ? ClonedLoop : OrigLoop;
 200   }
 201
 202   /// \brief The VMap that is populated by cloning and then used in
 203   /// remapinstruction to remap the cloned instructions.
 204   ValueToValueMapTy &getVMap() { return VMap; }
 205
 206   /// \brief Remaps the cloned instructions using VMap.
 207   void remapInstructions() { remapInstructionsInLoop(ClonedLoopBlocks, VMap); }
 208
 209   /// \brief Based on the set of instructions selected for this partition,
 210   /// removes the unnecessary ones.
 211   void removeUnusedInsts() {
 212     SmallVector<Instruction *, 8> Unused;
 213
 214     for (auto *Block : OrigLoop->getBlocks())
 215       for (auto &Inst : *Block)
 216         if (!Set.count(&Inst)) {
 217           Instruction *NewInst = &Inst;
 218           if (!VMap.empty())
 219             NewInst = cast<Instruction>(VMap[NewInst]);
 220
 221           assert(!isa<BranchInst>(NewInst) &&
 222                  "Branches are marked used early on");
 223           Unused.push_back(NewInst);
 224         }
 225
 226     // Delete the instructions backwards, as it has a reduced likelihood of
 227     // having to update as many def-use and use-def chains.
 228     for (auto I = Unused.rbegin(), E = Unused.rend(); I != E; ++I) {
 229       auto *Inst = *I;
 230
 231       if (!Inst->use_empty())
 232         Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
 233       Inst->eraseFromParent();
 234     }
 235   }
 236
 237   void print() const {
 238     if (DepCycle)
 239       dbgs() << "  (cycle)\n";
 240     for (auto *I : Set)
 241       // Prefix with the block name.
 242       dbgs() << "  " << I->getParent()->getName() << ":" << *I << "\n";
 243   }
 244
 245   void printBlocks() const {
 246     for (auto *BB : getDistributedLoop()->getBlocks())
 247       dbgs() << *BB;
 248   }
 249
 250 private:
 251   /// \brief Instructions from OrigLoop selected for this partition.
 252   InstructionSet Set;
 253
 254   /// \brief Whether this partition contains a dependence cycle.
 255   bool DepCycle;
 256
 257   /// \brief The original loop.
 258   Loop *OrigLoop;
 259
 260   /// \brief The cloned loop.  If this partition is mapped to the original loop,
 261   /// this is null.
 262   Loop *ClonedLoop;
 263
 264   /// \brief The blocks of ClonedLoop including the preheader.  If this
 265   /// partition is mapped to the original loop, this is empty.
 266   SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
 267
 268   /// \brief These gets populated once the set of instructions have been
 269   /// finalized. If this partition is mapped to the original loop, these are not
 270   /// set.
 271   ValueToValueMapTy VMap;
 272 };
 273
 274 /// \brief Holds the set of Partitions.  It populates them, merges them and then
 275 /// clones the loops.
 276 class InstPartitionContainer {
 277   typedef DenseMap<Instruction *, int> InstToPartitionIdT;
 278
 279 public:
 280   InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
 281       : L(L), LI(LI), DT(DT) {}
 282
 283   /// \brief Returns the number of partitions.
 284   unsigned getSize() const { return PartitionContainer.size(); }
 285
 286   /// \brief Adds \p Inst into the current partition if that is marked to
 287   /// contain cycles.  Otherwise start a new partition for it.
 288   void addToCyclicPartition(Instruction *Inst) {
 289     // If the current partition is non-cyclic.  Start a new one.
 290     if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
 291       PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
 292     else
 293       PartitionContainer.back().add(Inst);
 294   }
 295
 296   /// \brief Adds \p Inst into a partition that is not marked to contain
 297   /// dependence cycles.
 298   ///
 299   //  Initially we isolate memory instructions into as many partitions as
 300   //  possible, then later we may merge them back together.
 301   void addToNewNonCyclicPartition(Instruction *Inst) {
 302     PartitionContainer.emplace_back(Inst, L);
 303   }
 304
 305   /// \brief Merges adjacent non-cyclic partitions.
 306   ///
 307   /// The idea is that we currently only want to isolate the non-vectorizable
 308   /// partition.  We could later allow more distribution among these partition
 309   /// too.
 310   void mergeAdjacentNonCyclic() {
 311     mergeAdjacentPartitionsIf(
 312         [](const InstPartition *P) { return !P->hasDepCycle(); });
 313   }
 314
 315   /// \brief If a partition contains only conditional stores, we won't vectorize
 316   /// it.  Try to merge it with a previous cyclic partition.
 317   void mergeNonIfConvertible() {
 318     mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
 319       if (Partition->hasDepCycle())
 320         return true;
 321
 322       // Now, check if all stores are conditional in this partition.
 323       bool seenStore = false;
 324
 325       for (auto *Inst : *Partition)
 326         if (isa<StoreInst>(Inst)) {
 327           seenStore = true;
 328           if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
 329             return false;
 330         }
 331       return seenStore;
 332     });
 333   }
 334
 335   /// \brief Merges the partitions according to various heuristics.
 336   void mergeBeforePopulating() {
 337     mergeAdjacentNonCyclic();
 338     if (!DistributeNonIfConvertible)
 339       mergeNonIfConvertible();
 340   }
 341
 342   /// \brief Merges partitions in order to ensure that no loads are duplicated.
 343   ///
 344   /// We can't duplicate loads because that could potentially reorder them.
 345   /// LoopAccessAnalysis provides dependency information with the context that
 346   /// the order of memory operation is preserved.
 347   ///
 348   /// Return if any partitions were merged.
 349   bool mergeToAvoidDuplicatedLoads() {
 350     typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT;
 351     typedef EquivalenceClasses<InstPartition *> ToBeMergedT;
 352
 353     LoadToPartitionT LoadToPartition;
 354     ToBeMergedT ToBeMerged;
 355
 356     // Step through the partitions and create equivalence between partitions
 357     // that contain the same load.  Also put partitions in between them in the
 358     // same equivalence class to avoid reordering of memory operations.
 359     for (PartitionContainerT::iterator I = PartitionContainer.begin(),
 360                                        E = PartitionContainer.end();
 361          I != E; ++I) {
 362       auto *PartI = &*I;
 363
 364       // If a load occurs in two partitions PartI and PartJ, merge all
 365       // partitions (PartI, PartJ] into PartI.
 366       for (Instruction *Inst : *PartI)
 367         if (isa<LoadInst>(Inst)) {
 368           bool NewElt;
 369           LoadToPartitionT::iterator LoadToPart;
 370
 371           std::tie(LoadToPart, NewElt) =
 372               LoadToPartition.insert(std::make_pair(Inst, PartI));
 373           if (!NewElt) {
 374             DEBUG(dbgs() << "Merging partitions due to this load in multiple "
 375                          << "partitions: " << PartI << ", "
 376                          << LoadToPart->second << "\n" << *Inst << "\n");
 377
 378             auto PartJ = I;
 379             do {
 380               --PartJ;
 381               ToBeMerged.unionSets(PartI, &*PartJ);
 382             } while (&*PartJ != LoadToPart->second);
 383           }
 384         }
 385     }
 386     if (ToBeMerged.empty())
 387       return false;
 388
 389     // Merge the member of an equivalence class into its class leader.  This
 390     // makes the members empty.
 391     for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
 392          I != E; ++I) {
 393       if (!I->isLeader())
 394         continue;
 395
 396       auto PartI = I->getData();
 397       for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
 398                                    ToBeMerged.member_end())) {
 399         PartJ->moveTo(*PartI);
 400       }
 401     }
 402
 403     // Remove the empty partitions.
 404     PartitionContainer.remove_if(
 405         [](const InstPartition &P) { return P.empty(); });
 406
 407     return true;
 408   }
 409
 410   /// \brief Sets up the mapping between instructions to partitions.  If the
 411   /// instruction is duplicated across multiple partitions, set the entry to -1.
 412   void setupPartitionIdOnInstructions() {
 413     int PartitionID = 0;
 414     for (const auto &Partition : PartitionContainer) {
 415       for (Instruction *Inst : Partition) {
 416         bool NewElt;
 417         InstToPartitionIdT::iterator Iter;
 418
 419         std::tie(Iter, NewElt) =
 420             InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
 421         if (!NewElt)
 422           Iter->second = -1;
 423       }
 424       ++PartitionID;
 425     }
 426   }
 427
 428   /// \brief Populates the partition with everything that the seeding
 429   /// instructions require.
 430   void populateUsedSet() {
 431     for (auto &P : PartitionContainer)
 432       P.populateUsedSet();
 433   }
 434
 435   /// \brief This performs the main chunk of the work of cloning the loops for
 436   /// the partitions.
 437   void cloneLoops(Pass *P) {
 438     BasicBlock *OrigPH = L->getLoopPreheader();
 439     // At this point the predecessor of the preheader is either the memcheck
 440     // block or the top part of the original preheader.
 441     BasicBlock *Pred = OrigPH->getSinglePredecessor();
 442     assert(Pred && "Preheader does not have a single predecessor");
 443     BasicBlock *ExitBlock = L->getExitBlock();
 444     assert(ExitBlock && "No single exit block");
 445     Loop *NewLoop;
 446
 447     assert(!PartitionContainer.empty() && "at least two partitions expected");
 448     // We're cloning the preheader along with the loop so we already made sure
 449     // it was empty.
 450     assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
 451            "preheader not empty");
 452
 453     // Create a loop for each partition except the last.  Clone the original
 454     // loop before PH along with adding a preheader for the cloned loop.  Then
 455     // update PH to point to the newly added preheader.
 456     BasicBlock *TopPH = OrigPH;
 457     unsigned Index = getSize() - 1;
 458     for (auto I = std::next(PartitionContainer.rbegin()),
 459               E = PartitionContainer.rend();
 460          I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) {
 461       auto *Part = &*I;
 462
 463       NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
 464
 465       Part->getVMap()[ExitBlock] = TopPH;
 466       Part->remapInstructions();
 467     }
 468     Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
 469
 470     // Now go in forward order and update the immediate dominator for the
 471     // preheaders with the exiting block of the previous loop.  Dominance
 472     // within the loop is updated in cloneLoopWithPreheader.
 473     for (auto Curr = PartitionContainer.cbegin(),
 474               Next = std::next(PartitionContainer.cbegin()),
 475               E = PartitionContainer.cend();
 476          Next != E; ++Curr, ++Next)
 477       DT->changeImmediateDominator(
 478           Next->getDistributedLoop()->getLoopPreheader(),
 479           Curr->getDistributedLoop()->getExitingBlock());
 480   }
 481
 482   /// \brief Removes the dead instructions from the cloned loops.
 483   void removeUnusedInsts() {
 484     for (auto &Partition : PartitionContainer)
 485       Partition.removeUnusedInsts();
 486   }
 487
 488   /// \brief For each memory pointer, it computes the partitionId the pointer is
 489   /// used in.
 490   ///
 491   /// This returns an array of int where the I-th entry corresponds to I-th
 492   /// entry in LAI.getRuntimePointerCheck().  If the pointer is used in multiple
 493   /// partitions its entry is set to -1.
 494   SmallVector<int, 8>
 495   computePartitionSetForPointers(const LoopAccessInfo &LAI) {
 496     const LoopAccessInfo::RuntimePointerCheck *RtPtrCheck =
 497         LAI.getRuntimePointerCheck();
 498
 499     unsigned N = RtPtrCheck->Pointers.size();
 500     SmallVector<int, 8> PtrToPartitions(N);
 501     for (unsigned I = 0; I < N; ++I) {
 502       Value *Ptr = RtPtrCheck->Pointers[I];
 503       auto Instructions =
 504           LAI.getInstructionsForAccess(Ptr, RtPtrCheck->IsWritePtr[I]);
 505
 506       int &Partition = PtrToPartitions[I];
 507       // First set it to uninitialized.
 508       Partition = -2;
 509       for (Instruction *Inst : Instructions) {
 510         // Note that this could be -1 if Inst is duplicated across multiple
 511         // partitions.
 512         int ThisPartition = this->InstToPartitionId[Inst];
 513         if (Partition == -2)
 514           Partition = ThisPartition;
 515         // -1 means belonging to multiple partitions.
 516         else if (Partition == -1)
 517           break;
 518         else if (Partition != (int)ThisPartition)
 519           Partition = -1;
 520       }
 521       assert(Partition != -2 && "Pointer not belonging to any partition");
 522     }
 523
 524     return PtrToPartitions;
 525   }
 526
 527   void print(raw_ostream &OS) const {
 528     unsigned Index = 0;
 529     for (const auto &P : PartitionContainer) {
 530       OS << "Partition " << Index++ << " (" << &P << "):\n";
 531       P.print();
 532     }
 533   }
 534
 535   void dump() const { print(dbgs()); }
 536
 537 #ifndef NDEBUG
 538   friend raw_ostream &operator<<(raw_ostream &OS,
 539                                  const InstPartitionContainer &Partitions) {
 540     Partitions.print(OS);
 541     return OS;
 542   }
 543 #endif
 544
 545   void printBlocks() const {
 546     unsigned Index = 0;
 547     for (const auto &P : PartitionContainer) {
 548       dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
 549       P.printBlocks();
 550     }
 551   }
 552
 553 private:
 554   typedef std::list<InstPartition> PartitionContainerT;
 555
 556   /// \brief List of partitions.
 557   PartitionContainerT PartitionContainer;
 558
 559   /// \brief Mapping from Instruction to partition Id.  If the instruction
 560   /// belongs to multiple partitions the entry contains -1.
 561   InstToPartitionIdT InstToPartitionId;
 562
 563   Loop *L;
 564   LoopInfo *LI;
 565   DominatorTree *DT;
 566
 567   /// \brief The control structure to merge adjacent partitions if both satisfy
 568   /// the \p Predicate.
 569   template <class UnaryPredicate>
 570   void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
 571     InstPartition *PrevMatch = nullptr;
 572     for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
 573       auto DoesMatch = Predicate(&*I);
 574       if (PrevMatch == nullptr && DoesMatch) {
 575         PrevMatch = &*I;
 576         ++I;
 577       } else if (PrevMatch != nullptr && DoesMatch) {
 578         I->moveTo(*PrevMatch);
 579         I = PartitionContainer.erase(I);
 580       } else {
 581         PrevMatch = nullptr;
 582         ++I;
 583       }
 584     }
 585   }
 586 };
 587
 588 /// \brief For each memory instruction, this class maintains difference of the
 589 /// number of unsafe dependences that start out from this instruction minus
 590 /// those that end here.
 591 ///
 592 /// By traversing the memory instructions in program order and accumulating this
 593 /// number, we know whether any unsafe dependence crosses over a program point.
 594 class MemoryInstructionDependences {
 595   typedef MemoryDepChecker::Dependence Dependence;
 596
 597 public:
 598   struct Entry {
 599     Instruction *Inst;
 600     unsigned NumUnsafeDependencesStartOrEnd;
 601
 602     Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {}
 603   };
 604
 605   typedef SmallVector<Entry, 8> AccessesType;
 606
 607   AccessesType::const_iterator begin() const { return Accesses.begin(); }
 608   AccessesType::const_iterator end() const { return Accesses.end(); }
 609
 610   MemoryInstructionDependences(
 611       const SmallVectorImpl<Instruction *> &Instructions,
 612       const SmallVectorImpl<Dependence> &InterestingDependences) {
 613     Accesses.append(Instructions.begin(), Instructions.end());
 614
 615     DEBUG(dbgs() << "Backward dependences:\n");
 616     for (auto &Dep : InterestingDependences)
 617       if (Dep.isPossiblyBackward()) {
 618         // Note that the designations source and destination follow the program
 619         // order, i.e. source is always first.  (The direction is given by the
 620         // DepType.)
 621         ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
 622         --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
 623
 624         DEBUG(Dep.print(dbgs(), 2, Instructions));
 625       }
 626   }
 627
 628 private:
 629   AccessesType Accesses;
 630 };
 631
 632 /// \brief Handles the loop versioning based on memchecks.
 633 class RuntimeCheckEmitter {
 634 public:
 635   RuntimeCheckEmitter(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
 636                       DominatorTree *DT,
 637                       const SmallVector<int, 8> *PtrToPartition = nullptr)
 638       : OrigLoop(L), NonDistributedLoop(nullptr),
 639         PtrToPartition(PtrToPartition), LAI(LAI), LI(LI), DT(DT) {}
 640
 641   /// \brief Returns true if we need memchecks to distribute the loop.
 642   bool needsRuntimeChecks() const {
 643     return LAI.getRuntimePointerCheck()->needsAnyChecking(PtrToPartition);
 644   }
 645
 646   /// \brief Performs the CFG manipulation part of versioning the loop including
 647   /// the DominatorTree and LoopInfo updates.
 648   void versionLoop(Pass *P) {
 649     Instruction *FirstCheckInst;
 650     Instruction *MemRuntimeCheck;
 651     // Add the memcheck in the original preheader (this is empty initially).
 652     BasicBlock *MemCheckBB = OrigLoop->getLoopPreheader();
 653     std::tie(FirstCheckInst, MemRuntimeCheck) =
 654         LAI.addRuntimeCheck(MemCheckBB->getTerminator(), PtrToPartition);
 655     assert(MemRuntimeCheck && "called even though needsAnyChecking = false");
 656
 657     // Rename the block to make the IR more readable.
 658     MemCheckBB->setName(OrigLoop->getHeader()->getName() + ".ldist.memcheck");
 659
 660     // Create empty preheader for the loop (and after cloning for the
 661     // original/nondist loop).
 662     BasicBlock *PH =
 663         SplitBlock(MemCheckBB, MemCheckBB->getTerminator(), DT, LI);
 664     PH->setName(OrigLoop->getHeader()->getName() + ".ph");
 665
 666     // Clone the loop including the preheader.
 667     //
 668     // FIXME: This does not currently preserve SimplifyLoop because the exit
 669     // block is a join between the two loops.
 670     SmallVector<BasicBlock *, 8> NonDistributedLoopBlocks;
 671     NonDistributedLoop =
 672         cloneLoopWithPreheader(PH, MemCheckBB, OrigLoop, VMap, ".ldist.nondist",
 673                                LI, DT, NonDistributedLoopBlocks);
 674     remapInstructionsInLoop(NonDistributedLoopBlocks, VMap);
 675
 676     // Insert the conditional branch based on the result of the memchecks.
 677     Instruction *OrigTerm = MemCheckBB->getTerminator();
 678     BranchInst::Create(NonDistributedLoop->getLoopPreheader(),
 679                        OrigLoop->getLoopPreheader(), MemRuntimeCheck, OrigTerm);
 680     OrigTerm->eraseFromParent();
 681
 682     // The loops merge in the original exit block.  This is now dominated by the
 683     // memchecking block.
 684     DT->changeImmediateDominator(OrigLoop->getExitBlock(), MemCheckBB);
 685   }
 686
 687   /// \brief Adds the necessary PHI nodes for the versioned loops based on the
 688   /// loop-defined values used outside of the loop.
 689   void addPHINodes(const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
 690     BasicBlock *PHIBlock = OrigLoop->getExitBlock();
 691     assert(PHIBlock && "No single successor to loop exit block");
 692
 693     for (auto *Inst : DefsUsedOutside) {
 694       auto *NonDistInst = cast<Instruction>(VMap[Inst]);
 695       PHINode *PN;
 696
 697       // First see if we have a single-operand PHI with the value defined by the
 698       // original loop.
 699       for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) {
 700         assert(PN->getNumOperands() == 1 &&
 701                "Exit block should only have on predecessor");
 702         if (PN->getIncomingValue(0) == Inst)
 703           break;
 704       }
 705       // If not create it.
 706       if (!PN) {
 707         PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".ldist",
 708                              PHIBlock->begin());
 709         for (auto *User : Inst->users())
 710           if (!OrigLoop->contains(cast<Instruction>(User)->getParent()))
 711             User->replaceUsesOfWith(Inst, PN);
 712         PN->addIncoming(Inst, OrigLoop->getExitingBlock());
 713       }
 714       // Add the new incoming value from the non-distributed loop.
 715       PN->addIncoming(NonDistInst, NonDistributedLoop->getExitingBlock());
 716     }
 717   }
 718
 719 private:
 720   /// \brief The original loop.  This becomes the "versioned" one, i.e. control
 721   /// goes if the memchecks all pass.
 722   Loop *OrigLoop;
 723   /// \brief The fall-back loop, i.e. if any of the memchecks fail.
 724   Loop *NonDistributedLoop;
 725
 726   /// \brief For each memory pointer it contains the partitionId it is used in.
 727   /// If nullptr, no partitioning is used.
 728   ///
 729   /// The I-th entry corresponds to I-th entry in LAI.getRuntimePointerCheck().
 730   /// If the pointer is used in multiple partitions the entry is set to -1.
 731   const SmallVector<int, 8> *PtrToPartition;
 732
 733   /// \brief This maps the instructions from OrigLoop to their counterpart in
 734   /// NonDistributedLoop.
 735   ValueToValueMapTy VMap;
 736
 737   /// \brief Analyses used.
 738   const LoopAccessInfo &LAI;
 739   LoopInfo *LI;
 740   DominatorTree *DT;
 741 };
 742
 743 /// \brief Returns the instructions that use values defined in the loop.
 744 static SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L) {
 745   SmallVector<Instruction *, 8> UsedOutside;
 746
 747   for (auto *Block : L->getBlocks())
 748     // FIXME: I believe that this could use copy_if if the Inst reference could
 749     // be adapted into a pointer.
 750     for (auto &Inst : *Block) {
 751       auto Users = Inst.users();
 752       if (std::any_of(Users.begin(), Users.end(), [&](User *U) {
 753             auto *Use = cast<Instruction>(U);
 754             return !L->contains(Use->getParent());
 755           }))
 756         UsedOutside.push_back(&Inst);
 757     }
 758
 759   return UsedOutside;
 760 }
 761
 762 /// \brief The pass class.
 763 class LoopDistribute : public FunctionPass {
 764 public:
 765   LoopDistribute() : FunctionPass(ID) {
 766     initializeLoopDistributePass(*PassRegistry::getPassRegistry());
 767   }
 768
 769   bool runOnFunction(Function &F) override {
 770     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 771     LAA = &getAnalysis<LoopAccessAnalysis>();
 772     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 773
 774     // Build up a worklist of inner-loops to vectorize. This is necessary as the
 775     // act of distributing a loop creates new loops and can invalidate iterators
 776     // across the loops.
 777     SmallVector<Loop *, 8> Worklist;
 778
 779     for (Loop *TopLevelLoop : *LI)
 780       for (Loop *L : depth_first(TopLevelLoop))
 781         // We only handle inner-most loops.
 782         if (L->empty())
 783           Worklist.push_back(L);
 784
 785     // Now walk the identified inner loops.
 786     bool Changed = false;
 787     for (Loop *L : Worklist)
 788       Changed |= processLoop(L);
 789
 790     // Process each loop nest in the function.
 791     return Changed;
 792   }
 793
 794   void getAnalysisUsage(AnalysisUsage &AU) const override {
 795     AU.addRequired<LoopInfoWrapperPass>();
 796     AU.addPreserved<LoopInfoWrapperPass>();
 797     AU.addRequired<LoopAccessAnalysis>();
 798     AU.addRequired<DominatorTreeWrapperPass>();
 799     AU.addPreserved<DominatorTreeWrapperPass>();
 800   }
 801
 802   static char ID;
 803
 804 private:
 805   /// \brief Try to distribute an inner-most loop.
 806   bool processLoop(Loop *L) {
 807     assert(L->empty() && "Only process inner loops.");
 808
 809     DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
 810                  << "\" checking " << *L << "\n");
 811
 812     BasicBlock *PH = L->getLoopPreheader();
 813     if (!PH) {
 814       DEBUG(dbgs() << "Skipping; no preheader");
 815       return false;
 816     }
 817     if (!L->getExitBlock()) {
 818       DEBUG(dbgs() << "Skipping; multiple exit blocks");
 819       return false;
 820     }
 821     // LAA will check that we only have a single exiting block.
 822
 823     const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
 824
 825     // Currently, we only distribute to isolate the part of the loop with
 826     // dependence cycles to enable partial vectorization.
 827     if (LAI.canVectorizeMemory()) {
 828       DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization");
 829       return false;
 830     }
 831     auto *InterestingDependences =
 832         LAI.getDepChecker().getInterestingDependences();
 833     if (!InterestingDependences || InterestingDependences->empty()) {
 834       DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate");
 835       return false;
 836     }
 837
 838     InstPartitionContainer Partitions(L, LI, DT);
 839
 840     // First, go through each memory operation and assign them to consecutive
 841     // partitions (the order of partitions follows program order).  Put those
 842     // with unsafe dependences into "cyclic" partition otherwise put each store
 843     // in its own "non-cyclic" partition (we'll merge these later).
 844     //
 845     // Note that a memory operation (e.g. Load2 below) at a program point that
 846     // has an unsafe dependence (Store3->Load1) spanning over it must be
 847     // included in the same cyclic partition as the dependent operations.  This
 848     // is to preserve the original program order after distribution.  E.g.:
 849     //
 850     //                NumUnsafeDependencesStartOrEnd  NumUnsafeDependencesActive
 851     //  Load1   -.                     1                       0->1
 852     //  Load2    | /Unsafe/            0                       1
 853     //  Store3  -'                    -1                       1->0
 854     //  Load4                          0                       0
 855     //
 856     // NumUnsafeDependencesActive > 0 indicates this situation and in this case
 857     // we just keep assigning to the same cyclic partition until
 858     // NumUnsafeDependencesActive reaches 0.
 859     const MemoryDepChecker &DepChecker = LAI.getDepChecker();
 860     MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
 861                                      *InterestingDependences);
 862
 863     int NumUnsafeDependencesActive = 0;
 864     for (auto &InstDep : MID) {
 865       Instruction *I = InstDep.Inst;
 866       // We update NumUnsafeDependencesActive post-instruction, catch the
 867       // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
 868       if (NumUnsafeDependencesActive ||
 869           InstDep.NumUnsafeDependencesStartOrEnd > 0)
 870         Partitions.addToCyclicPartition(I);
 871       else
 872         Partitions.addToNewNonCyclicPartition(I);
 873       NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
 874       assert(NumUnsafeDependencesActive >= 0 &&
 875              "Negative number of dependences active");
 876     }
 877
 878     // Add partitions for values used outside.  These partitions can be out of
 879     // order from the original program order.  This is OK because if the
 880     // partition uses a load we will merge this partition with the original
 881     // partition of the load that we set up in the previous loop (see
 882     // mergeToAvoidDuplicatedLoads).
 883     auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
 884     for (auto *Inst : DefsUsedOutside)
 885       Partitions.addToNewNonCyclicPartition(Inst);
 886
 887     DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
 888     if (Partitions.getSize() < 2)
 889       return false;
 890
 891     // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
 892     // should be able to vectorize these together.
 893     Partitions.mergeBeforePopulating();
 894     DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
 895     if (Partitions.getSize() < 2)
 896       return false;
 897
 898     // Now, populate the partitions with non-memory operations.
 899     Partitions.populateUsedSet();
 900     DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
 901
 902     // In order to preserve original lexical order for loads, keep them in the
 903     // partition that we set up in the MemoryInstructionDependences loop.
 904     if (Partitions.mergeToAvoidDuplicatedLoads()) {
 905       DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
 906                    << Partitions);
 907       if (Partitions.getSize() < 2)
 908         return false;
 909     }
 910
 911     DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
 912     // We're done forming the partitions set up the reverse mapping from
 913     // instructions to partitions.
 914     Partitions.setupPartitionIdOnInstructions();
 915
 916     // To keep things simple have an empty preheader before we version or clone
 917     // the loop.  (Also split if this has no predecessor, i.e. entry, because we
 918     // rely on PH having a predecessor.)
 919     if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
 920       SplitBlock(PH, PH->getTerminator(), DT, LI);
 921
 922     // If we need run-time checks to disambiguate pointers are run-time, version
 923     // the loop now.
 924     auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI);
 925     RuntimeCheckEmitter RtCheckEmitter(LAI, L, LI, DT, &PtrToPartition);
 926     if (RtCheckEmitter.needsRuntimeChecks()) {
 927       DEBUG(dbgs() << "\nPointers:\n");
 928       DEBUG(LAI.getRuntimePointerCheck()->print(dbgs(), 0, &PtrToPartition));
 929       RtCheckEmitter.versionLoop(this);
 930       RtCheckEmitter.addPHINodes(DefsUsedOutside);
 931     }
 932
 933     // Create identical copies of the original loop for each partition and hook
 934     // them up sequentially.
 935     Partitions.cloneLoops(this);
 936
 937     // Now, we remove the instruction from each loop that don't belong to that
 938     // partition.
 939     Partitions.removeUnusedInsts();
 940     DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
 941     DEBUG(Partitions.printBlocks());
 942
 943     if (LDistVerify) {
 944       LI->verify();
 945       DT->verifyDomTree();
 946     }
 947
 948     ++NumLoopsDistributed;
 949     return true;
 950   }
 951
 952   // Analyses used.
 953   LoopInfo *LI;
 954   LoopAccessAnalysis *LAA;
 955   DominatorTree *DT;
 956 };
 957 } // anonymous namespace
 958
 959 char LoopDistribute::ID;
 960 static const char ldist_name[] = "Loop Distribition";
 961
 962 INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false)
 963 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 964 INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
 965 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 966 INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false)
 967
 968 namespace llvm {
 969 FunctionPass *createLoopDistributePass() { return new LoopDistribute(); }
 970 }