lib/Transforms/Scalar/LoopRerollPass.cpp

   1 //===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This pass implements a simple loop reroller.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "llvm/Transforms/Scalar.h"
  15 #include "llvm/ADT/MapVector.h"
  16 #include "llvm/ADT/STLExtras.h"
  17 #include "llvm/ADT/SmallBitVector.h"
  18 #include "llvm/ADT/SmallSet.h"
  19 #include "llvm/ADT/Statistic.h"
  20 #include "llvm/Analysis/AliasAnalysis.h"
  21 #include "llvm/Analysis/AliasSetTracker.h"
  22 #include "llvm/Analysis/LoopPass.h"
  23 #include "llvm/Analysis/ScalarEvolution.h"
  24 #include "llvm/Analysis/ScalarEvolutionExpander.h"
  25 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  26 #include "llvm/Analysis/TargetLibraryInfo.h"
  27 #include "llvm/Analysis/ValueTracking.h"
  28 #include "llvm/IR/DataLayout.h"
  29 #include "llvm/IR/Dominators.h"
  30 #include "llvm/IR/IntrinsicInst.h"
  31 #include "llvm/Support/CommandLine.h"
  32 #include "llvm/Support/Debug.h"
  33 #include "llvm/Support/raw_ostream.h"
  34 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  35 #include "llvm/Transforms/Utils/Local.h"
  36 #include "llvm/Transforms/Utils/LoopUtils.h"
  37
  38 using namespace llvm;
  39
  40 #define DEBUG_TYPE "loop-reroll"
  41
  42 STATISTIC(NumRerolledLoops, "Number of rerolled loops");
  43
  44 static cl::opt<unsigned>
  45 MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
  46   cl::desc("The maximum increment for loop rerolling"));
  47
  48 static cl::opt<unsigned>
  49 NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
  50                           cl::Hidden,
  51                           cl::desc("The maximum number of failures to tolerate"
  52                                    " during fuzzy matching. (default: 400)"));
  53
  54 // This loop re-rolling transformation aims to transform loops like this:
  55 //
  56 // int foo(int a);
  57 // void bar(int *x) {
  58 //   for (int i = 0; i < 500; i += 3) {
  59 //     foo(i);
  60 //     foo(i+1);
  61 //     foo(i+2);
  62 //   }
  63 // }
  64 //
  65 // into a loop like this:
  66 //
  67 // void bar(int *x) {
  68 //   for (int i = 0; i < 500; ++i)
  69 //     foo(i);
  70 // }
  71 //
  72 // It does this by looking for loops that, besides the latch code, are composed
  73 // of isomorphic DAGs of instructions, with each DAG rooted at some increment
  74 // to the induction variable, and where each DAG is isomorphic to the DAG
  75 // rooted at the induction variable (excepting the sub-DAGs which root the
  76 // other induction-variable increments). In other words, we're looking for loop
  77 // bodies of the form:
  78 //
  79 // %iv = phi [ (preheader, ...), (body, %iv.next) ]
  80 // f(%iv)
  81 // %iv.1 = add %iv, 1                <-- a root increment
  82 // f(%iv.1)
  83 // %iv.2 = add %iv, 2                <-- a root increment
  84 // f(%iv.2)
  85 // %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
  86 // f(%iv.scale_m_1)
  87 // ...
  88 // %iv.next = add %iv, scale
  89 // %cmp = icmp(%iv, ...)
  90 // br %cmp, header, exit
  91 //
  92 // where each f(i) is a set of instructions that, collectively, are a function
  93 // only of i (and other loop-invariant values).
  94 //
  95 // As a special case, we can also reroll loops like this:
  96 //
  97 // int foo(int);
  98 // void bar(int *x) {
  99 //   for (int i = 0; i < 500; ++i) {
 100 //     x[3*i] = foo(0);
 101 //     x[3*i+1] = foo(0);
 102 //     x[3*i+2] = foo(0);
 103 //   }
 104 // }
 105 //
 106 // into this:
 107 //
 108 // void bar(int *x) {
 109 //   for (int i = 0; i < 1500; ++i)
 110 //     x[i] = foo(0);
 111 // }
 112 //
 113 // in which case, we're looking for inputs like this:
 114 //
 115 // %iv = phi [ (preheader, ...), (body, %iv.next) ]
 116 // %scaled.iv = mul %iv, scale
 117 // f(%scaled.iv)
 118 // %scaled.iv.1 = add %scaled.iv, 1
 119 // f(%scaled.iv.1)
 120 // %scaled.iv.2 = add %scaled.iv, 2
 121 // f(%scaled.iv.2)
 122 // %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
 123 // f(%scaled.iv.scale_m_1)
 124 // ...
 125 // %iv.next = add %iv, 1
 126 // %cmp = icmp(%iv, ...)
 127 // br %cmp, header, exit
 128
 129 namespace {
 130   enum IterationLimits {
 131     /// The maximum number of iterations that we'll try and reroll. This
 132     /// has to be less than 25 in order to fit into a SmallBitVector.
 133     IL_MaxRerollIterations = 16,
 134     /// The bitvector index used by loop induction variables and other
 135     /// instructions that belong to all iterations.
 136     IL_All,
 137     IL_End
 138   };
 139
 140   class LoopReroll : public LoopPass {
 141   public:
 142     static char ID; // Pass ID, replacement for typeid
 143     LoopReroll() : LoopPass(ID) {
 144       initializeLoopRerollPass(*PassRegistry::getPassRegistry());
 145     }
 146
 147     bool runOnLoop(Loop *L, LPPassManager &LPM) override;
 148
 149     void getAnalysisUsage(AnalysisUsage &AU) const override {
 150       AU.addRequired<AAResultsWrapperPass>();
 151       AU.addRequired<LoopInfoWrapperPass>();
 152       AU.addPreserved<LoopInfoWrapperPass>();
 153       AU.addRequired<DominatorTreeWrapperPass>();
 154       AU.addPreserved<DominatorTreeWrapperPass>();
 155       AU.addRequired<ScalarEvolutionWrapperPass>();
 156       AU.addRequired<TargetLibraryInfoWrapperPass>();
 157     }
 158
 159   protected:
 160     AliasAnalysis *AA;
 161     LoopInfo *LI;
 162     ScalarEvolution *SE;
 163     TargetLibraryInfo *TLI;
 164     DominatorTree *DT;
 165
 166     typedef SmallVector<Instruction *, 16> SmallInstructionVector;
 167     typedef SmallSet<Instruction *, 16>   SmallInstructionSet;
 168
 169     // Map between induction variable and its increment
 170     DenseMap<Instruction *, int64_t> IVToIncMap;
 171
 172     // A chain of isomorphic instructions, identified by a single-use PHI
 173     // representing a reduction. Only the last value may be used outside the
 174     // loop.
 175     struct SimpleLoopReduction {
 176       SimpleLoopReduction(Instruction *P, Loop *L)
 177         : Valid(false), Instructions(1, P) {
 178         assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
 179         add(L);
 180       }
 181
 182       bool valid() const {
 183         return Valid;
 184       }
 185
 186       Instruction *getPHI() const {
 187         assert(Valid && "Using invalid reduction");
 188         return Instructions.front();
 189       }
 190
 191       Instruction *getReducedValue() const {
 192         assert(Valid && "Using invalid reduction");
 193         return Instructions.back();
 194       }
 195
 196       Instruction *get(size_t i) const {
 197         assert(Valid && "Using invalid reduction");
 198         return Instructions[i+1];
 199       }
 200
 201       Instruction *operator [] (size_t i) const { return get(i); }
 202
 203       // The size, ignoring the initial PHI.
 204       size_t size() const {
 205         assert(Valid && "Using invalid reduction");
 206         return Instructions.size()-1;
 207       }
 208
 209       typedef SmallInstructionVector::iterator iterator;
 210       typedef SmallInstructionVector::const_iterator const_iterator;
 211
 212       iterator begin() {
 213         assert(Valid && "Using invalid reduction");
 214         return std::next(Instructions.begin());
 215       }
 216
 217       const_iterator begin() const {
 218         assert(Valid && "Using invalid reduction");
 219         return std::next(Instructions.begin());
 220       }
 221
 222       iterator end() { return Instructions.end(); }
 223       const_iterator end() const { return Instructions.end(); }
 224
 225     protected:
 226       bool Valid;
 227       SmallInstructionVector Instructions;
 228
 229       void add(Loop *L);
 230     };
 231
 232     // The set of all reductions, and state tracking of possible reductions
 233     // during loop instruction processing.
 234     struct ReductionTracker {
 235       typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector;
 236
 237       // Add a new possible reduction.
 238       void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); }
 239
 240       // Setup to track possible reductions corresponding to the provided
 241       // rerolling scale. Only reductions with a number of non-PHI instructions
 242       // that is divisible by the scale are considered. Three instructions sets
 243       // are filled in:
 244       //   - A set of all possible instructions in eligible reductions.
 245       //   - A set of all PHIs in eligible reductions
 246       //   - A set of all reduced values (last instructions) in eligible
 247       //     reductions.
 248       void restrictToScale(uint64_t Scale,
 249                            SmallInstructionSet &PossibleRedSet,
 250                            SmallInstructionSet &PossibleRedPHISet,
 251                            SmallInstructionSet &PossibleRedLastSet) {
 252         PossibleRedIdx.clear();
 253         PossibleRedIter.clear();
 254         Reds.clear();
 255
 256         for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i)
 257           if (PossibleReds[i].size() % Scale == 0) {
 258             PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
 259             PossibleRedPHISet.insert(PossibleReds[i].getPHI());
 260
 261             PossibleRedSet.insert(PossibleReds[i].getPHI());
 262             PossibleRedIdx[PossibleReds[i].getPHI()] = i;
 263             for (Instruction *J : PossibleReds[i]) {
 264               PossibleRedSet.insert(J);
 265               PossibleRedIdx[J] = i;
 266             }
 267           }
 268       }
 269
 270       // The functions below are used while processing the loop instructions.
 271
 272       // Are the two instructions both from reductions, and furthermore, from
 273       // the same reduction?
 274       bool isPairInSame(Instruction *J1, Instruction *J2) {
 275         DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1);
 276         if (J1I != PossibleRedIdx.end()) {
 277           DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2);
 278           if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second)
 279             return true;
 280         }
 281
 282         return false;
 283       }
 284
 285       // The two provided instructions, the first from the base iteration, and
 286       // the second from iteration i, form a matched pair. If these are part of
 287       // a reduction, record that fact.
 288       void recordPair(Instruction *J1, Instruction *J2, unsigned i) {
 289         if (PossibleRedIdx.count(J1)) {
 290           assert(PossibleRedIdx.count(J2) &&
 291                  "Recording reduction vs. non-reduction instruction?");
 292
 293           PossibleRedIter[J1] = 0;
 294           PossibleRedIter[J2] = i;
 295
 296           int Idx = PossibleRedIdx[J1];
 297           assert(Idx == PossibleRedIdx[J2] &&
 298                  "Recording pair from different reductions?");
 299           Reds.insert(Idx);
 300         }
 301       }
 302
 303       // The functions below can be called after we've finished processing all
 304       // instructions in the loop, and we know which reductions were selected.
 305
 306       bool validateSelected();
 307       void replaceSelected();
 308
 309     protected:
 310       // The vector of all possible reductions (for any scale).
 311       SmallReductionVector PossibleReds;
 312
 313       DenseMap<Instruction *, int> PossibleRedIdx;
 314       DenseMap<Instruction *, int> PossibleRedIter;
 315       DenseSet<int> Reds;
 316     };
 317
 318     // A DAGRootSet models an induction variable being used in a rerollable
 319     // loop. For example,
 320     //
 321     //   x[i*3+0] = y1
 322     //   x[i*3+1] = y2
 323     //   x[i*3+2] = y3
 324     //
 325     //   Base instruction -> i*3
 326     //                    +---+----+
 327     //                   /    |     \
 328     //               ST[y1]  +1     +2  <-- Roots
 329     //                        |      |
 330     //                      ST[y2] ST[y3]
 331     //
 332     // There may be multiple DAGRoots, for example:
 333     //
 334     //   x[i*2+0] = ...   (1)
 335     //   x[i*2+1] = ...   (1)
 336     //   x[i*2+4] = ...   (2)
 337     //   x[i*2+5] = ...   (2)
 338     //   x[(i+1234)*2+5678] = ... (3)
 339     //   x[(i+1234)*2+5679] = ... (3)
 340     //
 341     // The loop will be rerolled by adding a new loop induction variable,
 342     // one for the Base instruction in each DAGRootSet.
 343     //
 344     struct DAGRootSet {
 345       Instruction *BaseInst;
 346       SmallInstructionVector Roots;
 347       // The instructions between IV and BaseInst (but not including BaseInst).
 348       SmallInstructionSet SubsumedInsts;
 349     };
 350
 351     // The set of all DAG roots, and state tracking of all roots
 352     // for a particular induction variable.
 353     struct DAGRootTracker {
 354       DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV,
 355                      ScalarEvolution *SE, AliasAnalysis *AA,
 356                      TargetLibraryInfo *TLI,
 357                      DenseMap<Instruction *, int64_t> &IncrMap)
 358           : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), IV(IV),
 359             IVToIncMap(IncrMap) {}
 360
 361       /// Stage 1: Find all the DAG roots for the induction variable.
 362       bool findRoots();
 363       /// Stage 2: Validate if the found roots are valid.
 364       bool validate(ReductionTracker &Reductions);
 365       /// Stage 3: Assuming validate() returned true, perform the
 366       /// replacement.
 367       /// @param IterCount The maximum iteration count of L.
 368       void replace(const SCEV *IterCount);
 369
 370     protected:
 371       typedef MapVector<Instruction*, SmallBitVector> UsesTy;
 372
 373       bool findRootsRecursive(Instruction *IVU,
 374                               SmallInstructionSet SubsumedInsts);
 375       bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts);
 376       bool collectPossibleRoots(Instruction *Base,
 377                                 std::map<int64_t,Instruction*> &Roots);
 378
 379       bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet);
 380       void collectInLoopUserSet(const SmallInstructionVector &Roots,
 381                                 const SmallInstructionSet &Exclude,
 382                                 const SmallInstructionSet &Final,
 383                                 DenseSet<Instruction *> &Users);
 384       void collectInLoopUserSet(Instruction *Root,
 385                                 const SmallInstructionSet &Exclude,
 386                                 const SmallInstructionSet &Final,
 387                                 DenseSet<Instruction *> &Users);
 388
 389       UsesTy::iterator nextInstr(int Val, UsesTy &In,
 390                                  const SmallInstructionSet &Exclude,
 391                                  UsesTy::iterator *StartI=nullptr);
 392       bool isBaseInst(Instruction *I);
 393       bool isRootInst(Instruction *I);
 394       bool instrDependsOn(Instruction *I,
 395                           UsesTy::iterator Start,
 396                           UsesTy::iterator End);
 397
 398       LoopReroll *Parent;
 399
 400       // Members of Parent, replicated here for brevity.
 401       Loop *L;
 402       ScalarEvolution *SE;
 403       AliasAnalysis *AA;
 404       TargetLibraryInfo *TLI;
 405
 406       // The loop induction variable.
 407       Instruction *IV;
 408       // Loop step amount.
 409       int64_t Inc;
 410       // Loop reroll count; if Inc == 1, this records the scaling applied
 411       // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;
 412       // If Inc is not 1, Scale = Inc.
 413       uint64_t Scale;
 414       // The roots themselves.
 415       SmallVector<DAGRootSet,16> RootSets;
 416       // All increment instructions for IV.
 417       SmallInstructionVector LoopIncs;
 418       // Map of all instructions in the loop (in order) to the iterations
 419       // they are used in (or specially, IL_All for instructions
 420       // used in the loop increment mechanism).
 421       UsesTy Uses;
 422       // Map between induction variable and its increment
 423       DenseMap<Instruction *, int64_t> &IVToIncMap;
 424     };
 425
 426     void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
 427     void collectPossibleReductions(Loop *L,
 428            ReductionTracker &Reductions);
 429     bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
 430                 ReductionTracker &Reductions);
 431   };
 432 }
 433
 434 char LoopReroll::ID = 0;
 435 INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
 436 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 437 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 438 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 439 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 440 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 441 INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
 442
 443 Pass *llvm::createLoopRerollPass() {
 444   return new LoopReroll;
 445 }
 446
 447 // Returns true if the provided instruction is used outside the given loop.
 448 // This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
 449 // non-loop blocks to be outside the loop.
 450 static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
 451   for (User *U : I->users()) {
 452     if (!L->contains(cast<Instruction>(U)))
 453       return true;
 454   }
 455   return false;
 456 }
 457
 458 // Collect the list of loop induction variables with respect to which it might
 459 // be possible to reroll the loop.
 460 void LoopReroll::collectPossibleIVs(Loop *L,
 461                                     SmallInstructionVector &PossibleIVs) {
 462   BasicBlock *Header = L->getHeader();
 463   for (BasicBlock::iterator I = Header->begin(),
 464        IE = Header->getFirstInsertionPt(); I != IE; ++I) {
 465     if (!isa<PHINode>(I))
 466       continue;
 467     if (!I->getType()->isIntegerTy())
 468       continue;
 469
 470     if (const SCEVAddRecExpr *PHISCEV =
 471             dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&*I))) {
 472       if (PHISCEV->getLoop() != L)
 473         continue;
 474       if (!PHISCEV->isAffine())
 475         continue;
 476       if (const SCEVConstant *IncSCEV =
 477           dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) {
 478         const APInt &AInt = IncSCEV->getValue()->getValue().abs();
 479         if (IncSCEV->getValue()->isZero() || AInt.uge(MaxInc))
 480           continue;
 481         IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue();
 482         DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
 483                      << "\n");
 484         PossibleIVs.push_back(&*I);
 485       }
 486     }
 487   }
 488 }
 489
 490 // Add the remainder of the reduction-variable chain to the instruction vector
 491 // (the initial PHINode has already been added). If successful, the object is
 492 // marked as valid.
 493 void LoopReroll::SimpleLoopReduction::add(Loop *L) {
 494   assert(!Valid && "Cannot add to an already-valid chain");
 495
 496   // The reduction variable must be a chain of single-use instructions
 497   // (including the PHI), except for the last value (which is used by the PHI
 498   // and also outside the loop).
 499   Instruction *C = Instructions.front();
 500   if (C->user_empty())
 501     return;
 502
 503   do {
 504     C = cast<Instruction>(*C->user_begin());
 505     if (C->hasOneUse()) {
 506       if (!C->isBinaryOp())
 507         return;
 508
 509       if (!(isa<PHINode>(Instructions.back()) ||
 510             C->isSameOperationAs(Instructions.back())))
 511         return;
 512
 513       Instructions.push_back(C);
 514     }
 515   } while (C->hasOneUse());
 516
 517   if (Instructions.size() < 2 ||
 518       !C->isSameOperationAs(Instructions.back()) ||
 519       C->use_empty())
 520     return;
 521
 522   // C is now the (potential) last instruction in the reduction chain.
 523   for (User *U : C->users()) {
 524     // The only in-loop user can be the initial PHI.
 525     if (L->contains(cast<Instruction>(U)))
 526       if (cast<Instruction>(U) != Instructions.front())
 527         return;
 528   }
 529
 530   Instructions.push_back(C);
 531   Valid = true;
 532 }
 533
 534 // Collect the vector of possible reduction variables.
 535 void LoopReroll::collectPossibleReductions(Loop *L,
 536   ReductionTracker &Reductions) {
 537   BasicBlock *Header = L->getHeader();
 538   for (BasicBlock::iterator I = Header->begin(),
 539        IE = Header->getFirstInsertionPt(); I != IE; ++I) {
 540     if (!isa<PHINode>(I))
 541       continue;
 542     if (!I->getType()->isSingleValueType())
 543       continue;
 544
 545     SimpleLoopReduction SLR(&*I, L);
 546     if (!SLR.valid())
 547       continue;
 548
 549     DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " <<
 550           SLR.size() << " chained instructions)\n");
 551     Reductions.addSLR(SLR);
 552   }
 553 }
 554
 555 // Collect the set of all users of the provided root instruction. This set of
 556 // users contains not only the direct users of the root instruction, but also
 557 // all users of those users, and so on. There are two exceptions:
 558 //
 559 //   1. Instructions in the set of excluded instructions are never added to the
 560 //   use set (even if they are users). This is used, for example, to exclude
 561 //   including root increments in the use set of the primary IV.
 562 //
 563 //   2. Instructions in the set of final instructions are added to the use set
 564 //   if they are users, but their users are not added. This is used, for
 565 //   example, to prevent a reduction update from forcing all later reduction
 566 //   updates into the use set.
 567 void LoopReroll::DAGRootTracker::collectInLoopUserSet(
 568   Instruction *Root, const SmallInstructionSet &Exclude,
 569   const SmallInstructionSet &Final,
 570   DenseSet<Instruction *> &Users) {
 571   SmallInstructionVector Queue(1, Root);
 572   while (!Queue.empty()) {
 573     Instruction *I = Queue.pop_back_val();
 574     if (!Users.insert(I).second)
 575       continue;
 576
 577     if (!Final.count(I))
 578       for (Use &U : I->uses()) {
 579         Instruction *User = cast<Instruction>(U.getUser());
 580         if (PHINode *PN = dyn_cast<PHINode>(User)) {
 581           // Ignore "wrap-around" uses to PHIs of this loop's header.
 582           if (PN->getIncomingBlock(U) == L->getHeader())
 583             continue;
 584         }
 585
 586         if (L->contains(User) && !Exclude.count(User)) {
 587           Queue.push_back(User);
 588         }
 589       }
 590
 591     // We also want to collect single-user "feeder" values.
 592     for (User::op_iterator OI = I->op_begin(),
 593          OIE = I->op_end(); OI != OIE; ++OI) {
 594       if (Instruction *Op = dyn_cast<Instruction>(*OI))
 595         if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) &&
 596             !Final.count(Op))
 597           Queue.push_back(Op);
 598     }
 599   }
 600 }
 601
 602 // Collect all of the users of all of the provided root instructions (combined
 603 // into a single set).
 604 void LoopReroll::DAGRootTracker::collectInLoopUserSet(
 605   const SmallInstructionVector &Roots,
 606   const SmallInstructionSet &Exclude,
 607   const SmallInstructionSet &Final,
 608   DenseSet<Instruction *> &Users) {
 609   for (SmallInstructionVector::const_iterator I = Roots.begin(),
 610        IE = Roots.end(); I != IE; ++I)
 611     collectInLoopUserSet(*I, Exclude, Final, Users);
 612 }
 613
 614 static bool isSimpleLoadStore(Instruction *I) {
 615   if (LoadInst *LI = dyn_cast<LoadInst>(I))
 616     return LI->isSimple();
 617   if (StoreInst *SI = dyn_cast<StoreInst>(I))
 618     return SI->isSimple();
 619   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
 620     return !MI->isVolatile();
 621   return false;
 622 }
 623
 624 /// Return true if IVU is a "simple" arithmetic operation.
 625 /// This is used for narrowing the search space for DAGRoots; only arithmetic
 626 /// and GEPs can be part of a DAGRoot.
 627 static bool isSimpleArithmeticOp(User *IVU) {
 628   if (Instruction *I = dyn_cast<Instruction>(IVU)) {
 629     switch (I->getOpcode()) {
 630     default: return false;
 631     case Instruction::Add:
 632     case Instruction::Sub:
 633     case Instruction::Mul:
 634     case Instruction::Shl:
 635     case Instruction::AShr:
 636     case Instruction::LShr:
 637     case Instruction::GetElementPtr:
 638     case Instruction::Trunc:
 639     case Instruction::ZExt:
 640     case Instruction::SExt:
 641       return true;
 642     }
 643   }
 644   return false;
 645 }
 646
 647 static bool isLoopIncrement(User *U, Instruction *IV) {
 648   BinaryOperator *BO = dyn_cast<BinaryOperator>(U);
 649   if (!BO || BO->getOpcode() != Instruction::Add)
 650     return false;
 651
 652   for (auto *UU : BO->users()) {
 653     PHINode *PN = dyn_cast<PHINode>(UU);
 654     if (PN && PN == IV)
 655       return true;
 656   }
 657   return false;
 658 }
 659
 660 bool LoopReroll::DAGRootTracker::
 661 collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
 662   SmallInstructionVector BaseUsers;
 663
 664   for (auto *I : Base->users()) {
 665     ConstantInt *CI = nullptr;
 666
 667     if (isLoopIncrement(I, IV)) {
 668       LoopIncs.push_back(cast<Instruction>(I));
 669       continue;
 670     }
 671
 672     // The root nodes must be either GEPs, ORs or ADDs.
 673     if (auto *BO = dyn_cast<BinaryOperator>(I)) {
 674       if (BO->getOpcode() == Instruction::Add ||
 675           BO->getOpcode() == Instruction::Or)
 676         CI = dyn_cast<ConstantInt>(BO->getOperand(1));
 677     } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
 678       Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1);
 679       CI = dyn_cast<ConstantInt>(LastOperand);
 680     }
 681
 682     if (!CI) {
 683       if (Instruction *II = dyn_cast<Instruction>(I)) {
 684         BaseUsers.push_back(II);
 685         continue;
 686       } else {
 687         DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I << "\n");
 688         return false;
 689       }
 690     }
 691
 692     int64_t V = std::abs(CI->getValue().getSExtValue());
 693     if (Roots.find(V) != Roots.end())
 694       // No duplicates, please.
 695       return false;
 696
 697     Roots[V] = cast<Instruction>(I);
 698   }
 699
 700   if (Roots.empty())
 701     return false;
 702
 703   // If we found non-loop-inc, non-root users of Base, assume they are
 704   // for the zeroth root index. This is because "add %a, 0" gets optimized
 705   // away.
 706   if (BaseUsers.size()) {
 707     if (Roots.find(0) != Roots.end()) {
 708       DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
 709       return false;
 710     }
 711     Roots[0] = Base;
 712   }
 713
 714   // Calculate the number of users of the base, or lowest indexed, iteration.
 715   unsigned NumBaseUses = BaseUsers.size();
 716   if (NumBaseUses == 0)
 717     NumBaseUses = Roots.begin()->second->getNumUses();
 718
 719   // Check that every node has the same number of users.
 720   for (auto &KV : Roots) {
 721     if (KV.first == 0)
 722       continue;
 723     if (KV.second->getNumUses() != NumBaseUses) {
 724       DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
 725             << "#Base=" << NumBaseUses << ", #Root=" <<
 726             KV.second->getNumUses() << "\n");
 727       return false;
 728     }
 729   }
 730
 731   return true;
 732 }
 733
 734 bool LoopReroll::DAGRootTracker::
 735 findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) {
 736   // Does the user look like it could be part of a root set?
 737   // All its users must be simple arithmetic ops.
 738   if (I->getNumUses() > IL_MaxRerollIterations)
 739     return false;
 740
 741   if ((I->getOpcode() == Instruction::Mul ||
 742        I->getOpcode() == Instruction::PHI) &&
 743       I != IV &&
 744       findRootsBase(I, SubsumedInsts))
 745     return true;
 746
 747   SubsumedInsts.insert(I);
 748
 749   for (User *V : I->users()) {
 750     Instruction *I = dyn_cast<Instruction>(V);
 751     if (std::find(LoopIncs.begin(), LoopIncs.end(), I) != LoopIncs.end())
 752       continue;
 753
 754     if (!I || !isSimpleArithmeticOp(I) ||
 755         !findRootsRecursive(I, SubsumedInsts))
 756       return false;
 757   }
 758   return true;
 759 }
 760
 761 bool LoopReroll::DAGRootTracker::
 762 findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
 763
 764   // The base instruction needs to be a multiply so
 765   // that we can erase it.
 766   if (IVU->getOpcode() != Instruction::Mul &&
 767       IVU->getOpcode() != Instruction::PHI)
 768     return false;
 769
 770   std::map<int64_t, Instruction*> V;
 771   if (!collectPossibleRoots(IVU, V))
 772     return false;
 773
 774   // If we didn't get a root for index zero, then IVU must be
 775   // subsumed.
 776   if (V.find(0) == V.end())
 777     SubsumedInsts.insert(IVU);
 778
 779   // Partition the vector into monotonically increasing indexes.
 780   DAGRootSet DRS;
 781   DRS.BaseInst = nullptr;
 782
 783   for (auto &KV : V) {
 784     if (!DRS.BaseInst) {
 785       DRS.BaseInst = KV.second;
 786       DRS.SubsumedInsts = SubsumedInsts;
 787     } else if (DRS.Roots.empty()) {
 788       DRS.Roots.push_back(KV.second);
 789     } else if (V.find(KV.first - 1) != V.end()) {
 790       DRS.Roots.push_back(KV.second);
 791     } else {
 792       // Linear sequence terminated.
 793       RootSets.push_back(DRS);
 794       DRS.BaseInst = KV.second;
 795       DRS.SubsumedInsts = SubsumedInsts;
 796       DRS.Roots.clear();
 797     }
 798   }
 799   RootSets.push_back(DRS);
 800
 801   return true;
 802 }
 803
 804 bool LoopReroll::DAGRootTracker::findRoots() {
 805   Inc = IVToIncMap[IV];
 806
 807   assert(RootSets.empty() && "Unclean state!");
 808   if (std::abs(Inc) == 1) {
 809     for (auto *IVU : IV->users()) {
 810       if (isLoopIncrement(IVU, IV))
 811         LoopIncs.push_back(cast<Instruction>(IVU));
 812     }
 813     if (!findRootsRecursive(IV, SmallInstructionSet()))
 814       return false;
 815     LoopIncs.push_back(IV);
 816   } else {
 817     if (!findRootsBase(IV, SmallInstructionSet()))
 818       return false;
 819   }
 820
 821   // Ensure all sets have the same size.
 822   if (RootSets.empty()) {
 823     DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
 824     return false;
 825   }
 826   for (auto &V : RootSets) {
 827     if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) {
 828       DEBUG(dbgs()
 829             << "LRR: Aborting because not all root sets have the same size\n");
 830       return false;
 831     }
 832   }
 833
 834   // And ensure all loop iterations are consecutive. We rely on std::map
 835   // providing ordered traversal.
 836   for (auto &V : RootSets) {
 837     const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(V.BaseInst));
 838     if (!ADR)
 839       return false;
 840
 841     // Consider a DAGRootSet with N-1 roots (so N different values including
 842     //   BaseInst).
 843     // Define d = Roots[0] - BaseInst, which should be the same as
 844     //   Roots[I] - Roots[I-1] for all I in [1..N).
 845     // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the
 846     //   loop iteration J.
 847     //
 848     // Now, For the loop iterations to be consecutive:
 849     //   D = d * N
 850
 851     unsigned N = V.Roots.size() + 1;
 852     const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(V.Roots[0]), ADR);
 853     const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N);
 854     if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV)) {
 855       DEBUG(dbgs() << "LRR: Aborting because iterations are not consecutive\n");
 856       return false;
 857     }
 858   }
 859   Scale = RootSets[0].Roots.size() + 1;
 860
 861   if (Scale > IL_MaxRerollIterations) {
 862     DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
 863           << "#Found=" << Scale << ", #Max=" << IL_MaxRerollIterations
 864           << "\n");
 865     return false;
 866   }
 867
 868   DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale << "\n");
 869
 870   return true;
 871 }
 872
 873 bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) {
 874   // Populate the MapVector with all instructions in the block, in order first,
 875   // so we can iterate over the contents later in perfect order.
 876   for (auto &I : *L->getHeader()) {
 877     Uses[&I].resize(IL_End);
 878   }
 879
 880   SmallInstructionSet Exclude;
 881   for (auto &DRS : RootSets) {
 882     Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
 883     Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
 884     Exclude.insert(DRS.BaseInst);
 885   }
 886   Exclude.insert(LoopIncs.begin(), LoopIncs.end());
 887
 888   for (auto &DRS : RootSets) {
 889     DenseSet<Instruction*> VBase;
 890     collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase);
 891     for (auto *I : VBase) {
 892       Uses[I].set(0);
 893     }
 894
 895     unsigned Idx = 1;
 896     for (auto *Root : DRS.Roots) {
 897       DenseSet<Instruction*> V;
 898       collectInLoopUserSet(Root, Exclude, PossibleRedSet, V);
 899
 900       // While we're here, check the use sets are the same size.
 901       if (V.size() != VBase.size()) {
 902         DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
 903         return false;
 904       }
 905
 906       for (auto *I : V) {
 907         Uses[I].set(Idx);
 908       }
 909       ++Idx;
 910     }
 911
 912     // Make sure our subsumed instructions are remembered too.
 913     for (auto *I : DRS.SubsumedInsts) {
 914       Uses[I].set(IL_All);
 915     }
 916   }
 917
 918   // Make sure the loop increments are also accounted for.
 919
 920   Exclude.clear();
 921   for (auto &DRS : RootSets) {
 922     Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
 923     Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
 924     Exclude.insert(DRS.BaseInst);
 925   }
 926
 927   DenseSet<Instruction*> V;
 928   collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V);
 929   for (auto *I : V) {
 930     Uses[I].set(IL_All);
 931   }
 932
 933   return true;
 934
 935 }
 936
 937 /// Get the next instruction in "In" that is a member of set Val.
 938 /// Start searching from StartI, and do not return anything in Exclude.
 939 /// If StartI is not given, start from In.begin().
 940 LoopReroll::DAGRootTracker::UsesTy::iterator
 941 LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In,
 942                                       const SmallInstructionSet &Exclude,
 943                                       UsesTy::iterator *StartI) {
 944   UsesTy::iterator I = StartI ? *StartI : In.begin();
 945   while (I != In.end() && (I->second.test(Val) == 0 ||
 946                            Exclude.count(I->first) != 0))
 947     ++I;
 948   return I;
 949 }
 950
 951 bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) {
 952   for (auto &DRS : RootSets) {
 953     if (DRS.BaseInst == I)
 954       return true;
 955   }
 956   return false;
 957 }
 958
 959 bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) {
 960   for (auto &DRS : RootSets) {
 961     if (std::find(DRS.Roots.begin(), DRS.Roots.end(), I) != DRS.Roots.end())
 962       return true;
 963   }
 964   return false;
 965 }
 966
 967 /// Return true if instruction I depends on any instruction between
 968 /// Start and End.
 969 bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I,
 970                                                 UsesTy::iterator Start,
 971                                                 UsesTy::iterator End) {
 972   for (auto *U : I->users()) {
 973     for (auto It = Start; It != End; ++It)
 974       if (U == It->first)
 975         return true;
 976   }
 977   return false;
 978 }
 979
 980 static bool isIgnorableInst(const Instruction *I) {
 981   if (isa<DbgInfoIntrinsic>(I))
 982     return true;
 983   const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
 984   if (!II)
 985     return false;
 986   switch (II->getIntrinsicID()) {
 987     default:
 988       return false;
 989     case llvm::Intrinsic::annotation:
 990     case Intrinsic::ptr_annotation:
 991     case Intrinsic::var_annotation:
 992     // TODO: the following intrinsics may also be whitelisted:
 993     //   lifetime_start, lifetime_end, invariant_start, invariant_end
 994       return true;
 995   }
 996   return false;
 997 }
 998
 999 bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
1000   // We now need to check for equivalence of the use graph of each root with
1001   // that of the primary induction variable (excluding the roots). Our goal
1002   // here is not to solve the full graph isomorphism problem, but rather to
1003   // catch common cases without a lot of work. As a result, we will assume
1004   // that the relative order of the instructions in each unrolled iteration
1005   // is the same (although we will not make an assumption about how the
1006   // different iterations are intermixed). Note that while the order must be
1007   // the same, the instructions may not be in the same basic block.
1008
1009   // An array of just the possible reductions for this scale factor. When we
1010   // collect the set of all users of some root instructions, these reduction
1011   // instructions are treated as 'final' (their uses are not considered).
1012   // This is important because we don't want the root use set to search down
1013   // the reduction chain.
1014   SmallInstructionSet PossibleRedSet;
1015   SmallInstructionSet PossibleRedLastSet;
1016   SmallInstructionSet PossibleRedPHISet;
1017   Reductions.restrictToScale(Scale, PossibleRedSet,
1018                              PossibleRedPHISet, PossibleRedLastSet);
1019
1020   // Populate "Uses" with where each instruction is used.
1021   if (!collectUsedInstructions(PossibleRedSet))
1022     return false;
1023
1024   // Make sure we mark the reduction PHIs as used in all iterations.
1025   for (auto *I : PossibleRedPHISet) {
1026     Uses[I].set(IL_All);
1027   }
1028
1029   // Make sure all instructions in the loop are in one and only one
1030   // set.
1031   for (auto &KV : Uses) {
1032     if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) {
1033       DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
1034             << *KV.first << " (#uses=" << KV.second.count() << ")\n");
1035       return false;
1036     }
1037   }
1038
1039   DEBUG(
1040     for (auto &KV : Uses) {
1041       dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
1042     }
1043     );
1044
1045   for (unsigned Iter = 1; Iter < Scale; ++Iter) {
1046     // In addition to regular aliasing information, we need to look for
1047     // instructions from later (future) iterations that have side effects
1048     // preventing us from reordering them past other instructions with side
1049     // effects.
1050     bool FutureSideEffects = false;
1051     AliasSetTracker AST(*AA);
1052     // The map between instructions in f(%iv.(i+1)) and f(%iv).
1053     DenseMap<Value *, Value *> BaseMap;
1054
1055     // Compare iteration Iter to the base.
1056     SmallInstructionSet Visited;
1057     auto BaseIt = nextInstr(0, Uses, Visited);
1058     auto RootIt = nextInstr(Iter, Uses, Visited);
1059     auto LastRootIt = Uses.begin();
1060
1061     while (BaseIt != Uses.end() && RootIt != Uses.end()) {
1062       Instruction *BaseInst = BaseIt->first;
1063       Instruction *RootInst = RootIt->first;
1064
1065       // Skip over the IV or root instructions; only match their users.
1066       bool Continue = false;
1067       if (isBaseInst(BaseInst)) {
1068         Visited.insert(BaseInst);
1069         BaseIt = nextInstr(0, Uses, Visited);
1070         Continue = true;
1071       }
1072       if (isRootInst(RootInst)) {
1073         LastRootIt = RootIt;
1074         Visited.insert(RootInst);
1075         RootIt = nextInstr(Iter, Uses, Visited);
1076         Continue = true;
1077       }
1078       if (Continue) continue;
1079
1080       if (!BaseInst->isSameOperationAs(RootInst)) {
1081         // Last chance saloon. We don't try and solve the full isomorphism
1082         // problem, but try and at least catch the case where two instructions
1083         // *of different types* are round the wrong way. We won't be able to
1084         // efficiently tell, given two ADD instructions, which way around we
1085         // should match them, but given an ADD and a SUB, we can at least infer
1086         // which one is which.
1087         //
1088         // This should allow us to deal with a greater subset of the isomorphism
1089         // problem. It does however change a linear algorithm into a quadratic
1090         // one, so limit the number of probes we do.
1091         auto TryIt = RootIt;
1092         unsigned N = NumToleratedFailedMatches;
1093         while (TryIt != Uses.end() &&
1094                !BaseInst->isSameOperationAs(TryIt->first) &&
1095                N--) {
1096           ++TryIt;
1097           TryIt = nextInstr(Iter, Uses, Visited, &TryIt);
1098         }
1099
1100         if (TryIt == Uses.end() || TryIt == RootIt ||
1101             instrDependsOn(TryIt->first, RootIt, TryIt)) {
1102           DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
1103                 " vs. " << *RootInst << "\n");
1104           return false;
1105         }
1106
1107         RootIt = TryIt;
1108         RootInst = TryIt->first;
1109       }
1110
1111       // All instructions between the last root and this root
1112       // may belong to some other iteration. If they belong to a
1113       // future iteration, then they're dangerous to alias with.
1114       //
1115       // Note that because we allow a limited amount of flexibility in the order
1116       // that we visit nodes, LastRootIt might be *before* RootIt, in which
1117       // case we've already checked this set of instructions so we shouldn't
1118       // do anything.
1119       for (; LastRootIt < RootIt; ++LastRootIt) {
1120         Instruction *I = LastRootIt->first;
1121         if (LastRootIt->second.find_first() < (int)Iter)
1122           continue;
1123         if (I->mayWriteToMemory())
1124           AST.add(I);
1125         // Note: This is specifically guarded by a check on isa<PHINode>,
1126         // which while a valid (somewhat arbitrary) micro-optimization, is
1127         // needed because otherwise isSafeToSpeculativelyExecute returns
1128         // false on PHI nodes.
1129         if (!isa<PHINode>(I) && !isSimpleLoadStore(I) &&
1130             !isSafeToSpeculativelyExecute(I))
1131           // Intervening instructions cause side effects.
1132           FutureSideEffects = true;
1133       }
1134
1135       // Make sure that this instruction, which is in the use set of this
1136       // root instruction, does not also belong to the base set or the set of
1137       // some other root instruction.
1138       if (RootIt->second.count() > 1) {
1139         DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
1140                         " vs. " << *RootInst << " (prev. case overlap)\n");
1141         return false;
1142       }
1143
1144       // Make sure that we don't alias with any instruction in the alias set
1145       // tracker. If we do, then we depend on a future iteration, and we
1146       // can't reroll.
1147       if (RootInst->mayReadFromMemory())
1148         for (auto &K : AST) {
1149           if (K.aliasesUnknownInst(RootInst, *AA)) {
1150             DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
1151                             " vs. " << *RootInst << " (depends on future store)\n");
1152             return false;
1153           }
1154         }
1155
1156       // If we've past an instruction from a future iteration that may have
1157       // side effects, and this instruction might also, then we can't reorder
1158       // them, and this matching fails. As an exception, we allow the alias
1159       // set tracker to handle regular (simple) load/store dependencies.
1160       if (FutureSideEffects && ((!isSimpleLoadStore(BaseInst) &&
1161                                  !isSafeToSpeculativelyExecute(BaseInst)) ||
1162                                 (!isSimpleLoadStore(RootInst) &&
1163                                  !isSafeToSpeculativelyExecute(RootInst)))) {
1164         DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
1165                         " vs. " << *RootInst <<
1166                         " (side effects prevent reordering)\n");
1167         return false;
1168       }
1169
1170       // For instructions that are part of a reduction, if the operation is
1171       // associative, then don't bother matching the operands (because we
1172       // already know that the instructions are isomorphic, and the order
1173       // within the iteration does not matter). For non-associative reductions,
1174       // we do need to match the operands, because we need to reject
1175       // out-of-order instructions within an iteration!
1176       // For example (assume floating-point addition), we need to reject this:
1177       //   x += a[i]; x += b[i];
1178       //   x += a[i+1]; x += b[i+1];
1179       //   x += b[i+2]; x += a[i+2];
1180       bool InReduction = Reductions.isPairInSame(BaseInst, RootInst);
1181
1182       if (!(InReduction && BaseInst->isAssociative())) {
1183         bool Swapped = false, SomeOpMatched = false;
1184         for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) {
1185           Value *Op2 = RootInst->getOperand(j);
1186
1187           // If this is part of a reduction (and the operation is not
1188           // associatve), then we match all operands, but not those that are
1189           // part of the reduction.
1190           if (InReduction)
1191             if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
1192               if (Reductions.isPairInSame(RootInst, Op2I))
1193                 continue;
1194
1195           DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
1196           if (BMI != BaseMap.end()) {
1197             Op2 = BMI->second;
1198           } else {
1199             for (auto &DRS : RootSets) {
1200               if (DRS.Roots[Iter-1] == (Instruction*) Op2) {
1201                 Op2 = DRS.BaseInst;
1202                 break;
1203               }
1204             }
1205           }
1206
1207           if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
1208             // If we've not already decided to swap the matched operands, and
1209             // we've not already matched our first operand (note that we could
1210             // have skipped matching the first operand because it is part of a
1211             // reduction above), and the instruction is commutative, then try
1212             // the swapped match.
1213             if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched &&
1214                 BaseInst->getOperand(!j) == Op2) {
1215               Swapped = true;
1216             } else {
1217               DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
1218                     << " vs. " << *RootInst << " (operand " << j << ")\n");
1219               return false;
1220             }
1221           }
1222
1223           SomeOpMatched = true;
1224         }
1225       }
1226
1227       if ((!PossibleRedLastSet.count(BaseInst) &&
1228            hasUsesOutsideLoop(BaseInst, L)) ||
1229           (!PossibleRedLastSet.count(RootInst) &&
1230            hasUsesOutsideLoop(RootInst, L))) {
1231         DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst <<
1232                         " vs. " << *RootInst << " (uses outside loop)\n");
1233         return false;
1234       }
1235
1236       Reductions.recordPair(BaseInst, RootInst, Iter);
1237       BaseMap.insert(std::make_pair(RootInst, BaseInst));
1238
1239       LastRootIt = RootIt;
1240       Visited.insert(BaseInst);
1241       Visited.insert(RootInst);
1242       BaseIt = nextInstr(0, Uses, Visited);
1243       RootIt = nextInstr(Iter, Uses, Visited);
1244     }
1245     assert (BaseIt == Uses.end() && RootIt == Uses.end() &&
1246             "Mismatched set sizes!");
1247   }
1248
1249   DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
1250                   *IV << "\n");
1251
1252   return true;
1253 }
1254
1255 void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) {
1256   BasicBlock *Header = L->getHeader();
1257   // Remove instructions associated with non-base iterations.
1258   for (BasicBlock::reverse_iterator J = Header->rbegin();
1259        J != Header->rend();) {
1260     unsigned I = Uses[&*J].find_first();
1261     if (I > 0 && I < IL_All) {
1262       Instruction *D = &*J;
1263       DEBUG(dbgs() << "LRR: removing: " << *D << "\n");
1264       D->eraseFromParent();
1265       continue;
1266     }
1267
1268     ++J;
1269   }
1270   bool Negative = IVToIncMap[IV] < 0;
1271   const DataLayout &DL = Header->getModule()->getDataLayout();
1272
1273   // We need to create a new induction variable for each different BaseInst.
1274   for (auto &DRS : RootSets) {
1275     // Insert the new induction variable.
1276     const SCEVAddRecExpr *RealIVSCEV =
1277       cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
1278     const SCEV *Start = RealIVSCEV->getStart();
1279     const SCEVAddRecExpr *H = cast<SCEVAddRecExpr>(SE->getAddRecExpr(
1280         Start, SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1), L,
1281         SCEV::FlagAnyWrap));
1282     { // Limit the lifetime of SCEVExpander.
1283       SCEVExpander Expander(*SE, DL, "reroll");
1284       Value *NewIV = Expander.expandCodeFor(H, IV->getType(), &Header->front());
1285
1286       for (auto &KV : Uses) {
1287         if (KV.second.find_first() == 0)
1288           KV.first->replaceUsesOfWith(DRS.BaseInst, NewIV);
1289       }
1290
1291       if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
1292         // FIXME: Why do we need this check?
1293         if (Uses[BI].find_first() == IL_All) {
1294           const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
1295
1296           // Iteration count SCEV minus 1
1297           const SCEV *ICMinus1SCEV = SE->getMinusSCEV(
1298               ICSCEV, SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1));
1299
1300           Value *ICMinus1; // Iteration count minus 1
1301           if (isa<SCEVConstant>(ICMinus1SCEV)) {
1302             ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI);
1303           } else {
1304             BasicBlock *Preheader = L->getLoopPreheader();
1305             if (!Preheader)
1306               Preheader = InsertPreheaderForLoop(L, Parent);
1307
1308             ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(),
1309                                               Preheader->getTerminator());
1310           }
1311
1312           Value *Cond =
1313             new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, "exitcond");
1314           BI->setCondition(Cond);
1315
1316           if (BI->getSuccessor(1) != Header)
1317             BI->swapSuccessors();
1318         }
1319       }
1320     }
1321   }
1322
1323   SimplifyInstructionsInBlock(Header, TLI);
1324   DeleteDeadPHIs(Header, TLI);
1325 }
1326
1327 // Validate the selected reductions. All iterations must have an isomorphic
1328 // part of the reduction chain and, for non-associative reductions, the chain
1329 // entries must appear in order.
1330 bool LoopReroll::ReductionTracker::validateSelected() {
1331   // For a non-associative reduction, the chain entries must appear in order.
1332   for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
1333        RI != RIE; ++RI) {
1334     int i = *RI;
1335     int PrevIter = 0, BaseCount = 0, Count = 0;
1336     for (Instruction *J : PossibleReds[i]) {
1337       // Note that all instructions in the chain must have been found because
1338       // all instructions in the function must have been assigned to some
1339       // iteration.
1340       int Iter = PossibleRedIter[J];
1341       if (Iter != PrevIter && Iter != PrevIter + 1 &&
1342           !PossibleReds[i].getReducedValue()->isAssociative()) {
1343         DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
1344                         J << "\n");
1345         return false;
1346       }
1347
1348       if (Iter != PrevIter) {
1349         if (Count != BaseCount) {
1350           DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
1351                 " reduction use count " << Count <<
1352                 " is not equal to the base use count " <<
1353                 BaseCount << "\n");
1354           return false;
1355         }
1356
1357         Count = 0;
1358       }
1359
1360       ++Count;
1361       if (Iter == 0)
1362         ++BaseCount;
1363
1364       PrevIter = Iter;
1365     }
1366   }
1367
1368   return true;
1369 }
1370
1371 // For all selected reductions, remove all parts except those in the first
1372 // iteration (and the PHI). Replace outside uses of the reduced value with uses
1373 // of the first-iteration reduced value (in other words, reroll the selected
1374 // reductions).
1375 void LoopReroll::ReductionTracker::replaceSelected() {
1376   // Fixup reductions to refer to the last instruction associated with the
1377   // first iteration (not the last).
1378   for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
1379        RI != RIE; ++RI) {
1380     int i = *RI;
1381     int j = 0;
1382     for (int e = PossibleReds[i].size(); j != e; ++j)
1383       if (PossibleRedIter[PossibleReds[i][j]] != 0) {
1384         --j;
1385         break;
1386       }
1387
1388     // Replace users with the new end-of-chain value.
1389     SmallInstructionVector Users;
1390     for (User *U : PossibleReds[i].getReducedValue()->users()) {
1391       Users.push_back(cast<Instruction>(U));
1392     }
1393
1394     for (SmallInstructionVector::iterator J = Users.begin(),
1395          JE = Users.end(); J != JE; ++J)
1396       (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
1397                               PossibleReds[i][j]);
1398   }
1399 }
1400
1401 // Reroll the provided loop with respect to the provided induction variable.
1402 // Generally, we're looking for a loop like this:
1403 //
1404 // %iv = phi [ (preheader, ...), (body, %iv.next) ]
1405 // f(%iv)
1406 // %iv.1 = add %iv, 1                <-- a root increment
1407 // f(%iv.1)
1408 // %iv.2 = add %iv, 2                <-- a root increment
1409 // f(%iv.2)
1410 // %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
1411 // f(%iv.scale_m_1)
1412 // ...
1413 // %iv.next = add %iv, scale
1414 // %cmp = icmp(%iv, ...)
1415 // br %cmp, header, exit
1416 //
1417 // Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
1418 // instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
1419 // be intermixed with eachother. The restriction imposed by this algorithm is
1420 // that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
1421 // etc. be the same.
1422 //
1423 // First, we collect the use set of %iv, excluding the other increment roots.
1424 // This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
1425 // times, having collected the use set of f(%iv.(i+1)), during which we:
1426 //   - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
1427 //     the next unmatched instruction in f(%iv.(i+1)).
1428 //   - Ensure that both matched instructions don't have any external users
1429 //     (with the exception of last-in-chain reduction instructions).
1430 //   - Track the (aliasing) write set, and other side effects, of all
1431 //     instructions that belong to future iterations that come before the matched
1432 //     instructions. If the matched instructions read from that write set, then
1433 //     f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
1434 //     f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
1435 //     if any of these future instructions had side effects (could not be
1436 //     speculatively executed), and so do the matched instructions, when we
1437 //     cannot reorder those side-effect-producing instructions, and rerolling
1438 //     fails.
1439 //
1440 // Finally, we make sure that all loop instructions are either loop increment
1441 // roots, belong to simple latch code, parts of validated reductions, part of
1442 // f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
1443 // have been validated), then we reroll the loop.
1444 bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
1445                         const SCEV *IterCount,
1446                         ReductionTracker &Reductions) {
1447   DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, IVToIncMap);
1448
1449   if (!DAGRoots.findRoots())
1450     return false;
1451   DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
1452                   *IV << "\n");
1453
1454   if (!DAGRoots.validate(Reductions))
1455     return false;
1456   if (!Reductions.validateSelected())
1457     return false;
1458   // At this point, we've validated the rerolling, and we're committed to
1459   // making changes!
1460
1461   Reductions.replaceSelected();
1462   DAGRoots.replace(IterCount);
1463
1464   ++NumRerolledLoops;
1465   return true;
1466 }
1467
1468 bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
1469   if (skipOptnoneFunction(L))
1470     return false;
1471
1472   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1473   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1474   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1475   TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
1476   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1477
1478   BasicBlock *Header = L->getHeader();
1479   DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() <<
1480         "] Loop %" << Header->getName() << " (" <<
1481         L->getNumBlocks() << " block(s))\n");
1482
1483   bool Changed = false;
1484
1485   // For now, we'll handle only single BB loops.
1486   if (L->getNumBlocks() > 1)
1487     return Changed;
1488
1489   if (!SE->hasLoopInvariantBackedgeTakenCount(L))
1490     return Changed;
1491
1492   const SCEV *LIBETC = SE->getBackedgeTakenCount(L);
1493   const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType()));
1494   DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n");
1495
1496   // First, we need to find the induction variable with respect to which we can
1497   // reroll (there may be several possible options).
1498   SmallInstructionVector PossibleIVs;
1499   IVToIncMap.clear();
1500   collectPossibleIVs(L, PossibleIVs);
1501
1502   if (PossibleIVs.empty()) {
1503     DEBUG(dbgs() << "LRR: No possible IVs found\n");
1504     return Changed;
1505   }
1506
1507   ReductionTracker Reductions;
1508   collectPossibleReductions(L, Reductions);
1509
1510   // For each possible IV, collect the associated possible set of 'root' nodes
1511   // (i+1, i+2, etc.).
1512   for (SmallInstructionVector::iterator I = PossibleIVs.begin(),
1513        IE = PossibleIVs.end(); I != IE; ++I)
1514     if (reroll(*I, L, Header, IterCount, Reductions)) {
1515       Changed = true;
1516       break;
1517     }
1518
1519   return Changed;
1520 }