Re-sort all of the includes with ./utils/sort_includes.py so that

[oota-llvm.git] / lib / Target / R600 / AMDILCFGStructurizer.cpp
diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp

index 85ac72542a47247de20e4ce6c5600910983b69b1..d23df833fc2557a8a4abed1bab4801f3d216dcc7 100644 (file)
--- a/lib/Target/R600/AMDILCFGStructurizer.cpp
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp
@@ -13,12 +13,10 @@
  #include "AMDGPU.h"
  #include "AMDGPUInstrInfo.h"
  #include "R600InstrInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DepthFirstIterator.h"
  #include "llvm/ADT/SCCIterator.h"
  #include "llvm/ADT/SmallVector.h"
  #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/DepthFirstIterator.h"
  #include "llvm/Analysis/DominatorInternals.h"
  #include "llvm/Analysis/Dominators.h"
  #include "llvm/CodeGen/MachineDominators.h"
@@ -30,6 +28,8 @@
  #include "llvm/CodeGen/MachineLoopInfo.h"
  #include "llvm/CodeGen/MachinePostDominators.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
  #include "llvm/Target/TargetInstrInfo.h"
  #include "llvm/Target/TargetMachine.h"
  
@@ -54,6 +54,10 @@ STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
  STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
  STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
  
+namespace llvm {
+  void initializeAMDGPUCFGStructurizerPass(PassRegistry&);
+}
+
  //===----------------------------------------------------------------------===//
  //
  // Miscellaneous utility for CFGStructurizer.
@@ -131,13 +135,13 @@ public:
  
    static char ID;
  
-  AMDGPUCFGStructurizer(TargetMachine &tm) :
-      MachineFunctionPass(ID), TM(tm),
-      TII(static_cast<const R600InstrInfo *>(tm.getInstrInfo())),
-      TRI(&TII->getRegisterInfo()) { }
+  AMDGPUCFGStructurizer() :
+      MachineFunctionPass(ID), TII(NULL), TRI(NULL) {
+    initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
+  }
  
     const char *getPassName() const {
-    return "AMD IL Control Flow Graph structurizer Pass";
+    return "AMDGPU Control Flow Graph structurizer Pass";
    }
  
    void getAnalysisUsage(AnalysisUsage &AU) const {
@@ -157,6 +161,8 @@ public:
    bool prepare();
  
    bool runOnMachineFunction(MachineFunction &MF) {
+    TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+    TRI = &TII->getRegisterInfo();
      DEBUG(MF.dump(););
      OrderedBlks.clear();
      FuncRep = &MF;
@@ -173,7 +179,6 @@ public:
    }
  
  protected:
-  TargetMachine &TM;
    MachineDominatorTree *MDT;
    MachinePostDominatorTree *PDT;
    MachineLoopInfo *MLI;
@@ -251,7 +256,6 @@ protected:
    MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
    static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
    static MachineInstr *getContinueInstr(MachineBasicBlock *MBB);
-  static MachineInstr *getLoopBreakInstr(MachineBasicBlock *MBB);
    static bool isReturnBlock(MachineBasicBlock *MBB);
    static void cloneSuccessorList(MachineBasicBlock *DstMBB,
        MachineBasicBlock *SrcMBB) ;
@@ -668,16 +672,6 @@ MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
    return NULL;
  }
  
-MachineInstr *AMDGPUCFGStructurizer::getLoopBreakInstr(MachineBasicBlock *MBB) {
-  for (MachineBasicBlock::iterator It = MBB->begin(); (It != MBB->end());
-      ++It) {
-    MachineInstr *MI = &(*It);
-    if (MI->getOpcode() == AMDGPU::PREDICATED_BREAK)
-      return MI;
-  }
-  return NULL;
-}
-
  bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
    MachineInstr *MI = getReturnInstr(MBB);
    bool IsReturn = (MBB->succ_size() == 0);
@@ -1016,13 +1010,14 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
      return 0;
  
    assert(isCondBranch(BranchMI));
+  int NumMatch = 0;
  
    MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
-  serialPatternMatch(TrueMBB);
-  ifPatternMatch(TrueMBB);
+  NumMatch += serialPatternMatch(TrueMBB);
+  NumMatch += ifPatternMatch(TrueMBB);
    MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
-  serialPatternMatch(FalseMBB);
-  ifPatternMatch(FalseMBB);
+  NumMatch += serialPatternMatch(FalseMBB);
+  NumMatch += ifPatternMatch(FalseMBB);
    MachineBasicBlock *LandBlk;
    int Cloned = 0;
  
@@ -1039,8 +1034,11 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
    } else if (FalseMBB->succ_size() == 1
               && *FalseMBB->succ_begin() == TrueMBB) {
      // Triangle pattern, true is empty
-    LandBlk = TrueMBB;
-    TrueMBB = NULL;
+    // We reverse the predicate to make a triangle, empty false pattern;
+    std::swap(TrueMBB, FalseMBB);
+    reversePredicateSetter(MBB->end());
+    LandBlk = FalseMBB;
+    FalseMBB = NULL;
    } else if (FalseMBB->succ_size() == 1
               && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
      LandBlk = *FalseMBB->succ_begin();
@@ -1048,7 +1046,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
      && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
      LandBlk = *TrueMBB->succ_begin();
    } else {
-    return handleJumpintoIf(MBB, TrueMBB, FalseMBB);
+    return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
    }
  
    // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
@@ -1076,7 +1074,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
  
    numClonedBlock += Cloned;
  
-  return 1 + Cloned;
+  return 1 + Cloned + NumMatch;
  }
  
  int AMDGPUCFGStructurizer::loopendPatternMatch() {
@@ -1241,7 +1239,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
  
        numClonedBlock += Num;
        Num += serialPatternMatch(*HeadMBB->succ_begin());
-      Num += serialPatternMatch(*(++HeadMBB->succ_begin()));
+      Num += serialPatternMatch(*llvm::next(HeadMBB->succ_begin()));
        Num += ifPatternMatch(HeadMBB);
        assert(Num > 0);
  
@@ -1343,32 +1341,77 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
    // add initReg = initVal to headBlk
  
    const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-  unsigned InitReg =
-    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
-  if (!MigrateTrue || !MigrateFalse)
-    llvm_unreachable("Extra register needed to handle CFG");
+  if (!MigrateTrue || !MigrateFalse) {
+    // XXX: We have an opportunity here to optimize the "branch into if" case
+    // here.  Branch into if looks like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \           |
+    // diamond_false        diamond_true
+    //             \      /
+    //               done
+    //
+    // The diamond_head block begins the "if" and the diamond_true block
+    // is the block being "branched into".
+    //
+    // If MigrateTrue is true, then TrueBB is the block being "branched into"
+    // and if MigrateFalse is true, then FalseBB is the block being
+    // "branched into"
+    // 
+    // Here is the pseudo code for how I think the optimization should work:
+    // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
+    // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
+    // 3. Move the branch instruction from diamond_head into its own basic
+    //    block (new_block).
+    // 4. Add an unconditional branch from diamond_head to new_block
+    // 5. Replace the branch instruction in branch_from with an unconditional
+    //    branch to new_block.  If branch_from has multiple predecessors, then
+    //    we need to replace the True/False block in the branch
+    //    instruction instead of replacing it.
+    // 6. Change the condition of the branch instruction in new_block from
+    //    COND to (COND || GPR0)
+    //
+    // In order insert these MOV instruction, we will need to use the
+    // RegisterScavenger.  Usually liveness stops being tracked during
+    // the late machine optimization passes, however if we implement
+    // bool TargetRegisterInfo::requiresRegisterScavenging(
+    //                                                const MachineFunction &MF)
+    // and have it return true, liveness will be tracked correctly 
+    // by generic optimization passes.  We will also need to make sure that
+    // all of our target-specific passes that run after regalloc and before
+    // the CFGStructurizer track liveness and we will need to modify this pass
+    // to correctly track liveness.
+    //
+    // After the above changes, the new CFG should look like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //                       \     /
+    //                      new_block
+    //                      /      |
+    //         diamond_false        diamond_true
+    //                      \      /
+    //                        done
+    //
+    // Without this optimization, we are forced to duplicate the diamond_true
+    // block and we will end up with a CFG like this:
+    //
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \                   |
+    // diamond_false        diamond_true      diamond_true (duplicate)
+    //             \      /                   |
+    //               done --------------------|
+    //
+    // Duplicating diamond_true can be very costly especially if it has a
+    // lot of instructions.
+    return 0;
+  }
  
    int NumNewBlk = 0;
  
-  if (!LandBlk) {
-    LandBlk = HeadMBB->getParent()->CreateMachineBasicBlock();
-    HeadMBB->getParent()->push_back(LandBlk);  //insert to function
-
-    if (TrueMBB) {
-      TrueMBB->addSuccessor(LandBlk);
-    } else {
-      HeadMBB->addSuccessor(LandBlk);
-    }
-
-    if (FalseMBB) {
-      FalseMBB->addSuccessor(LandBlk);
-    } else {
-      HeadMBB->addSuccessor(LandBlk);
-    }
-
-    NumNewBlk ++;
-  }
-
    bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
  
    //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
@@ -1383,6 +1426,10 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
          CmpResReg, DebugLoc());
    }
  
+  // XXX: We are running this after RA, so creating virtual registers will
+  // cause an assertion failure in the PostRA scheduling pass.
+  unsigned InitReg =
+    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
    insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
        DebugLoc());
  
@@ -1456,6 +1503,7 @@ void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
  void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
      MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
      MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
+  assert (TrueMBB);
    DEBUG(
      dbgs() << "ifPattern BB" << MBB->getNumber();
      dbgs() << "{  ";
@@ -1525,26 +1573,8 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
    DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
                 << " land = BB" << LandMBB->getNumber() << "\n";);
  
-  /* we last inserterd the DebugLoc in the
-   * BREAK_LOGICALZ_i32 or AMDGPU::BREAK_LOGICALNZ statement in the current
-   * dstBlk.
-   * search for the DebugLoc in the that statement.
-   * if not found, we have to insert the empty/default DebugLoc */
-  MachineInstr *LoopBreakInstr = getLoopBreakInstr(DstBlk);
-  DebugLoc DLBreak = (LoopBreakInstr) ? LoopBreakInstr->getDebugLoc() :
-      DebugLoc();
-
-  insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DLBreak);
-
-  /* we last inserterd the DebugLoc in the continue statement in the current
-   * dstBlk.
-   * search for the DebugLoc in the continue statement.
-   * if not found, we have to insert the empty/default DebugLoc */
-  MachineInstr *ContinueInstr = getContinueInstr(DstBlk);
-  DebugLoc DLContinue = (ContinueInstr) ? ContinueInstr->getDebugLoc() :
-      DebugLoc();
-
-  insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DLContinue);
+  insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
+  insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
    DstBlk->addSuccessor(LandMBB);
    DstBlk->removeSuccessor(DstBlk);
  }
@@ -1561,7 +1591,9 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
    MachineBasicBlock::iterator I = BranchMI;
    if (TrueBranch != LandMBB)
      reversePredicateSetter(I);
-  insertCondBranchBefore(I, AMDGPU::PREDICATED_BREAK, DL);
+  insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
+  insertInstrBefore(I, AMDGPU::BREAK);
+  insertInstrBefore(I, AMDGPU::ENDIF);
    //now branchInst can be erase safely
    BranchMI->eraseFromParent();
    //now take care of successors, retire blocks
@@ -1736,7 +1768,7 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
    if (MBB->succ_size() != 2)
      return;
    MachineBasicBlock *MBB1 = *MBB->succ_begin();
-  MachineBasicBlock *MBB2 = *(++MBB->succ_begin());
+  MachineBasicBlock *MBB2 = *llvm::next(MBB->succ_begin());
    if (MBB1 != MBB2)
      return;
  
@@ -1872,6 +1904,14 @@ char AMDGPUCFGStructurizer::ID = 0;
  } // end anonymous namespace
  
  
-FunctionPass *llvm::createAMDGPUCFGStructurizerPass(TargetMachine &tm) {
-  return new AMDGPUCFGStructurizer(tm);
+INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
+                      "AMDGPU CFG Structurizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer",
+                      "AMDGPU CFG Structurizer", false, false)
+
+FunctionPass *llvm::createAMDGPUCFGStructurizerPass() {
+  return new AMDGPUCFGStructurizer();
  }