Re-sort all of the includes with ./utils/sort_includes.py so that
[oota-llvm.git] / lib / Target / R600 / AMDILCFGStructurizer.cpp
index 4910e5dc9ce0e456dfd472153e1e301e7c103ec3..d23df833fc2557a8a4abed1bab4801f3d216dcc7 100644 (file)
@@ -8,11 +8,12 @@
 /// \file
 //==-----------------------------------------------------------------------===//
 
-#define DEBUGME 0
 #define DEBUG_TYPE "structcfg"
 
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
+#include "R600InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -27,6 +28,8 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -46,15 +49,15 @@ STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
     "matched");
 STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
     "matched");
-STATISTIC(numLoopbreakPatternMatch, "CFGStructurizer number of loop-break "
-    "pattern matched");
 STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
     "pattern matched");
-STATISTIC(numLoopPatternMatch,      "CFGStructurizer number of loop pattern "
-    "matched");
 STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
 STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
 
+namespace llvm {
+  void initializeAMDGPUCFGStructurizerPass(PassRegistry&);
+}
+
 //===----------------------------------------------------------------------===//
 //
 // Miscellaneous utility for CFGStructurizer.
@@ -62,37 +65,27 @@ STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
 //===----------------------------------------------------------------------===//
 namespace {
 #define SHOWNEWINSTR(i) \
-  if (DEBUGME) errs() << "New instr: " << *i << "\n"
+  DEBUG(dbgs() << "New instr: " << *i << "\n");
 
 #define SHOWNEWBLK(b, msg) \
-if (DEBUGME) { \
-  errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
-  errs() << "\n"; \
-}
+DEBUG( \
+  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  dbgs() << "\n"; \
+);
 
 #define SHOWBLK_DETAIL(b, msg) \
-if (DEBUGME) { \
+DEBUG( \
   if (b) { \
-  errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
-  b->print(errs()); \
-  errs() << "\n"; \
+  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  b->print(dbgs()); \
+  dbgs() << "\n"; \
   } \
-}
+);
 
 #define INVALIDSCCNUM -1
-#define INVALIDREGNUM 0
-
-template<class LoopinfoT>
-void PrintLoopinfo(const LoopinfoT &LoopInfo, llvm::raw_ostream &OS) {
-  for (typename LoopinfoT::iterator iter = LoopInfo.begin(),
-       iterEnd = LoopInfo.end();
-       iter != iterEnd; ++iter) {
-    (*iter)->print(OS, 0);
-  }
-}
 
 template<class NodeT>
-void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) {
+void ReverseVector(SmallVectorImpl<NodeT *> &Src) {
   size_t sz = Src.size();
   for (size_t i = 0; i < sz/2; ++i) {
     NodeT *t = Src[i];
@@ -109,40 +102,14 @@ void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) {
 //
 //===----------------------------------------------------------------------===//
 
+
 namespace {
-template<class PassT>
-struct CFGStructTraits {
-};
 
-template <class InstrT>
 class BlockInformation {
 public:
-  bool isRetired;
-  int  sccNum;
-  //SmallVector<InstrT*, DEFAULT_VEC_SLOTS> succInstr;
-  //Instructions defining the corresponding successor.
-  BlockInformation() : isRetired(false), sccNum(INVALIDSCCNUM) {}
-};
-
-template <class BlockT, class InstrT, class RegiT>
-class LandInformation {
-public:
-  BlockT *landBlk;
-  std::set<RegiT> breakInitRegs;  //Registers that need to "reg = 0", before
-                                  //WHILELOOP(thisloop) init before entering
-                                  //thisloop.
-  std::set<RegiT> contInitRegs;   //Registers that need to "reg = 0", after
-                                  //WHILELOOP(thisloop) init after entering
-                                  //thisloop.
-  std::set<RegiT> endbranchInitRegs; //Init before entering this loop, at loop
-                                     //land block, branch cond on this reg.
-  std::set<RegiT> breakOnRegs;       //registers that need to "if (reg) break
-                                     //endif" after ENDLOOP(thisloop) break
-                                     //outerLoopOf(thisLoop).
-  std::set<RegiT> contOnRegs;       //registers that need to "if (reg) continue
-                                    //endif" after ENDLOOP(thisloop) continue on
-                                    //outerLoopOf(thisLoop).
-  LandInformation() : landBlk(NULL) {}
+  bool IsRetired;
+  int  SccNum;
+  BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {}
 };
 
 } // end anonymous namespace
@@ -154,1025 +121,1213 @@ public:
 //===----------------------------------------------------------------------===//
 
 namespace {
-// bixia TODO: port it to BasicBlock, not just MachineBasicBlock.
-template<class PassT>
-class  CFGStructurizer {
+class AMDGPUCFGStructurizer : public MachineFunctionPass {
 public:
-  typedef enum {
+  typedef SmallVector<MachineBasicBlock *, 32> MBBVector;
+  typedef std::map<MachineBasicBlock *, BlockInformation *> MBBInfoMap;
+  typedef std::map<MachineLoop *, MachineBasicBlock *> LoopLandInfoMap;
+
+  enum PathToKind {
     Not_SinglePath = 0,
     SinglePath_InPath = 1,
     SinglePath_NotInPath = 2
-  } PathToKind;
+  };
 
-public:
-  typedef typename PassT::InstructionType         InstrT;
-  typedef typename PassT::FunctionType            FuncT;
-  typedef typename PassT::DominatortreeType       DomTreeT;
-  typedef typename PassT::PostDominatortreeType   PostDomTreeT;
-  typedef typename PassT::DomTreeNodeType         DomTreeNodeT;
-  typedef typename PassT::LoopinfoType            LoopInfoT;
-
-  typedef GraphTraits<FuncT *>                    FuncGTraits;
-  //typedef FuncGTraits::nodes_iterator BlockIterator;
-  typedef typename FuncT::iterator                BlockIterator;
-
-  typedef typename FuncGTraits::NodeType          BlockT;
-  typedef GraphTraits<BlockT *>                   BlockGTraits;
-  typedef GraphTraits<Inverse<BlockT *> >         InvBlockGTraits;
-  //typedef BlockGTraits::succ_iterator InstructionIterator;
-  typedef typename BlockT::iterator               InstrIterator;
-
-  typedef CFGStructTraits<PassT>                  CFGTraits;
-  typedef BlockInformation<InstrT>                BlockInfo;
-  typedef std::map<BlockT *, BlockInfo *>         BlockInfoMap;
-
-  typedef int                                     RegiT;
-  typedef typename PassT::LoopType                LoopT;
-  typedef LandInformation<BlockT, InstrT, RegiT>  LoopLandInfo;
-        typedef std::map<LoopT *, LoopLandInfo *> LoopLandInfoMap;
-        //landing info for loop break
-  typedef SmallVector<BlockT *, 32>               BlockTSmallerVector;
+  static char ID;
 
-public:
-  CFGStructurizer();
-  ~CFGStructurizer();
+  AMDGPUCFGStructurizer() :
+      MachineFunctionPass(ID), TII(NULL), TRI(NULL) {
+    initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
+  }
+
+   const char *getPassName() const {
+    return "AMDGPU Control Flow Graph structurizer Pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addPreserved<MachineFunctionAnalysis>();
+    AU.addRequired<MachineFunctionAnalysis>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+  }
 
   /// Perform the CFG structurization
-  bool run(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri);
+  bool run();
 
   /// Perform the CFG preparation
-  bool prepare(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri);
+  /// This step will remove every unconditionnal/dead jump instructions and make
+  /// sure all loops have an exit block
+  bool prepare();
+
+  bool runOnMachineFunction(MachineFunction &MF) {
+    TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+    TRI = &TII->getRegisterInfo();
+    DEBUG(MF.dump(););
+    OrderedBlks.clear();
+    FuncRep = &MF;
+    MLI = &getAnalysis<MachineLoopInfo>();
+    DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
+    MDT = &getAnalysis<MachineDominatorTree>();
+    DEBUG(MDT->print(dbgs(), (const llvm::Module*)0););
+    PDT = &getAnalysis<MachinePostDominatorTree>();
+    DEBUG(PDT->print(dbgs()););
+    prepare();
+    run();
+    DEBUG(MF.dump(););
+    return true;
+  }
 
-private:
-  void reversePredicateSetter(typename BlockT::iterator);
-  void   orderBlocks();
-  void   printOrderedBlocks(llvm::raw_ostream &OS);
-  int patternMatch(BlockT *CurBlock);
-  int patternMatchGroup(BlockT *CurBlock);
-
-  int serialPatternMatch(BlockT *CurBlock);
-  int ifPatternMatch(BlockT *CurBlock);
-  int switchPatternMatch(BlockT *CurBlock);
-  int loopendPatternMatch(BlockT *CurBlock);
-  int loopPatternMatch(BlockT *CurBlock);
-
-  int loopbreakPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
-  int loopcontPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
-  //int loopWithoutBreak(BlockT *);
-
-  void handleLoopbreak (BlockT *ExitingBlock, LoopT *ExitingLoop,
-                        BlockT *ExitBlock, LoopT *exitLoop, BlockT *landBlock);
-  void handleLoopcontBlock(BlockT *ContingBlock, LoopT *contingLoop,
-                           BlockT *ContBlock, LoopT *contLoop);
-  bool isSameloopDetachedContbreak(BlockT *Src1Block, BlockT *Src2Block);
-  int handleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
-                       BlockT *FalseBlock);
-  int handleJumpintoIfImp(BlockT *HeadBlock, BlockT *TrueBlock,
-                          BlockT *FalseBlock);
-  int improveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
-                              BlockT *FalseBlock, BlockT **LandBlockPtr);
-  void showImproveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
-                                   BlockT *FalseBlock, BlockT *LandBlock,
-                                   bool Detail = false);
-  PathToKind singlePathTo(BlockT *SrcBlock, BlockT *DstBlock,
-                          bool AllowSideEntry = true);
-  BlockT *singlePathEnd(BlockT *srcBlock, BlockT *DstBlock,
-                        bool AllowSideEntry = true);
-  int cloneOnSideEntryTo(BlockT *PreBlock, BlockT *SrcBlock, BlockT *DstBlock);
-  void mergeSerialBlock(BlockT *DstBlock, BlockT *srcBlock);
-
-  void mergeIfthenelseBlock(InstrT *BranchInstr, BlockT *CurBlock,
-                            BlockT *TrueBlock, BlockT *FalseBlock,
-                            BlockT *LandBlock);
-  void mergeLooplandBlock(BlockT *DstBlock, LoopLandInfo *LoopLand);
-  void mergeLoopbreakBlock(BlockT *ExitingBlock, BlockT *ExitBlock,
-                           BlockT *ExitLandBlock, RegiT SetReg);
-  void settleLoopcontBlock(BlockT *ContingBlock, BlockT *ContBlock,
-                           RegiT SetReg);
-  BlockT *relocateLoopcontBlock(LoopT *ParentLoopRep, LoopT *LoopRep,
-                                std::set<BlockT*> &ExitBlockSet,
-                                BlockT *ExitLandBlk);
-  BlockT *addLoopEndbranchBlock(LoopT *LoopRep,
-                                BlockTSmallerVector &ExitingBlocks,
-                                BlockTSmallerVector &ExitBlocks);
-  BlockT *normalizeInfiniteLoopExit(LoopT *LoopRep);
-  void removeUnconditionalBranch(BlockT *SrcBlock);
-  void removeRedundantConditionalBranch(BlockT *SrcBlock);
-  void addDummyExitBlock(SmallVector<BlockT *, DEFAULT_VEC_SLOTS> &RetBlocks);
-
-  void removeSuccessor(BlockT *SrcBlock);
-  BlockT *cloneBlockForPredecessor(BlockT *CurBlock, BlockT *PredBlock);
-  BlockT *exitingBlock2ExitBlock (LoopT *LoopRep, BlockT *exitingBlock);
-
-  void migrateInstruction(BlockT *SrcBlock, BlockT *DstBlock,
-                          InstrIterator InsertPos);
-
-  void recordSccnum(BlockT *SrcBlock, int SCCNum);
-  int getSCCNum(BlockT *srcBlk);
-
-  void retireBlock(BlockT *DstBlock, BlockT *SrcBlock);
-  bool isRetiredBlock(BlockT *SrcBlock);
-  bool isActiveLoophead(BlockT *CurBlock);
-  bool needMigrateBlock(BlockT *Block);
-
-  BlockT *recordLoopLandBlock(LoopT *LoopRep, BlockT *LandBlock,
-                              BlockTSmallerVector &exitBlocks,
-                              std::set<BlockT*> &ExitBlockSet);
-  void setLoopLandBlock(LoopT *LoopRep, BlockT *Block = NULL);
-  BlockT *getLoopLandBlock(LoopT *LoopRep);
-  LoopLandInfo *getLoopLandInfo(LoopT *LoopRep);
-
-  void addLoopBreakOnReg(LoopT *LoopRep, RegiT RegNum);
-  void addLoopContOnReg(LoopT *LoopRep, RegiT RegNum);
-  void addLoopBreakInitReg(LoopT *LoopRep, RegiT RegNum);
-  void addLoopContInitReg(LoopT *LoopRep, RegiT RegNum);
-  void addLoopEndbranchInitReg(LoopT *LoopRep, RegiT RegNum);
-
-  bool hasBackEdge(BlockT *curBlock);
-  unsigned getLoopDepth  (LoopT *LoopRep);
-  int countActiveBlock(
-    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterStart,
-    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterEnd);
-    BlockT *findNearestCommonPostDom(std::set<BlockT *>&);
-  BlockT *findNearestCommonPostDom(BlockT *Block1, BlockT *Block2);
+protected:
+  MachineDominatorTree *MDT;
+  MachinePostDominatorTree *PDT;
+  MachineLoopInfo *MLI;
+  const R600InstrInfo *TII;
+  const AMDGPURegisterInfo *TRI;
+
+  // PRINT FUNCTIONS
+  /// Print the ordered Blocks.
+  void printOrderedBlocks() const {
+    size_t i = 0;
+    for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(),
+        iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) {
+      dbgs() << "BB" << (*iterBlk)->getNumber();
+      dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
+      if (i != 0 && i % 10 == 0) {
+        dbgs() << "\n";
+      } else {
+        dbgs() << " ";
+      }
+    }
+  }
+  static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
+    for (MachineLoop::iterator iter = LoopInfo.begin(),
+         iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
+      (*iter)->print(dbgs(), 0);
+    }
+  }
+
+  // UTILITY FUNCTIONS
+  int getSCCNum(MachineBasicBlock *MBB) const;
+  MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const;
+  bool hasBackEdge(MachineBasicBlock *MBB) const;
+  static unsigned getLoopDepth(MachineLoop *LoopRep);
+  bool isRetiredBlock(MachineBasicBlock *MBB) const;
+  bool isActiveLoophead(MachineBasicBlock *MBB) const;
+  PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
+      bool AllowSideEntry = true) const;
+  int countActiveBlock(MBBVector::const_iterator It,
+      MBBVector::const_iterator E) const;
+  bool needMigrateBlock(MachineBasicBlock *MBB) const;
+
+  // Utility Functions
+  void reversePredicateSetter(MachineBasicBlock::iterator I);
+  /// Compute the reversed DFS post order of Blocks
+  void orderBlocks(MachineFunction *MF);
+
+  // Function originaly from CFGStructTraits
+  void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
+      DebugLoc DL = DebugLoc());
+  MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
+    DebugLoc DL = DebugLoc());
+  MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode);
+  void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode,
+      DebugLoc DL);
+  void insertCondBranchBefore(MachineBasicBlock *MBB,
+      MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
+      DebugLoc DL);
+  void insertCondBranchEnd(MachineBasicBlock *MBB, int NewOpcode, int RegNum);
+  static int getBranchNzeroOpcode(int OldOpcode);
+  static int getBranchZeroOpcode(int OldOpcode);
+  static int getContinueNzeroOpcode(int OldOpcode);
+  static int getContinueZeroOpcode(int OldOpcode);
+  static MachineBasicBlock *getTrueBranch(MachineInstr *MI);
+  static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB);
+  static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB,
+      MachineInstr *MI);
+  static bool isCondBranch(MachineInstr *MI);
+  static bool isUncondBranch(MachineInstr *MI);
+  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB);
+  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB);
+  /// The correct naming for this is getPossibleLoopendBlockBranchInstr.
+  ///
+  /// BB with backward-edge could have move instructions after the branch
+  /// instruction.  Such move instruction "belong to" the loop backward-edge.
+  MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
+  static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
+  static MachineInstr *getContinueInstr(MachineBasicBlock *MBB);
+  static bool isReturnBlock(MachineBasicBlock *MBB);
+  static void cloneSuccessorList(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *SrcMBB) ;
+  static MachineBasicBlock *clone(MachineBasicBlock *MBB);
+  /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose
+  /// because the AMDGPU instruction is not recognized as terminator fix this
+  /// and retire this routine
+  void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB,
+      MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);
+  static void wrapup(MachineBasicBlock *MBB);
+
+
+  int patternMatch(MachineBasicBlock *MBB);
+  int patternMatchGroup(MachineBasicBlock *MBB);
+  int serialPatternMatch(MachineBasicBlock *MBB);
+  int ifPatternMatch(MachineBasicBlock *MBB);
+  int loopendPatternMatch();
+  int mergeLoop(MachineLoop *LoopRep);
+  int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock *LoopHeader);
+
+  void handleLoopcontBlock(MachineBasicBlock *ContingMBB,
+      MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
+      MachineLoop *ContLoop);
+  /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in
+  /// the same loop with LoopLandInfo without explicitly keeping track of
+  /// loopContBlks and loopBreakBlks, this is a method to get the information.
+  bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
+      MachineBasicBlock *Src2MBB);
+  int handleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
+  int handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
+  int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+      MachineBasicBlock **LandMBBPtr);
+  void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+      MachineBasicBlock *LandMBB, bool Detail = false);
+  int cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+      MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB);
+  void mergeSerialBlock(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *SrcMBB);
+
+  void mergeIfthenelseBlock(MachineInstr *BranchMI,
+      MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
+      MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB);
+  void mergeLooplandBlock(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *LandMBB);
+  void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+      MachineBasicBlock *LandMBB);
+  void settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+      MachineBasicBlock *ContMBB);
+  /// normalizeInfiniteLoopExit change
+  ///   B1:
+  ///        uncond_br LoopHeader
+  ///
+  /// to
+  ///   B1:
+  ///        cond_br 1 LoopHeader dummyExit
+  /// and return the newly added dummy exit block
+  MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep);
+  void removeUnconditionalBranch(MachineBasicBlock *MBB);
+  /// Remove duplicate branches instructions in a block.
+  /// For instance
+  /// B0:
+  ///    cond_br X B1 B2
+  ///    cond_br X B1 B2
+  /// is transformed to
+  /// B0:
+  ///    cond_br X B1 B2
+  void removeRedundantConditionalBranch(MachineBasicBlock *MBB);
+  void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB);
+  void removeSuccessor(MachineBasicBlock *MBB);
+  MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
+      MachineBasicBlock *PredMBB);
+  void migrateInstruction(MachineBasicBlock *SrcMBB,
+      MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
+  void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
+  void retireBlock(MachineBasicBlock *MBB);
+  void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = NULL);
+
+  MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
+  /// This is work around solution for findNearestCommonDominator not avaiable
+  /// to post dom a proper fix should go to Dominators.h.
+  MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1,
+      MachineBasicBlock *MBB2);
 
 private:
-  DomTreeT *domTree;
-  PostDomTreeT *postDomTree;
-  LoopInfoT *loopInfo;
-  PassT *passRep;
-  FuncT *funcRep;
-
-  BlockInfoMap blockInfoMap;
-  LoopLandInfoMap loopLandInfoMap;
-  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> orderedBlks;
-  const AMDGPURegisterInfo *TRI;
+  MBBInfoMap BlockInfoMap;
+  LoopLandInfoMap LLInfoMap;
+  std::map<MachineLoop *, bool> Visited;
+  MachineFunction *FuncRep;
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
+};
+
+int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
+  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
+  if (It == BlockInfoMap.end())
+    return INVALIDSCCNUM;
+  return (*It).second->SccNum;
+}
+
+MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
+    const {
+  LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
+  if (It == LLInfoMap.end())
+    return NULL;
+  return (*It).second;
+}
 
-};  //template class CFGStructurizer
+bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
+  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
+  if (!LoopRep)
+    return false;
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  return MBB->isSuccessor(LoopHeader);
+}
+
+unsigned AMDGPUCFGStructurizer::getLoopDepth(MachineLoop *LoopRep) {
+  return LoopRep ? LoopRep->getLoopDepth() : 0;
+}
 
-template<class PassT> CFGStructurizer<PassT>::CFGStructurizer()
-  : domTree(NULL), postDomTree(NULL), loopInfo(NULL) {
+bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
+  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
+  if (It == BlockInfoMap.end())
+    return false;
+  return (*It).second->IsRetired;
+}
+
+bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
+  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
+  while (LoopRep && LoopRep->getHeader() == MBB) {
+    MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
+    if(!LoopLand)
+      return true;
+    if (!isRetiredBlock(LoopLand))
+      return true;
+    LoopRep = LoopRep->getParentLoop();
+  }
+  return false;
+}
+AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
+    bool AllowSideEntry) const {
+  assert(DstMBB);
+  if (SrcMBB == DstMBB)
+    return SinglePath_InPath;
+  while (SrcMBB && SrcMBB->succ_size() == 1) {
+    SrcMBB = *SrcMBB->succ_begin();
+    if (SrcMBB == DstMBB)
+      return SinglePath_InPath;
+    if (!AllowSideEntry && SrcMBB->pred_size() > 1)
+      return Not_SinglePath;
+  }
+  if (SrcMBB && SrcMBB->succ_size()==0)
+    return SinglePath_NotInPath;
+  return Not_SinglePath;
 }
 
-template<class PassT> CFGStructurizer<PassT>::~CFGStructurizer() {
-  for (typename BlockInfoMap::iterator I = blockInfoMap.begin(),
-       E = blockInfoMap.end(); I != E; ++I) {
-    delete I->second;
+int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
+    MBBVector::const_iterator E) const {
+  int Count = 0;
+  while (It != E) {
+    if (!isRetiredBlock(*It))
+      ++Count;
+    ++It;
   }
+  return Count;
 }
 
-template<class PassT>
-bool CFGStructurizer<PassT>::prepare(FuncT &func, PassT &pass,
-                                     const AMDGPURegisterInfo * tri) {
-  passRep = &pass;
-  funcRep = &func;
-  TRI = tri;
+bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
+  unsigned BlockSizeThreshold = 30;
+  unsigned CloneInstrThreshold = 100;
+  bool MultiplePreds = MBB && (MBB->pred_size() > 1);
 
-  bool changed = false;
+  if(!MultiplePreds)
+    return false;
+  unsigned BlkSize = MBB->size();
+  return ((BlkSize > BlockSizeThreshold) &&
+      (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
+}
 
-  //FIXME: if not reducible flow graph, make it so ???
+void AMDGPUCFGStructurizer::reversePredicateSetter(
+    MachineBasicBlock::iterator I) {
+  while (I--) {
+    if (I->getOpcode() == AMDGPU::PRED_X) {
+      switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
+      case OPCODE_IS_ZERO_INT:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_NOT_ZERO_INT);
+        return;
+      case OPCODE_IS_NOT_ZERO_INT:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_ZERO_INT);
+        return;
+      case OPCODE_IS_ZERO:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_NOT_ZERO);
+        return;
+      case OPCODE_IS_NOT_ZERO:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_ZERO);
+        return;
+      default:
+        llvm_unreachable("PRED_X Opcode invalid!");
+      }
+    }
+  }
+}
+
+void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
+    int NewOpcode, DebugLoc DL) {
+ MachineInstr *MI = MBB->getParent()
+    ->CreateMachineInstr(TII->get(NewOpcode), DL);
+  MBB->push_back(MI);
+  //assume the instruction doesn't take any reg operand ...
+  SHOWNEWINSTR(MI);
+}
+
+MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
+    int NewOpcode, DebugLoc DL) {
+  MachineInstr *MI =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
+  if (MBB->begin() != MBB->end())
+    MBB->insert(MBB->begin(), MI);
+  else
+    MBB->push_back(MI);
+  SHOWNEWINSTR(MI);
+  return MI;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
+    MachineBasicBlock::iterator I, int NewOpcode) {
+  MachineInstr *OldMI = &(*I);
+  MachineBasicBlock *MBB = OldMI->getParent();
+  MachineInstr *NewMBB =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
+  MBB->insert(I, NewMBB);
+  //assume the instruction doesn't take any reg operand ...
+  SHOWNEWINSTR(NewMBB);
+  return NewMBB;
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchBefore(
+    MachineBasicBlock::iterator I, int NewOpcode, DebugLoc DL) {
+  MachineInstr *OldMI = &(*I);
+  MachineBasicBlock *MBB = OldMI->getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
+  MBB->insert(I, NewMI);
+  MachineInstrBuilder MIB(*MF, NewMI);
+  MIB.addReg(OldMI->getOperand(1).getReg(), false);
+  SHOWNEWINSTR(NewMI);
+  //erase later oldInstr->eraseFromParent();
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk,
+    MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
+    DebugLoc DL) {
+  MachineFunction *MF = blk->getParent();
+  MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
+  //insert before
+  blk->insert(I, NewInstr);
+  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
+  SHOWNEWINSTR(NewInstr);
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchEnd(MachineBasicBlock *MBB,
+    int NewOpcode, int RegNum) {
+  MachineFunction *MF = MBB->getParent();
+  MachineInstr *NewInstr =
+    MF->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
+  MBB->push_back(NewInstr);
+  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
+  SHOWNEWINSTR(NewInstr);
+}
 
-  if (DEBUGME) {
-        errs() << "AMDGPUCFGStructurizer::prepare\n";
+int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+  case AMDGPU::BRANCH_COND_i32:
+  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
+  default: llvm_unreachable("internal error");
   }
+  return -1;
+}
 
-  loopInfo = CFGTraits::getLoopInfo(pass);
-  if (DEBUGME) {
-    errs() << "LoopInfo:\n";
-    PrintLoopinfo(*loopInfo, errs());
+int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+  case AMDGPU::BRANCH_COND_i32:
+  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
+  default: llvm_unreachable("internal error");
   }
+  return -1;
+}
+
+int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
+  default: llvm_unreachable("internal error");
+  };
+  return -1;
+}
 
-  orderBlocks();
-  if (DEBUGME) {
-    errs() << "Ordered blocks:\n";
-    printOrderedBlocks(errs());
+int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
+  default: llvm_unreachable("internal error");
   }
+  return -1;
+}
 
-  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> retBlks;
-
-  for (typename LoopInfoT::iterator iter = loopInfo->begin(),
-       iterEnd = loopInfo->end();
-       iter != iterEnd; ++iter) {
-    LoopT* loopRep = (*iter);
-    BlockTSmallerVector exitingBlks;
-    loopRep->getExitingBlocks(exitingBlks);
-    
-    if (exitingBlks.size() == 0) {
-      BlockT* dummyExitBlk = normalizeInfiniteLoopExit(loopRep);
-      if (dummyExitBlk != NULL)
-        retBlks.push_back(dummyExitBlk);
-    }
+MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) {
+  return MI->getOperand(0).getMBB();
+}
+
+void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI,
+    MachineBasicBlock *MBB) {
+  MI->getOperand(0).setMBB(MBB);
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
+    MachineInstr *MI) {
+  assert(MBB->succ_size() == 2);
+  MachineBasicBlock *TrueBranch = getTrueBranch(MI);
+  MachineBasicBlock::succ_iterator It = MBB->succ_begin();
+  MachineBasicBlock::succ_iterator Next = It;
+  ++Next;
+  return (*It == TrueBranch) ? *Next : *It;
+}
+
+bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+    case AMDGPU::JUMP_COND:
+    case AMDGPU::BRANCH_COND_i32:
+    case AMDGPU::BRANCH_COND_f32: return true;
+  default:
+    return false;
   }
+  return false;
+}
 
-  // Remove unconditional branch instr.
-  // Add dummy exit block iff there are multiple returns.
+bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AMDGPU::JUMP:
+  case AMDGPU::BRANCH:
+    return true;
+  default:
+    return false;
+  }
+  return false;
+}
+
+DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
+  //get DebugLoc from the first MachineBasicBlock instruction with debug info
+  DebugLoc DL;
+  for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end();
+      ++It) {
+    MachineInstr *instr = &(*It);
+    if (instr->getDebugLoc().isUnknown() == false)
+      DL = instr->getDebugLoc();
+  }
+  return DL;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
+    MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  MachineInstr *MI = &*It;
+  if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
+    return MI;
+  return NULL;
+}
 
-  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
-       iterBlk = orderedBlks.begin(), iterEndBlk = orderedBlks.end();
-       iterBlk != iterEndBlk;
-       ++iterBlk) {
-    BlockT *curBlk = *iterBlk;
-    removeUnconditionalBranch(curBlk);
-    removeRedundantConditionalBranch(curBlk);
-    if (CFGTraits::isReturnBlock(curBlk)) {
-      retBlks.push_back(curBlk);
+MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
+    MachineBasicBlock *MBB) {
+  for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend();
+      It != E; ++It) {
+    // FIXME: Simplify
+    MachineInstr *MI = &*It;
+    if (MI) {
+      if (isCondBranch(MI) || isUncondBranch(MI))
+        return MI;
+      else if (!TII->isMov(MI->getOpcode()))
+        break;
     }
-    assert(curBlk->succ_size() <= 2);
-  } //for
+  }
+  return NULL;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  if (It != MBB->rend()) {
+    MachineInstr *instr = &(*It);
+    if (instr->getOpcode() == AMDGPU::RETURN)
+      return instr;
+  }
+  return NULL;
+}
 
-  if (retBlks.size() >= 2) {
-    addDummyExitBlock(retBlks);
-    changed = true;
+MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  if (It != MBB->rend()) {
+    MachineInstr *MI = &(*It);
+    if (MI->getOpcode() == AMDGPU::CONTINUE)
+      return MI;
   }
+  return NULL;
+}
 
-  return changed;
-} //CFGStructurizer::prepare
+bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
+  MachineInstr *MI = getReturnInstr(MBB);
+  bool IsReturn = (MBB->succ_size() == 0);
+  if (MI)
+    assert(IsReturn);
+  else if (IsReturn)
+    DEBUG(
+      dbgs() << "BB" << MBB->getNumber()
+             <<" is return block without RETURN instr\n";);
+  return  IsReturn;
+}
 
-template<class PassT>
-bool CFGStructurizer<PassT>::run(FuncT &func, PassT &pass,
-    const AMDGPURegisterInfo * tri) {
-  passRep = &pass;
-  funcRep = &func;
-  TRI = tri;
+void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
+    MachineBasicBlock *SrcMBB) {
+  for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(),
+       iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It)
+    DstMBB->addSuccessor(*It);  // *iter's predecessor is also taken care of
+}
 
-  //Assume reducible CFG...
-  if (DEBUGME) {
-    errs() << "AMDGPUCFGStructurizer::run\n";
-    func.viewCFG();
+MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
+  MachineFunction *Func = MBB->getParent();
+  MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
+  Func->push_back(NewMBB);  //insert to function
+  for (MachineBasicBlock::iterator It = MBB->begin(), E = MBB->end();
+      It != E; ++It) {
+    MachineInstr *MI = Func->CloneMachineInstr(It);
+    NewMBB->push_back(MI);
   }
+  return NewMBB;
+}
+
+void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB,
+    MachineBasicBlock *NewBlk) {
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB);
+  if (BranchMI && isCondBranch(BranchMI) &&
+      getTrueBranch(BranchMI) == OldMBB)
+    setTrueBranch(BranchMI, NewBlk);
+}
+
+void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
+  assert((!MBB->getParent()->getJumpTableInfo()
+          || MBB->getParent()->getJumpTableInfo()->isEmpty())
+         && "found a jump table");
+
+   //collect continue right before endloop
+   SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr;
+   MachineBasicBlock::iterator Pre = MBB->begin();
+   MachineBasicBlock::iterator E = MBB->end();
+   MachineBasicBlock::iterator It = Pre;
+   while (It != E) {
+     if (Pre->getOpcode() == AMDGPU::CONTINUE
+         && It->getOpcode() == AMDGPU::ENDLOOP)
+       ContInstr.push_back(Pre);
+     Pre = It;
+     ++It;
+   }
+
+   //delete continue right before endloop
+   for (unsigned i = 0; i < ContInstr.size(); ++i)
+      ContInstr[i]->eraseFromParent();
+
+   // TODO to fix up jump table so later phase won't be confused.  if
+   // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
+   // there isn't such an interface yet.  alternatively, replace all the other
+   // blocks in the jump table with the entryBlk //}
+
+}
+
+
+bool AMDGPUCFGStructurizer::prepare() {
+  bool Changed = false;
+
+  //FIXME: if not reducible flow graph, make it so ???
+
+  DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
 
-  domTree = CFGTraits::getDominatorTree(pass);
-  if (DEBUGME) {
-    domTree->print(errs(), (const llvm::Module*)0);
+  orderBlocks(FuncRep);
+
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;
+
+  // Add an ExitBlk to loop that don't have one
+  for (MachineLoopInfo::iterator It = MLI->begin(),
+       E = MLI->end(); It != E; ++It) {
+    MachineLoop *LoopRep = (*It);
+    MBBVector ExitingMBBs;
+    LoopRep->getExitingBlocks(ExitingMBBs);
+
+    if (ExitingMBBs.size() == 0) {
+      MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep);
+      if (DummyExitBlk)
+        RetBlks.push_back(DummyExitBlk);
+    }
   }
 
-  postDomTree = CFGTraits::getPostDominatorTree(pass);
-  if (DEBUGME) {
-    postDomTree->print(errs());
+  // Remove unconditional branch instr.
+  // Add dummy exit block iff there are multiple returns.
+  for (SmallVectorImpl<MachineBasicBlock *>::const_iterator
+       It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    removeUnconditionalBranch(MBB);
+    removeRedundantConditionalBranch(MBB);
+    if (isReturnBlock(MBB)) {
+      RetBlks.push_back(MBB);
+    }
+    assert(MBB->succ_size() <= 2);
   }
 
-  loopInfo = CFGTraits::getLoopInfo(pass);
-  if (DEBUGME) {
-    errs() << "LoopInfo:\n";
-    PrintLoopinfo(*loopInfo, errs());
+  if (RetBlks.size() >= 2) {
+    addDummyExitBlock(RetBlks);
+    Changed = true;
   }
 
-  orderBlocks();
+  return Changed;
+}
+
+bool AMDGPUCFGStructurizer::run() {
+
+  //Assume reducible CFG...
+  DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n";FuncRep->viewCFG(););
+
 #ifdef STRESSTEST
   //Use the worse block ordering to test the algorithm.
   ReverseVector(orderedBlks);
 #endif
 
-  if (DEBUGME) {
-    errs() << "Ordered blocks:\n";
-    printOrderedBlocks(errs());
-  }
-  int numIter = 0;
-  bool finish = false;
-  BlockT *curBlk;
-  bool makeProgress = false;
-  int numRemainedBlk = countActiveBlock(orderedBlks.begin(),
-                                        orderedBlks.end());
+  DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
+  int NumIter = 0;
+  bool Finish = false;
+  MachineBasicBlock *MBB;
+  bool MakeProgress = false;
+  int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(),
+                                        OrderedBlks.end());
 
   do {
-    ++numIter;
-    if (DEBUGME) {
-      errs() << "numIter = " << numIter
-             << ", numRemaintedBlk = " << numRemainedBlk << "\n";
-    }
-
-    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
-      iterBlk = orderedBlks.begin();
-    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
-      iterBlkEnd = orderedBlks.end();
-
-    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
-      sccBeginIter = iterBlk;
-    BlockT *sccBeginBlk = NULL;
-    int sccNumBlk = 0;  // The number of active blocks, init to a
+    ++NumIter;
+    DEBUG(
+      dbgs() << "numIter = " << NumIter
+             << ", numRemaintedBlk = " << NumRemainedBlk << "\n";
+    );
+
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
+        OrderedBlks.begin();
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator E =
+        OrderedBlks.end();
+
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
+        It;
+    MachineBasicBlock *SccBeginMBB = NULL;
+    int SccNumBlk = 0;  // The number of active blocks, init to a
                         // maximum possible number.
-    int sccNumIter;     // Number of iteration in this SCC.
-
-    while (iterBlk != iterBlkEnd) {
-      curBlk = *iterBlk;
-
-      if (sccBeginBlk == NULL) {
-        sccBeginIter = iterBlk;
-        sccBeginBlk = curBlk;
-        sccNumIter = 0;
-        sccNumBlk = numRemainedBlk; // Init to maximum possible number.
-        if (DEBUGME) {
-              errs() << "start processing SCC" << getSCCNum(sccBeginBlk);
-              errs() << "\n";
-        }
+    int SccNumIter;     // Number of iteration in this SCC.
+
+    while (It != E) {
+      MBB = *It;
+
+      if (!SccBeginMBB) {
+        SccBeginIter = It;
+        SccBeginMBB = MBB;
+        SccNumIter = 0;
+        SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
+        DEBUG(
+              dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
+              dbgs() << "\n";
+        );
       }
 
-      if (!isRetiredBlock(curBlk)) {
-        patternMatch(curBlk);
-      }
+      if (!isRetiredBlock(MBB))
+        patternMatch(MBB);
 
-      ++iterBlk;
+      ++It;
 
-      bool contNextScc = true;
-      if (iterBlk == iterBlkEnd
-          || getSCCNum(sccBeginBlk) != getSCCNum(*iterBlk)) {
+      bool ContNextScc = true;
+      if (It == E
+          || getSCCNum(SccBeginMBB) != getSCCNum(*It)) {
         // Just finish one scc.
-        ++sccNumIter;
-        int sccRemainedNumBlk = countActiveBlock(sccBeginIter, iterBlk);
-        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= sccNumBlk) {
-          if (DEBUGME) {
-            errs() << "Can't reduce SCC " << getSCCNum(curBlk)
-                   << ", sccNumIter = " << sccNumIter;
-            errs() << "doesn't make any progress\n";
-          }
-          contNextScc = true;
-        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < sccNumBlk) {
-          sccNumBlk = sccRemainedNumBlk;
-          iterBlk = sccBeginIter;
-          contNextScc = false;
-          if (DEBUGME) {
-            errs() << "repeat processing SCC" << getSCCNum(curBlk)
-                   << "sccNumIter = " << sccNumIter << "\n";
-            func.viewCFG();
-          }
+        ++SccNumIter;
+        int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
+        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
+          DEBUG(
+            dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
+                   << ", sccNumIter = " << SccNumIter;
+            dbgs() << "doesn't make any progress\n";
+          );
+          ContNextScc = true;
+        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
+          SccNumBlk = sccRemainedNumBlk;
+          It = SccBeginIter;
+          ContNextScc = false;
+          DEBUG(
+            dbgs() << "repeat processing SCC" << getSCCNum(MBB)
+                   << "sccNumIter = " << SccNumIter << "\n";
+            FuncRep->viewCFG();
+          );
         } else {
           // Finish the current scc.
-          contNextScc = true;
+          ContNextScc = true;
         }
       } else {
         // Continue on next component in the current scc.
-        contNextScc = false;
+        ContNextScc = false;
       }
 
-      if (contNextScc) {
-        sccBeginBlk = NULL;
-      }
+      if (ContNextScc)
+        SccBeginMBB = NULL;
     } //while, "one iteration" over the function.
 
-    BlockT *entryBlk = FuncGTraits::nodes_begin(&func);
-    if (entryBlk->succ_size() == 0) {
-      finish = true;
-      if (DEBUGME) {
-        errs() << "Reduce to one block\n";
-      }
+    MachineBasicBlock *EntryMBB =
+        GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
+    if (EntryMBB->succ_size() == 0) {
+      Finish = true;
+      DEBUG(
+        dbgs() << "Reduce to one block\n";
+      );
     } else {
-      int newnumRemainedBlk
-        = countActiveBlock(orderedBlks.begin(), orderedBlks.end());
+      int NewnumRemainedBlk
+        = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
       // consider cloned blocks ??
-      if (newnumRemainedBlk == 1 || newnumRemainedBlk < numRemainedBlk) {
-        makeProgress = true;
-        numRemainedBlk = newnumRemainedBlk;
+      if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) {
+        MakeProgress = true;
+        NumRemainedBlk = NewnumRemainedBlk;
       } else {
-        makeProgress = false;
-        if (DEBUGME) {
-          errs() << "No progress\n";
-        }
+        MakeProgress = false;
+        DEBUG(
+          dbgs() << "No progress\n";
+        );
       }
     }
-  } while (!finish && makeProgress);
+  } while (!Finish && MakeProgress);
 
   // Misc wrap up to maintain the consistency of the Function representation.
-  CFGTraits::wrapup(FuncGTraits::nodes_begin(&func));
+  wrapup(GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
 
   // Detach retired Block, release memory.
-  for (typename BlockInfoMap::iterator iterMap = blockInfoMap.begin(),
-       iterEndMap = blockInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
-    if ((*iterMap).second && (*iterMap).second->isRetired) {
-      assert(((*iterMap).first)->getNumber() != -1);
-      if (DEBUGME) {
-        errs() << "Erase BB" << ((*iterMap).first)->getNumber() << "\n";
-      }
-      (*iterMap).first->eraseFromParent();  //Remove from the parent Function.
+  for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
+      It != E; ++It) {
+    if ((*It).second && (*It).second->IsRetired) {
+      assert(((*It).first)->getNumber() != -1);
+      DEBUG(
+        dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";
+      );
+      (*It).first->eraseFromParent();  //Remove from the parent Function.
     }
-    delete (*iterMap).second;
-  }
-  blockInfoMap.clear();
-
-  // clear loopLandInfoMap
-  for (typename LoopLandInfoMap::iterator iterMap = loopLandInfoMap.begin(),
-       iterEndMap = loopLandInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
-    delete (*iterMap).second;
+    delete (*It).second;
   }
-  loopLandInfoMap.clear();
+  BlockInfoMap.clear();
+  LLInfoMap.clear();
 
-  if (DEBUGME) {
-    func.viewCFG();
-  }
+  DEBUG(
+    FuncRep->viewCFG();
+  );
 
-  if (!finish) {
-    assert(!"IRREDUCIBL_CF");
-  }
+  if (!Finish)
+    llvm_unreachable("IRREDUCIBL_CF");
 
   return true;
-} //CFGStructurizer::run
-
-/// Print the ordered Blocks.
-///
-template<class PassT>
-void CFGStructurizer<PassT>::printOrderedBlocks(llvm::raw_ostream &os) {
-  size_t i = 0;
-  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
-      iterBlk = orderedBlks.begin(), iterBlkEnd = orderedBlks.end();
-       iterBlk != iterBlkEnd;
-       ++iterBlk, ++i) {
-    os << "BB" << (*iterBlk)->getNumber();
-    os << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
-    if (i != 0 && i % 10 == 0) {
-      os << "\n";
-    } else {
-      os << " ";
-    }
-  }
-} //printOrderedBlocks
-
-/// Compute the reversed DFS post order of Blocks
-///
-template<class PassT> void CFGStructurizer<PassT>::orderBlocks() {
-  int sccNum = 0;
-  BlockT *bb;
-  for (scc_iterator<FuncT *> sccIter = scc_begin(funcRep),
-       sccEnd = scc_end(funcRep); sccIter != sccEnd; ++sccIter, ++sccNum) {
-    std::vector<BlockT *> &sccNext = *sccIter;
-    for (typename std::vector<BlockT *>::const_iterator
-         blockIter = sccNext.begin(), blockEnd = sccNext.end();
+}
+
+
+
+void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
+  int SccNum = 0;
+  MachineBasicBlock *MBB;
+  for (scc_iterator<MachineFunction *> It = scc_begin(MF), E = scc_end(MF);
+      It != E; ++It, ++SccNum) {
+    std::vector<MachineBasicBlock *> &SccNext = *It;
+    for (std::vector<MachineBasicBlock *>::const_iterator
+         blockIter = SccNext.begin(), blockEnd = SccNext.end();
          blockIter != blockEnd; ++blockIter) {
-      bb = *blockIter;
-      orderedBlks.push_back(bb);
-      recordSccnum(bb, sccNum);
+      MBB = *blockIter;
+      OrderedBlks.push_back(MBB);
+      recordSccnum(MBB, SccNum);
     }
   }
 
   //walk through all the block in func to check for unreachable
-  for (BlockIterator blockIter1 = FuncGTraits::nodes_begin(funcRep),
-       blockEnd1 = FuncGTraits::nodes_end(funcRep);
-       blockIter1 != blockEnd1; ++blockIter1) {
-    BlockT *bb = &(*blockIter1);
-    sccNum = getSCCNum(bb);
-    if (sccNum == INVALIDSCCNUM) {
-      errs() << "unreachable block BB" << bb->getNumber() << "\n";
-    }
+  typedef GraphTraits<MachineFunction *> GTM;
+  MachineFunction::iterator It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF);
+  for (; It != E; ++It) {
+    MachineBasicBlock *MBB = &(*It);
+    SccNum = getSCCNum(MBB);
+    if (SccNum == INVALIDSCCNUM)
+      dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
   }
-} //orderBlocks
+}
 
-template<class PassT> int CFGStructurizer<PassT>::patternMatch(BlockT *curBlk) {
-  int numMatch = 0;
-  int curMatch;
+int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
+  int NumMatch = 0;
+  int CurMatch;
 
-  if (DEBUGME) {
-        errs() << "Begin patternMatch BB" << curBlk->getNumber() << "\n";
-  }
+  DEBUG(
+        dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";
+  );
 
-  while ((curMatch = patternMatchGroup(curBlk)) > 0) {
-    numMatch += curMatch;
-  }
+  while ((CurMatch = patternMatchGroup(MBB)) > 0)
+    NumMatch += CurMatch;
 
-  if (DEBUGME) {
-        errs() << "End patternMatch BB" << curBlk->getNumber()
-      << ", numMatch = " << numMatch << "\n";
-  }
+  DEBUG(
+        dbgs() << "End patternMatch BB" << MBB->getNumber()
+      << ", numMatch = " << NumMatch << "\n";
+  );
+
+  return NumMatch;
+}
 
-  return numMatch;
-} //patternMatch
-
-template<class PassT>
-int CFGStructurizer<PassT>::patternMatchGroup(BlockT *curBlk) {
-  int numMatch = 0;
-  numMatch += serialPatternMatch(curBlk);
-  numMatch += ifPatternMatch(curBlk);
-  numMatch += loopendPatternMatch(curBlk);
-  numMatch += loopPatternMatch(curBlk);
-  return numMatch;
-}//patternMatchGroup
-
-template<class PassT>
-int CFGStructurizer<PassT>::serialPatternMatch(BlockT *curBlk) {
-  if (curBlk->succ_size() != 1) {
+int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
+  int NumMatch = 0;
+  NumMatch += loopendPatternMatch();
+  NumMatch += serialPatternMatch(MBB);
+  NumMatch += ifPatternMatch(MBB);
+  return NumMatch;
+}
+
+
+int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
+  if (MBB->succ_size() != 1)
     return 0;
-  }
 
-  BlockT *childBlk = *curBlk->succ_begin();
-  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) {
+  MachineBasicBlock *childBlk = *MBB->succ_begin();
+  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk))
     return 0;
-  }
 
-  mergeSerialBlock(curBlk, childBlk);
+  mergeSerialBlock(MBB, childBlk);
   ++numSerialPatternMatch;
   return 1;
-} //serialPatternMatch
+}
 
-template<class PassT>
-int CFGStructurizer<PassT>::ifPatternMatch(BlockT *curBlk) {
+int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
   //two edges
-  if (curBlk->succ_size() != 2) {
+  if (MBB->succ_size() != 2)
     return 0;
-  }
-
-  if (hasBackEdge(curBlk)) {
+  if (hasBackEdge(MBB))
     return 0;
-  }
-
-  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(curBlk);
-  if (branchInstr == NULL) {
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
+  if (!BranchMI)
     return 0;
-  }
 
-  assert(CFGTraits::isCondBranch(branchInstr));
+  assert(isCondBranch(BranchMI));
+  int NumMatch = 0;
 
-  BlockT *trueBlk = CFGTraits::getTrueBranch(branchInstr);
-  BlockT *falseBlk = CFGTraits::getFalseBranch(curBlk, branchInstr);
-  BlockT *landBlk;
-  int cloned = 0;
+  MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
+  NumMatch += serialPatternMatch(TrueMBB);
+  NumMatch += ifPatternMatch(TrueMBB);
+  MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
+  NumMatch += serialPatternMatch(FalseMBB);
+  NumMatch += ifPatternMatch(FalseMBB);
+  MachineBasicBlock *LandBlk;
+  int Cloned = 0;
 
+  assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty());
   // TODO: Simplify
-  if (trueBlk->succ_size() == 1 && falseBlk->succ_size() == 1
-    && *trueBlk->succ_begin() == *falseBlk->succ_begin()) {
-    landBlk = *trueBlk->succ_begin();
-  } else if (trueBlk->succ_size() == 0 && falseBlk->succ_size() == 0) {
-    landBlk = NULL;
-  } else if (trueBlk->succ_size() == 1 && *trueBlk->succ_begin() == falseBlk) {
-    landBlk = falseBlk;
-    falseBlk = NULL;
-  } else if (falseBlk->succ_size() == 1
-             && *falseBlk->succ_begin() == trueBlk) {
-    landBlk = trueBlk;
-    trueBlk = NULL;
-  } else if (falseBlk->succ_size() == 1
-             && isSameloopDetachedContbreak(trueBlk, falseBlk)) {
-    landBlk = *falseBlk->succ_begin();
-  } else if (trueBlk->succ_size() == 1
-    && isSameloopDetachedContbreak(falseBlk, trueBlk)) {
-    landBlk = *trueBlk->succ_begin();
+  if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1
+    && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) {
+    // Diamond pattern
+    LandBlk = *TrueMBB->succ_begin();
+  } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
+    // Triangle pattern, false is empty
+    LandBlk = FalseMBB;
+    FalseMBB = NULL;
+  } else if (FalseMBB->succ_size() == 1
+             && *FalseMBB->succ_begin() == TrueMBB) {
+    // Triangle pattern, true is empty
+    // We reverse the predicate to make a triangle, empty false pattern;
+    std::swap(TrueMBB, FalseMBB);
+    reversePredicateSetter(MBB->end());
+    LandBlk = FalseMBB;
+    FalseMBB = NULL;
+  } else if (FalseMBB->succ_size() == 1
+             && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
+    LandBlk = *FalseMBB->succ_begin();
+  } else if (TrueMBB->succ_size() == 1
+    && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
+    LandBlk = *TrueMBB->succ_begin();
   } else {
-    return handleJumpintoIf(curBlk, trueBlk, falseBlk);
+    return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
   }
 
   // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
   // new BB created for landBlk==NULL may introduce new challenge to the
   // reduction process.
-  if (landBlk != NULL &&
-      ((trueBlk && trueBlk->pred_size() > 1)
-      || (falseBlk && falseBlk->pred_size() > 1))) {
-     cloned += improveSimpleJumpintoIf(curBlk, trueBlk, falseBlk, &landBlk);
+  if (LandBlk &&
+      ((TrueMBB && TrueMBB->pred_size() > 1)
+      || (FalseMBB && FalseMBB->pred_size() > 1))) {
+     Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk);
   }
 
-  if (trueBlk && trueBlk->pred_size() > 1) {
-    trueBlk = cloneBlockForPredecessor(trueBlk, curBlk);
-    ++cloned;
+  if (TrueMBB && TrueMBB->pred_size() > 1) {
+    TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB);
+    ++Cloned;
   }
 
-  if (falseBlk && falseBlk->pred_size() > 1) {
-    falseBlk = cloneBlockForPredecessor(falseBlk, curBlk);
-    ++cloned;
+  if (FalseMBB && FalseMBB->pred_size() > 1) {
+    FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB);
+    ++Cloned;
   }
 
-  mergeIfthenelseBlock(branchInstr, curBlk, trueBlk, falseBlk, landBlk);
+  mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk);
 
   ++numIfPatternMatch;
 
-  numClonedBlock += cloned;
-
-  return 1 + cloned;
-} //ifPatternMatch
+  numClonedBlock += Cloned;
 
-template<class PassT>
-int CFGStructurizer<PassT>::switchPatternMatch(BlockT *curBlk) {
-  return 0;
-} //switchPatternMatch
+  return 1 + Cloned + NumMatch;
+}
 
-template<class PassT>
-int CFGStructurizer<PassT>::loopendPatternMatch(BlockT *curBlk) {
-  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
-  typename std::vector<LoopT *> nestedLoops;
-  while (loopRep) {
-    nestedLoops.push_back(loopRep);
-    loopRep = loopRep->getParentLoop();
+int AMDGPUCFGStructurizer::loopendPatternMatch() {
+  std::vector<MachineLoop *> NestedLoops;
+  for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end();
+      It != E; ++It) {
+    df_iterator<MachineLoop *> LpIt = df_begin(*It),
+        LpE = df_end(*It);
+    for (; LpIt != LpE; ++LpIt)
+      NestedLoops.push_back(*LpIt);
   }
-
-  if (nestedLoops.size() == 0) {
+  if (NestedLoops.size() == 0)
     return 0;
-  }
 
   // Process nested loop outside->inside, so "continue" to a outside loop won't
   // be mistaken as "break" of the current loop.
-  int num = 0;
-  for (typename std::vector<LoopT *>::reverse_iterator
-       iter = nestedLoops.rbegin(), iterEnd = nestedLoops.rend();
-       iter != iterEnd; ++iter) {
-    loopRep = *iter;
-
-    if (getLoopLandBlock(loopRep) != NULL) {
+  int Num = 0;
+  for (std::vector<MachineLoop *>::reverse_iterator It = NestedLoops.rbegin(),
+      E = NestedLoops.rend(); It != E; ++It) {
+    MachineLoop *ExaminedLoop = *It;
+    if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
       continue;
-    }
-
-    BlockT *loopHeader = loopRep->getHeader();
+    DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
+    int NumBreak = mergeLoop(ExaminedLoop);
+    if (NumBreak == -1)
+      break;
+    Num += NumBreak;
+  }
+  return Num;
+}
 
-    int numBreak = loopbreakPatternMatch(loopRep, loopHeader);
+int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  MBBVector ExitingMBBs;
+  LoopRep->getExitingBlocks(ExitingMBBs);
+  assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
+  DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";);
+  // We assume a single ExitBlk
+  MBBVector ExitBlks;
+  LoopRep->getExitBlocks(ExitBlks);
+  SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet;
+  for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i)
+    ExitBlkSet.insert(ExitBlks[i]);
+  assert(ExitBlkSet.size() == 1);
+  MachineBasicBlock *ExitBlk = *ExitBlks.begin();
+  assert(ExitBlk && "Loop has several exit block");
+  MBBVector LatchBlks;
+  typedef GraphTraits<Inverse<MachineBasicBlock*> > InvMBBTraits;
+  InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader),
+      PE = InvMBBTraits::child_end(LoopHeader);
+  for (; PI != PE; PI++) {
+    if (LoopRep->contains(*PI))
+      LatchBlks.push_back(*PI);
+  }
+
+  for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
+    mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
+  for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i)
+    settleLoopcontBlock(LatchBlks[i], LoopHeader);
+  int Match = 0;
+  do {
+    Match = 0;
+    Match += serialPatternMatch(LoopHeader);
+    Match += ifPatternMatch(LoopHeader);
+  } while (Match > 0);
+  mergeLooplandBlock(LoopHeader, ExitBlk);
+  MachineLoop *ParentLoop = LoopRep->getParentLoop();
+  if (ParentLoop)
+    MLI->changeLoopFor(LoopHeader, ParentLoop);
+  else
+    MLI->removeBlock(LoopHeader);
+  Visited[LoopRep] = true;
+  return 1;
+}
 
-    if (numBreak == -1) {
-      break;
+int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep,
+    MachineBasicBlock *LoopHeader) {
+  int NumCont = 0;
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> ContMBB;
+  typedef GraphTraits<Inverse<MachineBasicBlock *> > GTIM;
+  GTIM::ChildIteratorType It = GTIM::child_begin(LoopHeader),
+      E = GTIM::child_end(LoopHeader);
+  for (; It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    if (LoopRep->contains(MBB)) {
+      handleLoopcontBlock(MBB, MLI->getLoopFor(MBB),
+                          LoopHeader, LoopRep);
+      ContMBB.push_back(MBB);
+      ++NumCont;
     }
+  }
 
-    int numCont = loopcontPatternMatch(loopRep, loopHeader);
-    num += numBreak + numCont;
+  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(),
+      E = ContMBB.end(); It != E; ++It) {
+    (*It)->removeSuccessor(LoopHeader);
   }
 
-  return num;
-} //loopendPatternMatch
+  numLoopcontPatternMatch += NumCont;
+
+  return NumCont;
+}
 
-template<class PassT>
-int CFGStructurizer<PassT>::loopPatternMatch(BlockT *curBlk) {
-  if (curBlk->succ_size() != 0) {
-    return 0;
-  }
 
-  int numLoop = 0;
-  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
-  while (loopRep && loopRep->getHeader() == curBlk) {
-    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
-    if (loopLand) {
-      BlockT *landBlk = loopLand->landBlk;
-      assert(landBlk);
-      if (!isRetiredBlock(landBlk)) {
-        mergeLooplandBlock(curBlk, loopLand);
-        ++numLoop;
+bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
+    MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
+  if (Src1MBB->succ_size() == 0) {
+    MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
+    if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
+      MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
+      if (TheEntry) {
+        DEBUG(
+          dbgs() << "isLoopContBreakBlock yes src1 = BB"
+                 << Src1MBB->getNumber()
+                 << " src2 = BB" << Src2MBB->getNumber() << "\n";
+        );
+        return true;
       }
     }
-    loopRep = loopRep->getParentLoop();
   }
+  return false;
+}
 
-  numLoopPatternMatch += numLoop;
+int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
+  int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
+  if (Num == 0) {
+    DEBUG(
+      dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
+    );
+    Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
+  }
+  return Num;
+}
 
-  return numLoop;
-} //loopPatternMatch
+int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
+  int Num = 0;
+  MachineBasicBlock *DownBlk;
 
-template<class PassT>
-int CFGStructurizer<PassT>::loopbreakPatternMatch(LoopT *loopRep,
-                                                  BlockT *loopHeader) {
-  BlockTSmallerVector exitingBlks;
-  loopRep->getExitingBlocks(exitingBlks);
+  //trueBlk could be the common post dominator
+  DownBlk = TrueMBB;
+
+  DEBUG(
+    dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
+           << " true = BB" << TrueMBB->getNumber()
+           << ", numSucc=" << TrueMBB->succ_size()
+           << " false = BB" << FalseMBB->getNumber() << "\n";
+  );
+
+  while (DownBlk) {
+    DEBUG(
+      dbgs() << "check down = BB" << DownBlk->getNumber();
+    );
+
+    if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
+      DEBUG(
+        dbgs() << " working\n";
+      );
+
+      Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
+      Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
+
+      numClonedBlock += Num;
+      Num += serialPatternMatch(*HeadMBB->succ_begin());
+      Num += serialPatternMatch(*llvm::next(HeadMBB->succ_begin()));
+      Num += ifPatternMatch(HeadMBB);
+      assert(Num > 0);
 
-  if (DEBUGME) {
-    errs() << "Loop has " << exitingBlks.size() << " exiting blocks\n";
-  }
+      break;
+    }
+    DEBUG(
+      dbgs() << " not working\n";
+    );
+    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : NULL;
+  } // walk down the postDomTree
 
-  if (exitingBlks.size() == 0) {
-    setLoopLandBlock(loopRep);
-    return 0;
-  }
+  return Num;
+}
 
-  // Compute the corresponding exitBlks and exit block set.
-  BlockTSmallerVector exitBlks;
-  std::set<BlockT *> exitBlkSet;
-  for (typename BlockTSmallerVector::const_iterator iter = exitingBlks.begin(),
-       iterEnd = exitingBlks.end(); iter != iterEnd; ++iter) {
-    BlockT *exitingBlk = *iter;
-    BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
-    exitBlks.push_back(exitBlk);
-    exitBlkSet.insert(exitBlk);  //non-duplicate insert
+void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
+    MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
+    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
+  dbgs() << "head = BB" << HeadMBB->getNumber()
+         << " size = " << HeadMBB->size();
+  if (Detail) {
+    dbgs() << "\n";
+    HeadMBB->print(dbgs());
+    dbgs() << "\n";
   }
 
-  assert(exitBlkSet.size() > 0);
-  assert(exitBlks.size() == exitingBlks.size());
-
-  if (DEBUGME) {
-    errs() << "Loop has " << exitBlkSet.size() << " exit blocks\n";
+  if (TrueMBB) {
+    dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = "
+           << TrueMBB->size() << " numPred = " << TrueMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      TrueMBB->print(dbgs());
+      dbgs() << "\n";
+    }
+  }
+  if (FalseMBB) {
+    dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = "
+           << FalseMBB->size() << " numPred = " << FalseMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      FalseMBB->print(dbgs());
+      dbgs() << "\n";
+    }
+  }
+  if (LandMBB) {
+    dbgs() << ", land = BB" << LandMBB->getNumber() << " size = "
+           << LandMBB->size() << " numPred = " << LandMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      LandMBB->print(dbgs());
+      dbgs() << "\n";
+    }
   }
 
-  // Find exitLandBlk.
-  BlockT *exitLandBlk = NULL;
-  int numCloned = 0;
-  int numSerial = 0;
-
-  if (exitBlkSet.size() == 1) {
-    exitLandBlk = *exitBlkSet.begin();
-  } else {
-    exitLandBlk = findNearestCommonPostDom(exitBlkSet);
+    dbgs() << "\n";
+}
 
-    if (exitLandBlk == NULL) {
-      return -1;
-    }
+int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+    MachineBasicBlock **LandMBBPtr) {
+  bool MigrateTrue = false;
+  bool MigrateFalse = false;
 
-    bool allInPath = true;
-    bool allNotInPath = true;
-    for (typename std::set<BlockT*>::const_iterator
-         iter = exitBlkSet.begin(),
-         iterEnd = exitBlkSet.end();
-         iter != iterEnd; ++iter) {
-      BlockT *exitBlk = *iter;
-
-      PathToKind pathKind = singlePathTo(exitBlk, exitLandBlk, true);
-      if (DEBUGME) {
-        errs() << "BB" << exitBlk->getNumber()
-               << " to BB" << exitLandBlk->getNumber() << " PathToKind="
-               << pathKind << "\n";
-      }
+  MachineBasicBlock *LandBlk = *LandMBBPtr;
 
-      allInPath = allInPath && (pathKind == SinglePath_InPath);
-      allNotInPath = allNotInPath && (pathKind == SinglePath_NotInPath);
+  assert((!TrueMBB || TrueMBB->succ_size() <= 1)
+         && (!FalseMBB || FalseMBB->succ_size() <= 1));
 
-      if (!allInPath && !allNotInPath) {
-        if (DEBUGME) {
-              errs() << "singlePath check fail\n";
-        }
-        return -1;
-      }
-    } // check all exit blocks
-
-    if (allNotInPath) {
-
-      // TODO: Simplify, maybe separate function?
-      LoopT *parentLoopRep = loopRep->getParentLoop();
-      BlockT *parentLoopHeader = NULL;
-      if (parentLoopRep)
-        parentLoopHeader = parentLoopRep->getHeader();
-
-      if (exitLandBlk == parentLoopHeader &&
-          (exitLandBlk = relocateLoopcontBlock(parentLoopRep,
-                                               loopRep,
-                                               exitBlkSet,
-                                               exitLandBlk)) != NULL) {
-        if (DEBUGME) {
-          errs() << "relocateLoopcontBlock success\n";
-        }
-      } else if ((exitLandBlk = addLoopEndbranchBlock(loopRep,
-                                                      exitingBlks,
-                                                      exitBlks)) != NULL) {
-        if (DEBUGME) {
-          errs() << "insertEndbranchBlock success\n";
-        }
-      } else {
-        if (DEBUGME) {
-          errs() << "loop exit fail\n";
-        }
-        return -1;
-      }
-    }
+  if (TrueMBB == FalseMBB)
+    return 0;
 
-    // Handle side entry to exit path.
-    exitBlks.clear();
-    exitBlkSet.clear();
-    for (typename BlockTSmallerVector::iterator iterExiting =
-           exitingBlks.begin(),
-         iterExitingEnd = exitingBlks.end();
-         iterExiting != iterExitingEnd; ++iterExiting) {
-      BlockT *exitingBlk = *iterExiting;
-      BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
-      BlockT *newExitBlk = exitBlk;
-
-      if (exitBlk != exitLandBlk && exitBlk->pred_size() > 1) {
-        newExitBlk = cloneBlockForPredecessor(exitBlk, exitingBlk);
-        ++numCloned;
-      }
+  MigrateTrue = needMigrateBlock(TrueMBB);
+  MigrateFalse = needMigrateBlock(FalseMBB);
 
-      numCloned += cloneOnSideEntryTo(exitingBlk, newExitBlk, exitLandBlk);
+  if (!MigrateTrue && !MigrateFalse)
+    return 0;
 
-      exitBlks.push_back(newExitBlk);
-      exitBlkSet.insert(newExitBlk);
-    }
+  // If we need to migrate either trueBlk and falseBlk, migrate the rest that
+  // have more than one predecessors.  without doing this, its predecessor
+  // rather than headBlk will have undefined value in initReg.
+  if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1)
+    MigrateTrue = true;
+  if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
+    MigrateFalse = true;
 
-    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
-         iterExitEnd = exitBlks.end();
-         iterExit != iterExitEnd; ++iterExit) {
-      BlockT *exitBlk = *iterExit;
-      numSerial += serialPatternMatch(exitBlk);
-    }
-
-    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
-         iterExitEnd = exitBlks.end();
-         iterExit != iterExitEnd; ++iterExit) {
-      BlockT *exitBlk = *iterExit;
-      if (exitBlk->pred_size() > 1) {
-        if (exitBlk != exitLandBlk) {
-          return -1;
-        }
-      } else {
-        if (exitBlk != exitLandBlk &&
-            (exitBlk->succ_size() != 1 ||
-            *exitBlk->succ_begin() != exitLandBlk)) {
-          return -1;
-        }
-      }
-    }
-  } // else
-
-  exitLandBlk = recordLoopLandBlock(loopRep, exitLandBlk, exitBlks, exitBlkSet);
-
-  // Fold break into the breaking block. Leverage across level breaks.
-  assert(exitingBlks.size() == exitBlks.size());
-  for (typename BlockTSmallerVector::const_iterator iterExit = exitBlks.begin(),
-       iterExiting = exitingBlks.begin(), iterExitEnd = exitBlks.end();
-       iterExit != iterExitEnd; ++iterExit, ++iterExiting) {
-    BlockT *exitBlk = *iterExit;
-    BlockT *exitingBlk = *iterExiting;
-    assert(exitBlk->pred_size() == 1 || exitBlk == exitLandBlk);
-    LoopT *exitingLoop = loopInfo->getLoopFor(exitingBlk);
-    handleLoopbreak(exitingBlk, exitingLoop, exitBlk, loopRep, exitLandBlk);
-  }
-
-  int numBreak = static_cast<int>(exitingBlks.size());
-  numLoopbreakPatternMatch += numBreak;
-  numClonedBlock += numCloned;
-  return numBreak + numSerial + numCloned;
-} //loopbreakPatternMatch
-
-template<class PassT>
-int CFGStructurizer<PassT>::loopcontPatternMatch(LoopT *loopRep,
-                                                 BlockT *loopHeader) {
-  int numCont = 0;
-  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> contBlk;
-  for (typename InvBlockGTraits::ChildIteratorType iter =
-       InvBlockGTraits::child_begin(loopHeader),
-       iterEnd = InvBlockGTraits::child_end(loopHeader);
-       iter != iterEnd; ++iter) {
-    BlockT *curBlk = *iter;
-    if (loopRep->contains(curBlk)) {
-      handleLoopcontBlock(curBlk, loopInfo->getLoopFor(curBlk),
-                          loopHeader, loopRep);
-      contBlk.push_back(curBlk);
-      ++numCont;
-    }
-  }
-
-  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator
-       iter = contBlk.begin(), iterEnd = contBlk.end();
-       iter != iterEnd; ++iter) {
-    (*iter)->removeSuccessor(loopHeader);
-  }
-
-  numLoopcontPatternMatch += numCont;
-
-  return numCont;
-} //loopcontPatternMatch
-
-
-template<class PassT>
-bool CFGStructurizer<PassT>::isSameloopDetachedContbreak(BlockT *src1Blk,
-                                                         BlockT *src2Blk) {
-  // return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in the
-  // same loop with LoopLandInfo without explicitly keeping track of
-  // loopContBlks and loopBreakBlks, this is a method to get the information.
-  //
-  if (src1Blk->succ_size() == 0) {
-    LoopT *loopRep = loopInfo->getLoopFor(src1Blk);
-    if (loopRep != NULL && loopRep == loopInfo->getLoopFor(src2Blk)) {
-      LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-      if (theEntry != NULL) {
-        if (DEBUGME) {
-          errs() << "isLoopContBreakBlock yes src1 = BB"
-                 << src1Blk->getNumber()
-                 << " src2 = BB" << src2Blk->getNumber() << "\n";
-        }
-        return true;
-      }
-    }
-  }
-  return false;
-}  //isSameloopDetachedContbreak
-
-template<class PassT>
-int CFGStructurizer<PassT>::handleJumpintoIf(BlockT *headBlk,
-                                             BlockT *trueBlk,
-                                             BlockT *falseBlk) {
-  int num = handleJumpintoIfImp(headBlk, trueBlk, falseBlk);
-  if (num == 0) {
-    if (DEBUGME) {
-      errs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
-    }
-    num = handleJumpintoIfImp(headBlk, falseBlk, trueBlk);
-  }
-  return num;
-}
-
-template<class PassT>
-int CFGStructurizer<PassT>::handleJumpintoIfImp(BlockT *headBlk,
-                                                BlockT *trueBlk,
-                                                BlockT *falseBlk) {
-  int num = 0;
-  BlockT *downBlk;
-
-  //trueBlk could be the common post dominator
-  downBlk = trueBlk;
-
-  if (DEBUGME) {
-    errs() << "handleJumpintoIfImp head = BB" << headBlk->getNumber()
-           << " true = BB" << trueBlk->getNumber()
-           << ", numSucc=" << trueBlk->succ_size()
-           << " false = BB" << falseBlk->getNumber() << "\n";
-  }
-
-  while (downBlk) {
-    if (DEBUGME) {
-      errs() << "check down = BB" << downBlk->getNumber();
-    }
-
-    if (singlePathTo(falseBlk, downBlk) == SinglePath_InPath) {
-      if (DEBUGME) {
-        errs() << " working\n";
-      }
-
-      num += cloneOnSideEntryTo(headBlk, trueBlk, downBlk);
-      num += cloneOnSideEntryTo(headBlk, falseBlk, downBlk);
-
-      numClonedBlock += num;
-      num += serialPatternMatch(*headBlk->succ_begin());
-      num += serialPatternMatch(*(++headBlk->succ_begin()));
-      num += ifPatternMatch(headBlk);
-      assert(num > 0);
-
-      break;
-    }
-    if (DEBUGME) {
-      errs() << " not working\n";
-    }
-    downBlk = (downBlk->succ_size() == 1) ? (*downBlk->succ_begin()) : NULL;
-  } // walk down the postDomTree
-
-  return num;
-} //handleJumpintoIf
-
-template<class PassT>
-void CFGStructurizer<PassT>::showImproveSimpleJumpintoIf(BlockT *headBlk,
-                                                         BlockT *trueBlk,
-                                                         BlockT *falseBlk,
-                                                         BlockT *landBlk,
-                                                         bool detail) {
-  errs() << "head = BB" << headBlk->getNumber()
-         << " size = " << headBlk->size();
-  if (detail) {
-    errs() << "\n";
-    headBlk->print(errs());
-    errs() << "\n";
-  }
-
-  if (trueBlk) {
-    errs() << ", true = BB" << trueBlk->getNumber() << " size = "
-           << trueBlk->size() << " numPred = " << trueBlk->pred_size();
-    if (detail) {
-      errs() << "\n";
-      trueBlk->print(errs());
-      errs() << "\n";
-    }
-  }
-  if (falseBlk) {
-    errs() << ", false = BB" << falseBlk->getNumber() << " size = "
-           << falseBlk->size() << " numPred = " << falseBlk->pred_size();
-    if (detail) {
-      errs() << "\n";
-      falseBlk->print(errs());
-      errs() << "\n";
-    }
-  }
-  if (landBlk) {
-    errs() << ", land = BB" << landBlk->getNumber() << " size = "
-           << landBlk->size() << " numPred = " << landBlk->pred_size();
-    if (detail) {
-      errs() << "\n";
-      landBlk->print(errs());
-      errs() << "\n";
-    }
-  }
-
-    errs() << "\n";
-} //showImproveSimpleJumpintoIf
-
-template<class PassT>
-int CFGStructurizer<PassT>::improveSimpleJumpintoIf(BlockT *headBlk,
-                                                    BlockT *trueBlk,
-                                                    BlockT *falseBlk,
-                                                    BlockT **plandBlk) {
-  bool migrateTrue = false;
-  bool migrateFalse = false;
-
-  BlockT *landBlk = *plandBlk;
-
-  assert((trueBlk == NULL || trueBlk->succ_size() <= 1)
-         && (falseBlk == NULL || falseBlk->succ_size() <= 1));
-
-  if (trueBlk == falseBlk) {
-    return 0;
-  }
-
-  migrateTrue = needMigrateBlock(trueBlk);
-  migrateFalse = needMigrateBlock(falseBlk);
-
-  if (!migrateTrue && !migrateFalse) {
-    return 0;
-  }
-
-  // If we need to migrate either trueBlk and falseBlk, migrate the rest that
-  // have more than one predecessors.  without doing this, its predecessor
-  // rather than headBlk will have undefined value in initReg.
-  if (!migrateTrue && trueBlk && trueBlk->pred_size() > 1) {
-    migrateTrue = true;
-  }
-  if (!migrateFalse && falseBlk && falseBlk->pred_size() > 1) {
-    migrateFalse = true;
-  }
-
-  if (DEBUGME) {
-    errs() << "before improveSimpleJumpintoIf: ";
-    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
-  }
+  DEBUG(
+    dbgs() << "before improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
+  );
 
   // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
   //
@@ -1186,205 +1341,192 @@ int CFGStructurizer<PassT>::improveSimpleJumpintoIf(BlockT *headBlk,
   // add initReg = initVal to headBlk
 
   const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-  unsigned initReg =
-    funcRep->getRegInfo().createVirtualRegister(I32RC);
-  if (!migrateTrue || !migrateFalse) {
-    int initVal = migrateTrue ? 0 : 1;
-    CFGTraits::insertAssignInstrBefore(headBlk, passRep, initReg, initVal);
+  if (!MigrateTrue || !MigrateFalse) {
+    // XXX: We have an opportunity here to optimize the "branch into if" case
+    // here.  Branch into if looks like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \           |
+    // diamond_false        diamond_true
+    //             \      /
+    //               done
+    //
+    // The diamond_head block begins the "if" and the diamond_true block
+    // is the block being "branched into".
+    //
+    // If MigrateTrue is true, then TrueBB is the block being "branched into"
+    // and if MigrateFalse is true, then FalseBB is the block being
+    // "branched into"
+    // 
+    // Here is the pseudo code for how I think the optimization should work:
+    // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
+    // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
+    // 3. Move the branch instruction from diamond_head into its own basic
+    //    block (new_block).
+    // 4. Add an unconditional branch from diamond_head to new_block
+    // 5. Replace the branch instruction in branch_from with an unconditional
+    //    branch to new_block.  If branch_from has multiple predecessors, then
+    //    we need to replace the True/False block in the branch
+    //    instruction instead of replacing it.
+    // 6. Change the condition of the branch instruction in new_block from
+    //    COND to (COND || GPR0)
+    //
+    // In order insert these MOV instruction, we will need to use the
+    // RegisterScavenger.  Usually liveness stops being tracked during
+    // the late machine optimization passes, however if we implement
+    // bool TargetRegisterInfo::requiresRegisterScavenging(
+    //                                                const MachineFunction &MF)
+    // and have it return true, liveness will be tracked correctly 
+    // by generic optimization passes.  We will also need to make sure that
+    // all of our target-specific passes that run after regalloc and before
+    // the CFGStructurizer track liveness and we will need to modify this pass
+    // to correctly track liveness.
+    //
+    // After the above changes, the new CFG should look like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //                       \     /
+    //                      new_block
+    //                      /      |
+    //         diamond_false        diamond_true
+    //                      \      /
+    //                        done
+    //
+    // Without this optimization, we are forced to duplicate the diamond_true
+    // block and we will end up with a CFG like this:
+    //
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \                   |
+    // diamond_false        diamond_true      diamond_true (duplicate)
+    //             \      /                   |
+    //               done --------------------|
+    //
+    // Duplicating diamond_true can be very costly especially if it has a
+    // lot of instructions.
+    return 0;
   }
 
-  int numNewBlk = 0;
-
-  if (landBlk == NULL) {
-    landBlk = funcRep->CreateMachineBasicBlock();
-    funcRep->push_back(landBlk);  //insert to function
-
-    if (trueBlk) {
-      trueBlk->addSuccessor(landBlk);
-    } else {
-      headBlk->addSuccessor(landBlk);
-    }
-
-    if (falseBlk) {
-      falseBlk->addSuccessor(landBlk);
-    } else {
-      headBlk->addSuccessor(landBlk);
-    }
-
-    numNewBlk ++;
-  }
+  int NumNewBlk = 0;
 
-  bool landBlkHasOtherPred = (landBlk->pred_size() > 2);
+  bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
 
   //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
-  typename BlockT::iterator insertPos =
-    CFGTraits::getInstrPos
-    (landBlk, CFGTraits::insertInstrBefore(landBlk, AMDGPU::ENDIF, passRep));
-
-  if (landBlkHasOtherPred) {
-    unsigned immReg =
-      funcRep->getRegInfo().createVirtualRegister(I32RC);
-    CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 2);
-    unsigned cmpResReg =
-      funcRep->getRegInfo().createVirtualRegister(I32RC);
-
-    CFGTraits::insertCompareInstrBefore(landBlk, insertPos, passRep, cmpResReg,
-                                        initReg, immReg);
-    CFGTraits::insertCondBranchBefore(landBlk, insertPos,
-                                      AMDGPU::IF_PREDICATE_SET, passRep,
-                                      cmpResReg, DebugLoc());
-  }
-
-  CFGTraits::insertCondBranchBefore(landBlk, insertPos, AMDGPU::IF_PREDICATE_SET,
-                                    passRep, initReg, DebugLoc());
-
-  if (migrateTrue) {
-    migrateInstruction(trueBlk, landBlk, insertPos);
+  MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);
+
+  if (LandBlkHasOtherPred) {
+    llvm_unreachable("Extra register needed to handle CFG");
+    unsigned CmpResReg =
+      HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+    llvm_unreachable("Extra compare instruction needed to handle CFG");
+    insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
+        CmpResReg, DebugLoc());
+  }
+
+  // XXX: We are running this after RA, so creating virtual registers will
+  // cause an assertion failure in the PostRA scheduling pass.
+  unsigned InitReg =
+    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+  insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
+      DebugLoc());
+
+  if (MigrateTrue) {
+    migrateInstruction(TrueMBB, LandBlk, I);
     // need to uncondionally insert the assignment to ensure a path from its
     // predecessor rather than headBlk has valid value in initReg if
     // (initVal != 1).
-    CFGTraits::insertAssignInstrBefore(trueBlk, passRep, initReg, 1);
+    llvm_unreachable("Extra register needed to handle CFG");
   }
-  CFGTraits::insertInstrBefore(insertPos, AMDGPU::ELSE, passRep);
+  insertInstrBefore(I, AMDGPU::ELSE);
 
-  if (migrateFalse) {
-    migrateInstruction(falseBlk, landBlk, insertPos);
+  if (MigrateFalse) {
+    migrateInstruction(FalseMBB, LandBlk, I);
     // need to uncondionally insert the assignment to ensure a path from its
     // predecessor rather than headBlk has valid value in initReg if
     // (initVal != 0)
-    CFGTraits::insertAssignInstrBefore(falseBlk, passRep, initReg, 0);
+    llvm_unreachable("Extra register needed to handle CFG");
   }
 
-  if (landBlkHasOtherPred) {
+  if (LandBlkHasOtherPred) {
     // add endif
-    CFGTraits::insertInstrBefore(insertPos, AMDGPU::ENDIF, passRep);
+    insertInstrBefore(I, AMDGPU::ENDIF);
 
     // put initReg = 2 to other predecessors of landBlk
-    for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
-         predIterEnd = landBlk->pred_end(); predIter != predIterEnd;
-         ++predIter) {
-      BlockT *curBlk = *predIter;
-      if (curBlk != trueBlk && curBlk != falseBlk) {
-        CFGTraits::insertAssignInstrBefore(curBlk, passRep, initReg, 2);
-      }
-    } //for
-  }
-  if (DEBUGME) {
-    errs() << "result from improveSimpleJumpintoIf: ";
-    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
-  }
-
-  // update landBlk
-  *plandBlk = landBlk;
-
-  return numNewBlk;
-} //improveSimpleJumpintoIf
-
-template<class PassT>
-void CFGStructurizer<PassT>::handleLoopbreak(BlockT *exitingBlk,
-                                              LoopT *exitingLoop,
-                                             BlockT *exitBlk,
-                                              LoopT *exitLoop,
-                                             BlockT *landBlk) {
-  if (DEBUGME) {
-    errs() << "Trying to break loop-depth = " << getLoopDepth(exitLoop)
-           << " from loop-depth = " << getLoopDepth(exitingLoop) << "\n";
-  }
-  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-
-  RegiT initReg = INVALIDREGNUM;
-  if (exitingLoop != exitLoop) {
-    initReg = static_cast<int>
-      (funcRep->getRegInfo().createVirtualRegister(I32RC));
-    assert(initReg != INVALIDREGNUM);
-    addLoopBreakInitReg(exitLoop, initReg);
-    while (exitingLoop != exitLoop && exitingLoop) {
-      addLoopBreakOnReg(exitingLoop, initReg);
-      exitingLoop = exitingLoop->getParentLoop();
+    for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
+         PE = LandBlk->pred_end(); PI != PE; ++PI) {
+      MachineBasicBlock *MBB = *PI;
+      if (MBB != TrueMBB && MBB != FalseMBB)
+        llvm_unreachable("Extra register needed to handle CFG");
     }
-    assert(exitingLoop == exitLoop);
   }
+  DEBUG(
+    dbgs() << "result from improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
+  );
 
-  mergeLoopbreakBlock(exitingBlk, exitBlk, landBlk, initReg);
+  // update landBlk
+  *LandMBBPtr = LandBlk;
 
-} //handleLoopbreak
+  return NumNewBlk;
+}
 
-template<class PassT>
-void CFGStructurizer<PassT>::handleLoopcontBlock(BlockT *contingBlk,
-                                                  LoopT *contingLoop,
-                                                 BlockT *contBlk,
-                                                  LoopT *contLoop) {
-  if (DEBUGME) {
-    errs() << "loopcontPattern cont = BB" << contingBlk->getNumber()
-           << " header = BB" << contBlk->getNumber() << "\n";
+void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock *ContingMBB,
+    MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
+    MachineLoop *ContLoop) {
+  DEBUG(dbgs() << "loopcontPattern cont = BB" << ContingMBB->getNumber()
+               << " header = BB" << ContMBB->getNumber() << "\n";
+        dbgs() << "Trying to continue loop-depth = "
+               << getLoopDepth(ContLoop)
+               << " from loop-depth = " << getLoopDepth(ContingLoop) << "\n";);
+  settleLoopcontBlock(ContingMBB, ContMBB);
+}
 
-    errs() << "Trying to continue loop-depth = "
-           << getLoopDepth(contLoop)
-           << " from loop-depth = " << getLoopDepth(contingLoop) << "\n";
-  }
+void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
+    MachineBasicBlock *SrcMBB) {
+  DEBUG(
+    dbgs() << "serialPattern BB" << DstMBB->getNumber()
+           << " <= BB" << SrcMBB->getNumber() << "\n";
+  );
+  DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
 
-  RegiT initReg = INVALIDREGNUM;
-  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-  if (contingLoop != contLoop) {
-    initReg = static_cast<int>
-      (funcRep->getRegInfo().createVirtualRegister(I32RC));
-    assert(initReg != INVALIDREGNUM);
-    addLoopContInitReg(contLoop, initReg);
-    while (contingLoop && contingLoop->getParentLoop() != contLoop) {
-      addLoopBreakOnReg(contingLoop, initReg);  //not addLoopContOnReg
-      contingLoop = contingLoop->getParentLoop();
-    }
-    assert(contingLoop && contingLoop->getParentLoop() == contLoop);
-    addLoopContOnReg(contingLoop, initReg);
-  }
+  DstMBB->removeSuccessor(SrcMBB);
+  cloneSuccessorList(DstMBB, SrcMBB);
 
-  settleLoopcontBlock(contingBlk, contBlk, initReg);
-} //handleLoopcontBlock
+  removeSuccessor(SrcMBB);
+  MLI->removeBlock(SrcMBB);
+  retireBlock(SrcMBB);
+}
 
-template<class PassT>
-void CFGStructurizer<PassT>::mergeSerialBlock(BlockT *dstBlk, BlockT *srcBlk) {
-  if (DEBUGME) {
-    errs() << "serialPattern BB" << dstBlk->getNumber()
-           << " <= BB" << srcBlk->getNumber() << "\n";
-  }
-  dstBlk->splice(dstBlk->end(), srcBlk, srcBlk->begin(), srcBlk->end());
-
-  dstBlk->removeSuccessor(srcBlk);
-  CFGTraits::cloneSuccessorList(dstBlk, srcBlk);
-
-  removeSuccessor(srcBlk);
-  retireBlock(dstBlk, srcBlk);
-} //mergeSerialBlock
-
-template<class PassT>
-void CFGStructurizer<PassT>::mergeIfthenelseBlock(InstrT *branchInstr,
-                                                  BlockT *curBlk,
-                                                  BlockT *trueBlk,
-                                                  BlockT *falseBlk,
-                                                  BlockT *landBlk) {
-  if (DEBUGME) {
-    errs() << "ifPattern BB" << curBlk->getNumber();
-    errs() << "{  ";
-    if (trueBlk) {
-      errs() << "BB" << trueBlk->getNumber();
-    }
-    errs() << "  } else ";
-    errs() << "{  ";
-    if (falseBlk) {
-      errs() << "BB" << falseBlk->getNumber();
-    }
-    errs() << "  }\n ";
-    errs() << "landBlock: ";
-    if (landBlk == NULL) {
-      errs() << "NULL";
+void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
+    MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
+    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
+  assert (TrueMBB);
+  DEBUG(
+    dbgs() << "ifPattern BB" << MBB->getNumber();
+    dbgs() << "{  ";
+    if (TrueMBB) {
+      dbgs() << "BB" << TrueMBB->getNumber();
+    }
+    dbgs() << "  } else ";
+    dbgs() << "{  ";
+    if (FalseMBB) {
+      dbgs() << "BB" << FalseMBB->getNumber();
+    }
+    dbgs() << "  }\n ";
+    dbgs() << "landBlock: ";
+    if (!LandMBB) {
+      dbgs() << "NULL";
     } else {
-      errs() << "BB" << landBlk->getNumber();
+      dbgs() << "BB" << LandMBB->getNumber();
     }
-    errs() << "\n";
-  }
+    dbgs() << "\n";
+  );
 
-  int oldOpcode = branchInstr->getOpcode();
-  DebugLoc branchDL = branchInstr->getDebugLoc();
+  int OldOpcode = BranchMI->getOpcode();
+  DebugLoc BranchDL = BranchMI->getDebugLoc();
 
 //    transform to
 //    if cond
@@ -1394,1645 +1536,382 @@ void CFGStructurizer<PassT>::mergeIfthenelseBlock(InstrT *branchInstr,
 //    endif
 //    landBlk
 
-  typename BlockT::iterator branchInstrPos =
-    CFGTraits::getInstrPos(curBlk, branchInstr);
-  CFGTraits::insertCondBranchBefore(branchInstrPos,
-                                    CFGTraits::getBranchNzeroOpcode(oldOpcode),
-                                    passRep,
-                                    branchDL);
-
-  if (trueBlk) {
-    curBlk->splice(branchInstrPos, trueBlk, trueBlk->begin(), trueBlk->end());
-    curBlk->removeSuccessor(trueBlk);
-    if (landBlk && trueBlk->succ_size()!=0) {
-      trueBlk->removeSuccessor(landBlk);
-    }
-    retireBlock(curBlk, trueBlk);
-  }
-  CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ELSE, passRep);
-
-  if (falseBlk) {
-    curBlk->splice(branchInstrPos, falseBlk, falseBlk->begin(),
-                   falseBlk->end());
-    curBlk->removeSuccessor(falseBlk);
-    if (landBlk && falseBlk->succ_size() != 0) {
-      falseBlk->removeSuccessor(landBlk);
-    }
-    retireBlock(curBlk, falseBlk);
-  }
-  CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ENDIF, passRep);
-
-  branchInstr->eraseFromParent();
-
-  if (landBlk && trueBlk && falseBlk) {
-    curBlk->addSuccessor(landBlk);
-  }
-
-} //mergeIfthenelseBlock
-
-template<class PassT>
-void CFGStructurizer<PassT>::mergeLooplandBlock(BlockT *dstBlk,
-                                                LoopLandInfo *loopLand) {
-  BlockT *landBlk = loopLand->landBlk;
-
-  if (DEBUGME) {
-    errs() << "loopPattern header = BB" << dstBlk->getNumber()
-           << " land = BB" << landBlk->getNumber() << "\n";
-  }
-
-  // Loop contInitRegs are init at the beginning of the loop.
-  for (typename std::set<RegiT>::const_iterator iter =
-         loopLand->contInitRegs.begin(),
-       iterEnd = loopLand->contInitRegs.end(); iter != iterEnd; ++iter) {
-    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
-  }
-
-  /* we last inserterd the DebugLoc in the
-   * BREAK_LOGICALZ_i32 or AMDGPU::BREAK_LOGICALNZ statement in the current dstBlk.
-   * search for the DebugLoc in the that statement.
-   * if not found, we have to insert the empty/default DebugLoc */
-  InstrT *loopBreakInstr = CFGTraits::getLoopBreakInstr(dstBlk);
-  DebugLoc DLBreak = (loopBreakInstr) ? loopBreakInstr->getDebugLoc() : DebugLoc();
-
-  CFGTraits::insertInstrBefore(dstBlk, AMDGPU::WHILELOOP, passRep, DLBreak);
-  // Loop breakInitRegs are init before entering the loop.
-  for (typename std::set<RegiT>::const_iterator iter =
-         loopLand->breakInitRegs.begin(),
-       iterEnd = loopLand->breakInitRegs.end(); iter != iterEnd; ++iter) {
-    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
-  }
-  // Loop endbranchInitRegs are init before entering the loop.
-  for (typename std::set<RegiT>::const_iterator iter =
-         loopLand->endbranchInitRegs.begin(),
-       iterEnd = loopLand->endbranchInitRegs.end(); iter != iterEnd; ++iter) {
-    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
-  }
+  MachineBasicBlock::iterator I = BranchMI;
+  insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode),
+      BranchDL);
 
-  /* we last inserterd the DebugLoc in the continue statement in the current dstBlk
-   * search for the DebugLoc in the continue statement.
-   * if not found, we have to insert the empty/default DebugLoc */
-  InstrT *continueInstr = CFGTraits::getContinueInstr(dstBlk);
-  DebugLoc DLContinue = (continueInstr) ? continueInstr->getDebugLoc() : DebugLoc();
-
-  CFGTraits::insertInstrEnd(dstBlk, AMDGPU::ENDLOOP, passRep, DLContinue);
-  // Loop breakOnRegs are check after the ENDLOOP: break the loop outside this
-  // loop.
-  for (typename std::set<RegiT>::const_iterator iter =
-         loopLand->breakOnRegs.begin(),
-       iterEnd = loopLand->breakOnRegs.end(); iter != iterEnd; ++iter) {
-    CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::PREDICATED_BREAK, passRep,
-                                   *iter);
+  if (TrueMBB) {
+    MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
+    MBB->removeSuccessor(TrueMBB);
+    if (LandMBB && TrueMBB->succ_size()!=0)
+      TrueMBB->removeSuccessor(LandMBB);
+    retireBlock(TrueMBB);
+    MLI->removeBlock(TrueMBB);
   }
 
-  // Loop contOnRegs are check after the ENDLOOP: cont the loop outside this
-  // loop.
-  for (std::set<RegiT>::const_iterator iter = loopLand->contOnRegs.begin(),
-       iterEnd = loopLand->contOnRegs.end(); iter != iterEnd; ++iter) {
-    CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::CONTINUE_LOGICALNZ_i32,
-                                   passRep, *iter);
+  if (FalseMBB) {
+    insertInstrBefore(I, AMDGPU::ELSE);
+    MBB->splice(I, FalseMBB, FalseMBB->begin(),
+                   FalseMBB->end());
+    MBB->removeSuccessor(FalseMBB);
+    if (LandMBB && FalseMBB->succ_size() != 0)
+      FalseMBB->removeSuccessor(LandMBB);
+    retireBlock(FalseMBB);
+    MLI->removeBlock(FalseMBB);
   }
+  insertInstrBefore(I, AMDGPU::ENDIF);
 
-  dstBlk->splice(dstBlk->end(), landBlk, landBlk->begin(), landBlk->end());
+  BranchMI->eraseFromParent();
 
-  for (typename BlockT::succ_iterator iter = landBlk->succ_begin(),
-       iterEnd = landBlk->succ_end(); iter != iterEnd; ++iter) {
-    dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of.
-  }
-
-  removeSuccessor(landBlk);
-  retireBlock(dstBlk, landBlk);
-} //mergeLooplandBlock
+  if (LandMBB && TrueMBB && FalseMBB)
+    MBB->addSuccessor(LandMBB);
 
-template<class PassT>
-void CFGStructurizer<PassT>::reversePredicateSetter(typename BlockT::iterator I) {
-  while (I--) {
-    if (I->getOpcode() == AMDGPU::PRED_X) {
-      switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
-      case OPCODE_IS_ZERO_INT:
-        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO_INT);
-        return;
-      case OPCODE_IS_NOT_ZERO_INT:
-        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO_INT);
-        return;
-      case OPCODE_IS_ZERO:
-        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO);
-        return;
-      case OPCODE_IS_NOT_ZERO:
-        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO);
-        return;
-      default:
-        assert(0 && "PRED_X Opcode invalid!");
-      }
-    }
-  }
 }
 
-template<class PassT>
-void CFGStructurizer<PassT>::mergeLoopbreakBlock(BlockT *exitingBlk,
-                                                 BlockT *exitBlk,
-                                                 BlockT *exitLandBlk,
-                                                 RegiT  setReg) {
-  if (DEBUGME) {
-    errs() << "loopbreakPattern exiting = BB" << exitingBlk->getNumber()
-           << " exit = BB" << exitBlk->getNumber()
-           << " land = BB" << exitLandBlk->getNumber() << "\n";
-  }
-
-  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(exitingBlk);
-  assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
-
-  DebugLoc DL = branchInstr->getDebugLoc();
+void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
+    MachineBasicBlock *LandMBB) {
+  DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
+               << " land = BB" << LandMBB->getNumber() << "\n";);
 
-  BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
-
-  //    transform exitingBlk to
-  //    if ( ) {
-  //       exitBlk (if exitBlk != exitLandBlk)
-  //       setReg = 1
-  //       break
-  //    }endif
-  //    successor = {orgSuccessor(exitingBlk) - exitBlk}
-
-  typename BlockT::iterator branchInstrPos =
-    CFGTraits::getInstrPos(exitingBlk, branchInstr);
-
-  if (exitBlk == exitLandBlk && setReg == INVALIDREGNUM) {
-    //break_logical
+  insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
+  insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
+  DstBlk->addSuccessor(LandMBB);
+  DstBlk->removeSuccessor(DstBlk);
+}
 
-    if (trueBranch != exitBlk) {
-      reversePredicateSetter(branchInstrPos);
-    }
-    CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL);
-  } else {
-    if (trueBranch != exitBlk) {
-      reversePredicateSetter(branchInstr);
-    }
-    CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL);
-    if (exitBlk != exitLandBlk) {
-      //splice is insert-before ...
-      exitingBlk->splice(branchInstrPos, exitBlk, exitBlk->begin(),
-                         exitBlk->end());
-    }
-    if (setReg != INVALIDREGNUM) {
-      CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
-    }
-    CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::BREAK, passRep);
-  } //if_logical
 
+void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+    MachineBasicBlock *LandMBB) {
+  DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
+               << " land = BB" << LandMBB->getNumber() << "\n";);
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
+  assert(BranchMI && isCondBranch(BranchMI));
+  DebugLoc DL = BranchMI->getDebugLoc();
+  MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
+  MachineBasicBlock::iterator I = BranchMI;
+  if (TrueBranch != LandMBB)
+    reversePredicateSetter(I);
+  insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
+  insertInstrBefore(I, AMDGPU::BREAK);
+  insertInstrBefore(I, AMDGPU::ENDIF);
   //now branchInst can be erase safely
-  branchInstr->eraseFromParent();
-
+  BranchMI->eraseFromParent();
   //now take care of successors, retire blocks
-  exitingBlk->removeSuccessor(exitBlk);
-  if (exitBlk != exitLandBlk) {
-    //splice is insert-before ...
-    exitBlk->removeSuccessor(exitLandBlk);
-    retireBlock(exitingBlk, exitBlk);
-  }
-
-} //mergeLoopbreakBlock
-
-template<class PassT>
-void CFGStructurizer<PassT>::settleLoopcontBlock(BlockT *contingBlk,
-                                                 BlockT *contBlk,
-                                                 RegiT   setReg) {
-  if (DEBUGME) {
-    errs() << "settleLoopcontBlock conting = BB"
-           << contingBlk->getNumber()
-           << ", cont = BB" << contBlk->getNumber() << "\n";
-  }
-
-  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(contingBlk);
-  if (branchInstr) {
-    assert(CFGTraits::isCondBranch(branchInstr));
-    typename BlockT::iterator branchInstrPos =
-      CFGTraits::getInstrPos(contingBlk, branchInstr);
-    BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
-    int oldOpcode = branchInstr->getOpcode();
-    DebugLoc DL = branchInstr->getDebugLoc();
-
-    //    transform contingBlk to
-    //     if () {
-    //          move instr after branchInstr
-    //          continue
-    //        or
-    //          setReg = 1
-    //          break
-    //     }endif
-    //     successor = {orgSuccessor(contingBlk) - loopHeader}
-
-    bool useContinueLogical = 
-      (setReg == INVALIDREGNUM && (&*contingBlk->rbegin()) == branchInstr);
-
-    if (useContinueLogical == false) {
-      int branchOpcode =
-        trueBranch == contBlk ? CFGTraits::getBranchNzeroOpcode(oldOpcode)
-                              : CFGTraits::getBranchZeroOpcode(oldOpcode);
-
-      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
-
-      if (setReg != INVALIDREGNUM) {
-        CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
-        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
-        CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, DL);
-      } else {
-        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
-        CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, DL);
-      }
+  ExitingMBB->removeSuccessor(LandMBB);
+}
 
-      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::ENDIF, passRep, DL);
+void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+    MachineBasicBlock *ContMBB) {
+  DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
+               << ContingMBB->getNumber()
+               << ", cont = BB" << ContMBB->getNumber() << "\n";);
+
+  MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
+  if (MI) {
+    assert(isCondBranch(MI));
+    MachineBasicBlock::iterator I = MI;
+    MachineBasicBlock *TrueBranch = getTrueBranch(MI);
+    int OldOpcode = MI->getOpcode();
+    DebugLoc DL = MI->getDebugLoc();
+
+    bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI);
+
+    if (UseContinueLogical == false) {
+      int BranchOpcode =
+          TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) :
+          getBranchZeroOpcode(OldOpcode);
+      insertCondBranchBefore(I, BranchOpcode, DL);
+      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+      insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL);
+      insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL);
     } else {
-      int branchOpcode =
-        trueBranch == contBlk ? CFGTraits::getContinueNzeroOpcode(oldOpcode)
-                              : CFGTraits::getContinueZeroOpcode(oldOpcode);
-
-      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
+      int BranchOpcode =
+          TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
+          getContinueZeroOpcode(OldOpcode);
+      insertCondBranchBefore(I, BranchOpcode, DL);
     }
 
-    branchInstr->eraseFromParent();
+    MI->eraseFromParent();
   } else {
     // if we've arrived here then we've already erased the branch instruction
-    // travel back up the basic block to see the last reference of our debug location
-    // we've just inserted that reference here so it should be representative
-    if (setReg != INVALIDREGNUM) {
-      CFGTraits::insertAssignInstrBefore(contingBlk, passRep, setReg, 1);
-      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
-      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
-    } else {
-      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
-      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
-    }
-  } //else
-
-} //settleLoopcontBlock
-
-// BBs in exitBlkSet are determined as in break-path for loopRep,
-// before we can put code for BBs as inside loop-body for loopRep
-// check whether those BBs are determined as cont-BB for parentLoopRep
-// earlier.
-// If so, generate a new BB newBlk
-//    (1) set newBlk common successor of BBs in exitBlkSet
-//    (2) change the continue-instr in BBs in exitBlkSet to break-instr
-//    (3) generate continue-instr in newBlk
-//
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::relocateLoopcontBlock(LoopT *parentLoopRep,
-                                              LoopT *loopRep,
-                                              std::set<BlockT *> &exitBlkSet,
-                                              BlockT *exitLandBlk) {
-  std::set<BlockT *> endBlkSet;
-
-
-
-  for (typename std::set<BlockT *>::const_iterator iter = exitBlkSet.begin(),
-       iterEnd = exitBlkSet.end();
-       iter != iterEnd; ++iter) {
-    BlockT *exitBlk = *iter;
-    BlockT *endBlk = singlePathEnd(exitBlk, exitLandBlk);
-
-    if (endBlk == NULL || CFGTraits::getContinueInstr(endBlk) == NULL)
-      return NULL;
-
-    endBlkSet.insert(endBlk);
-  }
-
-  BlockT *newBlk = funcRep->CreateMachineBasicBlock();
-  funcRep->push_back(newBlk);  //insert to function
-  CFGTraits::insertInstrEnd(newBlk, AMDGPU::CONTINUE, passRep);
-  SHOWNEWBLK(newBlk, "New continue block: ");
-
-  for (typename std::set<BlockT*>::const_iterator iter = endBlkSet.begin(),
-       iterEnd = endBlkSet.end();
-       iter != iterEnd; ++iter) {
-      BlockT *endBlk = *iter;
-      InstrT *contInstr = CFGTraits::getContinueInstr(endBlk);
-      if (contInstr) {
-        contInstr->eraseFromParent();
-      }
-      endBlk->addSuccessor(newBlk);
-      if (DEBUGME) {
-        errs() << "Add new continue Block to BB"
-               << endBlk->getNumber() << " successors\n";
-      }
+    // travel back up the basic block to see the last reference of our debug
+    // location we've just inserted that reference here so it should be
+    // representative insertEnd to ensure phi-moves, if exist, go before the
+    // continue-instr.
+    insertInstrEnd(ContingMBB, AMDGPU::CONTINUE,
+        getLastDebugLocInBB(ContingMBB));
   }
+}
 
-  return newBlk;
-} //relocateLoopcontBlock
-
-
-// LoopEndbranchBlock is a BB created by the CFGStructurizer to use as
-// LoopLandBlock. This BB branch on the loop endBranchInit register to the
-// pathes corresponding to the loop exiting branches.
-
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::addLoopEndbranchBlock(LoopT *loopRep,
-                                              BlockTSmallerVector &exitingBlks,
-                                              BlockTSmallerVector &exitBlks) {
-  const AMDGPUInstrInfo *tii =
-             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
-  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-
-  RegiT endBranchReg = static_cast<int>
-    (funcRep->getRegInfo().createVirtualRegister(I32RC));
-  assert(endBranchReg >= 0);
-
-  // reg = 0 before entering the loop
-  addLoopEndbranchInitReg(loopRep, endBranchReg);
-
-  uint32_t numBlks = static_cast<uint32_t>(exitingBlks.size());
-  assert(numBlks >=2 && numBlks == exitBlks.size());
-
-  BlockT *preExitingBlk = exitingBlks[0];
-  BlockT *preExitBlk = exitBlks[0];
-  BlockT *preBranchBlk = funcRep->CreateMachineBasicBlock();
-  funcRep->push_back(preBranchBlk);  //insert to function
-  SHOWNEWBLK(preBranchBlk, "New loopEndbranch block: ");
-
-  BlockT *newLandBlk = preBranchBlk;
+int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
+  int Cloned = 0;
+  assert(PreMBB->isSuccessor(SrcMBB));
+  while (SrcMBB && SrcMBB != DstMBB) {
+    assert(SrcMBB->succ_size() == 1);
+    if (SrcMBB->pred_size() > 1) {
+      SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB);
+      ++Cloned;
+    }
 
-      CFGTraits::replaceInstrUseOfBlockWith(preExitingBlk, preExitBlk,
-        newLandBlk);
-  preExitingBlk->removeSuccessor(preExitBlk);
-  preExitingBlk->addSuccessor(newLandBlk);
+    PreMBB = SrcMBB;
+    SrcMBB = *SrcMBB->succ_begin();
+  }
 
-  //it is redundant to add reg = 0 to exitingBlks[0]
+  return Cloned;
+}
 
-  // For 1..n th exiting path (the last iteration handles two pathes) create the
-  // branch to the previous path and the current path.
-  for (uint32_t i = 1; i < numBlks; ++i) {
-    BlockT *curExitingBlk = exitingBlks[i];
-    BlockT *curExitBlk = exitBlks[i];
-    BlockT *curBranchBlk;
+MachineBasicBlock *
+AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
+    MachineBasicBlock *PredMBB) {
+  assert(PredMBB->isSuccessor(MBB) &&
+         "succBlk is not a prececessor of curBlk");
 
-    if (i == numBlks - 1) {
-      curBranchBlk = curExitBlk;
-    } else {
-      curBranchBlk = funcRep->CreateMachineBasicBlock();
-      funcRep->push_back(curBranchBlk);  //insert to function
-      SHOWNEWBLK(curBranchBlk, "New loopEndbranch block: ");
-    }
+  MachineBasicBlock *CloneMBB = clone(MBB);  //clone instructions
+  replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
+  //srcBlk, oldBlk, newBlk
 
-    // Add reg = i to exitingBlks[i].
-    CFGTraits::insertAssignInstrBefore(curExitingBlk, passRep,
-                                       endBranchReg, i);
-
-    // Remove the edge (exitingBlks[i] exitBlks[i]) add new edge
-    // (exitingBlks[i], newLandBlk).
-    CFGTraits::replaceInstrUseOfBlockWith(curExitingBlk, curExitBlk,
-                                          newLandBlk);
-    curExitingBlk->removeSuccessor(curExitBlk);
-    curExitingBlk->addSuccessor(newLandBlk);
-
-    // add to preBranchBlk the branch instruction:
-    // if (endBranchReg == preVal)
-    //    preExitBlk
-    // else
-    //    curBranchBlk
-    //
-    // preValReg = i - 1
+  PredMBB->removeSuccessor(MBB);
+  PredMBB->addSuccessor(CloneMBB);
 
-  DebugLoc DL;
-  RegiT preValReg = static_cast<int>
-    (funcRep->getRegInfo().createVirtualRegister(I32RC));
+  // add all successor to cloneBlk
+  cloneSuccessorList(CloneMBB, MBB);
 
-  preBranchBlk->insert(preBranchBlk->begin(),
-                       tii->getMovImmInstr(preBranchBlk->getParent(), preValReg,
-                       i - 1));
+  numClonedInstr += MBB->size();
 
-  // condResReg = (endBranchReg == preValReg)
-    RegiT condResReg = static_cast<int>
-      (funcRep->getRegInfo().createVirtualRegister(I32RC));
-    BuildMI(preBranchBlk, DL, tii->get(tii->getIEQOpcode()), condResReg)
-      .addReg(endBranchReg).addReg(preValReg);
+  DEBUG(
+    dbgs() << "Cloned block: " << "BB"
+           << MBB->getNumber() << "size " << MBB->size() << "\n";
+  );
 
-    BuildMI(preBranchBlk, DL, tii->get(AMDGPU::BRANCH_COND_i32))
-      .addMBB(preExitBlk).addReg(condResReg);
+  SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
 
-    preBranchBlk->addSuccessor(preExitBlk);
-    preBranchBlk->addSuccessor(curBranchBlk);
+  return CloneMBB;
+}
 
-    // Update preExitingBlk, preExitBlk, preBranchBlk.
-    preExitingBlk = curExitingBlk;
-    preExitBlk = curExitBlk;
-    preBranchBlk = curBranchBlk;
+void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
+    MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
+  MachineBasicBlock::iterator SpliceEnd;
+  //look for the input branchinstr, not the AMDGPU branchinstr
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
+  if (!BranchMI) {
+    DEBUG(
+      dbgs() << "migrateInstruction don't see branch instr\n" ;
+    );
+    SpliceEnd = SrcMBB->end();
+  } else {
+    DEBUG(
+      dbgs() << "migrateInstruction see branch instr\n" ;
+      BranchMI->dump();
+    );
+    SpliceEnd = BranchMI;
+  }
+  DEBUG(
+    dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size()
+      << "srcSize = " << SrcMBB->size() << "\n";
+  );
 
-  }  //end for 1 .. n blocks
+  //splice insert before insertPos
+  DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
 
-  return newLandBlk;
-} //addLoopEndbranchBlock
+  DEBUG(
+    dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
+      << "srcSize = " << SrcMBB->size() << "\n";
+  );
+}
 
-template<class PassT>
-typename CFGStructurizer<PassT>::PathToKind
-CFGStructurizer<PassT>::singlePathTo(BlockT *srcBlk, BlockT *dstBlk,
-                                     bool allowSideEntry) {
-  assert(dstBlk);
+MachineBasicBlock *
+AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
 
-  if (srcBlk == dstBlk) {
-    return SinglePath_InPath;
-  }
+  if (!LoopHeader || !LoopLatch)
+    return NULL;
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
+  // Is LoopRep an infinite loop ?
+  if (!BranchMI || !isUncondBranch(BranchMI))
+    return NULL;
 
-  while (srcBlk && srcBlk->succ_size() == 1) {
-    srcBlk = *srcBlk->succ_begin();
-    if (srcBlk == dstBlk) {
-      return SinglePath_InPath;
-    }
+  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
+  FuncRep->push_back(DummyExitBlk);  //insert to function
+  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
+  DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
+  MachineBasicBlock::iterator I = BranchMI;
+  unsigned ImmReg = FuncRep->getRegInfo().createVirtualRegister(I32RC);
+  llvm_unreachable("Extra register needed to handle CFG");
+  MachineInstr *NewMI = insertInstrBefore(I, AMDGPU::BRANCH_COND_i32);
+  MachineInstrBuilder MIB(*FuncRep, NewMI);
+  MIB.addMBB(LoopHeader);
+  MIB.addReg(ImmReg, false);
+  SHOWNEWINSTR(NewMI);
+  BranchMI->eraseFromParent();
+  LoopLatch->addSuccessor(DummyExitBlk);
+
+  return DummyExitBlk;
+}
 
-    if (!allowSideEntry && srcBlk->pred_size() > 1) {
-      return Not_SinglePath;
-    }
-  }
+void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
+  MachineInstr *BranchMI;
 
-  if (srcBlk && srcBlk->succ_size()==0) {
-    return SinglePath_NotInPath;
+  // I saw two unconditional branch in one basic block in example
+  // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
+  while ((BranchMI = getLoopendBlockBranchInstr(MBB))
+          && isUncondBranch(BranchMI)) {
+    DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump(););
+    BranchMI->eraseFromParent();
   }
+}
 
-  return Not_SinglePath;
-} //singlePathTo
-
-// If there is a single path from srcBlk to dstBlk, return the last block before
-// dstBlk If there is a single path from srcBlk->end without dstBlk, return the
-// last block in the path Otherwise, return NULL
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::singlePathEnd(BlockT *srcBlk, BlockT *dstBlk,
-                                      bool allowSideEntry) {
-  assert(dstBlk);
-
-  if (srcBlk == dstBlk) {
-    return srcBlk;
-  }
+void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
+    MachineBasicBlock *MBB) {
+  if (MBB->succ_size() != 2)
+    return;
+  MachineBasicBlock *MBB1 = *MBB->succ_begin();
+  MachineBasicBlock *MBB2 = *llvm::next(MBB->succ_begin());
+  if (MBB1 != MBB2)
+    return;
+
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
+  assert(BranchMI && isCondBranch(BranchMI));
+  DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump(););
+  BranchMI->eraseFromParent();
+  SHOWNEWBLK(MBB1, "Removing redundant successor");
+  MBB->removeSuccessor(MBB1);
+}
 
-  if (srcBlk->succ_size() == 0) {
-    return srcBlk;
+void AMDGPUCFGStructurizer::addDummyExitBlock(
+    SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
+  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
+  FuncRep->push_back(DummyExitBlk);  //insert to function
+  insertInstrEnd(DummyExitBlk, AMDGPU::RETURN);
+
+  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
+       E = RetMBB.end(); It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    MachineInstr *MI = getReturnInstr(MBB);
+    if (MI)
+      MI->eraseFromParent();
+    MBB->addSuccessor(DummyExitBlk);
+    DEBUG(
+      dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
+             << " successors\n";
+    );
   }
+  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
+}
 
-  while (srcBlk && srcBlk->succ_size() == 1) {
-    BlockT *preBlk = srcBlk;
-
-    srcBlk = *srcBlk->succ_begin();
-    if (srcBlk == NULL) {
-      return preBlk;
-    }
+void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
+  while (MBB->succ_size())
+    MBB->removeSuccessor(*MBB->succ_begin());
+}
 
-    if (!allowSideEntry && srcBlk->pred_size() > 1) {
-      return NULL;
-    }
-  }
+void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
+    int SccNum) {
+  BlockInformation *&srcBlkInfo = BlockInfoMap[MBB];
+  if (!srcBlkInfo)
+    srcBlkInfo = new BlockInformation();
+  srcBlkInfo->SccNum = SccNum;
+}
 
-  if (srcBlk && srcBlk->succ_size()==0) {
-    return srcBlk;
-  }
+void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
+  DEBUG(
+        dbgs() << "Retiring BB" << MBB->getNumber() << "\n";
+  );
 
-  return NULL;
+  BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
 
-} //singlePathEnd
-
-template<class PassT>
-int CFGStructurizer<PassT>::cloneOnSideEntryTo(BlockT *preBlk, BlockT *srcBlk,
-                                               BlockT *dstBlk) {
-  int cloned = 0;
-  assert(preBlk->isSuccessor(srcBlk));
-  while (srcBlk && srcBlk != dstBlk) {
-    assert(srcBlk->succ_size() == 1);
-    if (srcBlk->pred_size() > 1) {
-      srcBlk = cloneBlockForPredecessor(srcBlk, preBlk);
-      ++cloned;
-    }
+  if (!SrcBlkInfo)
+    SrcBlkInfo = new BlockInformation();
 
-    preBlk = srcBlk;
-    srcBlk = *srcBlk->succ_begin();
-  }
+  SrcBlkInfo->IsRetired = true;
+  assert(MBB->succ_size() == 0 && MBB->pred_size() == 0
+         && "can't retire block yet");
+}
 
-  return cloned;
-} //cloneOnSideEntryTo
+void AMDGPUCFGStructurizer::setLoopLandBlock(MachineLoop *loopRep,
+    MachineBasicBlock *MBB) {
+  MachineBasicBlock *&TheEntry = LLInfoMap[loopRep];
+  if (!MBB) {
+    MBB = FuncRep->CreateMachineBasicBlock();
+    FuncRep->push_back(MBB);  //insert to function
+    SHOWNEWBLK(MBB, "DummyLandingBlock for loop without break: ");
+  }
+  TheEntry = MBB;
+  DEBUG(
+    dbgs() << "setLoopLandBlock loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  landing-block = BB" << MBB->getNumber() << "\n";
+  );
+}
 
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::cloneBlockForPredecessor(BlockT *curBlk,
-                                                 BlockT *predBlk) {
-  assert(predBlk->isSuccessor(curBlk) &&
-         "succBlk is not a prececessor of curBlk");
+MachineBasicBlock *
+AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
+    MachineBasicBlock *MBB2) {
 
-  BlockT *cloneBlk = CFGTraits::clone(curBlk);  //clone instructions
-  CFGTraits::replaceInstrUseOfBlockWith(predBlk, curBlk, cloneBlk);
-  //srcBlk, oldBlk, newBlk
+  if (PDT->dominates(MBB1, MBB2))
+    return MBB1;
+  if (PDT->dominates(MBB2, MBB1))
+    return MBB2;
 
-  predBlk->removeSuccessor(curBlk);
-  predBlk->addSuccessor(cloneBlk);
+  MachineDomTreeNode *Node1 = PDT->getNode(MBB1);
+  MachineDomTreeNode *Node2 = PDT->getNode(MBB2);
 
-  // add all successor to cloneBlk
-  CFGTraits::cloneSuccessorList(cloneBlk, curBlk);
+  // Handle newly cloned node.
+  if (!Node1 && MBB1->succ_size() == 1)
+    return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2);
+  if (!Node2 && MBB2->succ_size() == 1)
+    return findNearestCommonPostDom(MBB1, *MBB2->succ_begin());
 
-  numClonedInstr += curBlk->size();
+  if (!Node1 || !Node2)
+    return NULL;
 
-  if (DEBUGME) {
-    errs() << "Cloned block: " << "BB"
-           << curBlk->getNumber() << "size " << curBlk->size() << "\n";
+  Node1 = Node1->getIDom();
+  while (Node1) {
+    if (PDT->dominates(Node1, Node2))
+      return Node1->getBlock();
+    Node1 = Node1->getIDom();
   }
 
-  SHOWNEWBLK(cloneBlk, "result of Cloned block: ");
-
-  return cloneBlk;
-} //cloneBlockForPredecessor
+  return NULL;
+}
 
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::exitingBlock2ExitBlock(LoopT *loopRep,
-                                               BlockT *exitingBlk) {
-  BlockT *exitBlk = NULL;
+MachineBasicBlock *
+AMDGPUCFGStructurizer::findNearestCommonPostDom(
+    std::set<MachineBasicBlock *> &MBBs) {
+  MachineBasicBlock *CommonDom;
+  std::set<MachineBasicBlock *>::const_iterator It = MBBs.begin();
+  std::set<MachineBasicBlock *>::const_iterator E = MBBs.end();
+  for (CommonDom = *It; It != E && CommonDom; ++It) {
+    MachineBasicBlock *MBB = *It;
+    if (MBB != CommonDom)
+      CommonDom = findNearestCommonPostDom(MBB, CommonDom);
+  }
+
+  DEBUG(
+    dbgs() << "Common post dominator for exit blocks is ";
+    if (CommonDom)
+          dbgs() << "BB" << CommonDom->getNumber() << "\n";
+    else
+      dbgs() << "NULL\n";
+  );
+
+  return CommonDom;
+}
 
-  for (typename BlockT::succ_iterator iterSucc = exitingBlk->succ_begin(),
-       iterSuccEnd = exitingBlk->succ_end();
-       iterSucc != iterSuccEnd; ++iterSucc) {
-    BlockT *curBlk = *iterSucc;
-    if (!loopRep->contains(curBlk)) {
-      assert(exitBlk == NULL);
-      exitBlk = curBlk;
-    }
-  }
+char AMDGPUCFGStructurizer::ID = 0;
 
-  assert(exitBlk != NULL);
+} // end anonymous namespace
 
-  return exitBlk;
-} //exitingBlock2ExitBlock
 
-template<class PassT>
-void CFGStructurizer<PassT>::migrateInstruction(BlockT *srcBlk,
-                                                BlockT *dstBlk,
-                                                InstrIterator insertPos) {
-  InstrIterator spliceEnd;
-  //look for the input branchinstr, not the AMDGPU branchinstr
-  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
-  if (branchInstr == NULL) {
-    if (DEBUGME) {
-      errs() << "migrateInstruction don't see branch instr\n" ;
-    }
-    spliceEnd = srcBlk->end();
-  } else {
-    if (DEBUGME) {
-      errs() << "migrateInstruction see branch instr\n" ;
-      branchInstr->dump();
-    }
-    spliceEnd = CFGTraits::getInstrPos(srcBlk, branchInstr);
-  }
-  if (DEBUGME) {
-    errs() << "migrateInstruction before splice dstSize = " << dstBlk->size()
-      << "srcSize = " << srcBlk->size() << "\n";
-  }
-
-  //splice insert before insertPos
-  dstBlk->splice(insertPos, srcBlk, srcBlk->begin(), spliceEnd);
-
-  if (DEBUGME) {
-    errs() << "migrateInstruction after splice dstSize = " << dstBlk->size()
-      << "srcSize = " << srcBlk->size() << "\n";
-  }
-} //migrateInstruction
-
-// normalizeInfiniteLoopExit change
-//   B1:
-//        uncond_br LoopHeader
-//
-// to
-//   B1:
-//        cond_br 1 LoopHeader dummyExit
-// and return the newly added dummy exit block
-// 
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::normalizeInfiniteLoopExit(LoopT* LoopRep) {
-  BlockT *loopHeader;
-  BlockT *loopLatch;
-  loopHeader = LoopRep->getHeader();
-  loopLatch = LoopRep->getLoopLatch();
-  BlockT *dummyExitBlk = NULL;
-  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-  if (loopHeader!=NULL && loopLatch!=NULL) {
-    InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(loopLatch);
-    if (branchInstr!=NULL && CFGTraits::isUncondBranch(branchInstr)) {
-      dummyExitBlk = funcRep->CreateMachineBasicBlock();
-      funcRep->push_back(dummyExitBlk);  //insert to function
-      SHOWNEWBLK(dummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
-
-      if (DEBUGME) errs() << "Old branch instr: " << *branchInstr << "\n";
-
-      typename BlockT::iterator insertPos =
-        CFGTraits::getInstrPos(loopLatch, branchInstr);
-      unsigned immReg =
-        funcRep->getRegInfo().createVirtualRegister(I32RC);
-      CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 1);
-      InstrT *newInstr = 
-        CFGTraits::insertInstrBefore(insertPos, AMDGPU::BRANCH_COND_i32, passRep);
-      MachineInstrBuilder MIB(*funcRep, newInstr);
-      MIB.addMBB(loopHeader);
-      MIB.addReg(immReg, false);
-
-      SHOWNEWINSTR(newInstr);
-
-      branchInstr->eraseFromParent();
-      loopLatch->addSuccessor(dummyExitBlk);
-    }
-  }
-
-  return dummyExitBlk;
-} //normalizeInfiniteLoopExit
-
-template<class PassT>
-void CFGStructurizer<PassT>::removeUnconditionalBranch(BlockT *srcBlk) {
-  InstrT *branchInstr;
-
-  // I saw two unconditional branch in one basic block in example
-  // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
-  while ((branchInstr = CFGTraits::getLoopendBlockBranchInstr(srcBlk))
-          && CFGTraits::isUncondBranch(branchInstr)) {
-    if (DEBUGME) {
-          errs() << "Removing unconditional branch instruction" ;
-      branchInstr->dump();
-    }
-    branchInstr->eraseFromParent();
-  }
-} //removeUnconditionalBranch
-
-template<class PassT>
-void CFGStructurizer<PassT>::removeRedundantConditionalBranch(BlockT *srcBlk) {
-  if (srcBlk->succ_size() == 2) {
-    BlockT *blk1 = *srcBlk->succ_begin();
-    BlockT *blk2 = *(++srcBlk->succ_begin());
-
-    if (blk1 == blk2) {
-      InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
-      assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
-      if (DEBUGME) {
-        errs() << "Removing unneeded conditional branch instruction" ;
-        branchInstr->dump();
-      }
-      branchInstr->eraseFromParent();
-      SHOWNEWBLK(blk1, "Removing redundant successor");
-      srcBlk->removeSuccessor(blk1);
-    }
-  }
-} //removeRedundantConditionalBranch
-
-template<class PassT>
-void CFGStructurizer<PassT>::addDummyExitBlock(SmallVector<BlockT*,
-                                               DEFAULT_VEC_SLOTS> &retBlks) {
-  BlockT *dummyExitBlk = funcRep->CreateMachineBasicBlock();
-  funcRep->push_back(dummyExitBlk);  //insert to function
-  CFGTraits::insertInstrEnd(dummyExitBlk, AMDGPU::RETURN, passRep);
-
-  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator iter =
-         retBlks.begin(),
-       iterEnd = retBlks.end(); iter != iterEnd; ++iter) {
-    BlockT *curBlk = *iter;
-    InstrT *curInstr = CFGTraits::getReturnInstr(curBlk);
-    if (curInstr) {
-      curInstr->eraseFromParent();
-    }
-    curBlk->addSuccessor(dummyExitBlk);
-    if (DEBUGME) {
-      errs() << "Add dummyExitBlock to BB" << curBlk->getNumber()
-             << " successors\n";
-    }
-  } //for
-
-  SHOWNEWBLK(dummyExitBlk, "DummyExitBlock: ");
-} //addDummyExitBlock
-
-template<class PassT>
-void CFGStructurizer<PassT>::removeSuccessor(BlockT *srcBlk) {
-  while (srcBlk->succ_size()) {
-    srcBlk->removeSuccessor(*srcBlk->succ_begin());
-  }
-}
-
-template<class PassT>
-void CFGStructurizer<PassT>::recordSccnum(BlockT *srcBlk, int sccNum) {
-  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
-
-  if (srcBlkInfo == NULL) {
-    srcBlkInfo = new BlockInfo();
-  }
-
-  srcBlkInfo->sccNum = sccNum;
-}
-
-template<class PassT>
-int CFGStructurizer<PassT>::getSCCNum(BlockT *srcBlk) {
-  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
-  return srcBlkInfo ? srcBlkInfo->sccNum : INVALIDSCCNUM;
-}
-
-template<class PassT>
-void CFGStructurizer<PassT>::retireBlock(BlockT *dstBlk, BlockT *srcBlk) {
-  if (DEBUGME) {
-        errs() << "Retiring BB" << srcBlk->getNumber() << "\n";
-  }
-
-  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
-
-  if (srcBlkInfo == NULL) {
-    srcBlkInfo = new BlockInfo();
-  }
-
-  srcBlkInfo->isRetired = true;
-  assert(srcBlk->succ_size() == 0 && srcBlk->pred_size() == 0
-         && "can't retire block yet");
-}
-
-template<class PassT>
-bool CFGStructurizer<PassT>::isRetiredBlock(BlockT *srcBlk) {
-  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
-  return (srcBlkInfo && srcBlkInfo->isRetired);
-}
-
-template<class PassT>
-bool CFGStructurizer<PassT>::isActiveLoophead(BlockT *curBlk) {
-  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
-  while (loopRep && loopRep->getHeader() == curBlk) {
-    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
-
-    if(loopLand == NULL)
-      return true;
-
-    BlockT *landBlk = loopLand->landBlk;
-    assert(landBlk);
-    if (!isRetiredBlock(landBlk)) {
-      return true;
-    }
-
-    loopRep = loopRep->getParentLoop();
-  }
-
-  return false;
-} //isActiveLoophead
-
-template<class PassT>
-bool CFGStructurizer<PassT>::needMigrateBlock(BlockT *blk) {
-  const unsigned blockSizeThreshold = 30;
-  const unsigned cloneInstrThreshold = 100;
-
-  bool multiplePreds = blk && (blk->pred_size() > 1);
-
-  if(!multiplePreds)
-    return false;
-
-  unsigned blkSize = blk->size();
-  return ((blkSize > blockSizeThreshold)
-          && (blkSize * (blk->pred_size() - 1) > cloneInstrThreshold));
-} //needMigrateBlock
-
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::recordLoopLandBlock(LoopT *loopRep, BlockT *landBlk,
-                                            BlockTSmallerVector &exitBlks,
-                                            std::set<BlockT *> &exitBlkSet) {
-  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> inpathBlks;  //in exit path blocks
-
-  for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
-       predIterEnd = landBlk->pred_end();
-       predIter != predIterEnd; ++predIter) {
-    BlockT *curBlk = *predIter;
-    if (loopRep->contains(curBlk) || exitBlkSet.count(curBlk)) {
-      inpathBlks.push_back(curBlk);
-    }
-  } //for
-
-  //if landBlk has predecessors that are not in the given loop,
-  //create a new block
-  BlockT *newLandBlk = landBlk;
-  if (inpathBlks.size() != landBlk->pred_size()) {
-    newLandBlk = funcRep->CreateMachineBasicBlock();
-    funcRep->push_back(newLandBlk);  //insert to function
-    newLandBlk->addSuccessor(landBlk);
-    for (typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::iterator iter =
-         inpathBlks.begin(),
-         iterEnd = inpathBlks.end(); iter != iterEnd; ++iter) {
-      BlockT *curBlk = *iter;
-      CFGTraits::replaceInstrUseOfBlockWith(curBlk, landBlk, newLandBlk);
-      //srcBlk, oldBlk, newBlk
-      curBlk->removeSuccessor(landBlk);
-      curBlk->addSuccessor(newLandBlk);
-    }
-    for (size_t i = 0, tot = exitBlks.size(); i < tot; ++i) {
-      if (exitBlks[i] == landBlk) {
-        exitBlks[i] = newLandBlk;
-      }
-    }
-    SHOWNEWBLK(newLandBlk, "NewLandingBlock: ");
-  }
-
-  setLoopLandBlock(loopRep, newLandBlk);
-
-  return newLandBlk;
-} // recordLoopbreakLand
-
-template<class PassT>
-void CFGStructurizer<PassT>::setLoopLandBlock(LoopT *loopRep, BlockT *blk) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-  assert(theEntry->landBlk == NULL);
-
-  if (blk == NULL) {
-    blk = funcRep->CreateMachineBasicBlock();
-    funcRep->push_back(blk);  //insert to function
-    SHOWNEWBLK(blk, "DummyLandingBlock for loop without break: ");
-  }
-
-  theEntry->landBlk = blk;
-
-  if (DEBUGME) {
-    errs() << "setLoopLandBlock loop-header = BB"
-           << loopRep->getHeader()->getNumber()
-           << "  landing-block = BB" << blk->getNumber() << "\n";
-  }
-} // setLoopLandBlock
-
-template<class PassT>
-void CFGStructurizer<PassT>::addLoopBreakOnReg(LoopT *loopRep, RegiT regNum) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-
-  theEntry->breakOnRegs.insert(regNum);
-
-  if (DEBUGME) {
-    errs() << "addLoopBreakOnReg loop-header = BB"
-           << loopRep->getHeader()->getNumber()
-           << "  regNum = " << regNum << "\n";
-  }
-} // addLoopBreakOnReg
-
-template<class PassT>
-void CFGStructurizer<PassT>::addLoopContOnReg(LoopT *loopRep, RegiT regNum) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-  theEntry->contOnRegs.insert(regNum);
-
-  if (DEBUGME) {
-    errs() << "addLoopContOnReg loop-header = BB"
-           << loopRep->getHeader()->getNumber()
-           << "  regNum = " << regNum << "\n";
-  }
-} // addLoopContOnReg
-
-template<class PassT>
-void CFGStructurizer<PassT>::addLoopBreakInitReg(LoopT *loopRep, RegiT regNum) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-  theEntry->breakInitRegs.insert(regNum);
-
-  if (DEBUGME) {
-    errs() << "addLoopBreakInitReg loop-header = BB"
-           << loopRep->getHeader()->getNumber()
-           << "  regNum = " << regNum << "\n";
-  }
-} // addLoopBreakInitReg
-
-template<class PassT>
-void CFGStructurizer<PassT>::addLoopContInitReg(LoopT *loopRep, RegiT regNum) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-  theEntry->contInitRegs.insert(regNum);
-
-  if (DEBUGME) {
-    errs() << "addLoopContInitReg loop-header = BB"
-           << loopRep->getHeader()->getNumber()
-           << "  regNum = " << regNum << "\n";
-  }
-} // addLoopContInitReg
-
-template<class PassT>
-void CFGStructurizer<PassT>::addLoopEndbranchInitReg(LoopT *loopRep,
-                                                     RegiT regNum) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-  theEntry->endbranchInitRegs.insert(regNum);
-
-  if (DEBUGME) {
-        errs() << "addLoopEndbranchInitReg loop-header = BB"
-      << loopRep->getHeader()->getNumber()
-      << "  regNum = " << regNum << "\n";
-  }
-} // addLoopEndbranchInitReg
-
-template<class PassT>
-typename CFGStructurizer<PassT>::LoopLandInfo *
-CFGStructurizer<PassT>::getLoopLandInfo(LoopT *loopRep) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  return theEntry;
-} // getLoopLandInfo
-
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::getLoopLandBlock(LoopT *loopRep) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  return theEntry ? theEntry->landBlk : NULL;
-} // getLoopLandBlock
-
-
-template<class PassT>
-bool CFGStructurizer<PassT>::hasBackEdge(BlockT *curBlk) {
-  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
-  if (loopRep == NULL)
-    return false;
-
-  BlockT *loopHeader = loopRep->getHeader();
-
-  return curBlk->isSuccessor(loopHeader);
-
-} //hasBackEdge
-
-template<class PassT>
-unsigned CFGStructurizer<PassT>::getLoopDepth(LoopT *loopRep) {
-  return loopRep ? loopRep->getLoopDepth() : 0;
-} //getLoopDepth
-
-template<class PassT>
-int CFGStructurizer<PassT>::countActiveBlock
-(typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterStart,
- typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterEnd) {
-  int count = 0;
-  while (iterStart != iterEnd) {
-    if (!isRetiredBlock(*iterStart)) {
-      ++count;
-    }
-    ++iterStart;
-  }
-
-  return count;
-} //countActiveBlock
-
-// This is work around solution for findNearestCommonDominator not avaiable to
-// post dom a proper fix should go to Dominators.h.
-
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT*
-CFGStructurizer<PassT>::findNearestCommonPostDom(BlockT *blk1, BlockT *blk2) {
-
-  if (postDomTree->dominates(blk1, blk2)) {
-    return blk1;
-  }
-  if (postDomTree->dominates(blk2, blk1)) {
-    return blk2;
-  }
-
-  DomTreeNodeT *node1 = postDomTree->getNode(blk1);
-  DomTreeNodeT *node2 = postDomTree->getNode(blk2);
-
-  // Handle newly cloned node.
-  if (node1 == NULL && blk1->succ_size() == 1) {
-    return findNearestCommonPostDom(*blk1->succ_begin(), blk2);
-  }
-  if (node2 == NULL && blk2->succ_size() == 1) {
-    return findNearestCommonPostDom(blk1, *blk2->succ_begin());
-  }
-
-  if (node1 == NULL || node2 == NULL) {
-    return NULL;
-  }
-
-  node1 = node1->getIDom();
-  while (node1) {
-    if (postDomTree->dominates(node1, node2)) {
-      return node1->getBlock();
-    }
-    node1 = node1->getIDom();
-  }
-
-  return NULL;
-}
-
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::findNearestCommonPostDom
-(typename std::set<BlockT *> &blks) {
-  BlockT *commonDom;
-  typename std::set<BlockT *>::const_iterator iter = blks.begin();
-  typename std::set<BlockT *>::const_iterator iterEnd = blks.end();
-  for (commonDom = *iter; iter != iterEnd && commonDom != NULL; ++iter) {
-    BlockT *curBlk = *iter;
-    if (curBlk != commonDom) {
-      commonDom = findNearestCommonPostDom(curBlk, commonDom);
-    }
-  }
-
-  if (DEBUGME) {
-    errs() << "Common post dominator for exit blocks is ";
-    if (commonDom) {
-          errs() << "BB" << commonDom->getNumber() << "\n";
-    } else {
-      errs() << "NULL\n";
-    }
-  }
-
-  return commonDom;
-} //findNearestCommonPostDom
-
-} // end anonymous namespace
-
-//todo: move-end
-
-
-//===----------------------------------------------------------------------===//
-//
-// CFGStructurizer for AMDGPU
-//
-//===----------------------------------------------------------------------===//
-
-
-namespace {
-class AMDGPUCFGStructurizer : public MachineFunctionPass {
-public:
-  typedef MachineInstr              InstructionType;
-  typedef MachineFunction           FunctionType;
-  typedef MachineBasicBlock         BlockType;
-  typedef MachineLoopInfo           LoopinfoType;
-  typedef MachineDominatorTree      DominatortreeType;
-  typedef MachinePostDominatorTree  PostDominatortreeType;
-  typedef MachineDomTreeNode        DomTreeNodeType;
-  typedef MachineLoop               LoopType;
-
-protected:
-  TargetMachine &TM;
-
-public:
-  AMDGPUCFGStructurizer(char &pid, TargetMachine &tm);
-  const TargetInstrInfo *getTargetInstrInfo() const;
-  const AMDGPURegisterInfo *getTargetRegisterInfo() const;
-};
-
-} // end anonymous namespace
-AMDGPUCFGStructurizer::AMDGPUCFGStructurizer(char &pid, TargetMachine &tm)
-  : MachineFunctionPass(pid), TM(tm) {
-}
-
-const TargetInstrInfo *AMDGPUCFGStructurizer::getTargetInstrInfo() const {
-  return TM.getInstrInfo();
-}
-
-const AMDGPURegisterInfo *AMDGPUCFGStructurizer::getTargetRegisterInfo() const {
-  return static_cast<const AMDGPURegisterInfo *>(TM.getRegisterInfo());
-}
-
-//===----------------------------------------------------------------------===//
-//
-// CFGPrepare
-//
-//===----------------------------------------------------------------------===//
-
-
-namespace {
-class AMDGPUCFGPrepare : public AMDGPUCFGStructurizer {
-public:
-  static char ID;
-
-public:
-  AMDGPUCFGPrepare(TargetMachine &tm);
-
-  virtual const char *getPassName() const;
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-
-  bool runOnMachineFunction(MachineFunction &F);
-};
-
-char AMDGPUCFGPrepare::ID = 0;
-} // end anonymous namespace
-
-AMDGPUCFGPrepare::AMDGPUCFGPrepare(TargetMachine &tm)
-  : AMDGPUCFGStructurizer(ID, tm )  {
-}
-const char *AMDGPUCFGPrepare::getPassName() const {
-  return "AMD IL Control Flow Graph Preparation Pass";
-}
-
-void AMDGPUCFGPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addPreserved<MachineFunctionAnalysis>();
-  AU.addRequired<MachineFunctionAnalysis>();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addRequired<MachinePostDominatorTree>();
-  AU.addRequired<MachineLoopInfo>();
-}
-
-//===----------------------------------------------------------------------===//
-//
-// CFGPerform
-//
-//===----------------------------------------------------------------------===//
-
-
-namespace {
-class AMDGPUCFGPerform : public AMDGPUCFGStructurizer {
-public:
-  static char ID;
-
-public:
-  AMDGPUCFGPerform(TargetMachine &tm);
-  virtual const char *getPassName() const;
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-  bool runOnMachineFunction(MachineFunction &F);
-};
-
-char AMDGPUCFGPerform::ID = 0;
-} // end anonymous namespace
-
-  AMDGPUCFGPerform::AMDGPUCFGPerform(TargetMachine &tm)
-: AMDGPUCFGStructurizer(ID, tm) {
-}
-
-const char *AMDGPUCFGPerform::getPassName() const {
-  return "AMD IL Control Flow Graph structurizer Pass";
-}
-
-void AMDGPUCFGPerform::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addPreserved<MachineFunctionAnalysis>();
-  AU.addRequired<MachineFunctionAnalysis>();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addRequired<MachinePostDominatorTree>();
-  AU.addRequired<MachineLoopInfo>();
-}
-
-//===----------------------------------------------------------------------===//
-//
-// CFGStructTraits<AMDGPUCFGStructurizer>
-//
-//===----------------------------------------------------------------------===//
-
-namespace {
-// this class is tailor to the AMDGPU backend
-template<>
-struct CFGStructTraits<AMDGPUCFGStructurizer> {
-  typedef int RegiT;
-
-  static int getBranchNzeroOpcode(int oldOpcode) {
-    switch(oldOpcode) {
-    case AMDGPU::JUMP_COND:
-    case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
-    case AMDGPU::BRANCH_COND_i32:
-    case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
-    default:
-      assert(0 && "internal error");
-    }
-    return -1;
-  }
-
-  static int getBranchZeroOpcode(int oldOpcode) {
-    switch(oldOpcode) {
-    case AMDGPU::JUMP_COND:
-    case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
-    case AMDGPU::BRANCH_COND_i32:
-    case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
-    default:
-      assert(0 && "internal error");
-    }
-    return -1;
-  }
-
-  static int getContinueNzeroOpcode(int oldOpcode) {
-    switch(oldOpcode) {
-    case AMDGPU::JUMP_COND:
-    case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
-    default:
-      assert(0 && "internal error");
-    };
-    return -1;
-  }
-
-  static int getContinueZeroOpcode(int oldOpcode) {
-    switch(oldOpcode) {
-    case AMDGPU::JUMP_COND:
-    case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
-    default:
-      assert(0 && "internal error");
-    }
-    return -1;
-  }
-
-  static MachineBasicBlock *getTrueBranch(MachineInstr *instr) {
-    return instr->getOperand(0).getMBB();
-  }
-
-  static void setTrueBranch(MachineInstr *instr, MachineBasicBlock *blk) {
-    instr->getOperand(0).setMBB(blk);
-  }
-
-  static MachineBasicBlock *
-  getFalseBranch(MachineBasicBlock *blk, MachineInstr *instr) {
-    assert(blk->succ_size() == 2);
-    MachineBasicBlock *trueBranch = getTrueBranch(instr);
-    MachineBasicBlock::succ_iterator iter = blk->succ_begin();
-    MachineBasicBlock::succ_iterator iterNext = iter;
-    ++iterNext;
-
-    return (*iter == trueBranch) ? *iterNext : *iter;
-  }
-
-  static bool isCondBranch(MachineInstr *instr) {
-    switch (instr->getOpcode()) {
-      case AMDGPU::JUMP_COND:
-      case AMDGPU::BRANCH_COND_i32:
-      case AMDGPU::BRANCH_COND_f32:
-      break;
-    default:
-      return false;
-    }
-    return true;
-  }
-
-  static bool isUncondBranch(MachineInstr *instr) {
-    switch (instr->getOpcode()) {
-    case AMDGPU::JUMP:
-    case AMDGPU::BRANCH:
-      return true;
-    default:
-      return false;
-    }
-    return true;
-  }
-
-  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *blk) {
-    //get DebugLoc from the first MachineBasicBlock instruction with debug info
-    DebugLoc DL;
-    for (MachineBasicBlock::iterator iter = blk->begin(); iter != blk->end(); ++iter) {
-      MachineInstr *instr = &(*iter);
-      if (instr->getDebugLoc().isUnknown() == false) {
-        DL = instr->getDebugLoc();
-      }
-    }
-    return DL;
-  }
-
-  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *blk) {
-    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
-    MachineInstr *instr = &*iter;
-    if (instr && (isCondBranch(instr) || isUncondBranch(instr))) {
-      return instr;
-    }
-    return NULL;
-  }
-
-  // The correct naming for this is getPossibleLoopendBlockBranchInstr.
-  //
-  // BB with backward-edge could have move instructions after the branch
-  // instruction.  Such move instruction "belong to" the loop backward-edge.
-  //
-  static MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *blk) {
-    const AMDGPUInstrInfo * TII = static_cast<const AMDGPUInstrInfo *>(
-                                  blk->getParent()->getTarget().getInstrInfo());
-
-    for (MachineBasicBlock::reverse_iterator iter = blk->rbegin(),
-         iterEnd = blk->rend(); iter != iterEnd; ++iter) {
-      // FIXME: Simplify
-      MachineInstr *instr = &*iter;
-      if (instr) {
-        if (isCondBranch(instr) || isUncondBranch(instr)) {
-          return instr;
-        } else if (!TII->isMov(instr->getOpcode())) {
-          break;
-        }
-      }
-    }
-    return NULL;
-  }
-
-  static MachineInstr *getReturnInstr(MachineBasicBlock *blk) {
-    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
-    if (iter != blk->rend()) {
-      MachineInstr *instr = &(*iter);
-      if (instr->getOpcode() == AMDGPU::RETURN) {
-        return instr;
-      }
-    }
-    return NULL;
-  }
-
-  static MachineInstr *getContinueInstr(MachineBasicBlock *blk) {
-    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
-    if (iter != blk->rend()) {
-      MachineInstr *instr = &(*iter);
-      if (instr->getOpcode() == AMDGPU::CONTINUE) {
-        return instr;
-      }
-    }
-    return NULL;
-  }
-
-  static MachineInstr *getLoopBreakInstr(MachineBasicBlock *blk) {
-    for (MachineBasicBlock::iterator iter = blk->begin(); (iter != blk->end()); ++iter) {
-      MachineInstr *instr = &(*iter);
-      if (instr->getOpcode() == AMDGPU::PREDICATED_BREAK) {
-        return instr;
-      }
-    }
-    return NULL;
-  }
-
-  static bool isReturnBlock(MachineBasicBlock *blk) {
-    MachineInstr *instr = getReturnInstr(blk);
-    bool isReturn = (blk->succ_size() == 0);
-    if (instr) {
-      assert(isReturn);
-    } else if (isReturn) {
-      if (DEBUGME) {
-        errs() << "BB" << blk->getNumber()
-               <<" is return block without RETURN instr\n";
-      }
-    }
-
-    return  isReturn;
-  }
-
-  static MachineBasicBlock::iterator
-  getInstrPos(MachineBasicBlock *blk, MachineInstr *instr) {
-    assert(instr->getParent() == blk && "instruction doesn't belong to block");
-    MachineBasicBlock::iterator iter = blk->begin();
-    MachineBasicBlock::iterator iterEnd = blk->end();
-    while (&(*iter) != instr && iter != iterEnd) {
-      ++iter;
-    }
-
-    assert(iter != iterEnd);
-    return iter;
-  }//getInstrPos
-
-  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
-                                         AMDGPUCFGStructurizer *passRep) {
-    return insertInstrBefore(blk,newOpcode,passRep,DebugLoc());
-  } //insertInstrBefore
-
-  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
-                                         AMDGPUCFGStructurizer *passRep, DebugLoc DL) {
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-    MachineInstr *newInstr =
-      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
-
-    MachineBasicBlock::iterator res;
-    if (blk->begin() != blk->end()) {
-      blk->insert(blk->begin(), newInstr);
-    } else {
-      blk->push_back(newInstr);
-    }
-
-    SHOWNEWINSTR(newInstr);
-
-    return newInstr;
-  } //insertInstrBefore
-
-  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
-                             AMDGPUCFGStructurizer *passRep) {
-    insertInstrEnd(blk,newOpcode,passRep,DebugLoc());
-  } //insertInstrEnd
-
-  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
-                             AMDGPUCFGStructurizer *passRep, DebugLoc DL) {
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-   MachineInstr *newInstr = blk->getParent()
-      ->CreateMachineInstr(tii->get(newOpcode), DL);
-
-    blk->push_back(newInstr);
-    //assume the instruction doesn't take any reg operand ...
-
-    SHOWNEWINSTR(newInstr);
-  } //insertInstrEnd
-
-  static MachineInstr *insertInstrBefore(MachineBasicBlock::iterator instrPos,
-                                         int newOpcode, 
-                                         AMDGPUCFGStructurizer *passRep) {
-    MachineInstr *oldInstr = &(*instrPos);
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-    MachineBasicBlock *blk = oldInstr->getParent();
-    MachineInstr *newInstr =
-      blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
-                                           DebugLoc());
-
-    blk->insert(instrPos, newInstr);
-    //assume the instruction doesn't take any reg operand ...
-
-    SHOWNEWINSTR(newInstr);
-    return newInstr;
-  } //insertInstrBefore
-
-  static void insertCondBranchBefore(MachineBasicBlock::iterator instrPos,
-                                     int newOpcode,
-                                     AMDGPUCFGStructurizer *passRep,
-                                     DebugLoc DL) {
-    MachineInstr *oldInstr = &(*instrPos);
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-    MachineBasicBlock *blk = oldInstr->getParent();
-    MachineFunction *MF = blk->getParent();
-    MachineInstr *newInstr = MF->CreateMachineInstr(tii->get(newOpcode), DL);
-
-    blk->insert(instrPos, newInstr);
-    MachineInstrBuilder MIB(*MF, newInstr);
-    MIB.addReg(oldInstr->getOperand(1).getReg(), false);
-
-    SHOWNEWINSTR(newInstr);
-    //erase later oldInstr->eraseFromParent();
-  } //insertCondBranchBefore
-
-  static void insertCondBranchBefore(MachineBasicBlock *blk,
-                                     MachineBasicBlock::iterator insertPos,
-                                     int newOpcode,
-                                     AMDGPUCFGStructurizer *passRep,
-                                     RegiT regNum,
-                                     DebugLoc DL) {
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-    MachineFunction *MF = blk->getParent();
-
-    MachineInstr *newInstr = MF->CreateMachineInstr(tii->get(newOpcode), DL);
-
-    //insert before
-    blk->insert(insertPos, newInstr);
-    MachineInstrBuilder(*MF, newInstr).addReg(regNum, false);
-
-    SHOWNEWINSTR(newInstr);
-  } //insertCondBranchBefore
-
-  static void insertCondBranchEnd(MachineBasicBlock *blk,
-                                  int newOpcode,
-                                  AMDGPUCFGStructurizer *passRep,
-                                  RegiT regNum) {
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-    MachineFunction *MF = blk->getParent();
-    MachineInstr *newInstr =
-      MF->CreateMachineInstr(tii->get(newOpcode), DebugLoc());
-
-    blk->push_back(newInstr);
-    MachineInstrBuilder(*MF, newInstr).addReg(regNum, false);
-
-    SHOWNEWINSTR(newInstr);
-  } //insertCondBranchEnd
-
-
-  static void insertAssignInstrBefore(MachineBasicBlock::iterator instrPos,
-                                      AMDGPUCFGStructurizer *passRep,
-                                      RegiT regNum, int regVal) {
-    MachineInstr *oldInstr = &(*instrPos);
-    const AMDGPUInstrInfo *tii =
-             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
-    MachineBasicBlock *blk = oldInstr->getParent();
-    MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
-                                                 regVal);
-    blk->insert(instrPos, newInstr);
-
-    SHOWNEWINSTR(newInstr);
-  } //insertAssignInstrBefore
-
-  static void insertAssignInstrBefore(MachineBasicBlock *blk,
-                                      AMDGPUCFGStructurizer *passRep,
-                                      RegiT regNum, int regVal) {
-    const AMDGPUInstrInfo *tii =
-             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
-
-    MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
-                                                 regVal);
-    if (blk->begin() != blk->end()) {
-      blk->insert(blk->begin(), newInstr);
-    } else {
-      blk->push_back(newInstr);
-    }
-
-    SHOWNEWINSTR(newInstr);
-
-  } //insertInstrBefore
-
-  static void insertCompareInstrBefore(MachineBasicBlock *blk,
-                                       MachineBasicBlock::iterator instrPos,
-                                       AMDGPUCFGStructurizer *passRep,
-                                       RegiT dstReg, RegiT src1Reg,
-                                       RegiT src2Reg) {
-    const AMDGPUInstrInfo *tii =
-             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
-    MachineFunction *MF = blk->getParent();
-    MachineInstr *newInstr =
-      MF->CreateMachineInstr(tii->get(tii->getIEQOpcode()), DebugLoc());
-
-    MachineInstrBuilder MIB(*MF, newInstr);
-    MIB.addReg(dstReg, RegState::Define); //set target
-    MIB.addReg(src1Reg); //set src value
-    MIB.addReg(src2Reg); //set src value
-
-    blk->insert(instrPos, newInstr);
-    SHOWNEWINSTR(newInstr);
-
-  } //insertCompareInstrBefore
-
-  static void cloneSuccessorList(MachineBasicBlock *dstBlk,
-                                 MachineBasicBlock *srcBlk) {
-    for (MachineBasicBlock::succ_iterator iter = srcBlk->succ_begin(),
-         iterEnd = srcBlk->succ_end(); iter != iterEnd; ++iter) {
-      dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of
-    }
-  } //cloneSuccessorList
-
-  static MachineBasicBlock *clone(MachineBasicBlock *srcBlk) {
-    MachineFunction *func = srcBlk->getParent();
-    MachineBasicBlock *newBlk = func->CreateMachineBasicBlock();
-    func->push_back(newBlk);  //insert to function
-    for (MachineBasicBlock::iterator iter = srcBlk->begin(),
-         iterEnd = srcBlk->end();
-         iter != iterEnd; ++iter) {
-      MachineInstr *instr = func->CloneMachineInstr(iter);
-      newBlk->push_back(instr);
-    }
-    return newBlk;
-  }
-
-  //MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose because
-  //the AMDGPU instruction is not recognized as terminator fix this and retire
-  //this routine
-  static void replaceInstrUseOfBlockWith(MachineBasicBlock *srcBlk,
-                                         MachineBasicBlock *oldBlk,
-                                         MachineBasicBlock *newBlk) {
-    MachineInstr *branchInstr = getLoopendBlockBranchInstr(srcBlk);
-    if (branchInstr && isCondBranch(branchInstr) &&
-        getTrueBranch(branchInstr) == oldBlk) {
-      setTrueBranch(branchInstr, newBlk);
-    }
-  }
-
-  static void wrapup(MachineBasicBlock *entryBlk) {
-    assert((!entryBlk->getParent()->getJumpTableInfo()
-            || entryBlk->getParent()->getJumpTableInfo()->isEmpty())
-           && "found a jump table");
-
-     //collect continue right before endloop
-     SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> contInstr;
-     MachineBasicBlock::iterator pre = entryBlk->begin();
-     MachineBasicBlock::iterator iterEnd = entryBlk->end();
-     MachineBasicBlock::iterator iter = pre;
-     while (iter != iterEnd) {
-       if (pre->getOpcode() == AMDGPU::CONTINUE
-           && iter->getOpcode() == AMDGPU::ENDLOOP) {
-         contInstr.push_back(pre);
-       }
-       pre = iter;
-       ++iter;
-     } //end while
-
-     //delete continue right before endloop
-     for (unsigned i = 0; i < contInstr.size(); ++i) {
-        contInstr[i]->eraseFromParent();
-     }
-
-     // TODO to fix up jump table so later phase won't be confused.  if
-     // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
-     // there isn't such an interface yet.  alternatively, replace all the other
-     // blocks in the jump table with the entryBlk //}
-
-  } //wrapup
-
-  static MachineDominatorTree *getDominatorTree(AMDGPUCFGStructurizer &pass) {
-    return &pass.getAnalysis<MachineDominatorTree>();
-  }
-
-  static MachinePostDominatorTree*
-  getPostDominatorTree(AMDGPUCFGStructurizer &pass) {
-    return &pass.getAnalysis<MachinePostDominatorTree>();
-  }
-
-  static MachineLoopInfo *getLoopInfo(AMDGPUCFGStructurizer &pass) {
-    return &pass.getAnalysis<MachineLoopInfo>();
-  }
-}; // template class CFGStructTraits
-} // end anonymous namespace
-
-// createAMDGPUCFGPreparationPass- Returns a pass
-FunctionPass *llvm::createAMDGPUCFGPreparationPass(TargetMachine &tm) {
-  return new AMDGPUCFGPrepare(tm);
-}
-
-bool AMDGPUCFGPrepare::runOnMachineFunction(MachineFunction &func) {
-  return CFGStructurizer<AMDGPUCFGStructurizer>().prepare(func, *this,
-                                                       getTargetRegisterInfo());
-}
-
-// createAMDGPUCFGStructurizerPass- Returns a pass
-FunctionPass *llvm::createAMDGPUCFGStructurizerPass(TargetMachine &tm) {
-  return new AMDGPUCFGPerform(tm);
-}
+INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
+                      "AMDGPU CFG Structurizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(AMDGPUCFGStructurizer, "amdgpustructurizer",
+                      "AMDGPU CFG Structurizer", false, false)
 
-bool AMDGPUCFGPerform::runOnMachineFunction(MachineFunction &func) {
-  return CFGStructurizer<AMDGPUCFGStructurizer>().run(func, *this,
-                                                      getTargetRegisterInfo());
+FunctionPass *llvm::createAMDGPUCFGStructurizerPass() {
+  return new AMDGPUCFGStructurizer();
 }