Uniformize the names of type predicates: rather than having isFloatTy and

[oota-llvm.git] / lib / Transforms / Scalar / JumpThreading.cpp
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp

index a380857235b6bee059f5559095d509f5541e429e..8f21aacbdd6f86c026b68a178e71af7a028aec6b 100644 (file)
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -29,6 +29,7 @@
  #include "llvm/ADT/SmallSet.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/Debug.h"
+#include "llvm/Support/ValueHandle.h"
  #include "llvm/Support/raw_ostream.h"
  using namespace llvm;
  
@@ -88,20 +89,21 @@ namespace {
      bool ThreadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock*> &PredBBs,
                      BasicBlock *SuccBB);
      bool DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
-                                          BasicBlock *PredBB);
+                                  const SmallVectorImpl<BasicBlock *> &PredBBs);
      
      typedef SmallVectorImpl<std::pair<ConstantInt*,
                                        BasicBlock*> > PredValueInfo;
      
      bool ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,
                                           PredValueInfo &Result);
-    bool ProcessThreadableEdges(Instruction *CondInst, BasicBlock *BB);
+    bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB);
      
      
      bool ProcessBranchOnDuplicateCond(BasicBlock *PredBB, BasicBlock *DestBB);
      bool ProcessSwitchOnDuplicateCond(BasicBlock *PredBB, BasicBlock *DestBB);
  
-    bool ProcessJumpOnPHI(PHINode *PN);
+    bool ProcessBranchOnPHI(PHINode *PN);
+    bool ProcessBranchOnXOR(BinaryOperator *BO);
      
      bool SimplifyPartiallyRedundantLoad(LoadInst *LI);
    };
@@ -117,16 +119,15 @@ FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); }
  /// runOnFunction - Top level algorithm.
  ///
  bool JumpThreading::runOnFunction(Function &F) {
-  DEBUG(errs() << "Jump threading on function '" << F.getName() << "'\n");
+  DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
    TD = getAnalysisIfAvailable<TargetData>();
    LVI = EnableLVI ? &getAnalysis<LazyValueInfo>() : 0;
    
    FindLoopHeaders(F);
    
-  bool AnotherIteration = true, EverChanged = false;
-  while (AnotherIteration) {
-    AnotherIteration = false;
-    bool Changed = false;
+  bool Changed, EverChanged = false;
+  do {
+    Changed = false;
      for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
        BasicBlock *BB = I;
        // Thread all of the branches we can over this block. 
@@ -139,7 +140,7 @@ bool JumpThreading::runOnFunction(Function &F) {
        // edges which simplifies the CFG.
        if (pred_begin(BB) == pred_end(BB) &&
            BB != &BB->getParent()->getEntryBlock()) {
-        DEBUG(errs() << "  JT: Deleting dead block '" << BB->getName()
+        DEBUG(dbgs() << "  JT: Deleting dead block '" << BB->getName()
                << "' with terminator: " << *BB->getTerminator() << '\n');
          LoopHeaders.erase(BB);
          DeleteDeadBlock(BB);
@@ -158,20 +159,25 @@ bool JumpThreading::runOnFunction(Function &F) {
            if (BBI->isTerminator()) {
              // Since TryToSimplifyUncondBranchFromEmptyBlock may delete the
              // block, we have to make sure it isn't in the LoopHeaders set.  We
-            // reinsert afterward in the rare case when the block isn't deleted.
+            // reinsert afterward if needed.
              bool ErasedFromLoopHeaders = LoopHeaders.erase(BB);
+            BasicBlock *Succ = BI->getSuccessor(0);
              
-            if (TryToSimplifyUncondBranchFromEmptyBlock(BB))
+            if (TryToSimplifyUncondBranchFromEmptyBlock(BB)) {
                Changed = true;
-            else if (ErasedFromLoopHeaders)
+              // If we deleted BB and BB was the header of a loop, then the
+              // successor is now the header of the loop.
+              BB = Succ;
+            }
+            
+            if (ErasedFromLoopHeaders)
                LoopHeaders.insert(BB);
            }
          }
        }
      }
-    AnotherIteration = Changed;
      EverChanged |= Changed;
-  }
+  } while (Changed);
    
    LoopHeaders.clear();
    return EverChanged;
@@ -275,6 +281,12 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
      /// predecessor based on its terminator.
      //
      if (LVI) {
+      // FIXME: change this to use the more-rich 'getPredicateOnEdge' method if
+      // "I" is a non-local compare-with-a-constant instruction.  This would be
+      // able to handle value inequalities better, for example if the compare is
+      // "X < 4" and "X < 3" is known true but "X < 4" itself is not available.
+      // Perhaps getConstantOnEdge should be smart enough to do this?
+      
        for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
          // If the value is known by LazyValueInfo to be a constant in a
          // predecessor, use that information to try to thread this block.
@@ -324,13 +336,18 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
        else
          InterestingVal = ConstantInt::getFalse(I->getContext());
        
-      // Scan for the sentinel.
+      // Scan for the sentinel.  If we find an undef, force it to the
+      // interesting value: x|undef -> true and x&undef -> false.
        for (unsigned i = 0, e = LHSVals.size(); i != e; ++i)
-        if (LHSVals[i].first == InterestingVal || LHSVals[i].first == 0)
+        if (LHSVals[i].first == InterestingVal || LHSVals[i].first == 0) {
            Result.push_back(LHSVals[i]);
+          Result.back().first = InterestingVal;
+        }
        for (unsigned i = 0, e = RHSVals.size(); i != e; ++i)
-        if (RHSVals[i].first == InterestingVal || RHSVals[i].first == 0)
+        if (RHSVals[i].first == InterestingVal || RHSVals[i].first == 0) {
            Result.push_back(RHSVals[i]);
+          Result.back().first = InterestingVal;
+        }
        return !Result.empty();
      }
      
@@ -344,8 +361,9 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
  
        // Invert the known values.
        for (unsigned i = 0, e = Result.size(); i != e; ++i)
-        Result[i].first =
-          cast<ConstantInt>(ConstantExpr::getNot(Result[i].first));
+        if (Result[i].first)
+          Result[i].first =
+            cast<ConstantInt>(ConstantExpr::getNot(Result[i].first));
        return true;
      }
    }
@@ -362,7 +380,17 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
          Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB);
          
          Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, TD);
-        if (Res == 0) continue;
+        if (Res == 0) {
+          if (!LVI || !isa<Constant>(RHS))
+            continue;
+          
+          LazyValueInfo::Tristate 
+            ResT = LVI->getPredicateOnEdge(Cmp->getPredicate(), LHS,
+                                           cast<Constant>(RHS), PredBB, BB);
+          if (ResT == LazyValueInfo::Unknown)
+            continue;
+          Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT);
+        }
          
          if (isa<UndefValue>(Res))
            Result.push_back(std::make_pair((ConstantInt*)0, PredBB));
@@ -377,6 +405,7 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
      // If comparing a live-in value against a constant, see if we know the
      // live-in value on any predecessors.
      if (LVI && isa<Constant>(Cmp->getOperand(1)) &&
+        Cmp->getType()->isIntegerTy() && // Not vector compare.
          (!isa<Instruction>(Cmp->getOperand(0)) ||
           cast<Instruction>(Cmp->getOperand(0))->getParent() != BB)) {
        Constant *RHSCst = cast<Constant>(Cmp->getOperand(1));
@@ -384,14 +413,14 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){
        for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
          // If the value is known by LazyValueInfo to be a constant in a
          // predecessor, use that information to try to thread this block.
-        Constant *PredCst = LVI->getConstantOnEdge(Cmp->getOperand(0), *PI, BB);
-        if (PredCst == 0)
+        LazyValueInfo::Tristate
+          Res = LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0),
+                                        RHSCst, *PI, BB);
+        if (Res == LazyValueInfo::Unknown)
            continue;
-        
-        // Constant fold the compare.
-        Value *Res = SimplifyCmpInst(Cmp->getPredicate(), PredCst, RHSCst, TD);
-        if (isa<ConstantInt>(Res) || isa<UndefValue>(Res))
-          Result.push_back(std::make_pair(dyn_cast<ConstantInt>(Res), *PI));
+
+        Constant *ResC = ConstantInt::get(Cmp->getType(), Res);
+        Result.push_back(std::make_pair(cast<ConstantInt>(ResC), *PI));
        }
        
        return !Result.empty();
@@ -427,6 +456,12 @@ static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
  /// ProcessBlock - If there are any predecessors whose control can be threaded
  /// through to a successor, transform them now.
  bool JumpThreading::ProcessBlock(BasicBlock *BB) {
+  // If the block is trivially dead, just return and let the caller nuke it.
+  // This simplifies other transformations.
+  if (pred_begin(BB) == pred_end(BB) &&
+      BB != &BB->getParent()->getEntryBlock())
+    return false;
+  
    // If this block has a single predecessor, and if that pred has a single
    // successor, merge the blocks.  This encourages recursive jump threading
    // because now the condition in this block can be threaded through
@@ -465,7 +500,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
    // terminator to an unconditional branch.  This can occur due to threading in
    // other blocks.
    if (isa<ConstantInt>(Condition)) {
-    DEBUG(errs() << "  In block '" << BB->getName()
+    DEBUG(dbgs() << "  In block '" << BB->getName()
            << "' folding terminator: " << *BB->getTerminator() << '\n');
      ++NumFolds;
      ConstantFoldTerminator(BB);
@@ -484,7 +519,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
        RemovePredecessorAndSimplify(BBTerm->getSuccessor(i), BB, TD);
      }
      
-    DEBUG(errs() << "  In block '" << BB->getName()
+    DEBUG(dbgs() << "  In block '" << BB->getName()
            << "' folding undef terminator: " << *BBTerm << '\n');
      BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm);
      BBTerm->eraseFromParent();
@@ -498,7 +533,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
    //     br COND, BBX, BBY
    //  BBX:
    //     br COND, BBZ, BBW
-  if (!Condition->hasOneUse() && // Multiple uses.
+  if (!LVI &&
+      !Condition->hasOneUse() && // Multiple uses.
        (CondInst == 0 || CondInst->getParent() != BB)) { // Non-local definition.
      pred_iterator PI = pred_begin(BB), E = pred_end(BB);
      if (isa<BranchInst>(BB->getTerminator())) {
@@ -518,17 +554,18 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
    }
  
    // All the rest of our checks depend on the condition being an instruction.
-  if (CondInst == 0)
+  if (CondInst == 0) {
+    // FIXME: Unify this with code below.
+    if (LVI && ProcessThreadableEdges(Condition, BB))
+      return true;
      return false;
-  
-  // See if this is a phi node in the current block.
-  if (PHINode *PN = dyn_cast<PHINode>(CondInst))
-    if (PN->getParent() == BB)
-      return ProcessJumpOnPHI(PN);
+  }  
+    
    
    if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
-    if (!isa<PHINode>(CondCmp->getOperand(0)) ||
-        cast<PHINode>(CondCmp->getOperand(0))->getParent() != BB) {
+    if (!LVI &&
+        (!isa<PHINode>(CondCmp->getOperand(0)) ||
+         cast<PHINode>(CondCmp->getOperand(0))->getParent() != BB)) {
        // If we have a comparison, loop over the predecessors to see if there is
        // a condition with a lexically identical value.
        pred_iterator PI = pred_begin(BB), E = pred_end(BB);
@@ -553,13 +590,13 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
    // we see one, check to see if it's partially redundant.  If so, insert a PHI
    // which can then be used to thread the values.
    //
-  // This is particularly important because reg2mem inserts loads and stores all
-  // over the place, and this blocks jump threading if we don't zap them.
    Value *SimplifyValue = CondInst;
    if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
      if (isa<Constant>(CondCmp->getOperand(1)))
        SimplifyValue = CondCmp->getOperand(0);
    
+  // TODO: There are other places where load PRE would be profitable, such as
+  // more complex comparisons.
    if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue))
      if (SimplifyPartiallyRedundantLoad(LI))
        return true;
@@ -572,9 +609,21 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
    if (ProcessThreadableEdges(CondInst, BB))
      return true;
    
+  // If this is an otherwise-unfoldable branch on a phi node in the current
+  // block, see if we can simplify.
+  if (PHINode *PN = dyn_cast<PHINode>(CondInst))
+    if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+      return ProcessBranchOnPHI(PN);
+  
+  
+  // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
+  if (CondInst->getOpcode() == Instruction::Xor &&
+      CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+    return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst));
+  
    
    // TODO: If we have: "br (X > 0)"  and we have a predecessor where we know
-  // "(X == 4)" thread through this block.
+  // "(X == 4)", thread through this block.
    
    return false;
  }
@@ -602,7 +651,7 @@ bool JumpThreading::ProcessBranchOnDuplicateCond(BasicBlock *PredBB,
    else if (PredBI->getSuccessor(0) != BB)
      BranchDir = false;
    else {
-    DEBUG(errs() << "  In block '" << PredBB->getName()
+    DEBUG(dbgs() << "  In block '" << PredBB->getName()
            << "' folding terminator: " << *PredBB->getTerminator() << '\n');
      ++NumFolds;
      ConstantFoldTerminator(PredBB);
@@ -614,7 +663,7 @@ bool JumpThreading::ProcessBranchOnDuplicateCond(BasicBlock *PredBB,
    // If the dest block has one predecessor, just fix the branch condition to a
    // constant and fold it.
    if (BB->getSinglePredecessor()) {
-    DEBUG(errs() << "  In block '" << BB->getName()
+    DEBUG(dbgs() << "  In block '" << BB->getName()
            << "' folding condition to '" << BranchDir << "': "
            << *BB->getTerminator() << '\n');
      ++NumFolds;
@@ -685,11 +734,16 @@ bool JumpThreading::ProcessSwitchOnDuplicateCond(BasicBlock *PredBB,
        if (PredSI->getSuccessor(PredCase) != DestBB &&
            DestSI->getSuccessor(i) != DestBB)
          continue;
+      
+      // Do not forward this if it already goes to this destination, this would
+      // be an infinite loop.
+      if (PredSI->getSuccessor(PredCase) == DestSucc)
+        continue;
  
        // Otherwise, we're safe to make the change.  Make sure that the edge from
        // DestSI to DestSucc is not critical and has no PHI nodes.
-      DEBUG(errs() << "FORWARDING EDGE " << *DestVal << "   FROM: " << *PredSI);
-      DEBUG(errs() << "THROUGH: " << *DestSI);
+      DEBUG(dbgs() << "FORWARDING EDGE " << *DestVal << "   FROM: " << *PredSI);
+      DEBUG(dbgs() << "THROUGH: " << *DestSI);
  
        // If the destination has PHI nodes, just split the edge for updating
        // simplicity.
@@ -727,7 +781,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
    Value *LoadedPtr = LI->getOperand(0);
  
    // If the loaded operand is defined in the LoadBB, it can't be available.
-  // FIXME: Could do PHI translation, that would be fun :)
+  // TODO: Could do simple PHI translation, that would be fun :)
    if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr))
      if (PtrOp->getParent() == LoadBB)
        return false;
@@ -736,8 +790,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
    // the entry to its block.
    BasicBlock::iterator BBIt = LI;
  
-  if (Value *AvailableVal = FindAvailableLoadedValue(LoadedPtr, LoadBB, 
-                                                     BBIt, 6)) {
+  if (Value *AvailableVal = 
+        FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, 6)) {
      // If the value if the load is locally available within the block, just use
      // it.  This frequently occurs for reg2mem'd allocas.
      //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n";
@@ -820,7 +874,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
      // Split them out to their own block.
      UnavailablePred =
        SplitBlockPredecessors(LoadBB, &PredsToSplit[0], PredsToSplit.size(),
-                             "thread-split", this);
+                             "thread-pre-split", this);
    }
    
    // If the value isn't available in all predecessors, then there will be
@@ -829,7 +883,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
    if (UnavailablePred) {
      assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
             "Can't handle critical edge here!");
-    Value *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr",
+    Value *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr", false,
+                                 LI->getAlignment(),
                                   UnavailablePred->getTerminator());
      AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
    }
@@ -927,27 +982,26 @@ FindMostPopularDest(BasicBlock *BB,
    return MostPopularDest;
  }
  
-bool JumpThreading::ProcessThreadableEdges(Instruction *CondInst,
-                                           BasicBlock *BB) {
+bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB) {
    // If threading this would thread across a loop header, don't even try to
    // thread the edge.
    if (LoopHeaders.count(BB))
      return false;
    
    SmallVector<std::pair<ConstantInt*, BasicBlock*>, 8> PredValues;
-  if (!ComputeValueKnownInPredecessors(CondInst, BB, PredValues))
+  if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues))
      return false;
    assert(!PredValues.empty() &&
           "ComputeValueKnownInPredecessors returned true with no values");
  
-  DEBUG(errs() << "IN BB: " << *BB;
+  DEBUG(dbgs() << "IN BB: " << *BB;
          for (unsigned i = 0, e = PredValues.size(); i != e; ++i) {
-          errs() << "  BB '" << BB->getName() << "': FOUND condition = ";
+          dbgs() << "  BB '" << BB->getName() << "': FOUND condition = ";
            if (PredValues[i].first)
-            errs() << *PredValues[i].first;
+            dbgs() << *PredValues[i].first;
            else
-            errs() << "UNDEF";
-          errs() << " for pred '" << PredValues[i].second->getName()
+            dbgs() << "UNDEF";
+          dbgs() << " for pred '" << PredValues[i].second->getName()
            << "'.\n";
          });
    
@@ -1031,36 +1085,135 @@ bool JumpThreading::ProcessThreadableEdges(Instruction *CondInst,
    return ThreadEdge(BB, PredsToFactor, MostPopularDest);
  }
  
-/// ProcessJumpOnPHI - We have a conditional branch or switch on a PHI node in
-/// the current block.  See if there are any simplifications we can do based on
-/// inputs to the phi node.
+/// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on
+/// a PHI node in the current block.  See if there are any simplifications we
+/// can do based on inputs to the phi node.
  /// 
-bool JumpThreading::ProcessJumpOnPHI(PHINode *PN) {
+bool JumpThreading::ProcessBranchOnPHI(PHINode *PN) {
    BasicBlock *BB = PN->getParent();
    
-  // If any of the predecessor blocks end in an unconditional branch, we can
-  // *duplicate* the jump into that block in order to further encourage jump
-  // threading and to eliminate cases where we have branch on a phi of an icmp
-  // (branch on icmp is much better).
-
-  // We don't want to do this tranformation for switches, because we don't
-  // really want to duplicate a switch.
-  if (isa<SwitchInst>(BB->getTerminator()))
-    return false;
+  // TODO: We could make use of this to do it once for blocks with common PHI
+  // values.
+  SmallVector<BasicBlock*, 1> PredBBs;
+  PredBBs.resize(1);
    
-  // Look for unconditional branch predecessors.
+  // If any of the predecessor blocks end in an unconditional branch, we can
+  // *duplicate* the conditional branch into that block in order to further
+  // encourage jump threading and to eliminate cases where we have branch on a
+  // phi of an icmp (branch on icmp is much better).
    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
      BasicBlock *PredBB = PN->getIncomingBlock(i);
      if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()))
-      if (PredBr->isUnconditional() &&
-          // Try to duplicate BB into PredBB.
-          DuplicateCondBranchOnPHIIntoPred(BB, PredBB))
-        return true;
+      if (PredBr->isUnconditional()) {
+        PredBBs[0] = PredBB;
+        // Try to duplicate BB into PredBB.
+        if (DuplicateCondBranchOnPHIIntoPred(BB, PredBBs))
+          return true;
+      }
    }
  
    return false;
  }
  
+/// ProcessBranchOnXOR - We have an otherwise unthreadable conditional branch on
+/// a xor instruction in the current block.  See if there are any
+/// simplifications we can do based on inputs to the xor.
+/// 
+bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) {
+  BasicBlock *BB = BO->getParent();
+  
+  // If either the LHS or RHS of the xor is a constant, don't do this
+  // optimization.
+  if (isa<ConstantInt>(BO->getOperand(0)) ||
+      isa<ConstantInt>(BO->getOperand(1)))
+    return false;
+  
+  // If the first instruction in BB isn't a phi, we won't be able to infer
+  // anything special about any particular predecessor.
+  if (!isa<PHINode>(BB->front()))
+    return false;
+  
+  // If we have a xor as the branch input to this block, and we know that the
+  // LHS or RHS of the xor in any predecessor is true/false, then we can clone
+  // the condition into the predecessor and fix that value to true, saving some
+  // logical ops on that path and encouraging other paths to simplify.
+  //
+  // This copies something like this:
+  //
+  //  BB:
+  //    %X = phi i1 [1],  [%X']
+  //    %Y = icmp eq i32 %A, %B
+  //    %Z = xor i1 %X, %Y
+  //    br i1 %Z, ...
+  //
+  // Into:
+  //  BB':
+  //    %Y = icmp ne i32 %A, %B
+  //    br i1 %Z, ...
+
+  SmallVector<std::pair<ConstantInt*, BasicBlock*>, 8> XorOpValues;
+  bool isLHS = true;
+  if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues)) {
+    assert(XorOpValues.empty());
+    if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues))
+      return false;
+    isLHS = false;
+  }
+  
+  assert(!XorOpValues.empty() &&
+         "ComputeValueKnownInPredecessors returned true with no values");
+
+  // Scan the information to see which is most popular: true or false.  The
+  // predecessors can be of the set true, false, or undef.
+  unsigned NumTrue = 0, NumFalse = 0;
+  for (unsigned i = 0, e = XorOpValues.size(); i != e; ++i) {
+    if (!XorOpValues[i].first) continue;  // Ignore undefs for the count.
+    if (XorOpValues[i].first->isZero())
+      ++NumFalse;
+    else
+      ++NumTrue;
+  }
+  
+  // Determine which value to split on, true, false, or undef if neither.
+  ConstantInt *SplitVal = 0;
+  if (NumTrue > NumFalse)
+    SplitVal = ConstantInt::getTrue(BB->getContext());
+  else if (NumTrue != 0 || NumFalse != 0)
+    SplitVal = ConstantInt::getFalse(BB->getContext());
+  
+  // Collect all of the blocks that this can be folded into so that we can
+  // factor this once and clone it once.
+  SmallVector<BasicBlock*, 8> BlocksToFoldInto;
+  for (unsigned i = 0, e = XorOpValues.size(); i != e; ++i) {
+    if (XorOpValues[i].first != SplitVal && XorOpValues[i].first != 0) continue;
+
+    BlocksToFoldInto.push_back(XorOpValues[i].second);
+  }
+  
+  // If we inferred a value for all of the predecessors, then duplication won't
+  // help us.  However, we can just replace the LHS or RHS with the constant.
+  if (BlocksToFoldInto.size() ==
+      cast<PHINode>(BB->front()).getNumIncomingValues()) {
+    if (SplitVal == 0) {
+      // If all preds provide undef, just nuke the xor, because it is undef too.
+      BO->replaceAllUsesWith(UndefValue::get(BO->getType()));
+      BO->eraseFromParent();
+    } else if (SplitVal->isZero()) {
+      // If all preds provide 0, replace the xor with the other input.
+      BO->replaceAllUsesWith(BO->getOperand(isLHS));
+      BO->eraseFromParent();
+    } else {
+      // If all preds provide 1, set the computed value to 1.
+      BO->setOperand(!isLHS, SplitVal);
+    }
+    
+    return true;
+  }
+  
+  // Try to duplicate BB into PredBB.
+  return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);
+}
+
  
  /// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new
  /// predecessor to the PHIBB block.  If it has PHI nodes, add entries for
@@ -1094,7 +1247,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
                                 BasicBlock *SuccBB) {
    // If threading to the same block as we come from, we would infinite loop.
    if (SuccBB == BB) {
-    DEBUG(errs() << "  Not threading across BB '" << BB->getName()
+    DEBUG(dbgs() << "  Not threading across BB '" << BB->getName()
            << "' - would thread to self!\n");
      return false;
    }
@@ -1102,7 +1255,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
    // If threading this would thread across a loop header, don't thread the edge.
    // See the comments above FindLoopHeaders for justifications and caveats.
    if (LoopHeaders.count(BB)) {
-    DEBUG(errs() << "  Not threading across loop header BB '" << BB->getName()
+    DEBUG(dbgs() << "  Not threading across loop header BB '" << BB->getName()
            << "' to dest BB '" << SuccBB->getName()
            << "' - it might create an irreducible loop!\n");
      return false;
@@ -1110,7 +1263,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
  
    unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB);
    if (JumpThreadCost > Threshold) {
-    DEBUG(errs() << "  Not threading BB '" << BB->getName()
+    DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
            << "' - Cost is too high: " << JumpThreadCost << "\n");
      return false;
    }
@@ -1120,14 +1273,14 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
    if (PredBBs.size() == 1)
      PredBB = PredBBs[0];
    else {
-    DEBUG(errs() << "  Factoring out " << PredBBs.size()
+    DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
            << " common predecessors.\n");
      PredBB = SplitBlockPredecessors(BB, &PredBBs[0], PredBBs.size(),
                                      ".thr_comm", this);
    }
    
    // And finally, do it!
-  DEBUG(errs() << "  Threading edge from '" << PredBB->getName() << "' to '"
+  DEBUG(dbgs() << "  Threading edge from '" << PredBB->getName() << "' to '"
          << SuccBB->getName() << "' with cost: " << JumpThreadCost
          << ", across block:\n    "
          << *BB << "\n");
@@ -1196,7 +1349,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
      if (UsesToRename.empty())
        continue;
      
-    DEBUG(errs() << "JT: Renaming non-local uses of: " << *I << "\n");
+    DEBUG(dbgs() << "JT: Renaming non-local uses of: " << *I << "\n");
  
      // We found a use of I outside of BB.  Rename all uses of I that are outside
      // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
@@ -1207,7 +1360,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
      
      while (!UsesToRename.empty())
        SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
-    DEBUG(errs() << "\n");
+    DEBUG(dbgs() << "\n");
    }
    
    
@@ -1224,20 +1377,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
    // At this point, the IR is fully up to date and consistent.  Do a quick scan
    // over the new instructions and zap any that are constants or dead.  This
    // frequently happens because of phi translation.
-  BI = NewBB->begin();
-  for (BasicBlock::iterator E = NewBB->end(); BI != E; ) {
-    Instruction *Inst = BI++;
-    
-    if (Value *V = SimplifyInstruction(Inst, TD)) {
-      WeakVH BIHandle(BI);
-      ReplaceAndSimplifyAllUses(Inst, V, TD);
-      if (BIHandle == 0)
-        BI = NewBB->begin();
-      continue;
-    }
-    
-    RecursivelyDeleteTriviallyDeadInstructions(Inst);
-  }
+  SimplifyInstructionsInBlock(NewBB, TD);
    
    // Threaded an edge!
    ++NumThreads;
@@ -1250,30 +1390,52 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
  /// improves the odds that the branch will be on an analyzable instruction like
  /// a compare.
  bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
-                                                     BasicBlock *PredBB) {
+                                 const SmallVectorImpl<BasicBlock *> &PredBBs) {
+  assert(!PredBBs.empty() && "Can't handle an empty set");
+
    // If BB is a loop header, then duplicating this block outside the loop would
    // cause us to transform this into an irreducible loop, don't do this.
    // See the comments above FindLoopHeaders for justifications and caveats.
    if (LoopHeaders.count(BB)) {
-    DEBUG(errs() << "  Not duplicating loop header '" << BB->getName()
-          << "' into predecessor block '" << PredBB->getName()
+    DEBUG(dbgs() << "  Not duplicating loop header '" << BB->getName()
+          << "' into predecessor block '" << PredBBs[0]->getName()
            << "' - it might create an irreducible loop!\n");
      return false;
    }
    
    unsigned DuplicationCost = getJumpThreadDuplicationCost(BB);
    if (DuplicationCost > Threshold) {
-    DEBUG(errs() << "  Not duplicating BB '" << BB->getName()
+    DEBUG(dbgs() << "  Not duplicating BB '" << BB->getName()
            << "' - Cost is too high: " << DuplicationCost << "\n");
      return false;
    }
    
+  // And finally, do it!  Start by factoring the predecessors is needed.
+  BasicBlock *PredBB;
+  if (PredBBs.size() == 1)
+    PredBB = PredBBs[0];
+  else {
+    DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
+          << " common predecessors.\n");
+    PredBB = SplitBlockPredecessors(BB, &PredBBs[0], PredBBs.size(),
+                                    ".thr_comm", this);
+  }
+  
    // Okay, we decided to do this!  Clone all the instructions in BB onto the end
    // of PredBB.
-  DEBUG(errs() << "  Duplicating block '" << BB->getName() << "' into end of '"
+  DEBUG(dbgs() << "  Duplicating block '" << BB->getName() << "' into end of '"
          << PredBB->getName() << "' to eliminate branch on phi.  Cost: "
          << DuplicationCost << " block is:" << *BB << "\n");
    
+  // Unless PredBB ends with an unconditional branch, split the edge so that we
+  // can just clone the bits from BB into the end of the new PredBB.
+  BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
+  
+  if (OldPredBranch == 0 || !OldPredBranch->isUnconditional()) {
+    PredBB = SplitEdge(PredBB, BB, this);
+    OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
+  }
+  
    // We are going to have to map operands from the original BB block into the
    // PredBB block.  Evaluate PHI nodes in BB.
    DenseMap<Instruction*, Value*> ValueMapping;
@@ -1282,15 +1444,10 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
    for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
      ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
    
-  BranchInst *OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
-  
    // Clone the non-phi instructions of BB into PredBB, keeping track of the
    // mapping and using it to remap operands in the cloned instructions.
    for (; BI != BB->end(); ++BI) {
      Instruction *New = BI->clone();
-    New->setName(BI->getName());
-    PredBB->getInstList().insert(OldPredBranch, New);
-    ValueMapping[BI] = New;
      
      // Remap operands to patch up intra-block references.
      for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
@@ -1299,6 +1456,19 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
          if (I != ValueMapping.end())
            New->setOperand(i, I->second);
        }
+
+    // If this instruction can be simplified after the operands are updated,
+    // just use the simplified value instead.  This frequently happens due to
+    // phi translation.
+    if (Value *IV = SimplifyInstruction(New, TD)) {
+      delete New;
+      ValueMapping[BI] = IV;
+    } else {
+      // Otherwise, insert the new instruction into the block.
+      New->setName(BI->getName());
+      PredBB->getInstList().insert(OldPredBranch, New);
+      ValueMapping[BI] = New;
+    }
    }
    
    // Check to see if the targets of the branch had PHI nodes. If so, we need to
@@ -1334,7 +1504,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
      if (UsesToRename.empty())
        continue;
      
-    DEBUG(errs() << "JT: Renaming non-local uses of: " << *I << "\n");
+    DEBUG(dbgs() << "JT: Renaming non-local uses of: " << *I << "\n");
      
      // We found a use of I outside of BB.  Rename all uses of I that are outside
      // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
@@ -1345,7 +1515,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
      
      while (!UsesToRename.empty())
        SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
-    DEBUG(errs() << "\n");
+    DEBUG(dbgs() << "\n");
    }
    
    // PredBB no longer jumps to BB, remove entries in the PHI node for the edge