Implement InstCombine/cast.ll:test29

[oota-llvm.git] / lib / Transforms / Scalar / InstructionCombining.cpp
diff --git a/lib/Transforms/Scalar/InstructionCombining.cpp b/lib/Transforms/Scalar/InstructionCombining.cpp

index 49966ae2e25293054026b5599c04a3f842708a91..2c8f6eda4955a2582e9ff7b05da2bec90eff7475 100644 (file)
--- a/lib/Transforms/Scalar/InstructionCombining.cpp
+++ b/lib/Transforms/Scalar/InstructionCombining.cpp
@@ -137,7 +137,9 @@ namespace {
      Instruction *visitStoreInst(StoreInst &SI);
      Instruction *visitBranchInst(BranchInst &BI);
      Instruction *visitSwitchInst(SwitchInst &SI);
+    Instruction *visitInsertElementInst(InsertElementInst &IE);
      Instruction *visitExtractElementInst(ExtractElementInst &EI);
+    Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
  
      // visitInstruction - Specify what to return for unhandled instructions...
      Instruction *visitInstruction(Instruction &I) { return 0; }
@@ -165,6 +167,9 @@ namespace {
      Value *InsertCastBefore(Value *V, const Type *Ty, Instruction &Pos) {
        if (V->getType() == Ty) return V;
  
+      if (Constant *CV = dyn_cast<Constant>(V))
+        return ConstantExpr::getCast(CV, Ty);
+      
        Instruction *C = new CastInst(V, Ty, V->getName(), &Pos);
        WorkList.push_back(C);
        return C;
@@ -201,6 +206,8 @@ namespace {
          Old->replaceAllUsesWith(New);
        if (Instruction *I = dyn_cast<Instruction>(Old))
          WorkList.push_back(I);
+      if (Instruction *I = dyn_cast<Instruction>(New))
+        WorkList.push_back(I);
        return true;
      }
      
@@ -228,7 +235,9 @@ namespace {
      // operators.
      bool SimplifyCommutative(BinaryOperator &I);
  
-    bool SimplifyDemandedBits(Value *V, uint64_t Mask, unsigned Depth = 0);
+    bool SimplifyDemandedBits(Value *V, uint64_t Mask, 
+                              uint64_t &KnownZero, uint64_t &KnownOne,
+                              unsigned Depth = 0);
  
      // FoldOpIntoPhi - Given a binary operator or cast instruction which has a
      // PHI node as operand #0, see if we can fold the instruction into the PHI
@@ -406,6 +415,20 @@ static ConstantInt *SubOne(ConstantInt *C) {
                                           ConstantInt::get(C->getType(), 1)));
  }
  
+/// GetConstantInType - Return a ConstantInt with the specified type and value.
+///
+static ConstantIntegral *GetConstantInType(const Type *Ty, uint64_t Val) {
+  if (Ty->isUnsigned())
+    return ConstantUInt::get(Ty, Val);
+  else if (Ty->getTypeID() == Type::BoolTyID)
+    return ConstantBool::get(Val);
+  int64_t SVal = Val;
+  SVal <<= 64-Ty->getPrimitiveSizeInBits();
+  SVal >>= 64-Ty->getPrimitiveSizeInBits();
+  return ConstantSInt::get(Ty, SVal);
+}
+
+
  /// ComputeMaskedBits - Determine which of the bits specified in Mask are
  /// known to be either zero or one and return them in the KnownZero/KnownOne
  /// bitsets.  This code only analyzes bits in Mask, in order to short-circuit
@@ -420,7 +443,7 @@ static void ComputeMaskedBits(Value *V, uint64_t Mask, uint64_t &KnownZero,
    // this won't lose us code quality.
    if (ConstantIntegral *CI = dyn_cast<ConstantIntegral>(V)) {
      // We know all of the bits for a constant!
-    KnownOne = CI->getZExtValue();
+    KnownOne = CI->getZExtValue() & Mask;
      KnownZero = ~KnownOne & Mask;
      return;
    }
@@ -430,147 +453,151 @@ static void ComputeMaskedBits(Value *V, uint64_t Mask, uint64_t &KnownZero,
      return;  // Limit search depth.
  
    uint64_t KnownZero2, KnownOne2;
-  if (Instruction *I = dyn_cast<Instruction>(V)) {
-    switch (I->getOpcode()) {
-    case Instruction::And:
-      // If either the LHS or the RHS are Zero, the result is zero.
-      ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
-      Mask &= ~KnownZero;
-      ComputeMaskedBits(I->getOperand(0), Mask, KnownZero2, KnownOne2, Depth+1);
-      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
-      assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
-      
-      // Output known-1 bits are only known if set in both the LHS & RHS.
-      KnownOne &= KnownOne2;
-      // Output known-0 are known to be clear if zero in either the LHS | RHS.
-      KnownZero |= KnownZero2;
-      return;
-    case Instruction::Or:
-      ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
-      ComputeMaskedBits(I->getOperand(0), Mask, KnownZero2, KnownOne2, Depth+1);
-      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
-      assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
-      
-      // Output known-0 bits are only known if clear in both the LHS & RHS.
-      KnownZero &= KnownZero2;
-      // Output known-1 are known to be set if set in either the LHS | RHS.
-      KnownOne |= KnownOne2;
-      return;
-    case Instruction::Xor: {
-      ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
-      ComputeMaskedBits(I->getOperand(0), Mask, KnownZero2, KnownOne2, Depth+1);
-      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
-      assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
-      
-      // Output known-0 bits are known if clear or set in both the LHS & RHS.
-      uint64_t KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
-      // Output known-1 are known to be set if set in only one of the LHS, RHS.
-      KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
-      KnownZero = KnownZeroOut;
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return;
+
+  Mask &= V->getType()->getIntegralTypeMask();
+  
+  switch (I->getOpcode()) {
+  case Instruction::And:
+    // If either the LHS or the RHS are Zero, the result is zero.
+    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
+    Mask &= ~KnownZero;
+    ComputeMaskedBits(I->getOperand(0), Mask, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    KnownOne &= KnownOne2;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    KnownZero |= KnownZero2;
+    return;
+  case Instruction::Or:
+    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
+    Mask &= ~KnownOne;
+    ComputeMaskedBits(I->getOperand(0), Mask, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    KnownZero &= KnownZero2;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    KnownOne |= KnownOne2;
+    return;
+  case Instruction::Xor: {
+    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(I->getOperand(0), Mask, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    uint64_t KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
+    KnownZero = KnownZeroOut;
+    return;
+  }
+  case Instruction::Select:
+    ComputeMaskedBits(I->getOperand(2), Mask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    return;
+  case Instruction::Cast: {
+    const Type *SrcTy = I->getOperand(0)->getType();
+    if (!SrcTy->isIntegral()) return;
+    
+    // If this is an integer truncate or noop, just look in the input.
+    if (SrcTy->getPrimitiveSizeInBits() >= 
+           I->getType()->getPrimitiveSizeInBits()) {
+      ComputeMaskedBits(I->getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
        return;
      }
-    case Instruction::Select:
-      ComputeMaskedBits(I->getOperand(2), Mask, KnownZero, KnownOne, Depth+1);
-      ComputeMaskedBits(I->getOperand(1), Mask, KnownZero2, KnownOne2, Depth+1);
-      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
-      assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
  
-      // Only known if known in both the LHS and RHS.
-      KnownOne &= KnownOne2;
-      KnownZero &= KnownZero2;
-      return;
-    case Instruction::Cast: {
-      const Type *SrcTy = I->getOperand(0)->getType();
-      if (!SrcTy->isIntegral()) return;
+    // Sign or Zero extension.  Compute the bits in the result that are not
+    // present in the input.
+    uint64_t NotIn = ~SrcTy->getIntegralTypeMask();
+    uint64_t NewBits = I->getType()->getIntegralTypeMask() & NotIn;
        
-      // If this is an integer truncate or noop, just look in the input.
-      if (SrcTy->getPrimitiveSizeInBits() >= 
-             I->getType()->getPrimitiveSizeInBits()) {
-        ComputeMaskedBits(I->getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
-        return;
-      }
+    // Handle zero extension.
+    if (!SrcTy->isSigned()) {
+      Mask &= SrcTy->getIntegralTypeMask();
+      ComputeMaskedBits(I->getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+      // The top bits are known to be zero.
+      KnownZero |= NewBits;
+    } else {
+      // Sign extension.
+      Mask &= SrcTy->getIntegralTypeMask();
+      ComputeMaskedBits(I->getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
  
-      // Sign or Zero extension.  Compute the bits in the result that are not
-      // present in the input.
-      uint64_t NotIn = ~SrcTy->getIntegralTypeMask();
-      uint64_t NewBits = I->getType()->getIntegralTypeMask() & NotIn;
-        
-      // Handle zero extension.
-      if (!SrcTy->isSigned()) {
-        Mask &= SrcTy->getIntegralTypeMask();
-        ComputeMaskedBits(I->getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
-        assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
-        // The top bits are known to be zero.
+      // If the sign bit of the input is known set or clear, then we know the
+      // top bits of the result.
+      uint64_t InSignBit = 1ULL << (SrcTy->getPrimitiveSizeInBits()-1);
+      if (KnownZero & InSignBit) {          // Input sign bit known zero
          KnownZero |= NewBits;
-      } else {
-        // Sign extension.
-        Mask &= SrcTy->getIntegralTypeMask();
-        ComputeMaskedBits(I->getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
-        assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
-
-        // If the sign bit of the input is known set or clear, then we know the
-        // top bits of the result.
-        uint64_t InSignBit = 1ULL << (SrcTy->getPrimitiveSizeInBits()-1);
-        if (KnownZero & InSignBit) {          // Input sign bit known zero
-          KnownZero |= NewBits;
-          KnownOne &= ~NewBits;
-        } else if (KnownOne & InSignBit) {    // Input sign bit known set
-          KnownOne |= NewBits;
-          KnownZero &= ~NewBits;
-        } else {                              // Input sign bit unknown
-          KnownZero &= ~NewBits;
-          KnownOne &= ~NewBits;
-        }
+        KnownOne &= ~NewBits;
+      } else if (KnownOne & InSignBit) {    // Input sign bit known set
+        KnownOne |= NewBits;
+        KnownZero &= ~NewBits;
+      } else {                              // Input sign bit unknown
+        KnownZero &= ~NewBits;
+        KnownOne &= ~NewBits;
        }
+    }
+    return;
+  }
+  case Instruction::Shl:
+    // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
+    if (ConstantUInt *SA = dyn_cast<ConstantUInt>(I->getOperand(1))) {
+      Mask >>= SA->getValue();
+      ComputeMaskedBits(I->getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+      KnownZero <<= SA->getValue();
+      KnownOne  <<= SA->getValue();
+      KnownZero |= (1ULL << SA->getValue())-1;  // low bits known zero.
        return;
      }
-    case Instruction::Shl:
-      // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
-      if (ConstantUInt *SA = dyn_cast<ConstantUInt>(I->getOperand(1))) {
-        Mask >> SA->getValue();
-        ComputeMaskedBits(I->getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
-        assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
-        KnownZero <<= SA->getValue();
-        KnownOne  <<= SA->getValue();
-        KnownZero |= (1ULL << SA->getValue())-1;  // low bits known zero.
-        return;
-      }
-      break;
-    case Instruction::Shr:
-      // (ushr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
-      if (ConstantUInt *SA = dyn_cast<ConstantUInt>(I->getOperand(1))) {
-        // Compute the new bits that are at the top now.
-        uint64_t HighBits = (1ULL << SA->getValue())-1;
-        HighBits <<= I->getType()->getPrimitiveSizeInBits()-SA->getValue();
+    break;
+  case Instruction::Shr:
+    // (ushr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
+    if (ConstantUInt *SA = dyn_cast<ConstantUInt>(I->getOperand(1))) {
+      // Compute the new bits that are at the top now.
+      uint64_t HighBits = (1ULL << SA->getValue())-1;
+      HighBits <<= I->getType()->getPrimitiveSizeInBits()-SA->getValue();
+      
+      if (I->getType()->isUnsigned()) {   // Unsigned shift right.
+        Mask <<= SA->getValue();
+        ComputeMaskedBits(I->getOperand(0), Mask, KnownZero,KnownOne,Depth+1);
+        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); 
+        KnownZero >>= SA->getValue();
+        KnownOne  >>= SA->getValue();
+        KnownZero |= HighBits;  // high bits known zero.
+      } else {
+        Mask <<= SA->getValue();
+        ComputeMaskedBits(I->getOperand(0), Mask, KnownZero,KnownOne,Depth+1);
+        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); 
+        KnownZero >>= SA->getValue();
+        KnownOne  >>= SA->getValue();
          
-        if (I->getType()->isUnsigned()) {   // Unsigned shift right.
-          Mask << SA->getValue();
-          ComputeMaskedBits(I->getOperand(0), Mask, KnownZero,KnownOne,Depth+1);
-          assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); 
-          KnownZero >>= SA->getValue();
-          KnownOne  >>= SA->getValue();
-          KnownZero |= HighBits;  // high bits known zero.
-        } else {
-          Mask << SA->getValue();
-          ComputeMaskedBits(I->getOperand(0), Mask, KnownZero,KnownOne,Depth+1);
-          assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); 
-          KnownZero >>= SA->getValue();
-          KnownOne  >>= SA->getValue();
-          
-          // Handle the sign bits.
-          uint64_t SignBit = 1ULL << (I->getType()->getPrimitiveSizeInBits()-1);
-          SignBit >>= SA->getValue();  // Adjust to where it is now in the mask.
-          
-          if (KnownZero & SignBit) {       // New bits are known zero.
-            KnownZero |= HighBits;
-          } else if (KnownOne & SignBit) { // New bits are known one.
-            KnownOne |= HighBits;
-          }
+        // Handle the sign bits.
+        uint64_t SignBit = 1ULL << (I->getType()->getPrimitiveSizeInBits()-1);
+        SignBit >>= SA->getValue();  // Adjust to where it is now in the mask.
+        
+        if (KnownZero & SignBit) {       // New bits are known zero.
+          KnownZero |= HighBits;
+        } else if (KnownOne & SignBit) { // New bits are known one.
+          KnownOne |= HighBits;
          }
-        return;
        }
-      break;
+      return;
      }
+    break;
    }
  }
  
@@ -584,19 +611,100 @@ static bool MaskedValueIsZero(Value *V, uint64_t Mask, unsigned Depth = 0) {
    return (KnownZero & Mask) == Mask;
  }
  
-/// SimplifyDemandedBits - Look at V.  At this point, we know that only the Mask
-/// bits of the result of V are ever used downstream.  If we can use this
-/// information to simplify V, return V and set NewVal to the new value we
-/// should use in V's place.
-bool InstCombiner::SimplifyDemandedBits(Value *V, uint64_t Mask,
+/// ShrinkDemandedConstant - Check to see if the specified operand of the 
+/// specified instruction is a constant integer.  If so, check to see if there
+/// are any bits set in the constant that are not demanded.  If so, shrink the
+/// constant and return true.
+static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo, 
+                                   uint64_t Demanded) {
+  ConstantInt *OpC = dyn_cast<ConstantInt>(I->getOperand(OpNo));
+  if (!OpC) return false;
+
+  // If there are no bits set that aren't demanded, nothing to do.
+  if ((~Demanded & OpC->getZExtValue()) == 0)
+    return false;
+
+  // This is producing any bits that are not needed, shrink the RHS.
+  uint64_t Val = Demanded & OpC->getZExtValue();
+  I->setOperand(OpNo, GetConstantInType(OpC->getType(), Val));
+  return true;
+}
+
+// ComputeSignedMinMaxValuesFromKnownBits - Given a signed integer type and a 
+// set of known zero and one bits, compute the maximum and minimum values that
+// could have the specified known zero and known one bits, returning them in
+// min/max.
+static void ComputeSignedMinMaxValuesFromKnownBits(const Type *Ty,
+                                                   uint64_t KnownZero,
+                                                   uint64_t KnownOne,
+                                                   int64_t &Min, int64_t &Max) {
+  uint64_t TypeBits = Ty->getIntegralTypeMask();
+  uint64_t UnknownBits = ~(KnownZero|KnownOne) & TypeBits;
+
+  uint64_t SignBit = 1ULL << (Ty->getPrimitiveSizeInBits()-1);
+  
+  // The minimum value is when all unknown bits are zeros, EXCEPT for the sign
+  // bit if it is unknown.
+  Min = KnownOne;
+  Max = KnownOne|UnknownBits;
+  
+  if (SignBit & UnknownBits) { // Sign bit is unknown
+    Min |= SignBit;
+    Max &= ~SignBit;
+  }
+  
+  // Sign extend the min/max values.
+  int ShAmt = 64-Ty->getPrimitiveSizeInBits();
+  Min = (Min << ShAmt) >> ShAmt;
+  Max = (Max << ShAmt) >> ShAmt;
+}
+
+// ComputeUnsignedMinMaxValuesFromKnownBits - Given an unsigned integer type and
+// a set of known zero and one bits, compute the maximum and minimum values that
+// could have the specified known zero and known one bits, returning them in
+// min/max.
+static void ComputeUnsignedMinMaxValuesFromKnownBits(const Type *Ty,
+                                                     uint64_t KnownZero,
+                                                     uint64_t KnownOne,
+                                                     uint64_t &Min,
+                                                     uint64_t &Max) {
+  uint64_t TypeBits = Ty->getIntegralTypeMask();
+  uint64_t UnknownBits = ~(KnownZero|KnownOne) & TypeBits;
+  
+  // The minimum value is when the unknown bits are all zeros.
+  Min = KnownOne;
+  // The maximum value is when the unknown bits are all ones.
+  Max = KnownOne|UnknownBits;
+}
+
+
+/// SimplifyDemandedBits - Look at V.  At this point, we know that only the
+/// DemandedMask bits of the result of V are ever used downstream.  If we can
+/// use this information to simplify V, do so and return true.  Otherwise,
+/// analyze the expression and return a mask of KnownOne and KnownZero bits for
+/// the expression (used to simplify the caller).  The KnownZero/One bits may
+/// only be accurate for those bits in the DemandedMask.
+bool InstCombiner::SimplifyDemandedBits(Value *V, uint64_t DemandedMask,
+                                        uint64_t &KnownZero, uint64_t &KnownOne,
                                          unsigned Depth) {
+  if (ConstantIntegral *CI = dyn_cast<ConstantIntegral>(V)) {
+    // We know all of the bits for a constant!
+    KnownOne = CI->getZExtValue() & DemandedMask;
+    KnownZero = ~KnownOne & DemandedMask;
+    return false;
+  }
+  
+  KnownZero = KnownOne = 0;
    if (!V->hasOneUse()) {    // Other users may use these bits.
-    if (Depth != 0)         // Not at the root.
+    if (Depth != 0) {       // Not at the root.
+      // Just compute the KnownZero/KnownOne bits to simplify things downstream.
+      ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth);
        return false;
+    }
      // If this is the root being simplified, allow it to have multiple uses,
-    // just set the Mask to all bits.
-    Mask = V->getType()->getIntegralTypeMask();
-  } else if (Mask == 0) {   // Not demanding any bits from V.
+    // just set the DemandedMask to all bits.
+    DemandedMask = V->getType()->getIntegralTypeMask();
+  } else if (DemandedMask == 0) {   // Not demanding any bits from V.
      if (V != UndefValue::get(V->getType()))
        return UpdateValueUsesWith(V, UndefValue::get(V->getType()));
      return false;
@@ -607,99 +715,300 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, uint64_t Mask,
    Instruction *I = dyn_cast<Instruction>(V);
    if (!I) return false;        // Only analyze instructions.
  
+  DemandedMask &= V->getType()->getIntegralTypeMask();
+  
+  uint64_t KnownZero2, KnownOne2;
    switch (I->getOpcode()) {
    default: break;
    case Instruction::And:
-    if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      // Only demanding an intersection of the bits.
-      if (SimplifyDemandedBits(I->getOperand(0), RHS->getRawValue() & Mask,
-                               Depth+1))
-        return true;
-      if (~Mask & RHS->getZExtValue()) {
-        // If this is producing any bits that are not needed, simplify the RHS.
-        uint64_t Val = Mask & RHS->getZExtValue();
-        Constant *RHS = 
-          ConstantUInt::get(I->getType()->getUnsignedVersion(), Val);
-        if (I->getType()->isSigned())
-          RHS = ConstantExpr::getCast(RHS, I->getType());
-        I->setOperand(1, RHS);
-        return UpdateValueUsesWith(I, I);
+    // If either the LHS or the RHS are Zero, the result is zero.
+    if (SimplifyDemandedBits(I->getOperand(1), DemandedMask,
+                             KnownZero, KnownOne, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+
+    // If something is known zero on the RHS, the bits aren't demanded on the
+    // LHS.
+    if (SimplifyDemandedBits(I->getOperand(0), DemandedMask & ~KnownZero,
+                             KnownZero2, KnownOne2, Depth+1))
+      return true;
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+
+    // If all of the demanded bits are known one on one side, return the other.
+    // These bits cannot contribute to the result of the 'and'.
+    if ((DemandedMask & ~KnownZero2 & KnownOne) == (DemandedMask & ~KnownZero2))
+      return UpdateValueUsesWith(I, I->getOperand(0));
+    if ((DemandedMask & ~KnownZero & KnownOne2) == (DemandedMask & ~KnownZero))
+      return UpdateValueUsesWith(I, I->getOperand(1));
+    
+    // If all of the demanded bits in the inputs are known zeros, return zero.
+    if ((DemandedMask & (KnownZero|KnownZero2)) == DemandedMask)
+      return UpdateValueUsesWith(I, Constant::getNullValue(I->getType()));
+      
+    // If the RHS is a constant, see if we can simplify it.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask & ~KnownZero2))
+      return UpdateValueUsesWith(I, I);
+      
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    KnownOne &= KnownOne2;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    KnownZero |= KnownZero2;
+    break;
+  case Instruction::Or:
+    if (SimplifyDemandedBits(I->getOperand(1), DemandedMask, 
+                             KnownZero, KnownOne, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    if (SimplifyDemandedBits(I->getOperand(0), DemandedMask & ~KnownOne, 
+                             KnownZero2, KnownOne2, Depth+1))
+      return true;
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'or'.
+    if ((DemandedMask & ~KnownOne2 & KnownZero) == (DemandedMask & ~KnownOne2))
+      return UpdateValueUsesWith(I, I->getOperand(0));
+    if ((DemandedMask & ~KnownOne & KnownZero2) == (DemandedMask & ~KnownOne))
+      return UpdateValueUsesWith(I, I->getOperand(1));
+
+    // If all of the potentially set bits on one side are known to be set on
+    // the other side, just use the 'other' side.
+    if ((DemandedMask & (~KnownZero) & KnownOne2) == 
+        (DemandedMask & (~KnownZero)))
+      return UpdateValueUsesWith(I, I->getOperand(0));
+    if ((DemandedMask & (~KnownZero2) & KnownOne) == 
+        (DemandedMask & (~KnownZero2)))
+      return UpdateValueUsesWith(I, I->getOperand(1));
+        
+    // If the RHS is a constant, see if we can simplify it.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask))
+      return UpdateValueUsesWith(I, I);
+          
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    KnownZero &= KnownZero2;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    KnownOne |= KnownOne2;
+    break;
+  case Instruction::Xor: {
+    if (SimplifyDemandedBits(I->getOperand(1), DemandedMask,
+                             KnownZero, KnownOne, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    if (SimplifyDemandedBits(I->getOperand(0), DemandedMask, 
+                             KnownZero2, KnownOne2, Depth+1))
+      return true;
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'xor'.
+    if ((DemandedMask & KnownZero) == DemandedMask)
+      return UpdateValueUsesWith(I, I->getOperand(0));
+    if ((DemandedMask & KnownZero2) == DemandedMask)
+      return UpdateValueUsesWith(I, I->getOperand(1));
+    
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    uint64_t KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    uint64_t KnownOneOut = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
+    
+    // If all of the unknown bits are known to be zero on one side or the other
+    // (but not both) turn this into an *inclusive* or.
+    //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
+    if (uint64_t UnknownBits = DemandedMask & ~(KnownZeroOut|KnownOneOut)) {
+      if ((UnknownBits & (KnownZero|KnownZero2)) == UnknownBits) {
+        Instruction *Or =
+          BinaryOperator::createOr(I->getOperand(0), I->getOperand(1),
+                                   I->getName());
+        InsertNewInstBefore(Or, *I);
+        return UpdateValueUsesWith(I, Or);
        }
      }
-    // Walk the LHS and the RHS.
-    return SimplifyDemandedBits(I->getOperand(0), Mask, Depth+1) ||
-           SimplifyDemandedBits(I->getOperand(1), Mask, Depth+1);
-  case Instruction::Or:
-  case Instruction::Xor:
-    if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      // If none of the [x]or'd in bits are demanded, don't both with the [x]or.
-      if ((Mask & RHS->getRawValue()) == 0)
-        return UpdateValueUsesWith(I, I->getOperand(0));
-      
-      // Otherwise, for an OR, we only demand those bits not set by the OR.
-      if (I->getOpcode() == Instruction::Or)
-        Mask &= ~RHS->getRawValue();
-      return SimplifyDemandedBits(I->getOperand(0), Mask, Depth+1);
-    }
-    // Walk the LHS and the RHS.
-    return SimplifyDemandedBits(I->getOperand(0), Mask, Depth+1) ||
-           SimplifyDemandedBits(I->getOperand(1), Mask, Depth+1);
+    
+    // If all of the demanded bits on one side are known, and all of the set
+    // bits on that side are also known to be set on the other side, turn this
+    // into an AND, as we know the bits will be cleared.
+    //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
+    if ((DemandedMask & (KnownZero|KnownOne)) == DemandedMask) { // all known
+      if ((KnownOne & KnownOne2) == KnownOne) {
+        Constant *AndC = GetConstantInType(I->getType(), 
+                                           ~KnownOne & DemandedMask);
+        Instruction *And = 
+          BinaryOperator::createAnd(I->getOperand(0), AndC, "tmp");
+        InsertNewInstBefore(And, *I);
+        return UpdateValueUsesWith(I, And);
+      }
+    }
+    
+    // If the RHS is a constant, see if we can simplify it.
+    // FIXME: for XOR, we prefer to force bits to 1 if they will make a -1.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask))
+      return UpdateValueUsesWith(I, I);
+    
+    KnownZero = KnownZeroOut;
+    KnownOne  = KnownOneOut;
+    break;
+  }
+  case Instruction::Select:
+    if (SimplifyDemandedBits(I->getOperand(2), DemandedMask,
+                             KnownZero, KnownOne, Depth+1))
+      return true;
+    if (SimplifyDemandedBits(I->getOperand(1), DemandedMask, 
+                             KnownZero2, KnownOne2, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // If the operands are constants, see if we can simplify them.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask))
+      return UpdateValueUsesWith(I, I);
+    if (ShrinkDemandedConstant(I, 2, DemandedMask))
+      return UpdateValueUsesWith(I, I);
+    
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    break;
    case Instruction::Cast: {
      const Type *SrcTy = I->getOperand(0)->getType();
-    if (SrcTy == Type::BoolTy)
-      return SimplifyDemandedBits(I->getOperand(0), Mask&1, Depth+1);
+    if (!SrcTy->isIntegral()) return false;
+    
+    // If this is an integer truncate or noop, just look in the input.
+    if (SrcTy->getPrimitiveSizeInBits() >= 
+        I->getType()->getPrimitiveSizeInBits()) {
+      if (SimplifyDemandedBits(I->getOperand(0), DemandedMask,
+                               KnownZero, KnownOne, Depth+1))
+        return true;
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+      break;
+    }
      
-    if (!SrcTy->isInteger()) return false;
-
-    unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
-    // If this is a sign-extend, treat specially.
-    if (SrcTy->isSigned() &&
-        SrcBits < I->getType()->getPrimitiveSizeInBits()) {
-      // If none of the top bits are demanded, convert this into an unsigned
-      // extend instead of a sign extend.
-      if ((Mask & ((1ULL << SrcBits)-1)) == 0) {
+    // Sign or Zero extension.  Compute the bits in the result that are not
+    // present in the input.
+    uint64_t NotIn = ~SrcTy->getIntegralTypeMask();
+    uint64_t NewBits = I->getType()->getIntegralTypeMask() & NotIn;
+    
+    // Handle zero extension.
+    if (!SrcTy->isSigned()) {
+      DemandedMask &= SrcTy->getIntegralTypeMask();
+      if (SimplifyDemandedBits(I->getOperand(0), DemandedMask,
+                               KnownZero, KnownOne, Depth+1))
+        return true;
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+      // The top bits are known to be zero.
+      KnownZero |= NewBits;
+    } else {
+      // Sign extension.
+      uint64_t InSignBit = 1ULL << (SrcTy->getPrimitiveSizeInBits()-1);
+      int64_t InputDemandedBits = DemandedMask & SrcTy->getIntegralTypeMask();
+
+      // If any of the sign extended bits are demanded, we know that the sign
+      // bit is demanded.
+      if (NewBits & DemandedMask)
+        InputDemandedBits |= InSignBit;
+      
+      if (SimplifyDemandedBits(I->getOperand(0), InputDemandedBits,
+                               KnownZero, KnownOne, Depth+1))
+        return true;
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+      
+      // If the sign bit of the input is known set or clear, then we know the
+      // top bits of the result.
+
+      // If the input sign bit is known zero, or if the NewBits are not demanded
+      // convert this into a zero extension.
+      if ((KnownZero & InSignBit) || (NewBits & ~DemandedMask) == NewBits) {
          // Convert to unsigned first.
          Instruction *NewVal;
          NewVal = new CastInst(I->getOperand(0), SrcTy->getUnsignedVersion(),
                                I->getOperand(0)->getName());
          InsertNewInstBefore(NewVal, *I);
+        // Then cast that to the destination type.
          NewVal = new CastInst(NewVal, I->getType(), I->getName());
          InsertNewInstBefore(NewVal, *I);
          return UpdateValueUsesWith(I, NewVal);
+      } else if (KnownOne & InSignBit) {    // Input sign bit known set
+        KnownOne |= NewBits;
+        KnownZero &= ~NewBits;
+      } else {                              // Input sign bit unknown
+        KnownZero &= ~NewBits;
+        KnownOne &= ~NewBits;
        }
-
-      // Otherwise, the high-bits *are* demanded.  This means that the code
-      // implicitly demands computation of the sign bit of the input, make sure
-      // we explicitly include it in Mask.
-      Mask |= 1ULL << (SrcBits-1);
      }
-    
-    // If this is an extension, the top bits are ignored.
-    Mask &= SrcTy->getIntegralTypeMask();
-    return SimplifyDemandedBits(I->getOperand(0), Mask, Depth+1);
+    break;
    }
-  case Instruction::Select:
-    // Simplify the T and F values if they are not demanded.
-    return SimplifyDemandedBits(I->getOperand(2), Mask, Depth+1) ||
-           SimplifyDemandedBits(I->getOperand(1), Mask, Depth+1);
    case Instruction::Shl:
-    // We only demand the low bits of the input.
-    if (ConstantUInt *SA = dyn_cast<ConstantUInt>(I->getOperand(1)))
-      return SimplifyDemandedBits(I->getOperand(0), Mask >> SA->getValue(), 
-                                  Depth+1);
+    if (ConstantUInt *SA = dyn_cast<ConstantUInt>(I->getOperand(1))) {
+      if (SimplifyDemandedBits(I->getOperand(0), DemandedMask >> SA->getValue(), 
+                               KnownZero, KnownOne, Depth+1))
+        return true;
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+      KnownZero <<= SA->getValue();
+      KnownOne  <<= SA->getValue();
+      KnownZero |= (1ULL << SA->getValue())-1;  // low bits known zero.
+    }
      break;
    case Instruction::Shr:
-    // We only demand the high bits of the input.
-    if (I->getType()->isUnsigned())
-      if (ConstantUInt *SA = dyn_cast<ConstantUInt>(I->getOperand(1))) {
-        Mask <<= SA->getValue();
-        Mask &= I->getType()->getIntegralTypeMask();
-        return SimplifyDemandedBits(I->getOperand(0), Mask, Depth+1);
+    if (ConstantUInt *SA = dyn_cast<ConstantUInt>(I->getOperand(1))) {
+      unsigned ShAmt = SA->getValue();
+      
+      // Compute the new bits that are at the top now.
+      uint64_t HighBits = (1ULL << ShAmt)-1;
+      HighBits <<= I->getType()->getPrimitiveSizeInBits() - ShAmt;
+      uint64_t TypeMask = I->getType()->getIntegralTypeMask();
+      if (I->getType()->isUnsigned()) {   // Unsigned shift right.
+        if (SimplifyDemandedBits(I->getOperand(0),
+                                 (DemandedMask << ShAmt) & TypeMask,
+                                 KnownZero, KnownOne, Depth+1))
+          return true;
+        assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+        KnownZero &= TypeMask;
+        KnownOne  &= TypeMask;
+        KnownZero >>= ShAmt;
+        KnownOne  >>= ShAmt;
+        KnownZero |= HighBits;  // high bits known zero.
+      } else {                            // Signed shift right.
+        if (SimplifyDemandedBits(I->getOperand(0),
+                                 (DemandedMask << ShAmt) & TypeMask,
+                                 KnownZero, KnownOne, Depth+1))
+          return true;
+        assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+        KnownZero &= TypeMask;
+        KnownOne  &= TypeMask;
+        KnownZero >>= SA->getValue();
+        KnownOne  >>= SA->getValue();
+        
+        // Handle the sign bits.
+        uint64_t SignBit = 1ULL << (I->getType()->getPrimitiveSizeInBits()-1);
+        SignBit >>= SA->getValue();  // Adjust to where it is now in the mask.
+        
+        // If the input sign bit is known to be zero, or if none of the top bits
+        // are demanded, turn this into an unsigned shift right.
+        if ((KnownZero & SignBit) || (HighBits & ~DemandedMask) == HighBits) {
+          // Convert the input to unsigned.
+          Instruction *NewVal;
+          NewVal = new CastInst(I->getOperand(0), 
+                                I->getType()->getUnsignedVersion(),
+                                I->getOperand(0)->getName());
+          InsertNewInstBefore(NewVal, *I);
+          // Perform the unsigned shift right.
+          NewVal = new ShiftInst(Instruction::Shr, NewVal, SA, I->getName());
+          InsertNewInstBefore(NewVal, *I);
+          // Then cast that to the destination type.
+          NewVal = new CastInst(NewVal, I->getType(), I->getName());
+          InsertNewInstBefore(NewVal, *I);
+          return UpdateValueUsesWith(I, NewVal);
+        } else if (KnownOne & SignBit) { // New bits are known one.
+          KnownOne |= HighBits;
+        }
        }
-    // FIXME: handle signed shr, demanding the appropriate bits.  If the top
-    // bits aren't demanded, strength reduce to a logical SHR instead.
+    }
      break;
    }
+  
+  // If the client is only demanding bits that we know, return the known
+  // constant.
+  if ((DemandedMask & (KnownZero|KnownOne)) == DemandedMask)
+    return UpdateValueUsesWith(I, GetConstantInType(I->getType(), KnownOne));
    return false;
  }  
  
@@ -1322,6 +1631,19 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
        if (Op1F->getValue() == 1.0)
          return ReplaceInstUsesWith(I, Op0);  // Eliminate 'mul double %X, 1.0'
      }
+    
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0))
+      if (Op0I->getOpcode() == Instruction::Add && Op0I->hasOneUse() &&
+          isa<ConstantInt>(Op0I->getOperand(1))) {
+        // Canonicalize (X+C1)*C2 -> X*C2+C1*C2.
+        Instruction *Add = BinaryOperator::createMul(Op0I->getOperand(0),
+                                                     Op1, "tmp");
+        InsertNewInstBefore(Add, I);
+        Value *C1C2 = ConstantExpr::getMul(Op1, 
+                                           cast<Constant>(Op0I->getOperand(1)));
+        return BinaryOperator::createAdd(Add, C1C2);
+        
+      }
  
      // Try to fold constant mul into select arguments.
      if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
@@ -1512,8 +1834,58 @@ Instruction *InstCombiner::visitDiv(BinaryOperator &I) {
  }
  
  
+/// GetFactor - If we can prove that the specified value is at least a multiple
+/// of some factor, return that factor.
+static Constant *GetFactor(Value *V) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
+    return CI;
+  
+  // Unless we can be tricky, we know this is a multiple of 1.
+  Constant *Result = ConstantInt::get(V->getType(), 1);
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return Result;
+  
+  if (I->getOpcode() == Instruction::Mul) {
+    // Handle multiplies by a constant, etc.
+    return ConstantExpr::getMul(GetFactor(I->getOperand(0)),
+                                GetFactor(I->getOperand(1)));
+  } else if (I->getOpcode() == Instruction::Shl) {
+    // (X<<C) -> X * (1 << C)
+    if (Constant *ShRHS = dyn_cast<Constant>(I->getOperand(1))) {
+      ShRHS = ConstantExpr::getShl(Result, ShRHS);
+      return ConstantExpr::getMul(GetFactor(I->getOperand(0)), ShRHS);
+    }
+  } else if (I->getOpcode() == Instruction::And) {
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      // X & 0xFFF0 is known to be a multiple of 16.
+      unsigned Zeros = CountTrailingZeros_64(RHS->getZExtValue());
+      if (Zeros != V->getType()->getPrimitiveSizeInBits())
+        return ConstantExpr::getShl(Result, 
+                                    ConstantUInt::get(Type::UByteTy, Zeros));
+    }
+  } else if (I->getOpcode() == Instruction::Cast) {
+    Value *Op = I->getOperand(0);
+    // Only handle int->int casts.
+    if (!Op->getType()->isInteger()) return Result;
+    return ConstantExpr::getCast(GetFactor(Op), V->getType());
+  }    
+  return Result;
+}
+
  Instruction *InstCombiner::visitRem(BinaryOperator &I) {
    Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  
+  // 0 % X == 0, we don't need to preserve faults!
+  if (Constant *LHS = dyn_cast<Constant>(Op0))
+    if (LHS->isNullValue())
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  if (isa<UndefValue>(Op0))              // undef % X -> 0
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  if (isa<UndefValue>(Op1))
+    return ReplaceInstUsesWith(I, Op1);  // X % undef -> undef
+  
    if (I.getType()->isSigned()) {
      if (Value *RHSNeg = dyn_castNegVal(Op1))
        if (!isa<ConstantSInt>(RHSNeg) ||
@@ -1542,62 +1914,35 @@ Instruction *InstCombiner::visitRem(BinaryOperator &I) {
      }
    }
  
-  if (isa<UndefValue>(Op0))              // undef % X -> 0
-    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
-  if (isa<UndefValue>(Op1))
-    return ReplaceInstUsesWith(I, Op1);  // X % undef -> undef
-
    if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    // X % 0 == undef, we don't need to preserve faults!
+    if (RHS->equalsInt(0))
+      return ReplaceInstUsesWith(I, UndefValue::get(I.getType()));
+    
      if (RHS->equalsInt(1))  // X % 1 == 0
        return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
  
      // Check to see if this is an unsigned remainder with an exact power of 2,
      // if so, convert to a bitwise and.
      if (ConstantUInt *C = dyn_cast<ConstantUInt>(RHS))
-      if (uint64_t Val = C->getValue())    // Don't break X % 0 (divide by zero)
-        if (!(Val & (Val-1)))              // Power of 2
-          return BinaryOperator::createAnd(Op0,
-                                         ConstantUInt::get(I.getType(), Val-1));
+      if (isPowerOf2_64(C->getValue()))
+        return BinaryOperator::createAnd(Op0, SubOne(C));
  
-    if (!RHS->isNullValue()) {
-      if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+    if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) {
+      if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) {
          if (Instruction *R = FoldOpIntoSelect(I, SI, this))
            return R;
-      if (isa<PHINode>(Op0))
+      } else if (isa<PHINode>(Op0I)) {
          if (Instruction *NV = FoldOpIntoPhi(I))
            return NV;
+      }
+      
+      // X*C1%C2 --> 0  iff  C1%C2 == 0
+      if (ConstantExpr::getRem(GetFactor(Op0I), RHS)->isNullValue())
+        return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
      }
    }
  
-  // If this is 'urem X, (Cond ? C1, C2)' where C1&C2 are powers of two,
-  // transform this into: '(Cond ? (urem X, C1) : (urem X, C2))'.
-  if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
-    if (ConstantUInt *STO = dyn_cast<ConstantUInt>(SI->getOperand(1)))
-      if (ConstantUInt *SFO = dyn_cast<ConstantUInt>(SI->getOperand(2))) {
-        if (STO->getValue() == 0) { // Couldn't be this argument.
-          I.setOperand(1, SFO);
-          return &I;
-        } else if (SFO->getValue() == 0) {
-          I.setOperand(1, STO);
-          return &I;
-        }
-
-        if (!(STO->getValue() & (STO->getValue()-1)) &&
-            !(SFO->getValue() & (SFO->getValue()-1))) {
-          Value *TrueAnd = InsertNewInstBefore(BinaryOperator::createAnd(Op0,
-                                         SubOne(STO), SI->getName()+".t"), I);
-          Value *FalseAnd = InsertNewInstBefore(BinaryOperator::createAnd(Op0,
-                                         SubOne(SFO), SI->getName()+".f"), I);
-          return new SelectInst(SI->getOperand(0), TrueAnd, FalseAnd);
-        }
-      }
-
-  // 0 % X == 0, we don't need to preserve faults!
-  if (ConstantInt *LHS = dyn_cast<ConstantInt>(Op0))
-    if (LHS->equalsInt(0))
-      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
-
-  
    if (Instruction *RHSI = dyn_cast<Instruction>(I.getOperand(1))) {
      // Turn A % (C << N), where C is 2^k, into A & ((C << N)-1) [urem only].
      if (I.getType()->isUnsigned() && 
@@ -1611,6 +1956,28 @@ Instruction *InstCombiner::visitRem(BinaryOperator &I) {
          return BinaryOperator::createAnd(Op0, Add);
        }
      }
+    
+    // If this is 'urem X, (Cond ? C1, C2)' where C1&C2 are powers of two,
+    // transform this into: '(Cond ? (urem X, C1) : (urem X, C2))'.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (ConstantUInt *STO = dyn_cast<ConstantUInt>(SI->getOperand(1)))
+        if (ConstantUInt *SFO = dyn_cast<ConstantUInt>(SI->getOperand(2))) {
+          if (STO->getValue() == 0) { // Couldn't be this argument.
+            I.setOperand(1, SFO);
+            return &I;
+          } else if (SFO->getValue() == 0) {
+            I.setOperand(1, STO);
+            return &I;
+          }
+          
+          if (isPowerOf2_64(STO->getValue()) && isPowerOf2_64(SFO->getValue())){
+            Value *TrueAnd = InsertNewInstBefore(BinaryOperator::createAnd(Op0,
+                                          SubOne(STO), SI->getName()+".t"), I);
+            Value *FalseAnd = InsertNewInstBefore(BinaryOperator::createAnd(Op0,
+                                          SubOne(SFO), SI->getName()+".f"), I);
+            return new SelectInst(SI->getOperand(0), TrueAnd, FalseAnd);
+          }
+        }
    }
    
    return 0;
@@ -2019,57 +2386,18 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
    if (Op0 == Op1)
      return ReplaceInstUsesWith(I, Op1);
  
-  // See if we can simplify any instructions used by the LHS whose sole 
+  // See if we can simplify any instructions used by the instruction whose sole 
    // purpose is to compute bits we don't care about.
-  if (SimplifyDemandedBits(&I, I.getType()->getIntegralTypeMask()))
+  uint64_t KnownZero, KnownOne;
+  if (!isa<PackedType>(I.getType()) &&
+      SimplifyDemandedBits(&I, I.getType()->getIntegralTypeMask(),
+                           KnownZero, KnownOne))
      return &I;
    
    if (ConstantIntegral *AndRHS = dyn_cast<ConstantIntegral>(Op1)) {
      uint64_t AndRHSMask = AndRHS->getZExtValue();
      uint64_t TypeMask = Op0->getType()->getIntegralTypeMask();
-
-    if (AndRHSMask == TypeMask)              // and X, -1 == X
-      return ReplaceInstUsesWith(I, Op0);
-    else if (AndRHSMask == 0)                // and X, 0 == 0
-      return ReplaceInstUsesWith(I, AndRHS);
-    
-    // and (and X, c1), c2 -> and (x, c1&c2).  Handle this case here, before
-    // calling ComputeMaskedNonZeroBits, to avoid inefficient cases where we
-    // traipse through many levels of ands.
-    {
-      Value *X = 0; ConstantInt *C1 = 0;
-      if (match(Op0, m_And(m_Value(X), m_ConstantInt(C1))))
-        return BinaryOperator::createAnd(X, ConstantExpr::getAnd(C1, AndRHS));
-    }
-
-    // Figure out which of the input bits are not known to be zero, and which
-    // bits are known to be zero.
-    uint64_t KnownZeroBits, KnownOneBits;
-    ComputeMaskedBits(Op0, TypeMask, KnownZeroBits, KnownOneBits);
-
-    // If the mask is not masking out any bits (i.e. all of the zeros in the
-    // mask are already known to be zero), there is no reason to do the and in
-    // the first place.
      uint64_t NotAndRHS = AndRHSMask^TypeMask;
-    if ((NotAndRHS & KnownZeroBits) == NotAndRHS)
-      return ReplaceInstUsesWith(I, Op0);
-    
-    // If the AND'd bits are all known, turn this AND into a constant.
-    if ((AndRHSMask & (KnownOneBits|KnownZeroBits)) == AndRHSMask) {
-      Constant *NewRHS = ConstantUInt::get(Type::ULongTy, 
-                                           AndRHSMask & KnownOneBits);
-      return ReplaceInstUsesWith(I, ConstantExpr::getCast(NewRHS, I.getType()));
-    }
-    
-    // If the AND mask contains bits that are known zero, remove them.  A
-    // special case is when there are no bits in common, in which case we
-    // implicitly turn this into an AND X, 0, which is later simplified into 0.
-    if ((AndRHSMask & ~KnownZeroBits) != AndRHSMask) {
-      Constant *NewRHS = 
-         ConstantUInt::get(Type::ULongTy, AndRHSMask & ~KnownZeroBits);
-      I.setOperand(1, ConstantExpr::getCast(NewRHS, I.getType()));
-      return &I;
-    }
  
      // Optimize a variety of ((val OP C1) & C2) combinations...
      if (isa<BinaryOperator>(Op0) || isa<ShiftInst>(Op0)) {
@@ -2079,13 +2407,6 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
        switch (Op0I->getOpcode()) {
        case Instruction::Xor:
        case Instruction::Or:
-        // (X ^ V) & C2 --> (X & C2) iff (V & C2) == 0
-        // (X | V) & C2 --> (X & C2) iff (V & C2) == 0
-        if (MaskedValueIsZero(Op0LHS, AndRHSMask))
-          return BinaryOperator::createAnd(Op0RHS, AndRHS);
-        if (MaskedValueIsZero(Op0RHS, AndRHSMask))
-          return BinaryOperator::createAnd(Op0LHS, AndRHS);
-
          // If the mask is only needed on one incoming arm, push it up.
          if (Op0I->hasOneUse()) {
            if (MaskedValueIsZero(Op0LHS, NotAndRHS)) {
@@ -2108,12 +2429,6 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
          }
  
          break;
-      case Instruction::And:
-        // (X & V) & C2 --> 0 iff (V & C2) == 0
-        if (MaskedValueIsZero(Op0LHS, AndRHSMask) ||
-            MaskedValueIsZero(Op0RHS, AndRHSMask))
-          return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
-        break;
        case Instruction::Add:
          // ((A & N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == AndRHS.
          // ((A | N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0
@@ -2168,54 +2483,6 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
                  return ReplaceInstUsesWith(I, AndRHS);
              }
        }
-
-
-      // If this is an integer sign or zero extension instruction.
-      if (SrcTy->isIntegral() &&
-          SrcTy->getPrimitiveSizeInBits() <
-          CI->getType()->getPrimitiveSizeInBits()) {
-
-        if (SrcTy->isUnsigned()) {
-          // See if this and is clearing out bits that are known to be zero
-          // anyway (due to the zero extension).
-          Constant *Mask = ConstantIntegral::getAllOnesValue(SrcTy);
-          Mask = ConstantExpr::getZeroExtend(Mask, CI->getType());
-          Constant *Result = ConstantExpr::getAnd(Mask, AndRHS);
-          if (Result == Mask)  // The "and" isn't doing anything, remove it.
-            return ReplaceInstUsesWith(I, CI);
-          if (Result != AndRHS) { // Reduce the and RHS constant.
-            I.setOperand(1, Result);
-            return &I;
-          }
-
-        } else {
-          if (CI->hasOneUse() && SrcTy->isInteger()) {
-            // We can only do this if all of the sign bits brought in are masked
-            // out.  Compute this by first getting 0000011111, then inverting
-            // it.
-            Constant *Mask = ConstantIntegral::getAllOnesValue(SrcTy);
-            Mask = ConstantExpr::getZeroExtend(Mask, CI->getType());
-            Mask = ConstantExpr::getNot(Mask);    // 1's in the new bits.
-            if (ConstantExpr::getAnd(Mask, AndRHS)->isNullValue()) {
-              // If the and is clearing all of the sign bits, change this to a
-              // zero extension cast.  To do this, cast the cast input to
-              // unsigned, then to the requested size.
-              Value *CastOp = CI->getOperand(0);
-              Instruction *NC =
-                new CastInst(CastOp, CastOp->getType()->getUnsignedVersion(),
-                             CI->getName()+".uns");
-              NC = InsertNewInstBefore(NC, I);
-              // Finally, insert a replacement for CI.
-              NC = new CastInst(NC, CI->getType(), CI->getName());
-              CI->setName("");
-              NC = InsertNewInstBefore(NC, I);
-              WorkList.push_back(CI);  // Delete CI later.
-              I.setOperand(0, NC);
-              return &I;               // The AND operand was modified.
-            }
-          }
-        }
-      }
      }
  
      // Try to fold constant and into select arguments.
@@ -2240,6 +2507,42 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
      InsertNewInstBefore(Or, I);
      return BinaryOperator::createNot(Or);
    }
+  
+  {
+    Value *A = 0, *B = 0;
+    ConstantInt *C1 = 0, *C2 = 0;
+    if (match(Op0, m_Or(m_Value(A), m_Value(B))))
+      if (A == Op1 || B == Op1)    // (A | ?) & A  --> A
+        return ReplaceInstUsesWith(I, Op1);
+    if (match(Op1, m_Or(m_Value(A), m_Value(B))))
+      if (A == Op0 || B == Op0)    // A & (A | ?)  --> A
+        return ReplaceInstUsesWith(I, Op0);
+    
+    if (Op0->hasOneUse() &&
+        match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
+      if (A == Op1) {                                // (A^B)&A -> A&(A^B)
+        I.swapOperands();     // Simplify below
+        std::swap(Op0, Op1);
+      } else if (B == Op1) {                         // (A^B)&B -> B&(B^A)
+        cast<BinaryOperator>(Op0)->swapOperands();
+        I.swapOperands();     // Simplify below
+        std::swap(Op0, Op1);
+      }
+    }
+    if (Op1->hasOneUse() &&
+        match(Op1, m_Xor(m_Value(A), m_Value(B)))) {
+      if (B == Op0) {                                // B&(A^B) -> B&(B^A)
+        cast<BinaryOperator>(Op1)->swapOperands();
+        std::swap(A, B);
+      }
+      if (A == Op0) {                                // A&(A^B) -> A & ~B
+        Instruction *NotB = BinaryOperator::createNot(B, "tmp");
+        InsertNewInstBefore(NotB, I);
+        return BinaryOperator::createAnd(A, NotB);
+      }
+    }
+  }
+  
  
    if (SetCondInst *RHS = dyn_cast<SetCondInst>(Op1)) {
      // (setcc1 A, B) & (setcc2 A, B) --> (setcc3 A, B)
@@ -2337,6 +2640,19 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
          }
    }
  
+  // fold (and (cast A), (cast B)) -> (cast (and A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+    if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
+      if (Op0C->getOperand(0)->getType() == Op1C->getOperand(0)->getType() &&
+          Op0C->getOperand(0)->getType()->isIntegral()) {
+        Instruction *NewOp = BinaryOperator::createAnd(Op0C->getOperand(0),
+                                                       Op1C->getOperand(0),
+                                                       I.getName());
+        InsertNewInstBefore(NewOp, I);
+        return new CastInst(NewOp, I.getType());
+      }
+  }
+
    return Changed ? &I : 0;
  }
  
@@ -2348,18 +2664,20 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
      return ReplaceInstUsesWith(I,                         // X | undef -> -1
                                 ConstantIntegral::getAllOnesValue(I.getType()));
  
-  // or X, X = X   or X, 0 == X
-  if (Op0 == Op1 || Op1 == Constant::getNullValue(I.getType()))
+  // or X, X = X
+  if (Op0 == Op1)
      return ReplaceInstUsesWith(I, Op0);
  
+  // See if we can simplify any instructions used by the instruction whose sole 
+  // purpose is to compute bits we don't care about.
+  uint64_t KnownZero, KnownOne;
+  if (!isa<PackedType>(I.getType()) &&
+      SimplifyDemandedBits(&I, I.getType()->getIntegralTypeMask(),
+                           KnownZero, KnownOne))
+    return &I;
+  
    // or X, -1 == -1
    if (ConstantIntegral *RHS = dyn_cast<ConstantIntegral>(Op1)) {
-    // If X is known to only contain bits that already exist in RHS, just
-    // replace this instruction with RHS directly.
-    if (MaskedValueIsZero(Op0, 
-                  RHS->getZExtValue()^RHS->getType()->getIntegralTypeMask()))
-      return ReplaceInstUsesWith(I, RHS);
-
      ConstantInt *C1 = 0; Value *X = 0;
      // (X & C1) | C2 --> (X | C2) & (C1|C2)
      if (match(Op0, m_And(m_Value(X), m_ConstantInt(C1))) && isOnlyUse(Op0)) {
@@ -2560,6 +2878,20 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
            }
          }
    }
+    
+  // fold (or (cast A), (cast B)) -> (cast (or A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+    if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
+      if (Op0C->getOperand(0)->getType() == Op1C->getOperand(0)->getType() &&
+          Op0C->getOperand(0)->getType()->isIntegral()) {
+        Instruction *NewOp = BinaryOperator::createOr(Op0C->getOperand(0),
+                                                      Op1C->getOperand(0),
+                                                      I.getName());
+        InsertNewInstBefore(NewOp, I);
+        return new CastInst(NewOp, I.getType());
+      }
+  }
+      
  
    return Changed ? &I : 0;
  }
@@ -2587,12 +2919,16 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
      assert(Result == &I && "AssociativeOpt didn't work?");
      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
    }
+  
+  // See if we can simplify any instructions used by the instruction whose sole 
+  // purpose is to compute bits we don't care about.
+  uint64_t KnownZero, KnownOne;
+  if (!isa<PackedType>(I.getType()) &&
+      SimplifyDemandedBits(&I, I.getType()->getIntegralTypeMask(),
+                           KnownZero, KnownOne))
+    return &I;
  
-  if (ConstantIntegral *RHS = dyn_cast<ConstantIntegral>(Op1)) {
-    // xor X, 0 == X
-    if (RHS->isNullValue())
-      return ReplaceInstUsesWith(I, Op0);
-
+  if (ConstantIntegral *RHS = dyn_cast<ConstantIntegral>(Op1)) {
      if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
        // xor (setcc A, B), true = not (setcc A, B) = setncc A, B
        if (SetCondInst *SCI = dyn_cast<SetCondInst>(Op0I))
@@ -2622,8 +2958,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
        }
  
        if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1)))
-        switch (Op0I->getOpcode()) {
-        case Instruction::Add:
+        if (Op0I->getOpcode() == Instruction::Add) {
            // ~(X-c) --> (-c-1)-X
            if (RHS->isAllOnesValue()) {
              Constant *NegOp0CI = ConstantExpr::getNeg(Op0CI);
@@ -2632,18 +2967,20 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
                                               ConstantInt::get(I.getType(), 1)),
                                            Op0I->getOperand(0));
            }
-          break;
-        case Instruction::And:
-          // (X & C1) ^ C2 --> (X & C1) | C2 iff (C1&C2) == 0
-          if (ConstantExpr::getAnd(RHS, Op0CI)->isNullValue())
-            return BinaryOperator::createOr(Op0, RHS);
-          break;
-        case Instruction::Or:
-          // (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
-          if (ConstantExpr::getAnd(RHS, Op0CI) == RHS)
-            return BinaryOperator::createAnd(Op0, ConstantExpr::getNot(RHS));
-          break;
-        default: break;
+        } else if (Op0I->getOpcode() == Instruction::Or) {
+          // (X|C1)^C2 -> X^(C1|C2) iff X&~C1 == 0
+          if (MaskedValueIsZero(Op0I->getOperand(0), Op0CI->getZExtValue())) {
+            Constant *NewRHS = ConstantExpr::getOr(Op0CI, RHS);
+            // Anything in both C1 and C2 is known to be zero, remove it from
+            // NewRHS.
+            Constant *CommonBits = ConstantExpr::getAnd(Op0CI, RHS);
+            NewRHS = ConstantExpr::getAnd(NewRHS, 
+                                          ConstantExpr::getNot(CommonBits));
+            WorkList.push_back(Op0I);
+            I.setOperand(0, Op0I->getOperand(0));
+            I.setOperand(1, NewRHS);
+            return &I;
+          }
          }
      }
  
@@ -2666,14 +3003,14 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
        return ReplaceInstUsesWith(I,
                                  ConstantIntegral::getAllOnesValue(I.getType()));
  
-  if (Instruction *Op1I = dyn_cast<Instruction>(Op1))
+  if (BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1))
      if (Op1I->getOpcode() == Instruction::Or) {
        if (Op1I->getOperand(0) == Op0) {              // B^(B|A) == (A|B)^B
-        cast<BinaryOperator>(Op1I)->swapOperands();
+        Op1I->swapOperands();
          I.swapOperands();
          std::swap(Op0, Op1);
        } else if (Op1I->getOperand(1) == Op0) {       // B^(A|B) == (A|B)^B
-        I.swapOperands();
+        I.swapOperands();     // Simplified below.
          std::swap(Op0, Op1);
        }
      } else if (Op1I->getOpcode() == Instruction::Xor) {
@@ -2681,15 +3018,22 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
          return ReplaceInstUsesWith(I, Op1I->getOperand(1));
        else if (Op0 == Op1I->getOperand(1))                   // A^(B^A) == B
          return ReplaceInstUsesWith(I, Op1I->getOperand(0));
+    } else if (Op1I->getOpcode() == Instruction::And && Op1I->hasOneUse()) {
+      if (Op1I->getOperand(0) == Op0)                      // A^(A&B) -> A^(B&A)
+        Op1I->swapOperands();
+      if (Op0 == Op1I->getOperand(1)) {                    // A^(B&A) -> (B&A)^A
+        I.swapOperands();     // Simplified below.
+        std::swap(Op0, Op1);
+      }
      }
  
-  if (Instruction *Op0I = dyn_cast<Instruction>(Op0))
+  if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0))
      if (Op0I->getOpcode() == Instruction::Or && Op0I->hasOneUse()) {
        if (Op0I->getOperand(0) == Op1)                // (B|A)^B == (A|B)^B
-        cast<BinaryOperator>(Op0I)->swapOperands();
+        Op0I->swapOperands();
        if (Op0I->getOperand(1) == Op1) {              // (A|B)^B == A & ~B
-        Value *NotB = InsertNewInstBefore(BinaryOperator::createNot(Op1,
-                                                     Op1->getName()+".not"), I);
+        Instruction *NotB = BinaryOperator::createNot(Op1, "tmp");
+        InsertNewInstBefore(NotB, I);
          return BinaryOperator::createAnd(Op0I->getOperand(0), NotB);
        }
      } else if (Op0I->getOpcode() == Instruction::Xor) {
@@ -2697,20 +3041,35 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
          return ReplaceInstUsesWith(I, Op0I->getOperand(1));
        else if (Op1 == Op0I->getOperand(1))                   // (B^A)^A == B
          return ReplaceInstUsesWith(I, Op0I->getOperand(0));
+    } else if (Op0I->getOpcode() == Instruction::And && Op0I->hasOneUse()) {
+      if (Op0I->getOperand(0) == Op1)                      // (A&B)^A -> (B&A)^A
+        Op0I->swapOperands();
+      if (Op0I->getOperand(1) == Op1 &&                    // (B&A)^A == ~B & A
+          !isa<ConstantInt>(Op1)) {  // Canonical form is (B&C)^C
+        Instruction *N = BinaryOperator::createNot(Op0I->getOperand(0), "tmp");
+        InsertNewInstBefore(N, I);
+        return BinaryOperator::createAnd(N, Op1);
+      }
      }
  
-  // (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
-  ConstantInt *C1 = 0, *C2 = 0;
-  if (match(Op0, m_And(m_Value(), m_ConstantInt(C1))) &&
-      match(Op1, m_And(m_Value(), m_ConstantInt(C2))) &&
-      ConstantExpr::getAnd(C1, C2)->isNullValue())
-    return BinaryOperator::createOr(Op0, Op1);
-
    // (setcc1 A, B) ^ (setcc2 A, B) --> (setcc3 A, B)
    if (SetCondInst *RHS = dyn_cast<SetCondInst>(I.getOperand(1)))
      if (Instruction *R = AssociativeOpt(I, FoldSetCCLogical(*this, RHS)))
        return R;
  
+  // fold (xor (cast A), (cast B)) -> (cast (xor A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+    if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
+      if (Op0C->getOperand(0)->getType() == Op1C->getOperand(0)->getType() &&
+          Op0C->getOperand(0)->getType()->isIntegral()) {
+        Instruction *NewOp = BinaryOperator::createXor(Op0C->getOperand(0),
+                                                       Op1C->getOperand(0),
+                                                       I.getName());
+        InsertNewInstBefore(NewOp, I);
+        return new CastInst(NewOp, I.getType());
+      }
+  }
+    
    return Changed ? &I : 0;
  }
  
@@ -3053,6 +3412,69 @@ Instruction *InstCombiner::visitSetCondInst(SetCondInst &I) {
      if (I.getOpcode() == Instruction::SetGE)
        return BinaryOperator::createSetGT(Op0, SubOne(CI));
  
+    
+    // See if we can fold the comparison based on bits known to be zero or one
+    // in the input.
+    uint64_t KnownZero, KnownOne;
+    if (SimplifyDemandedBits(Op0, Ty->getIntegralTypeMask(),
+                             KnownZero, KnownOne, 0))
+      return &I;
+        
+    // Given the known and unknown bits, compute a range that the LHS could be
+    // in.
+    if (KnownOne | KnownZero) {
+      if (Ty->isUnsigned()) {   // Unsigned comparison.
+        uint64_t Min, Max;
+        uint64_t RHSVal = CI->getZExtValue();
+        ComputeUnsignedMinMaxValuesFromKnownBits(Ty, KnownZero, KnownOne,
+                                                 Min, Max);
+        switch (I.getOpcode()) {  // LE/GE have been folded already.
+        default: assert(0 && "Unknown setcc opcode!");
+        case Instruction::SetEQ:
+          if (Max < RHSVal || Min > RHSVal)
+            return ReplaceInstUsesWith(I, ConstantBool::False);
+          break;
+        case Instruction::SetNE:
+          if (Max < RHSVal || Min > RHSVal)
+            return ReplaceInstUsesWith(I, ConstantBool::True);
+          break;
+        case Instruction::SetLT:
+          if (Max < RHSVal) return ReplaceInstUsesWith(I, ConstantBool::True);
+          if (Min > RHSVal) return ReplaceInstUsesWith(I, ConstantBool::False);
+          break;
+        case Instruction::SetGT:
+          if (Min > RHSVal) return ReplaceInstUsesWith(I, ConstantBool::True);
+          if (Max < RHSVal) return ReplaceInstUsesWith(I, ConstantBool::False);
+          break;
+        }
+      } else {              // Signed comparison.
+        int64_t Min, Max;
+        int64_t RHSVal = CI->getSExtValue();
+        ComputeSignedMinMaxValuesFromKnownBits(Ty, KnownZero, KnownOne,
+                                               Min, Max);
+        switch (I.getOpcode()) {  // LE/GE have been folded already.
+        default: assert(0 && "Unknown setcc opcode!");
+        case Instruction::SetEQ:
+          if (Max < RHSVal || Min > RHSVal)
+            return ReplaceInstUsesWith(I, ConstantBool::False);
+          break;
+        case Instruction::SetNE:
+          if (Max < RHSVal || Min > RHSVal)
+            return ReplaceInstUsesWith(I, ConstantBool::True);
+          break;
+        case Instruction::SetLT:
+          if (Max < RHSVal) return ReplaceInstUsesWith(I, ConstantBool::True);
+          if (Min > RHSVal) return ReplaceInstUsesWith(I, ConstantBool::False);
+          break;
+        case Instruction::SetGT:
+          if (Min > RHSVal) return ReplaceInstUsesWith(I, ConstantBool::True);
+          if (Max < RHSVal) return ReplaceInstUsesWith(I, ConstantBool::False);
+          break;
+        }
+      }
+    }
+          
+    
      if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
        switch (LHSI->getOpcode()) {
        case Instruction::And:
@@ -3063,17 +3485,28 @@ Instruction *InstCombiner::visitSetCondInst(SetCondInst &I) {
            // happens a LOT in code produced by the C front-end, for bitfield
            // access.
            ShiftInst *Shift = dyn_cast<ShiftInst>(LHSI->getOperand(0));
+          ConstantInt *AndCST = cast<ConstantInt>(LHSI->getOperand(1));
+
+          // Check to see if there is a noop-cast between the shift and the and.
+          if (!Shift) {
+            if (CastInst *CI = dyn_cast<CastInst>(LHSI->getOperand(0)))
+              if (CI->getOperand(0)->getType()->isIntegral() &&
+                  CI->getOperand(0)->getType()->getPrimitiveSizeInBits() ==
+                     CI->getType()->getPrimitiveSizeInBits())
+                Shift = dyn_cast<ShiftInst>(CI->getOperand(0));
+          }
+          
            ConstantUInt *ShAmt;
            ShAmt = Shift ? dyn_cast<ConstantUInt>(Shift->getOperand(1)) : 0;
-          ConstantInt *AndCST = cast<ConstantInt>(LHSI->getOperand(1));
-          const Type *Ty = LHSI->getType();
+          const Type *Ty = Shift ? Shift->getType() : 0;  // Type of the shift.
+          const Type *AndTy = AndCST->getType();          // Type of the and.
  
            // We can fold this as long as we can't shift unknown bits
            // into the mask.  This can only happen with signed shift
            // rights, as they sign-extend.
            if (ShAmt) {
              bool CanFold = Shift->getOpcode() != Instruction::Shr ||
-                           Shift->getType()->isUnsigned();
+                           Ty->isUnsigned();
              if (!CanFold) {
                // To test for the bad case of the signed shr, see if any
                // of the bits shifted in could be tested after the mask.
@@ -3082,7 +3515,8 @@ Instruction *InstCombiner::visitSetCondInst(SetCondInst &I) {
  
                Constant *OShAmt = ConstantUInt::get(Type::UByteTy, ShAmtVal);
                Constant *ShVal =
-                ConstantExpr::getShl(ConstantInt::getAllOnesValue(Ty), OShAmt);
+                ConstantExpr::getShl(ConstantInt::getAllOnesValue(AndTy), 
+                                     OShAmt);
                if (ConstantExpr::getAnd(ShVal, AndCST)->isNullValue())
                  CanFold = true;
              }
@@ -3112,7 +3546,13 @@ Instruction *InstCombiner::visitSetCondInst(SetCondInst &I) {
                  else
                    NewAndCST = ConstantExpr::getShl(AndCST, ShAmt);
                  LHSI->setOperand(1, NewAndCST);
-                LHSI->setOperand(0, Shift->getOperand(0));
+                if (AndTy == Ty) 
+                  LHSI->setOperand(0, Shift->getOperand(0));
+                else {
+                  Value *NewCast = InsertCastBefore(Shift->getOperand(0), AndTy,
+                                                    *Shift);
+                  LHSI->setOperand(0, NewCast);
+                }
                  WorkList.push_back(Shift); // Shift is dead.
                  AddUsesToWorkList(I);
                  return &I;
@@ -3586,6 +4026,32 @@ Instruction *InstCombiner::visitSetCondInst(SetCondInst &I) {
        if (Instruction *R = visitSetCondInstWithCastAndCast(I))
          return R;
    }
+  
+  if (I.getOpcode() == Instruction::SetNE ||
+      I.getOpcode() == Instruction::SetEQ) {
+    Value *A, *B;
+    if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
+        (A == Op1 || B == Op1)) {
+      // (A^B) == A  ->  B == 0
+      Value *OtherVal = A == Op1 ? B : A;
+      return BinaryOperator::create(I.getOpcode(), OtherVal,
+                                    Constant::getNullValue(A->getType()));
+    } else if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
+               (A == Op0 || B == Op0)) {
+      // A == (A^B)  ->  B == 0
+      Value *OtherVal = A == Op0 ? B : A;
+      return BinaryOperator::create(I.getOpcode(), OtherVal,
+                                    Constant::getNullValue(A->getType()));
+    } else if (match(Op0, m_Sub(m_Value(A), m_Value(B))) && A == Op1) {
+      // (A-B) == A  ->  B == 0
+      return BinaryOperator::create(I.getOpcode(), B,
+                                    Constant::getNullValue(B->getType()));
+    } else if (match(Op1, m_Sub(m_Value(A), m_Value(B))) && A == Op0) {
+      // A == (A-B)  ->  B == 0
+      return BinaryOperator::create(I.getOpcode(), B,
+                                    Constant::getNullValue(B->getType()));
+    }
+  }
    return Changed ? &I : 0;
  }
  
@@ -3688,7 +4154,7 @@ Instruction *InstCombiner::visitShiftInst(ShiftInst &I) {
    if (Op1 == Constant::getNullValue(Type::UByteTy) ||
        Op0 == Constant::getNullValue(Op0->getType()))
      return ReplaceInstUsesWith(I, Op0);
-
+  
    if (isa<UndefValue>(Op0)) {            // undef >>s X -> undef
      if (!isLeftShift && I.getType()->isSigned())
        return ReplaceInstUsesWith(I, Op0);
@@ -3737,6 +4203,13 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantUInt *Op1,
    bool isSignedShift = Op0->getType()->isSigned();
    bool isUnsignedShift = !isSignedShift;
  
+  // See if we can simplify any instructions used by the instruction whose sole 
+  // purpose is to compute bits we don't care about.
+  uint64_t KnownZero, KnownOne;
+  if (SimplifyDemandedBits(&I, I.getType()->getIntegralTypeMask(),
+                           KnownZero, KnownOne))
+    return &I;
+  
    // shl uint X, 32 = 0 and shr ubyte Y, 9 = 0, ... just don't eliminate shr
    // of a signed value.
    //
@@ -3766,40 +4239,6 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantUInt *Op1,
        return NV;
    
    if (Op0->hasOneUse()) {
-    // If this is a SHL of a sign-extending cast, see if we can turn the input
-    // into a zero extending cast (a simple strength reduction).
-    if (CastInst *CI = dyn_cast<CastInst>(Op0)) {
-      const Type *SrcTy = CI->getOperand(0)->getType();
-      if (isLeftShift && SrcTy->isInteger() && SrcTy->isSigned() &&
-          SrcTy->getPrimitiveSizeInBits() <
-          CI->getType()->getPrimitiveSizeInBits()) {
-        // We can change it to a zero extension if we are shifting out all of
-        // the sign extended bits.  To check this, form a mask of all of the
-        // sign extend bits, then shift them left and see if we have anything
-        // left.
-        Constant *Mask = ConstantIntegral::getAllOnesValue(SrcTy); //     1111
-        Mask = ConstantExpr::getZeroExtend(Mask, CI->getType());   // 00001111
-        Mask = ConstantExpr::getNot(Mask);   // 1's in the sign bits: 11110000
-        if (ConstantExpr::getShl(Mask, Op1)->isNullValue()) {
-          // If the shift is nuking all of the sign bits, change this to a
-          // zero extension cast.  To do this, cast the cast input to
-          // unsigned, then to the requested size.
-          Value *CastOp = CI->getOperand(0);
-          Instruction *NC =
-            new CastInst(CastOp, CastOp->getType()->getUnsignedVersion(),
-                         CI->getName()+".uns");
-          NC = InsertNewInstBefore(NC, I);
-          // Finally, insert a replacement for CI.
-          NC = new CastInst(NC, CI->getType(), CI->getName());
-          CI->setName("");
-          NC = InsertNewInstBefore(NC, I);
-          WorkList.push_back(CI);  // Delete CI later.
-          I.setOperand(0, NC);
-          return &I;               // The SHL operand was modified.
-        }
-      }
-    }
-    
      if (BinaryOperator *Op0BO = dyn_cast<BinaryOperator>(Op0)) {
        // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
        Value *V1, *V2;
@@ -4040,7 +4479,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantUInt *Op1,
        // this case, C1 == C2 and C1 is 8, 16, or 32.
        if (ShiftAmt1 == ShiftAmt2) {
          const Type *SExtType = 0;
-        switch (ShiftAmt1) {
+        switch (Op0->getType()->getPrimitiveSizeInBits() - ShiftAmt1) {
          case 8 : SExtType = Type::SByteTy; break;
          case 16: SExtType = Type::ShortTy; break;
          case 32: SExtType = Type::IntTy; break;
@@ -4149,6 +4588,10 @@ static bool isEliminableCastOfCast(const Type *SrcTy, const Type *MidTy,
        SrcTy->getPrimitiveSize() < MidTy->getPrimitiveSize())
      return true;
    
+  // Packed type conversions don't modify bits.
+  if (isa<PackedType>(SrcTy) && isa<PackedType>(MidTy) &&isa<PackedType>(DstTy))
+    return true;
+  
    return false;
  }
  
@@ -4378,9 +4821,12 @@ Instruction *InstCombiner::visitCastInst(CastInst &CI) {
  
    // See if we can simplify any instructions used by the LHS whose sole 
    // purpose is to compute bits we don't care about.
-  if (CI.getType()->isInteger() && CI.getOperand(0)->getType()->isIntegral() &&
-      SimplifyDemandedBits(&CI, CI.getType()->getIntegralTypeMask()))
-    return &CI;
+  if (CI.getType()->isInteger() && CI.getOperand(0)->getType()->isIntegral()) {
+    uint64_t KnownZero, KnownOne;
+    if (SimplifyDemandedBits(&CI, CI.getType()->getIntegralTypeMask(),
+                             KnownZero, KnownOne))
+      return &CI;
+  }
    
    // If casting the result of a getelementptr instruction with no offset, turn
    // this into a cast of the original pointer!
@@ -4412,7 +4858,30 @@ Instruction *InstCombiner::visitCastInst(CastInst &CI) {
    if (isa<PHINode>(Src))
      if (Instruction *NV = FoldOpIntoPhi(CI))
        return NV;
+  
+  // If the source and destination are pointers, and this cast is equivalent to
+  // a getelementptr X, 0, 0, 0...  turn it into the appropriate getelementptr.
+  // This can enhance SROA and other transforms that want type-safe pointers.
+  if (const PointerType *DstPTy = dyn_cast<PointerType>(CI.getType()))
+    if (const PointerType *SrcPTy = dyn_cast<PointerType>(Src->getType())) {
+      const Type *DstTy = DstPTy->getElementType();
+      const Type *SrcTy = SrcPTy->getElementType();
+      
+      Constant *ZeroUInt = Constant::getNullValue(Type::UIntTy);
+      unsigned NumZeros = 0;
+      while (SrcTy != DstTy && 
+             isa<CompositeType>(SrcTy) && !isa<PointerType>(SrcTy)) {
+        SrcTy = cast<CompositeType>(SrcTy)->getTypeAtIndex(ZeroUInt);
+        ++NumZeros;
+      }
  
+      // If we found a path from the src to dest, create the getelementptr now.
+      if (SrcTy == DstTy) {
+        std::vector<Value*> Idxs(NumZeros+1, ZeroUInt);
+        return new GetElementPtrInst(Src, Idxs);
+      }
+    }
+      
    // If the source value is an instruction with only this use, we can attempt to
    // propagate the cast into the instruction.  Also, only handle integral types
    // for now.
@@ -4485,63 +4954,61 @@ Instruction *InstCombiner::visitCastInst(CastInst &CI) {
          }
          break;
  
+      case Instruction::SetEQ:
        case Instruction::SetNE:
+        // We if we are just checking for a seteq of a single bit and casting it
+        // to an integer.  If so, shift the bit to the appropriate place then
+        // cast to integer to avoid the comparison.
          if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
-          if (Op1C->getRawValue() == 0) {
-            // If the input only has the low bit set, simplify directly.
-            Constant *Not1 =
-              ConstantExpr::getNot(ConstantInt::get(Op0->getType(), 1));
-            // cast (X != 0) to int  --> X if X&~1 == 0
-            if (MaskedValueIsZero(Op0, 
-                               cast<ConstantIntegral>(Not1)->getZExtValue())) {
-              if (CI.getType() == Op0->getType())
-                return ReplaceInstUsesWith(CI, Op0);
-              else
-                return new CastInst(Op0, CI.getType());
-            }
-
-            // If the input is an and with a single bit, shift then simplify.
-            ConstantInt *AndRHS;
-            if (match(Op0, m_And(m_Value(), m_ConstantInt(AndRHS))))
-              if (AndRHS->getRawValue() &&
-                  (AndRHS->getRawValue() & (AndRHS->getRawValue()-1)) == 0) {
-                unsigned ShiftAmt = Log2_64(AndRHS->getRawValue());
+          uint64_t Op1CV = Op1C->getZExtValue();
+          // cast (X == 0) to int --> X^1        iff X has only the low bit set.
+          // cast (X == 0) to int --> (X>>1)^1   iff X has only the 2nd bit set.
+          // cast (X == 1) to int --> X          iff X has only the low bit set.
+          // cast (X == 2) to int --> X>>1       iff X has only the 2nd bit set.
+          // cast (X != 0) to int --> X          iff X has only the low bit set.
+          // cast (X != 0) to int --> X>>1       iff X has only the 2nd bit set.
+          // cast (X != 1) to int --> X^1        iff X has only the low bit set.
+          // cast (X != 2) to int --> (X>>1)^1   iff X has only the 2nd bit set.
+          if (Op1CV == 0 || isPowerOf2_64(Op1CV)) {
+            // If Op1C some other power of two, convert:
+            uint64_t KnownZero, KnownOne;
+            uint64_t TypeMask = Op1->getType()->getIntegralTypeMask();
+            ComputeMaskedBits(Op0, TypeMask, KnownZero, KnownOne);
+            
+            if (isPowerOf2_64(KnownZero^TypeMask)) { // Exactly one possible 1?
+              bool isSetNE = SrcI->getOpcode() == Instruction::SetNE;
+              if (Op1CV && (Op1CV != (KnownZero^TypeMask))) {
+                // (X&4) == 2 --> false
+                // (X&4) != 2 --> true
+                Constant *Res = ConstantBool::get(isSetNE);
+                Res = ConstantExpr::getCast(Res, CI.getType());
+                return ReplaceInstUsesWith(CI, Res);
+              }
+              
+              unsigned ShiftAmt = Log2_64(KnownZero^TypeMask);
+              Value *In = Op0;
+              if (ShiftAmt) {
                  // Perform an unsigned shr by shiftamt.  Convert input to
                  // unsigned if it is signed.
-                Value *In = Op0;
                  if (In->getType()->isSigned())
                    In = InsertNewInstBefore(new CastInst(In,
                          In->getType()->getUnsignedVersion(), In->getName()),CI);
                  // Insert the shift to put the result in the low bit.
                  In = InsertNewInstBefore(new ShiftInst(Instruction::Shr, In,
-                                      ConstantInt::get(Type::UByteTy, ShiftAmt),
-                                                   In->getName()+".lobit"), CI);
-                if (CI.getType() == In->getType())
-                  return ReplaceInstUsesWith(CI, In);
-                else
-                  return new CastInst(In, CI.getType());
+                                     ConstantInt::get(Type::UByteTy, ShiftAmt),
+                                     In->getName()+".lobit"), CI);
                }
-          }
-        }
-        break;
-      case Instruction::SetEQ:
-        // We if we are just checking for a seteq of a single bit and casting it
-        // to an integer.  If so, shift the bit to the appropriate place then
-        // cast to integer to avoid the comparison.
-        if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
-          // Is Op1C a power of two or zero?
-          if ((Op1C->getRawValue() & Op1C->getRawValue()-1) == 0) {
-            // cast (X == 1) to int -> X iff X has only the low bit set.
-            if (Op1C->getRawValue() == 1) {
-              Constant *Not1 =
-                ConstantExpr::getNot(ConstantInt::get(Op0->getType(), 1));
-              if (MaskedValueIsZero(Op0, 
-                              cast<ConstantIntegral>(Not1)->getZExtValue())) {
-                if (CI.getType() == Op0->getType())
-                  return ReplaceInstUsesWith(CI, Op0);
-                else
-                  return new CastInst(Op0, CI.getType());
+              
+              if ((Op1CV != 0) == isSetNE) { // Toggle the low bit.
+                Constant *One = ConstantInt::get(In->getType(), 1);
+                In = BinaryOperator::createXor(In, One, "tmp");
+                InsertNewInstBefore(cast<Instruction>(In), CI);
                }
+              
+              if (CI.getType() == In->getType())
+                return ReplaceInstUsesWith(CI, In);
+              else
+                return new CastInst(In, CI.getType());
              }
            }
          }
@@ -4822,30 +5289,25 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
            }
  
            if (OtherAddOp) {
-            // So at this point we know we have:
-            //        select C, (add X, Y), (sub X, ?)
-            // We can do the transform profitably if either 'Y' = '?' or '?' is
-            // a constant.
-            if (SubOp->getOperand(1) == AddOp ||
-                isa<Constant>(SubOp->getOperand(1))) {
-              Value *NegVal;
-              if (Constant *C = dyn_cast<Constant>(SubOp->getOperand(1))) {
-                NegVal = ConstantExpr::getNeg(C);
-              } else {
-                NegVal = InsertNewInstBefore(
-                           BinaryOperator::createNeg(SubOp->getOperand(1)), SI);
-              }
+            // So at this point we know we have (Y -> OtherAddOp):
+            //        select C, (add X, Y), (sub X, Z)
+            Value *NegVal;  // Compute -Z
+            if (Constant *C = dyn_cast<Constant>(SubOp->getOperand(1))) {
+              NegVal = ConstantExpr::getNeg(C);
+            } else {
+              NegVal = InsertNewInstBefore(
+                    BinaryOperator::createNeg(SubOp->getOperand(1), "tmp"), SI);
+            }
  
-              Value *NewTrueOp = OtherAddOp;
-              Value *NewFalseOp = NegVal;
-              if (AddOp != TI)
-                std::swap(NewTrueOp, NewFalseOp);
-              Instruction *NewSel =
-                new SelectInst(CondVal, NewTrueOp,NewFalseOp,SI.getName()+".p");
+            Value *NewTrueOp = OtherAddOp;
+            Value *NewFalseOp = NegVal;
+            if (AddOp != TI)
+              std::swap(NewTrueOp, NewFalseOp);
+            Instruction *NewSel =
+              new SelectInst(CondVal, NewTrueOp,NewFalseOp,SI.getName()+".p");
  
-              NewSel = InsertNewInstBefore(NewSel, SI);
-              return BinaryOperator::createAdd(SubOp->getOperand(0), NewSel);
-            }
+            NewSel = InsertNewInstBefore(NewSel, SI);
+            return BinaryOperator::createAdd(SubOp->getOperand(0), NewSel);
            }
          }
        }
@@ -4921,6 +5383,68 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
    return 0;
  }
  
+/// GetKnownAlignment - If the specified pointer has an alignment that we can
+/// determine, return it, otherwise return 0.
+static unsigned GetKnownAlignment(Value *V, TargetData *TD) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
+    unsigned Align = GV->getAlignment();
+    if (Align == 0 && TD) 
+      Align = TD->getTypeAlignment(GV->getType()->getElementType());
+    return Align;
+  } else if (AllocationInst *AI = dyn_cast<AllocationInst>(V)) {
+    unsigned Align = AI->getAlignment();
+    if (Align == 0 && TD) {
+      if (isa<AllocaInst>(AI))
+        Align = TD->getTypeAlignment(AI->getType()->getElementType());
+      else if (isa<MallocInst>(AI)) {
+        // Malloc returns maximally aligned memory.
+        Align = TD->getTypeAlignment(AI->getType()->getElementType());
+        Align = std::max(Align, (unsigned)TD->getTypeAlignment(Type::DoubleTy));
+        Align = std::max(Align, (unsigned)TD->getTypeAlignment(Type::LongTy));
+      }
+    }
+    return Align;
+  } else if (isa<CastInst>(V) ||
+             (isa<ConstantExpr>(V) && 
+              cast<ConstantExpr>(V)->getOpcode() == Instruction::Cast)) {
+    User *CI = cast<User>(V);
+    if (isa<PointerType>(CI->getOperand(0)->getType()))
+      return GetKnownAlignment(CI->getOperand(0), TD);
+    return 0;
+  } else if (isa<GetElementPtrInst>(V) ||
+             (isa<ConstantExpr>(V) && 
+              cast<ConstantExpr>(V)->getOpcode()==Instruction::GetElementPtr)) {
+    User *GEPI = cast<User>(V);
+    unsigned BaseAlignment = GetKnownAlignment(GEPI->getOperand(0), TD);
+    if (BaseAlignment == 0) return 0;
+    
+    // If all indexes are zero, it is just the alignment of the base pointer.
+    bool AllZeroOperands = true;
+    for (unsigned i = 1, e = GEPI->getNumOperands(); i != e; ++i)
+      if (!isa<Constant>(GEPI->getOperand(i)) ||
+          !cast<Constant>(GEPI->getOperand(i))->isNullValue()) {
+        AllZeroOperands = false;
+        break;
+      }
+    if (AllZeroOperands)
+      return BaseAlignment;
+    
+    // Otherwise, if the base alignment is >= the alignment we expect for the
+    // base pointer type, then we know that the resultant pointer is aligned at
+    // least as much as its type requires.
+    if (!TD) return 0;
+
+    const Type *BasePtrTy = GEPI->getOperand(0)->getType();
+    if (TD->getTypeAlignment(cast<PointerType>(BasePtrTy)->getElementType())
+        <= BaseAlignment) {
+      const Type *GEPTy = GEPI->getType();
+      return TD->getTypeAlignment(cast<PointerType>(GEPTy)->getElementType());
+    }
+    return 0;
+  }
+  return 0;
+}
+
  
  /// visitCallInst - CallInst simplification.  This mostly only handles folding 
  /// of intrinsic instructions.  For normal calls, it allows visitCallSite to do
@@ -4939,8 +5463,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
      if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) {
        if (NumBytes->isNullValue()) return EraseInstFromFunction(CI);
  
-      // FIXME: Increase alignment here.
-
        if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes))
          if (CI->getRawValue() == 1) {
            // Replace the instruction with just byte operations.  We would
@@ -4952,30 +5474,129 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
      // If we have a memmove and the source operation is a constant global,
      // then the source and dest pointers can't alias, so we can change this
      // into a call to memcpy.
-    if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(II))
+    if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(II)) {
        if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
          if (GVSrc->isConstant()) {
            Module *M = CI.getParent()->getParent()->getParent();
-          Function *MemCpy = M->getOrInsertFunction("llvm.memcpy",
+          const char *Name;
+          if (CI.getCalledFunction()->getFunctionType()->getParamType(3) == 
+              Type::UIntTy)
+            Name = "llvm.memcpy.i32";
+          else
+            Name = "llvm.memcpy.i64";
+          Function *MemCpy = M->getOrInsertFunction(Name,
                                       CI.getCalledFunction()->getFunctionType());
            CI.setOperand(0, MemCpy);
            Changed = true;
          }
+    }
  
-    if (Changed) return II;
-  } else if (DbgStopPointInst *SPI = dyn_cast<DbgStopPointInst>(II)) {
-    // If this stoppoint is at the same source location as the previous
-    // stoppoint in the chain, it is not needed.
-    if (DbgStopPointInst *PrevSPI =
-        dyn_cast<DbgStopPointInst>(SPI->getChain()))
-      if (SPI->getLineNo() == PrevSPI->getLineNo() &&
-          SPI->getColNo() == PrevSPI->getColNo()) {
-        SPI->replaceAllUsesWith(PrevSPI);
-        return EraseInstFromFunction(CI);
+    // If we can determine a pointer alignment that is bigger than currently
+    // set, update the alignment.
+    if (isa<MemCpyInst>(MI) || isa<MemMoveInst>(MI)) {
+      unsigned Alignment1 = GetKnownAlignment(MI->getOperand(1), TD);
+      unsigned Alignment2 = GetKnownAlignment(MI->getOperand(2), TD);
+      unsigned Align = std::min(Alignment1, Alignment2);
+      if (MI->getAlignment()->getRawValue() < Align) {
+        MI->setAlignment(ConstantUInt::get(Type::UIntTy, Align));
+        Changed = true;
        }
+    } else if (isa<MemSetInst>(MI)) {
+      unsigned Alignment = GetKnownAlignment(MI->getDest(), TD);
+      if (MI->getAlignment()->getRawValue() < Alignment) {
+        MI->setAlignment(ConstantUInt::get(Type::UIntTy, Alignment));
+        Changed = true;
+      }
+    }
+          
+    if (Changed) return II;
    } else {
      switch (II->getIntrinsicID()) {
      default: break;
+    case Intrinsic::ppc_altivec_lvx:
+    case Intrinsic::ppc_altivec_lvxl:
+    case Intrinsic::x86_sse_loadu_ps:
+    case Intrinsic::x86_sse2_loadu_pd:
+    case Intrinsic::x86_sse2_loadu_dq:
+      // Turn PPC lvx     -> load if the pointer is known aligned.
+      // Turn X86 loadups -> load if the pointer is known aligned.
+      if (GetKnownAlignment(II->getOperand(1), TD) >= 16) {
+        Value *Ptr = InsertCastBefore(II->getOperand(1),
+                                      PointerType::get(II->getType()), CI);
+        return new LoadInst(Ptr);
+      }
+      break;
+    case Intrinsic::ppc_altivec_stvx:
+    case Intrinsic::ppc_altivec_stvxl:
+      // Turn stvx -> store if the pointer is known aligned.
+      if (GetKnownAlignment(II->getOperand(2), TD) >= 16) {
+        const Type *OpPtrTy = PointerType::get(II->getOperand(1)->getType());
+        Value *Ptr = InsertCastBefore(II->getOperand(2), OpPtrTy, CI);
+        return new StoreInst(II->getOperand(1), Ptr);
+      }
+      break;
+    case Intrinsic::x86_sse_storeu_ps:
+    case Intrinsic::x86_sse2_storeu_pd:
+    case Intrinsic::x86_sse2_storeu_dq:
+    case Intrinsic::x86_sse2_storel_dq:
+      // Turn X86 storeu -> store if the pointer is known aligned.
+      if (GetKnownAlignment(II->getOperand(1), TD) >= 16) {
+        const Type *OpPtrTy = PointerType::get(II->getOperand(2)->getType());
+        Value *Ptr = InsertCastBefore(II->getOperand(1), OpPtrTy, CI);
+        return new StoreInst(II->getOperand(2), Ptr);
+      }
+      break;
+    case Intrinsic::ppc_altivec_vperm:
+      // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
+      if (ConstantPacked *Mask = dyn_cast<ConstantPacked>(II->getOperand(3))) {
+        assert(Mask->getNumOperands() == 16 && "Bad type for intrinsic!");
+        
+        // Check that all of the elements are integer constants or undefs.
+        bool AllEltsOk = true;
+        for (unsigned i = 0; i != 16; ++i) {
+          if (!isa<ConstantInt>(Mask->getOperand(i)) && 
+              !isa<UndefValue>(Mask->getOperand(i))) {
+            AllEltsOk = false;
+            break;
+          }
+        }
+        
+        if (AllEltsOk) {
+          // Cast the input vectors to byte vectors.
+          Value *Op0 = InsertCastBefore(II->getOperand(1), Mask->getType(), CI);
+          Value *Op1 = InsertCastBefore(II->getOperand(2), Mask->getType(), CI);
+          Value *Result = UndefValue::get(Op0->getType());
+          
+          // Only extract each element once.
+          Value *ExtractedElts[32];
+          memset(ExtractedElts, 0, sizeof(ExtractedElts));
+          
+          for (unsigned i = 0; i != 16; ++i) {
+            if (isa<UndefValue>(Mask->getOperand(i)))
+              continue;
+            unsigned Idx =cast<ConstantInt>(Mask->getOperand(i))->getRawValue();
+            Idx &= 31;  // Match the hardware behavior.
+            
+            if (ExtractedElts[Idx] == 0) {
+              Instruction *Elt = 
+                new ExtractElementInst(Idx < 16 ? Op0 : Op1,
+                                       ConstantUInt::get(Type::UIntTy, Idx&15),
+                                       "tmp");
+              InsertNewInstBefore(Elt, CI);
+              ExtractedElts[Idx] = Elt;
+            }
+          
+            // Insert this value into the result vector.
+            Result = new InsertElementInst(Result, ExtractedElts[Idx],
+                                           ConstantUInt::get(Type::UIntTy, i),
+                                           "tmp");
+            InsertNewInstBefore(cast<Instruction>(Result), CI);
+          }
+          return new CastInst(Result, CI.getType());
+        }
+      }
+      break;
+
      case Intrinsic::stackrestore: {
        // If the save is right next to the restore, remove the restore.  This can
        // happen when variable allocas are DCE'd.
@@ -5104,8 +5725,10 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
    // Check to see if we are changing the return type...
    if (OldRetTy != FT->getReturnType()) {
      if (Callee->isExternal() &&
-        !OldRetTy->isLosslesslyConvertibleTo(FT->getReturnType()) &&
-        !Caller->use_empty())
+        !(OldRetTy->isLosslesslyConvertibleTo(FT->getReturnType()) ||
+          (isa<PointerType>(FT->getReturnType()) && 
+           TD->getIntPtrType()->isLosslesslyConvertibleTo(OldRetTy)))
+        && !Caller->use_empty())
        return false;   // Cannot transform this return value...
  
      // If the callsite is an invoke instruction, and the return value is used by
@@ -5751,7 +6374,8 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI) {
    if (const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) {
      const Type *SrcPTy = SrcTy->getElementType();
  
-    if (DestPTy->isInteger() || isa<PointerType>(DestPTy)) {
+    if (DestPTy->isInteger() || isa<PointerType>(DestPTy) || 
+        isa<PackedType>(DestPTy)) {
        // If the source is an array, the code below will not succeed.  Check to
        // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
        // constants.
@@ -5764,7 +6388,8 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI) {
              SrcPTy = SrcTy->getElementType();
            }
  
-      if ((SrcPTy->isInteger() || isa<PointerType>(SrcPTy)) &&
+      if ((SrcPTy->isInteger() || isa<PointerType>(SrcPTy) || 
+           isa<PackedType>(SrcPTy)) &&
            // Do not allow turning this into a load of an integer, which is then
            // casted to a pointer, this pessimizes pointer analysis a lot.
            (isa<PointerType>(SrcPTy) == isa<PointerType>(LI.getType())) &&
@@ -6200,42 +6825,134 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
    return 0;
  }
  
-Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
-  if (ConstantAggregateZero *C = 
-      dyn_cast<ConstantAggregateZero>(EI.getOperand(0))) {
-    // If packed val is constant 0, replace extract with scalar 0
-    const Type *Ty = cast<PackedType>(C->getType())->getElementType();
-    EI.replaceAllUsesWith(Constant::getNullValue(Ty));
-    return ReplaceInstUsesWith(EI, Constant::getNullValue(Ty));
+/// CheapToScalarize - Return true if the value is cheaper to scalarize than it
+/// is to leave as a vector operation.
+static bool CheapToScalarize(Value *V, bool isConstant) {
+  if (isa<ConstantAggregateZero>(V)) 
+    return true;
+  if (ConstantPacked *C = dyn_cast<ConstantPacked>(V)) {
+    if (isConstant) return true;
+    // If all elts are the same, we can extract.
+    Constant *Op0 = C->getOperand(0);
+    for (unsigned i = 1; i < C->getNumOperands(); ++i)
+      if (C->getOperand(i) != Op0)
+        return false;
+    return true;
+  }
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+  
+  // Insert element gets simplified to the inserted element or is deleted if
+  // this is constant idx extract element and its a constant idx insertelt.
+  if (I->getOpcode() == Instruction::InsertElement && isConstant &&
+      isa<ConstantInt>(I->getOperand(2)))
+    return true;
+  if (I->getOpcode() == Instruction::Load && I->hasOneUse())
+    return true;
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I))
+    if (BO->hasOneUse() &&
+        (CheapToScalarize(BO->getOperand(0), isConstant) ||
+         CheapToScalarize(BO->getOperand(1), isConstant)))
+      return true;
+  
+  return false;
+}
+
+/// FindScalarElement - Given a vector and an element number, see if the scalar
+/// value is already around as a register, for example if it were inserted then
+/// extracted from the vector.
+static Value *FindScalarElement(Value *V, unsigned EltNo) {
+  assert(isa<PackedType>(V->getType()) && "Not looking at a vector?");
+  const PackedType *PTy = cast<PackedType>(V->getType());
+  unsigned Width = PTy->getNumElements();
+  if (EltNo >= Width)  // Out of range access.
+    return UndefValue::get(PTy->getElementType());
+  
+  if (isa<UndefValue>(V))
+    return UndefValue::get(PTy->getElementType());
+  else if (isa<ConstantAggregateZero>(V))
+    return Constant::getNullValue(PTy->getElementType());
+  else if (ConstantPacked *CP = dyn_cast<ConstantPacked>(V))
+    return CP->getOperand(EltNo);
+  else if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert to a variable element, we don't know what it is.
+    if (!isa<ConstantUInt>(III->getOperand(2))) return 0;
+    unsigned IIElt = cast<ConstantUInt>(III->getOperand(2))->getValue();
+    
+    // If this is an insert to the element we are looking for, return the
+    // inserted value.
+    if (EltNo == IIElt) return III->getOperand(1);
+    
+    // Otherwise, the insertelement doesn't modify the value, recurse on its
+    // vector input.
+    return FindScalarElement(III->getOperand(0), EltNo);
+  } else if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V)) {
+    if (isa<ConstantAggregateZero>(SVI->getOperand(2))) {
+      return FindScalarElement(SVI->getOperand(0), 0);
+    } else if (ConstantPacked *CP = 
+                   dyn_cast<ConstantPacked>(SVI->getOperand(2))) {
+      if (isa<UndefValue>(CP->getOperand(EltNo)))
+        return UndefValue::get(PTy->getElementType());
+      unsigned InEl = cast<ConstantUInt>(CP->getOperand(EltNo))->getValue();
+      if (InEl < Width)
+        return FindScalarElement(SVI->getOperand(0), InEl);
+      else
+        return FindScalarElement(SVI->getOperand(1), InEl - Width);
+    }
    }
+  
+  // Otherwise, we don't know.
+  return 0;
+}
+
+Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
+
+  // If packed val is undef, replace extract with scalar undef.
+  if (isa<UndefValue>(EI.getOperand(0)))
+    return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+
+  // If packed val is constant 0, replace extract with scalar 0.
+  if (isa<ConstantAggregateZero>(EI.getOperand(0)))
+    return ReplaceInstUsesWith(EI, Constant::getNullValue(EI.getType()));
+  
    if (ConstantPacked *C = dyn_cast<ConstantPacked>(EI.getOperand(0))) {
      // If packed val is constant with uniform operands, replace EI
      // with that operand
-    Constant *op0 = cast<Constant>(C->getOperand(0));
+    Constant *op0 = C->getOperand(0);
      for (unsigned i = 1; i < C->getNumOperands(); ++i)
-      if (C->getOperand(i) != op0) return 0;
-    return ReplaceInstUsesWith(EI, op0);
+      if (C->getOperand(i) != op0) {
+        op0 = 0; 
+        break;
+      }
+    if (op0)
+      return ReplaceInstUsesWith(EI, op0);
+  }
+  
+  // If extracting a specified index from the vector, see if we can recursively
+  // find a previously computed scalar that was inserted into the vector.
+  if (ConstantUInt *IdxC = dyn_cast<ConstantUInt>(EI.getOperand(1))) {
+    if (Value *Elt = FindScalarElement(EI.getOperand(0), IdxC->getValue()))
+      return ReplaceInstUsesWith(EI, Elt);
    }
+  
    if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0)))
      if (I->hasOneUse()) {
        // Push extractelement into predecessor operation if legal and
        // profitable to do so
        if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
-        if (!isa<Constant>(BO->getOperand(0)) &&
-            !isa<Constant>(BO->getOperand(1)))
-          return 0;
-        ExtractElementInst *newEI0 = 
-          new ExtractElementInst(BO->getOperand(0), EI.getOperand(1),
-                                 EI.getName());
-        ExtractElementInst *newEI1 =
-          new ExtractElementInst(BO->getOperand(1), EI.getOperand(1),
-                                 EI.getName());
-        InsertNewInstBefore(newEI0, EI);
-        InsertNewInstBefore(newEI1, EI);
-        return BinaryOperator::create(BO->getOpcode(), newEI0, newEI1);
-      }
-      switch(I->getOpcode()) {
-      case Instruction::Load: {
+        bool isConstantElt = isa<ConstantInt>(EI.getOperand(1));
+        if (CheapToScalarize(BO, isConstantElt)) {
+          ExtractElementInst *newEI0 = 
+            new ExtractElementInst(BO->getOperand(0), EI.getOperand(1),
+                                   EI.getName()+".lhs");
+          ExtractElementInst *newEI1 =
+            new ExtractElementInst(BO->getOperand(1), EI.getOperand(1),
+                                   EI.getName()+".rhs");
+          InsertNewInstBefore(newEI0, EI);
+          InsertNewInstBefore(newEI1, EI);
+          return BinaryOperator::create(BO->getOpcode(), newEI0, newEI1);
+        }
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
          Value *Ptr = InsertCastBefore(I->getOperand(0),
                                        PointerType::get(EI.getType()), EI);
          GetElementPtrInst *GEP = 
@@ -6243,15 +6960,308 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
                                  I->getName() + ".gep");
          InsertNewInstBefore(GEP, EI);
          return new LoadInst(GEP);
+      } else if (InsertElementInst *IE = dyn_cast<InsertElementInst>(I)) {
+        // Extracting the inserted element?
+        if (IE->getOperand(2) == EI.getOperand(1))
+          return ReplaceInstUsesWith(EI, IE->getOperand(1));
+        // If the inserted and extracted elements are constants, they must not
+        // be the same value, extract from the pre-inserted value instead.
+        if (isa<Constant>(IE->getOperand(2)) &&
+            isa<Constant>(EI.getOperand(1))) {
+          AddUsesToWorkList(EI);
+          EI.setOperand(0, IE->getOperand(0));
+          return &EI;
+        }
        }
-      default:
-        return 0;
+    }
+  return 0;
+}
+
+/// CollectSingleShuffleElements - If V is a shuffle of values that ONLY returns
+/// elements from either LHS or RHS, return the shuffle mask and true. 
+/// Otherwise, return false.
+static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
+                                         std::vector<Constant*> &Mask) {
+  assert(V->getType() == LHS->getType() && V->getType() == RHS->getType() &&
+         "Invalid CollectSingleShuffleElements");
+  unsigned NumElts = cast<PackedType>(V->getType())->getNumElements();
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, UndefValue::get(Type::UIntTy));
+    return true;
+  } else if (V == LHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(ConstantUInt::get(Type::UIntTy, i));
+    return true;
+  } else if (V == RHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(ConstantUInt::get(Type::UIntTy, i+NumElts));
+    return true;
+  } else if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+    
+    if (!isa<ConstantInt>(IdxOp))
+      return false;
+    unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getRawValue();
+    
+    if (isa<UndefValue>(ScalarOp)) {  // inserting undef into vector.
+      // Okay, we can handle this if the vector we are insertinting into is
+      // transitively ok.
+      if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+        // If so, update the mask to reflect the inserted undef.
+        Mask[InsertedIdx] = UndefValue::get(Type::UIntTy);
+        return true;
+      }      
+    } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){
+      if (isa<ConstantInt>(EI->getOperand(1)) &&
+          EI->getOperand(0)->getType() == V->getType()) {
+        unsigned ExtractedIdx =
+          cast<ConstantInt>(EI->getOperand(1))->getRawValue();
+        
+        // This must be extracting from either LHS or RHS.
+        if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
+          // Okay, we can handle this if the vector we are insertinting into is
+          // transitively ok.
+          if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+            // If so, update the mask to reflect the inserted value.
+            if (EI->getOperand(0) == LHS) {
+              Mask[InsertedIdx & (NumElts-1)] = 
+                 ConstantUInt::get(Type::UIntTy, ExtractedIdx);
+            } else {
+              assert(EI->getOperand(0) == RHS);
+              Mask[InsertedIdx & (NumElts-1)] = 
+                ConstantUInt::get(Type::UIntTy, ExtractedIdx+NumElts);
+              
+            }
+            return true;
+          }
+        }
+      }
+    }
+  }
+  // TODO: Handle shufflevector here!
+  
+  return false;
+}
+
+/// CollectShuffleElements - We are building a shuffle of V, using RHS as the
+/// RHS of the shuffle instruction, if it is not null.  Return a shuffle mask
+/// that computes V and the LHS value of the shuffle.
+static Value *CollectShuffleElements(Value *V, std::vector<Constant*> &Mask,
+                                     Value *&RHS) {
+  assert(isa<PackedType>(V->getType()) && 
+         (RHS == 0 || V->getType() == RHS->getType()) &&
+         "Invalid shuffle!");
+  unsigned NumElts = cast<PackedType>(V->getType())->getNumElements();
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, UndefValue::get(Type::UIntTy));
+    return V;
+  } else if (isa<ConstantAggregateZero>(V)) {
+    Mask.assign(NumElts, ConstantUInt::get(Type::UIntTy, 0));
+    return V;
+  } else if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+    
+    if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+      if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) &&
+          EI->getOperand(0)->getType() == V->getType()) {
+        unsigned ExtractedIdx =
+          cast<ConstantInt>(EI->getOperand(1))->getRawValue();
+        unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getRawValue();
+        
+        // Either the extracted from or inserted into vector must be RHSVec,
+        // otherwise we'd end up with a shuffle of three inputs.
+        if (EI->getOperand(0) == RHS || RHS == 0) {
+          RHS = EI->getOperand(0);
+          Value *V = CollectShuffleElements(VecOp, Mask, RHS);
+          Mask[InsertedIdx & (NumElts-1)] = 
+            ConstantUInt::get(Type::UIntTy, NumElts+ExtractedIdx);
+          return V;
+        }
+        
+        if (VecOp == RHS) {
+          Value *V = CollectShuffleElements(EI->getOperand(0), Mask, RHS);
+          // Everything but the extracted element is replaced with the RHS.
+          for (unsigned i = 0; i != NumElts; ++i) {
+            if (i != InsertedIdx)
+              Mask[i] = ConstantUInt::get(Type::UIntTy, NumElts+i);
+          }
+          return V;
+        }
+        
+        // If this insertelement is a chain that comes from exactly these two
+        // vectors, return the vector and the effective shuffle.
+        if (CollectSingleShuffleElements(IEI, EI->getOperand(0), RHS, Mask))
+          return EI->getOperand(0);
+        
+      }
+    }
+  }
+  // TODO: Handle shufflevector here!
+  
+  // Otherwise, can't do anything fancy.  Return an identity vector.
+  for (unsigned i = 0; i != NumElts; ++i)
+    Mask.push_back(ConstantUInt::get(Type::UIntTy, i));
+  return V;
+}
+
+Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
+  Value *VecOp    = IE.getOperand(0);
+  Value *ScalarOp = IE.getOperand(1);
+  Value *IdxOp    = IE.getOperand(2);
+  
+  // If the inserted element was extracted from some other vector, and if the 
+  // indexes are constant, try to turn this into a shufflevector operation.
+  if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+    if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) &&
+        EI->getOperand(0)->getType() == IE.getType()) {
+      unsigned NumVectorElts = IE.getType()->getNumElements();
+      unsigned ExtractedIdx=cast<ConstantInt>(EI->getOperand(1))->getRawValue();
+      unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getRawValue();
+      
+      if (ExtractedIdx >= NumVectorElts) // Out of range extract.
+        return ReplaceInstUsesWith(IE, VecOp);
+      
+      if (InsertedIdx >= NumVectorElts)  // Out of range insert.
+        return ReplaceInstUsesWith(IE, UndefValue::get(IE.getType()));
+      
+      // If we are extracting a value from a vector, then inserting it right
+      // back into the same place, just use the input vector.
+      if (EI->getOperand(0) == VecOp && ExtractedIdx == InsertedIdx)
+        return ReplaceInstUsesWith(IE, VecOp);      
+      
+      // We could theoretically do this for ANY input.  However, doing so could
+      // turn chains of insertelement instructions into a chain of shufflevector
+      // instructions, and right now we do not merge shufflevectors.  As such,
+      // only do this in a situation where it is clear that there is benefit.
+      if (isa<UndefValue>(VecOp) || isa<ConstantAggregateZero>(VecOp)) {
+        // Turn this into shuffle(EIOp0, VecOp, Mask).  The result has all of
+        // the values of VecOp, except then one read from EIOp0.
+        // Build a new shuffle mask.
+        std::vector<Constant*> Mask;
+        if (isa<UndefValue>(VecOp))
+          Mask.assign(NumVectorElts, UndefValue::get(Type::UIntTy));
+        else {
+          assert(isa<ConstantAggregateZero>(VecOp) && "Unknown thing");
+          Mask.assign(NumVectorElts, ConstantUInt::get(Type::UIntTy,
+                                                       NumVectorElts));
+        } 
+        Mask[InsertedIdx] = ConstantUInt::get(Type::UIntTy, ExtractedIdx);
+        return new ShuffleVectorInst(EI->getOperand(0), VecOp,
+                                     ConstantPacked::get(Mask));
+      }
+      
+      // If this insertelement isn't used by some other insertelement, turn it
+      // (and any insertelements it points to), into one big shuffle.
+      if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.use_back())) {
+        std::vector<Constant*> Mask;
+        Value *RHS = 0;
+        Value *LHS = CollectShuffleElements(&IE, Mask, RHS);
+        if (RHS == 0) RHS = UndefValue::get(LHS->getType());
+        // We now have a shuffle of LHS, RHS, Mask.
+        return new ShuffleVectorInst(LHS, RHS, ConstantPacked::get(Mask));
        }
      }
+  }
+
    return 0;
  }
  
  
+Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+  Value *LHS = SVI.getOperand(0);
+  Value *RHS = SVI.getOperand(1);
+  Constant *Mask = cast<Constant>(SVI.getOperand(2));
+
+  bool MadeChange = false;
+  
+  if (isa<UndefValue>(Mask))
+    return ReplaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));
+  
+  // TODO: If we have shuffle(x, undef, mask) and any elements of mask refer to
+  // the undef, change them to undefs.
+  
+  // Canonicalize shuffle(x,x) -> shuffle(x,undef)
+  if (LHS == RHS) {
+    if (isa<UndefValue>(LHS)) {
+      // shuffle(undef,undef,mask) -> undef.
+      return ReplaceInstUsesWith(SVI, LHS);
+    }
+    
+    if (!isa<ConstantAggregateZero>(Mask)) {
+      // Remap any references to RHS to use LHS.
+      ConstantPacked *CP = cast<ConstantPacked>(Mask);
+      std::vector<Constant*> Elts;
+      for (unsigned i = 0, e = CP->getNumOperands(); i != e; ++i) {
+        Elts.push_back(CP->getOperand(i));
+        if (isa<UndefValue>(CP->getOperand(i)))
+          continue;
+        unsigned MV = cast<ConstantInt>(CP->getOperand(i))->getRawValue();
+        if (MV >= e)
+          Elts.back() = ConstantUInt::get(Type::UIntTy, MV & (e-1));
+      }
+      Mask = ConstantPacked::get(Elts);
+    }
+    SVI.setOperand(1, UndefValue::get(RHS->getType()));
+    SVI.setOperand(2, Mask);
+    MadeChange = true;
+  }
+  
+  // Canonicalize shuffle(undef,x,mask) -> shuffle(x, undef,mask').
+  if (isa<UndefValue>(LHS)) {
+    // shuffle(undef,x,<0,0,0,0>) -> undef.
+    if (isa<ConstantAggregateZero>(Mask))
+      return ReplaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));
+    
+    ConstantPacked *CPM = cast<ConstantPacked>(Mask);
+    std::vector<Constant*> Elts;
+    for (unsigned i = 0, e = CPM->getNumOperands(); i != e; ++i) {
+      if (isa<UndefValue>(CPM->getOperand(i)))
+        Elts.push_back(CPM->getOperand(i));
+      else {
+        unsigned EltNo = cast<ConstantUInt>(CPM->getOperand(i))->getRawValue();
+        if (EltNo >= e)
+          Elts.push_back(ConstantUInt::get(Type::UIntTy, EltNo-e));
+        else               // Referring to the undef.
+          Elts.push_back(UndefValue::get(Type::UIntTy));
+      }
+    }
+    return new ShuffleVectorInst(RHS, LHS, ConstantPacked::get(Elts));
+  }
+  
+  if (ConstantPacked *CP = dyn_cast<ConstantPacked>(Mask)) {
+    bool isLHSID = true, isRHSID = true;
+    
+    // Analyze the shuffle.
+    for (unsigned i = 0, e = CP->getNumOperands(); i != e; ++i) {
+      if (isa<UndefValue>(CP->getOperand(i)))
+        continue;
+      unsigned MV = cast<ConstantInt>(CP->getOperand(i))->getRawValue();
+      
+      // Is this an identity shuffle of the LHS value?
+      isLHSID &= (MV == i);
+      
+      // Is this an identity shuffle of the RHS value?
+      isRHSID &= (MV-e == i);
+    }
+
+    // Eliminate identity shuffles.
+    if (isLHSID) return ReplaceInstUsesWith(SVI, LHS);
+    if (isRHSID) return ReplaceInstUsesWith(SVI, RHS);
+  }
+  
+  return MadeChange ? &SVI : 0;
+}
+
+
+
  void InstCombiner::removeFromWorkList(Instruction *I) {
    WorkList.erase(std::remove(WorkList.begin(), WorkList.end(), I),
                   WorkList.end());