Implement InstCombine/cast.ll:test29

[oota-llvm.git] / lib / Transforms / Scalar / InstructionCombining.cpp
diff --git a/lib/Transforms/Scalar/InstructionCombining.cpp b/lib/Transforms/Scalar/InstructionCombining.cpp

index 4247d8a6e295606fa58fdd389545e86b71289ff3..2c8f6eda4955a2582e9ff7b05da2bec90eff7475 100644 (file)
--- a/lib/Transforms/Scalar/InstructionCombining.cpp
+++ b/lib/Transforms/Scalar/InstructionCombining.cpp
@@ -137,7 +137,9 @@ namespace {
      Instruction *visitStoreInst(StoreInst &SI);
      Instruction *visitBranchInst(BranchInst &BI);
      Instruction *visitSwitchInst(SwitchInst &SI);
+    Instruction *visitInsertElementInst(InsertElementInst &IE);
      Instruction *visitExtractElementInst(ExtractElementInst &EI);
+    Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
  
      // visitInstruction - Specify what to return for unhandled instructions...
      Instruction *visitInstruction(Instruction &I) { return 0; }
@@ -165,6 +167,9 @@ namespace {
      Value *InsertCastBefore(Value *V, const Type *Ty, Instruction &Pos) {
        if (V->getType() == Ty) return V;
  
+      if (Constant *CV = dyn_cast<Constant>(V))
+        return ConstantExpr::getCast(CV, Ty);
+      
        Instruction *C = new CastInst(V, Ty, V->getName(), &Pos);
        WorkList.push_back(C);
        return C;
@@ -451,6 +456,8 @@ static void ComputeMaskedBits(Value *V, uint64_t Mask, uint64_t &KnownZero,
    Instruction *I = dyn_cast<Instruction>(V);
    if (!I) return;
  
+  Mask &= V->getType()->getIntegralTypeMask();
+  
    switch (I->getOpcode()) {
    case Instruction::And:
      // If either the LHS or the RHS are Zero, the result is zero.
@@ -708,6 +715,8 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, uint64_t DemandedMask,
    Instruction *I = dyn_cast<Instruction>(V);
    if (!I) return false;        // Only analyze instructions.
  
+  DemandedMask &= V->getType()->getIntegralTypeMask();
+  
    uint64_t KnownZero2, KnownOne2;
    switch (I->getOpcode()) {
    default: break;
@@ -757,9 +766,9 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, uint64_t DemandedMask,
      
      // If all of the demanded bits are known zero on one side, return the other.
      // These bits cannot contribute to the result of the 'or'.
-    if ((DemandedMask & ~KnownOne2 & KnownZero) == DemandedMask & ~KnownOne2)
+    if ((DemandedMask & ~KnownOne2 & KnownZero) == (DemandedMask & ~KnownOne2))
        return UpdateValueUsesWith(I, I->getOperand(0));
-    if ((DemandedMask & ~KnownOne & KnownZero2) == DemandedMask & ~KnownOne)
+    if ((DemandedMask & ~KnownOne & KnownZero2) == (DemandedMask & ~KnownOne))
        return UpdateValueUsesWith(I, I->getOperand(1));
  
      // If all of the potentially set bits on one side are known to be set on
@@ -767,9 +776,9 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, uint64_t DemandedMask,
      if ((DemandedMask & (~KnownZero) & KnownOne2) == 
          (DemandedMask & (~KnownZero)))
        return UpdateValueUsesWith(I, I->getOperand(0));
-      if ((DemandedMask & (~KnownZero2) & KnownOne) == 
-          (DemandedMask & (~KnownZero2)))
-        return UpdateValueUsesWith(I, I->getOperand(1));
+    if ((DemandedMask & (~KnownZero2) & KnownOne) == 
+        (DemandedMask & (~KnownZero2)))
+      return UpdateValueUsesWith(I, I->getOperand(1));
          
      // If the RHS is a constant, see if we can simplify it.
      if (ShrinkDemandedConstant(I, 1, DemandedMask))
@@ -889,15 +898,21 @@ bool InstCombiner::SimplifyDemandedBits(Value *V, uint64_t DemandedMask,
        KnownZero |= NewBits;
      } else {
        // Sign extension.
-      if (SimplifyDemandedBits(I->getOperand(0),
-                               DemandedMask & SrcTy->getIntegralTypeMask(),
+      uint64_t InSignBit = 1ULL << (SrcTy->getPrimitiveSizeInBits()-1);
+      int64_t InputDemandedBits = DemandedMask & SrcTy->getIntegralTypeMask();
+
+      // If any of the sign extended bits are demanded, we know that the sign
+      // bit is demanded.
+      if (NewBits & DemandedMask)
+        InputDemandedBits |= InSignBit;
+      
+      if (SimplifyDemandedBits(I->getOperand(0), InputDemandedBits,
                                 KnownZero, KnownOne, Depth+1))
          return true;
        assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
        
        // If the sign bit of the input is known set or clear, then we know the
        // top bits of the result.
-      uint64_t InSignBit = 1ULL << (SrcTy->getPrimitiveSizeInBits()-1);
  
        // If the input sign bit is known zero, or if the NewBits are not demanded
        // convert this into a zero extension.
@@ -1616,6 +1631,19 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
        if (Op1F->getValue() == 1.0)
          return ReplaceInstUsesWith(I, Op0);  // Eliminate 'mul double %X, 1.0'
      }
+    
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0))
+      if (Op0I->getOpcode() == Instruction::Add && Op0I->hasOneUse() &&
+          isa<ConstantInt>(Op0I->getOperand(1))) {
+        // Canonicalize (X+C1)*C2 -> X*C2+C1*C2.
+        Instruction *Add = BinaryOperator::createMul(Op0I->getOperand(0),
+                                                     Op1, "tmp");
+        InsertNewInstBefore(Add, I);
+        Value *C1C2 = ConstantExpr::getMul(Op1, 
+                                           cast<Constant>(Op0I->getOperand(1)));
+        return BinaryOperator::createAdd(Add, C1C2);
+        
+      }
  
      // Try to fold constant mul into select arguments.
      if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
@@ -1806,8 +1834,58 @@ Instruction *InstCombiner::visitDiv(BinaryOperator &I) {
  }
  
  
+/// GetFactor - If we can prove that the specified value is at least a multiple
+/// of some factor, return that factor.
+static Constant *GetFactor(Value *V) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
+    return CI;
+  
+  // Unless we can be tricky, we know this is a multiple of 1.
+  Constant *Result = ConstantInt::get(V->getType(), 1);
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return Result;
+  
+  if (I->getOpcode() == Instruction::Mul) {
+    // Handle multiplies by a constant, etc.
+    return ConstantExpr::getMul(GetFactor(I->getOperand(0)),
+                                GetFactor(I->getOperand(1)));
+  } else if (I->getOpcode() == Instruction::Shl) {
+    // (X<<C) -> X * (1 << C)
+    if (Constant *ShRHS = dyn_cast<Constant>(I->getOperand(1))) {
+      ShRHS = ConstantExpr::getShl(Result, ShRHS);
+      return ConstantExpr::getMul(GetFactor(I->getOperand(0)), ShRHS);
+    }
+  } else if (I->getOpcode() == Instruction::And) {
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      // X & 0xFFF0 is known to be a multiple of 16.
+      unsigned Zeros = CountTrailingZeros_64(RHS->getZExtValue());
+      if (Zeros != V->getType()->getPrimitiveSizeInBits())
+        return ConstantExpr::getShl(Result, 
+                                    ConstantUInt::get(Type::UByteTy, Zeros));
+    }
+  } else if (I->getOpcode() == Instruction::Cast) {
+    Value *Op = I->getOperand(0);
+    // Only handle int->int casts.
+    if (!Op->getType()->isInteger()) return Result;
+    return ConstantExpr::getCast(GetFactor(Op), V->getType());
+  }    
+  return Result;
+}
+
  Instruction *InstCombiner::visitRem(BinaryOperator &I) {
    Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  
+  // 0 % X == 0, we don't need to preserve faults!
+  if (Constant *LHS = dyn_cast<Constant>(Op0))
+    if (LHS->isNullValue())
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  if (isa<UndefValue>(Op0))              // undef % X -> 0
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  if (isa<UndefValue>(Op1))
+    return ReplaceInstUsesWith(I, Op1);  // X % undef -> undef
+  
    if (I.getType()->isSigned()) {
      if (Value *RHSNeg = dyn_castNegVal(Op1))
        if (!isa<ConstantSInt>(RHSNeg) ||
@@ -1836,62 +1914,35 @@ Instruction *InstCombiner::visitRem(BinaryOperator &I) {
      }
    }
  
-  if (isa<UndefValue>(Op0))              // undef % X -> 0
-    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
-  if (isa<UndefValue>(Op1))
-    return ReplaceInstUsesWith(I, Op1);  // X % undef -> undef
-
    if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    // X % 0 == undef, we don't need to preserve faults!
+    if (RHS->equalsInt(0))
+      return ReplaceInstUsesWith(I, UndefValue::get(I.getType()));
+    
      if (RHS->equalsInt(1))  // X % 1 == 0
        return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
  
      // Check to see if this is an unsigned remainder with an exact power of 2,
      // if so, convert to a bitwise and.
      if (ConstantUInt *C = dyn_cast<ConstantUInt>(RHS))
-      if (uint64_t Val = C->getValue())    // Don't break X % 0 (divide by zero)
-        if (!(Val & (Val-1)))              // Power of 2
-          return BinaryOperator::createAnd(Op0,
-                                         ConstantUInt::get(I.getType(), Val-1));
+      if (isPowerOf2_64(C->getValue()))
+        return BinaryOperator::createAnd(Op0, SubOne(C));
  
-    if (!RHS->isNullValue()) {
-      if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+    if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) {
+      if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) {
          if (Instruction *R = FoldOpIntoSelect(I, SI, this))
            return R;
-      if (isa<PHINode>(Op0))
+      } else if (isa<PHINode>(Op0I)) {
          if (Instruction *NV = FoldOpIntoPhi(I))
            return NV;
+      }
+      
+      // X*C1%C2 --> 0  iff  C1%C2 == 0
+      if (ConstantExpr::getRem(GetFactor(Op0I), RHS)->isNullValue())
+        return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
      }
    }
  
-  // If this is 'urem X, (Cond ? C1, C2)' where C1&C2 are powers of two,
-  // transform this into: '(Cond ? (urem X, C1) : (urem X, C2))'.
-  if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
-    if (ConstantUInt *STO = dyn_cast<ConstantUInt>(SI->getOperand(1)))
-      if (ConstantUInt *SFO = dyn_cast<ConstantUInt>(SI->getOperand(2))) {
-        if (STO->getValue() == 0) { // Couldn't be this argument.
-          I.setOperand(1, SFO);
-          return &I;
-        } else if (SFO->getValue() == 0) {
-          I.setOperand(1, STO);
-          return &I;
-        }
-
-        if (!(STO->getValue() & (STO->getValue()-1)) &&
-            !(SFO->getValue() & (SFO->getValue()-1))) {
-          Value *TrueAnd = InsertNewInstBefore(BinaryOperator::createAnd(Op0,
-                                         SubOne(STO), SI->getName()+".t"), I);
-          Value *FalseAnd = InsertNewInstBefore(BinaryOperator::createAnd(Op0,
-                                         SubOne(SFO), SI->getName()+".f"), I);
-          return new SelectInst(SI->getOperand(0), TrueAnd, FalseAnd);
-        }
-      }
-
-  // 0 % X == 0, we don't need to preserve faults!
-  if (ConstantInt *LHS = dyn_cast<ConstantInt>(Op0))
-    if (LHS->equalsInt(0))
-      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
-
-  
    if (Instruction *RHSI = dyn_cast<Instruction>(I.getOperand(1))) {
      // Turn A % (C << N), where C is 2^k, into A & ((C << N)-1) [urem only].
      if (I.getType()->isUnsigned() && 
@@ -1905,6 +1956,28 @@ Instruction *InstCombiner::visitRem(BinaryOperator &I) {
          return BinaryOperator::createAnd(Op0, Add);
        }
      }
+    
+    // If this is 'urem X, (Cond ? C1, C2)' where C1&C2 are powers of two,
+    // transform this into: '(Cond ? (urem X, C1) : (urem X, C2))'.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (ConstantUInt *STO = dyn_cast<ConstantUInt>(SI->getOperand(1)))
+        if (ConstantUInt *SFO = dyn_cast<ConstantUInt>(SI->getOperand(2))) {
+          if (STO->getValue() == 0) { // Couldn't be this argument.
+            I.setOperand(1, SFO);
+            return &I;
+          } else if (SFO->getValue() == 0) {
+            I.setOperand(1, STO);
+            return &I;
+          }
+          
+          if (isPowerOf2_64(STO->getValue()) && isPowerOf2_64(SFO->getValue())){
+            Value *TrueAnd = InsertNewInstBefore(BinaryOperator::createAnd(Op0,
+                                          SubOne(STO), SI->getName()+".t"), I);
+            Value *FalseAnd = InsertNewInstBefore(BinaryOperator::createAnd(Op0,
+                                          SubOne(SFO), SI->getName()+".f"), I);
+            return new SelectInst(SI->getOperand(0), TrueAnd, FalseAnd);
+          }
+        }
    }
    
    return 0;
@@ -2316,7 +2389,8 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
    // See if we can simplify any instructions used by the instruction whose sole 
    // purpose is to compute bits we don't care about.
    uint64_t KnownZero, KnownOne;
-  if (SimplifyDemandedBits(&I, I.getType()->getIntegralTypeMask(),
+  if (!isa<PackedType>(I.getType()) &&
+      SimplifyDemandedBits(&I, I.getType()->getIntegralTypeMask(),
                             KnownZero, KnownOne))
      return &I;
    
@@ -2433,6 +2507,42 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
      InsertNewInstBefore(Or, I);
      return BinaryOperator::createNot(Or);
    }
+  
+  {
+    Value *A = 0, *B = 0;
+    ConstantInt *C1 = 0, *C2 = 0;
+    if (match(Op0, m_Or(m_Value(A), m_Value(B))))
+      if (A == Op1 || B == Op1)    // (A | ?) & A  --> A
+        return ReplaceInstUsesWith(I, Op1);
+    if (match(Op1, m_Or(m_Value(A), m_Value(B))))
+      if (A == Op0 || B == Op0)    // A & (A | ?)  --> A
+        return ReplaceInstUsesWith(I, Op0);
+    
+    if (Op0->hasOneUse() &&
+        match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
+      if (A == Op1) {                                // (A^B)&A -> A&(A^B)
+        I.swapOperands();     // Simplify below
+        std::swap(Op0, Op1);
+      } else if (B == Op1) {                         // (A^B)&B -> B&(B^A)
+        cast<BinaryOperator>(Op0)->swapOperands();
+        I.swapOperands();     // Simplify below
+        std::swap(Op0, Op1);
+      }
+    }
+    if (Op1->hasOneUse() &&
+        match(Op1, m_Xor(m_Value(A), m_Value(B)))) {
+      if (B == Op0) {                                // B&(A^B) -> B&(B^A)
+        cast<BinaryOperator>(Op1)->swapOperands();
+        std::swap(A, B);
+      }
+      if (A == Op0) {                                // A&(A^B) -> A & ~B
+        Instruction *NotB = BinaryOperator::createNot(B, "tmp");
+        InsertNewInstBefore(NotB, I);
+        return BinaryOperator::createAnd(A, NotB);
+      }
+    }
+  }
+  
  
    if (SetCondInst *RHS = dyn_cast<SetCondInst>(Op1)) {
      // (setcc1 A, B) & (setcc2 A, B) --> (setcc3 A, B)
@@ -2530,6 +2640,19 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
          }
    }
  
+  // fold (and (cast A), (cast B)) -> (cast (and A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+    if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
+      if (Op0C->getOperand(0)->getType() == Op1C->getOperand(0)->getType() &&
+          Op0C->getOperand(0)->getType()->isIntegral()) {
+        Instruction *NewOp = BinaryOperator::createAnd(Op0C->getOperand(0),
+                                                       Op1C->getOperand(0),
+                                                       I.getName());
+        InsertNewInstBefore(NewOp, I);
+        return new CastInst(NewOp, I.getType());
+      }
+  }
+
    return Changed ? &I : 0;
  }
  
@@ -2548,7 +2671,8 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
    // See if we can simplify any instructions used by the instruction whose sole 
    // purpose is to compute bits we don't care about.
    uint64_t KnownZero, KnownOne;
-  if (SimplifyDemandedBits(&I, I.getType()->getIntegralTypeMask(),
+  if (!isa<PackedType>(I.getType()) &&
+      SimplifyDemandedBits(&I, I.getType()->getIntegralTypeMask(),
                             KnownZero, KnownOne))
      return &I;
    
@@ -2754,6 +2878,20 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
            }
          }
    }
+    
+  // fold (or (cast A), (cast B)) -> (cast (or A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+    if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
+      if (Op0C->getOperand(0)->getType() == Op1C->getOperand(0)->getType() &&
+          Op0C->getOperand(0)->getType()->isIntegral()) {
+        Instruction *NewOp = BinaryOperator::createOr(Op0C->getOperand(0),
+                                                      Op1C->getOperand(0),
+                                                      I.getName());
+        InsertNewInstBefore(NewOp, I);
+        return new CastInst(NewOp, I.getType());
+      }
+  }
+      
  
    return Changed ? &I : 0;
  }
@@ -2785,7 +2923,8 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
    // See if we can simplify any instructions used by the instruction whose sole 
    // purpose is to compute bits we don't care about.
    uint64_t KnownZero, KnownOne;
-  if (SimplifyDemandedBits(&I, I.getType()->getIntegralTypeMask(),
+  if (!isa<PackedType>(I.getType()) &&
+      SimplifyDemandedBits(&I, I.getType()->getIntegralTypeMask(),
                             KnownZero, KnownOne))
      return &I;
  
@@ -2828,6 +2967,20 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
                                               ConstantInt::get(I.getType(), 1)),
                                            Op0I->getOperand(0));
            }
+        } else if (Op0I->getOpcode() == Instruction::Or) {
+          // (X|C1)^C2 -> X^(C1|C2) iff X&~C1 == 0
+          if (MaskedValueIsZero(Op0I->getOperand(0), Op0CI->getZExtValue())) {
+            Constant *NewRHS = ConstantExpr::getOr(Op0CI, RHS);
+            // Anything in both C1 and C2 is known to be zero, remove it from
+            // NewRHS.
+            Constant *CommonBits = ConstantExpr::getAnd(Op0CI, RHS);
+            NewRHS = ConstantExpr::getAnd(NewRHS, 
+                                          ConstantExpr::getNot(CommonBits));
+            WorkList.push_back(Op0I);
+            I.setOperand(0, Op0I->getOperand(0));
+            I.setOperand(1, NewRHS);
+            return &I;
+          }
          }
      }
  
@@ -2850,14 +3003,14 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
        return ReplaceInstUsesWith(I,
                                  ConstantIntegral::getAllOnesValue(I.getType()));
  
-  if (Instruction *Op1I = dyn_cast<Instruction>(Op1))
+  if (BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1))
      if (Op1I->getOpcode() == Instruction::Or) {
        if (Op1I->getOperand(0) == Op0) {              // B^(B|A) == (A|B)^B
-        cast<BinaryOperator>(Op1I)->swapOperands();
+        Op1I->swapOperands();
          I.swapOperands();
          std::swap(Op0, Op1);
        } else if (Op1I->getOperand(1) == Op0) {       // B^(A|B) == (A|B)^B
-        I.swapOperands();
+        I.swapOperands();     // Simplified below.
          std::swap(Op0, Op1);
        }
      } else if (Op1I->getOpcode() == Instruction::Xor) {
@@ -2865,15 +3018,22 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
          return ReplaceInstUsesWith(I, Op1I->getOperand(1));
        else if (Op0 == Op1I->getOperand(1))                   // A^(B^A) == B
          return ReplaceInstUsesWith(I, Op1I->getOperand(0));
+    } else if (Op1I->getOpcode() == Instruction::And && Op1I->hasOneUse()) {
+      if (Op1I->getOperand(0) == Op0)                      // A^(A&B) -> A^(B&A)
+        Op1I->swapOperands();
+      if (Op0 == Op1I->getOperand(1)) {                    // A^(B&A) -> (B&A)^A
+        I.swapOperands();     // Simplified below.
+        std::swap(Op0, Op1);
+      }
      }
  
-  if (Instruction *Op0I = dyn_cast<Instruction>(Op0))
+  if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0))
      if (Op0I->getOpcode() == Instruction::Or && Op0I->hasOneUse()) {
        if (Op0I->getOperand(0) == Op1)                // (B|A)^B == (A|B)^B
-        cast<BinaryOperator>(Op0I)->swapOperands();
+        Op0I->swapOperands();
        if (Op0I->getOperand(1) == Op1) {              // (A|B)^B == A & ~B
-        Value *NotB = InsertNewInstBefore(BinaryOperator::createNot(Op1,
-                                                     Op1->getName()+".not"), I);
+        Instruction *NotB = BinaryOperator::createNot(Op1, "tmp");
+        InsertNewInstBefore(NotB, I);
          return BinaryOperator::createAnd(Op0I->getOperand(0), NotB);
        }
      } else if (Op0I->getOpcode() == Instruction::Xor) {
@@ -2881,6 +3041,15 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
          return ReplaceInstUsesWith(I, Op0I->getOperand(1));
        else if (Op1 == Op0I->getOperand(1))                   // (B^A)^A == B
          return ReplaceInstUsesWith(I, Op0I->getOperand(0));
+    } else if (Op0I->getOpcode() == Instruction::And && Op0I->hasOneUse()) {
+      if (Op0I->getOperand(0) == Op1)                      // (A&B)^A -> (B&A)^A
+        Op0I->swapOperands();
+      if (Op0I->getOperand(1) == Op1 &&                    // (B&A)^A == ~B & A
+          !isa<ConstantInt>(Op1)) {  // Canonical form is (B&C)^C
+        Instruction *N = BinaryOperator::createNot(Op0I->getOperand(0), "tmp");
+        InsertNewInstBefore(N, I);
+        return BinaryOperator::createAnd(N, Op1);
+      }
      }
  
    // (setcc1 A, B) ^ (setcc2 A, B) --> (setcc3 A, B)
@@ -2888,6 +3057,19 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
      if (Instruction *R = AssociativeOpt(I, FoldSetCCLogical(*this, RHS)))
        return R;
  
+  // fold (xor (cast A), (cast B)) -> (cast (xor A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+    if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
+      if (Op0C->getOperand(0)->getType() == Op1C->getOperand(0)->getType() &&
+          Op0C->getOperand(0)->getType()->isIntegral()) {
+        Instruction *NewOp = BinaryOperator::createXor(Op0C->getOperand(0),
+                                                       Op1C->getOperand(0),
+                                                       I.getName());
+        InsertNewInstBefore(NewOp, I);
+        return new CastInst(NewOp, I.getType());
+      }
+  }
+    
    return Changed ? &I : 0;
  }
  
@@ -3844,6 +4026,32 @@ Instruction *InstCombiner::visitSetCondInst(SetCondInst &I) {
        if (Instruction *R = visitSetCondInstWithCastAndCast(I))
          return R;
    }
+  
+  if (I.getOpcode() == Instruction::SetNE ||
+      I.getOpcode() == Instruction::SetEQ) {
+    Value *A, *B;
+    if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
+        (A == Op1 || B == Op1)) {
+      // (A^B) == A  ->  B == 0
+      Value *OtherVal = A == Op1 ? B : A;
+      return BinaryOperator::create(I.getOpcode(), OtherVal,
+                                    Constant::getNullValue(A->getType()));
+    } else if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
+               (A == Op0 || B == Op0)) {
+      // A == (A^B)  ->  B == 0
+      Value *OtherVal = A == Op0 ? B : A;
+      return BinaryOperator::create(I.getOpcode(), OtherVal,
+                                    Constant::getNullValue(A->getType()));
+    } else if (match(Op0, m_Sub(m_Value(A), m_Value(B))) && A == Op1) {
+      // (A-B) == A  ->  B == 0
+      return BinaryOperator::create(I.getOpcode(), B,
+                                    Constant::getNullValue(B->getType()));
+    } else if (match(Op1, m_Sub(m_Value(A), m_Value(B))) && A == Op0) {
+      // A == (A-B)  ->  B == 0
+      return BinaryOperator::create(I.getOpcode(), B,
+                                    Constant::getNullValue(B->getType()));
+    }
+  }
    return Changed ? &I : 0;
  }
  
@@ -4271,7 +4479,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantUInt *Op1,
        // this case, C1 == C2 and C1 is 8, 16, or 32.
        if (ShiftAmt1 == ShiftAmt2) {
          const Type *SExtType = 0;
-        switch (ShiftAmt1) {
+        switch (Op0->getType()->getPrimitiveSizeInBits() - ShiftAmt1) {
          case 8 : SExtType = Type::SByteTy; break;
          case 16: SExtType = Type::ShortTy; break;
          case 32: SExtType = Type::IntTy; break;
@@ -4380,6 +4588,10 @@ static bool isEliminableCastOfCast(const Type *SrcTy, const Type *MidTy,
        SrcTy->getPrimitiveSize() < MidTy->getPrimitiveSize())
      return true;
    
+  // Packed type conversions don't modify bits.
+  if (isa<PackedType>(SrcTy) && isa<PackedType>(MidTy) &&isa<PackedType>(DstTy))
+    return true;
+  
    return false;
  }
  
@@ -4646,7 +4858,30 @@ Instruction *InstCombiner::visitCastInst(CastInst &CI) {
    if (isa<PHINode>(Src))
      if (Instruction *NV = FoldOpIntoPhi(CI))
        return NV;
+  
+  // If the source and destination are pointers, and this cast is equivalent to
+  // a getelementptr X, 0, 0, 0...  turn it into the appropriate getelementptr.
+  // This can enhance SROA and other transforms that want type-safe pointers.
+  if (const PointerType *DstPTy = dyn_cast<PointerType>(CI.getType()))
+    if (const PointerType *SrcPTy = dyn_cast<PointerType>(Src->getType())) {
+      const Type *DstTy = DstPTy->getElementType();
+      const Type *SrcTy = SrcPTy->getElementType();
+      
+      Constant *ZeroUInt = Constant::getNullValue(Type::UIntTy);
+      unsigned NumZeros = 0;
+      while (SrcTy != DstTy && 
+             isa<CompositeType>(SrcTy) && !isa<PointerType>(SrcTy)) {
+        SrcTy = cast<CompositeType>(SrcTy)->getTypeAtIndex(ZeroUInt);
+        ++NumZeros;
+      }
  
+      // If we found a path from the src to dest, create the getelementptr now.
+      if (SrcTy == DstTy) {
+        std::vector<Value*> Idxs(NumZeros+1, ZeroUInt);
+        return new GetElementPtrInst(Src, Idxs);
+      }
+    }
+      
    // If the source value is an instruction with only this use, we can attempt to
    // propagate the cast into the instruction.  Also, only handle integral types
    // for now.
@@ -4719,63 +4954,61 @@ Instruction *InstCombiner::visitCastInst(CastInst &CI) {
          }
          break;
  
+      case Instruction::SetEQ:
        case Instruction::SetNE:
+        // We if we are just checking for a seteq of a single bit and casting it
+        // to an integer.  If so, shift the bit to the appropriate place then
+        // cast to integer to avoid the comparison.
          if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
-          if (Op1C->getRawValue() == 0) {
-            // If the input only has the low bit set, simplify directly.
-            Constant *Not1 =
-              ConstantExpr::getNot(ConstantInt::get(Op0->getType(), 1));
-            // cast (X != 0) to int  --> X if X&~1 == 0
-            if (MaskedValueIsZero(Op0, 
-                               cast<ConstantIntegral>(Not1)->getZExtValue())) {
-              if (CI.getType() == Op0->getType())
-                return ReplaceInstUsesWith(CI, Op0);
-              else
-                return new CastInst(Op0, CI.getType());
-            }
-
-            // If the input is an and with a single bit, shift then simplify.
-            ConstantInt *AndRHS;
-            if (match(Op0, m_And(m_Value(), m_ConstantInt(AndRHS))))
-              if (AndRHS->getRawValue() &&
-                  (AndRHS->getRawValue() & (AndRHS->getRawValue()-1)) == 0) {
-                unsigned ShiftAmt = Log2_64(AndRHS->getRawValue());
+          uint64_t Op1CV = Op1C->getZExtValue();
+          // cast (X == 0) to int --> X^1        iff X has only the low bit set.
+          // cast (X == 0) to int --> (X>>1)^1   iff X has only the 2nd bit set.
+          // cast (X == 1) to int --> X          iff X has only the low bit set.
+          // cast (X == 2) to int --> X>>1       iff X has only the 2nd bit set.
+          // cast (X != 0) to int --> X          iff X has only the low bit set.
+          // cast (X != 0) to int --> X>>1       iff X has only the 2nd bit set.
+          // cast (X != 1) to int --> X^1        iff X has only the low bit set.
+          // cast (X != 2) to int --> (X>>1)^1   iff X has only the 2nd bit set.
+          if (Op1CV == 0 || isPowerOf2_64(Op1CV)) {
+            // If Op1C some other power of two, convert:
+            uint64_t KnownZero, KnownOne;
+            uint64_t TypeMask = Op1->getType()->getIntegralTypeMask();
+            ComputeMaskedBits(Op0, TypeMask, KnownZero, KnownOne);
+            
+            if (isPowerOf2_64(KnownZero^TypeMask)) { // Exactly one possible 1?
+              bool isSetNE = SrcI->getOpcode() == Instruction::SetNE;
+              if (Op1CV && (Op1CV != (KnownZero^TypeMask))) {
+                // (X&4) == 2 --> false
+                // (X&4) != 2 --> true
+                Constant *Res = ConstantBool::get(isSetNE);
+                Res = ConstantExpr::getCast(Res, CI.getType());
+                return ReplaceInstUsesWith(CI, Res);
+              }
+              
+              unsigned ShiftAmt = Log2_64(KnownZero^TypeMask);
+              Value *In = Op0;
+              if (ShiftAmt) {
                  // Perform an unsigned shr by shiftamt.  Convert input to
                  // unsigned if it is signed.
-                Value *In = Op0;
                  if (In->getType()->isSigned())
                    In = InsertNewInstBefore(new CastInst(In,
                          In->getType()->getUnsignedVersion(), In->getName()),CI);
                  // Insert the shift to put the result in the low bit.
                  In = InsertNewInstBefore(new ShiftInst(Instruction::Shr, In,
-                                      ConstantInt::get(Type::UByteTy, ShiftAmt),
-                                                   In->getName()+".lobit"), CI);
-                if (CI.getType() == In->getType())
-                  return ReplaceInstUsesWith(CI, In);
-                else
-                  return new CastInst(In, CI.getType());
+                                     ConstantInt::get(Type::UByteTy, ShiftAmt),
+                                     In->getName()+".lobit"), CI);
                }
-          }
-        }
-        break;
-      case Instruction::SetEQ:
-        // We if we are just checking for a seteq of a single bit and casting it
-        // to an integer.  If so, shift the bit to the appropriate place then
-        // cast to integer to avoid the comparison.
-        if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
-          // Is Op1C a power of two or zero?
-          if ((Op1C->getRawValue() & Op1C->getRawValue()-1) == 0) {
-            // cast (X == 1) to int -> X iff X has only the low bit set.
-            if (Op1C->getRawValue() == 1) {
-              Constant *Not1 =
-                ConstantExpr::getNot(ConstantInt::get(Op0->getType(), 1));
-              if (MaskedValueIsZero(Op0, 
-                              cast<ConstantIntegral>(Not1)->getZExtValue())) {
-                if (CI.getType() == Op0->getType())
-                  return ReplaceInstUsesWith(CI, Op0);
-                else
-                  return new CastInst(Op0, CI.getType());
+              
+              if ((Op1CV != 0) == isSetNE) { // Toggle the low bit.
+                Constant *One = ConstantInt::get(In->getType(), 1);
+                In = BinaryOperator::createXor(In, One, "tmp");
+                InsertNewInstBefore(cast<Instruction>(In), CI);
                }
+              
+              if (CI.getType() == In->getType())
+                return ReplaceInstUsesWith(CI, In);
+              else
+                return new CastInst(In, CI.getType());
              }
            }
          }
@@ -5056,30 +5289,25 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
            }
  
            if (OtherAddOp) {
-            // So at this point we know we have:
-            //        select C, (add X, Y), (sub X, ?)
-            // We can do the transform profitably if either 'Y' = '?' or '?' is
-            // a constant.
-            if (SubOp->getOperand(1) == AddOp ||
-                isa<Constant>(SubOp->getOperand(1))) {
-              Value *NegVal;
-              if (Constant *C = dyn_cast<Constant>(SubOp->getOperand(1))) {
-                NegVal = ConstantExpr::getNeg(C);
-              } else {
-                NegVal = InsertNewInstBefore(
-                           BinaryOperator::createNeg(SubOp->getOperand(1)), SI);
-              }
+            // So at this point we know we have (Y -> OtherAddOp):
+            //        select C, (add X, Y), (sub X, Z)
+            Value *NegVal;  // Compute -Z
+            if (Constant *C = dyn_cast<Constant>(SubOp->getOperand(1))) {
+              NegVal = ConstantExpr::getNeg(C);
+            } else {
+              NegVal = InsertNewInstBefore(
+                    BinaryOperator::createNeg(SubOp->getOperand(1), "tmp"), SI);
+            }
  
-              Value *NewTrueOp = OtherAddOp;
-              Value *NewFalseOp = NegVal;
-              if (AddOp != TI)
-                std::swap(NewTrueOp, NewFalseOp);
-              Instruction *NewSel =
-                new SelectInst(CondVal, NewTrueOp,NewFalseOp,SI.getName()+".p");
+            Value *NewTrueOp = OtherAddOp;
+            Value *NewFalseOp = NegVal;
+            if (AddOp != TI)
+              std::swap(NewTrueOp, NewFalseOp);
+            Instruction *NewSel =
+              new SelectInst(CondVal, NewTrueOp,NewFalseOp,SI.getName()+".p");
  
-              NewSel = InsertNewInstBefore(NewSel, SI);
-              return BinaryOperator::createAdd(SubOp->getOperand(0), NewSel);
-            }
+            NewSel = InsertNewInstBefore(NewSel, SI);
+            return BinaryOperator::createAdd(SubOp->getOperand(0), NewSel);
            }
          }
        }
@@ -5155,6 +5383,68 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
    return 0;
  }
  
+/// GetKnownAlignment - If the specified pointer has an alignment that we can
+/// determine, return it, otherwise return 0.
+static unsigned GetKnownAlignment(Value *V, TargetData *TD) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
+    unsigned Align = GV->getAlignment();
+    if (Align == 0 && TD) 
+      Align = TD->getTypeAlignment(GV->getType()->getElementType());
+    return Align;
+  } else if (AllocationInst *AI = dyn_cast<AllocationInst>(V)) {
+    unsigned Align = AI->getAlignment();
+    if (Align == 0 && TD) {
+      if (isa<AllocaInst>(AI))
+        Align = TD->getTypeAlignment(AI->getType()->getElementType());
+      else if (isa<MallocInst>(AI)) {
+        // Malloc returns maximally aligned memory.
+        Align = TD->getTypeAlignment(AI->getType()->getElementType());
+        Align = std::max(Align, (unsigned)TD->getTypeAlignment(Type::DoubleTy));
+        Align = std::max(Align, (unsigned)TD->getTypeAlignment(Type::LongTy));
+      }
+    }
+    return Align;
+  } else if (isa<CastInst>(V) ||
+             (isa<ConstantExpr>(V) && 
+              cast<ConstantExpr>(V)->getOpcode() == Instruction::Cast)) {
+    User *CI = cast<User>(V);
+    if (isa<PointerType>(CI->getOperand(0)->getType()))
+      return GetKnownAlignment(CI->getOperand(0), TD);
+    return 0;
+  } else if (isa<GetElementPtrInst>(V) ||
+             (isa<ConstantExpr>(V) && 
+              cast<ConstantExpr>(V)->getOpcode()==Instruction::GetElementPtr)) {
+    User *GEPI = cast<User>(V);
+    unsigned BaseAlignment = GetKnownAlignment(GEPI->getOperand(0), TD);
+    if (BaseAlignment == 0) return 0;
+    
+    // If all indexes are zero, it is just the alignment of the base pointer.
+    bool AllZeroOperands = true;
+    for (unsigned i = 1, e = GEPI->getNumOperands(); i != e; ++i)
+      if (!isa<Constant>(GEPI->getOperand(i)) ||
+          !cast<Constant>(GEPI->getOperand(i))->isNullValue()) {
+        AllZeroOperands = false;
+        break;
+      }
+    if (AllZeroOperands)
+      return BaseAlignment;
+    
+    // Otherwise, if the base alignment is >= the alignment we expect for the
+    // base pointer type, then we know that the resultant pointer is aligned at
+    // least as much as its type requires.
+    if (!TD) return 0;
+
+    const Type *BasePtrTy = GEPI->getOperand(0)->getType();
+    if (TD->getTypeAlignment(cast<PointerType>(BasePtrTy)->getElementType())
+        <= BaseAlignment) {
+      const Type *GEPTy = GEPI->getType();
+      return TD->getTypeAlignment(cast<PointerType>(GEPTy)->getElementType());
+    }
+    return 0;
+  }
+  return 0;
+}
+
  
  /// visitCallInst - CallInst simplification.  This mostly only handles folding 
  /// of intrinsic instructions.  For normal calls, it allows visitCallSite to do
@@ -5173,8 +5463,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
      if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) {
        if (NumBytes->isNullValue()) return EraseInstFromFunction(CI);
  
-      // FIXME: Increase alignment here.
-
        if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes))
          if (CI->getRawValue() == 1) {
            // Replace the instruction with just byte operations.  We would
@@ -5186,30 +5474,129 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
      // If we have a memmove and the source operation is a constant global,
      // then the source and dest pointers can't alias, so we can change this
      // into a call to memcpy.
-    if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(II))
+    if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(II)) {
        if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
          if (GVSrc->isConstant()) {
            Module *M = CI.getParent()->getParent()->getParent();
-          Function *MemCpy = M->getOrInsertFunction("llvm.memcpy",
+          const char *Name;
+          if (CI.getCalledFunction()->getFunctionType()->getParamType(3) == 
+              Type::UIntTy)
+            Name = "llvm.memcpy.i32";
+          else
+            Name = "llvm.memcpy.i64";
+          Function *MemCpy = M->getOrInsertFunction(Name,
                                       CI.getCalledFunction()->getFunctionType());
            CI.setOperand(0, MemCpy);
            Changed = true;
          }
+    }
  
-    if (Changed) return II;
-  } else if (DbgStopPointInst *SPI = dyn_cast<DbgStopPointInst>(II)) {
-    // If this stoppoint is at the same source location as the previous
-    // stoppoint in the chain, it is not needed.
-    if (DbgStopPointInst *PrevSPI =
-        dyn_cast<DbgStopPointInst>(SPI->getChain()))
-      if (SPI->getLineNo() == PrevSPI->getLineNo() &&
-          SPI->getColNo() == PrevSPI->getColNo()) {
-        SPI->replaceAllUsesWith(PrevSPI);
-        return EraseInstFromFunction(CI);
+    // If we can determine a pointer alignment that is bigger than currently
+    // set, update the alignment.
+    if (isa<MemCpyInst>(MI) || isa<MemMoveInst>(MI)) {
+      unsigned Alignment1 = GetKnownAlignment(MI->getOperand(1), TD);
+      unsigned Alignment2 = GetKnownAlignment(MI->getOperand(2), TD);
+      unsigned Align = std::min(Alignment1, Alignment2);
+      if (MI->getAlignment()->getRawValue() < Align) {
+        MI->setAlignment(ConstantUInt::get(Type::UIntTy, Align));
+        Changed = true;
        }
+    } else if (isa<MemSetInst>(MI)) {
+      unsigned Alignment = GetKnownAlignment(MI->getDest(), TD);
+      if (MI->getAlignment()->getRawValue() < Alignment) {
+        MI->setAlignment(ConstantUInt::get(Type::UIntTy, Alignment));
+        Changed = true;
+      }
+    }
+          
+    if (Changed) return II;
    } else {
      switch (II->getIntrinsicID()) {
      default: break;
+    case Intrinsic::ppc_altivec_lvx:
+    case Intrinsic::ppc_altivec_lvxl:
+    case Intrinsic::x86_sse_loadu_ps:
+    case Intrinsic::x86_sse2_loadu_pd:
+    case Intrinsic::x86_sse2_loadu_dq:
+      // Turn PPC lvx     -> load if the pointer is known aligned.
+      // Turn X86 loadups -> load if the pointer is known aligned.
+      if (GetKnownAlignment(II->getOperand(1), TD) >= 16) {
+        Value *Ptr = InsertCastBefore(II->getOperand(1),
+                                      PointerType::get(II->getType()), CI);
+        return new LoadInst(Ptr);
+      }
+      break;
+    case Intrinsic::ppc_altivec_stvx:
+    case Intrinsic::ppc_altivec_stvxl:
+      // Turn stvx -> store if the pointer is known aligned.
+      if (GetKnownAlignment(II->getOperand(2), TD) >= 16) {
+        const Type *OpPtrTy = PointerType::get(II->getOperand(1)->getType());
+        Value *Ptr = InsertCastBefore(II->getOperand(2), OpPtrTy, CI);
+        return new StoreInst(II->getOperand(1), Ptr);
+      }
+      break;
+    case Intrinsic::x86_sse_storeu_ps:
+    case Intrinsic::x86_sse2_storeu_pd:
+    case Intrinsic::x86_sse2_storeu_dq:
+    case Intrinsic::x86_sse2_storel_dq:
+      // Turn X86 storeu -> store if the pointer is known aligned.
+      if (GetKnownAlignment(II->getOperand(1), TD) >= 16) {
+        const Type *OpPtrTy = PointerType::get(II->getOperand(2)->getType());
+        Value *Ptr = InsertCastBefore(II->getOperand(1), OpPtrTy, CI);
+        return new StoreInst(II->getOperand(2), Ptr);
+      }
+      break;
+    case Intrinsic::ppc_altivec_vperm:
+      // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
+      if (ConstantPacked *Mask = dyn_cast<ConstantPacked>(II->getOperand(3))) {
+        assert(Mask->getNumOperands() == 16 && "Bad type for intrinsic!");
+        
+        // Check that all of the elements are integer constants or undefs.
+        bool AllEltsOk = true;
+        for (unsigned i = 0; i != 16; ++i) {
+          if (!isa<ConstantInt>(Mask->getOperand(i)) && 
+              !isa<UndefValue>(Mask->getOperand(i))) {
+            AllEltsOk = false;
+            break;
+          }
+        }
+        
+        if (AllEltsOk) {
+          // Cast the input vectors to byte vectors.
+          Value *Op0 = InsertCastBefore(II->getOperand(1), Mask->getType(), CI);
+          Value *Op1 = InsertCastBefore(II->getOperand(2), Mask->getType(), CI);
+          Value *Result = UndefValue::get(Op0->getType());
+          
+          // Only extract each element once.
+          Value *ExtractedElts[32];
+          memset(ExtractedElts, 0, sizeof(ExtractedElts));
+          
+          for (unsigned i = 0; i != 16; ++i) {
+            if (isa<UndefValue>(Mask->getOperand(i)))
+              continue;
+            unsigned Idx =cast<ConstantInt>(Mask->getOperand(i))->getRawValue();
+            Idx &= 31;  // Match the hardware behavior.
+            
+            if (ExtractedElts[Idx] == 0) {
+              Instruction *Elt = 
+                new ExtractElementInst(Idx < 16 ? Op0 : Op1,
+                                       ConstantUInt::get(Type::UIntTy, Idx&15),
+                                       "tmp");
+              InsertNewInstBefore(Elt, CI);
+              ExtractedElts[Idx] = Elt;
+            }
+          
+            // Insert this value into the result vector.
+            Result = new InsertElementInst(Result, ExtractedElts[Idx],
+                                           ConstantUInt::get(Type::UIntTy, i),
+                                           "tmp");
+            InsertNewInstBefore(cast<Instruction>(Result), CI);
+          }
+          return new CastInst(Result, CI.getType());
+        }
+      }
+      break;
+
      case Intrinsic::stackrestore: {
        // If the save is right next to the restore, remove the restore.  This can
        // happen when variable allocas are DCE'd.
@@ -5338,8 +5725,10 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
    // Check to see if we are changing the return type...
    if (OldRetTy != FT->getReturnType()) {
      if (Callee->isExternal() &&
-        !OldRetTy->isLosslesslyConvertibleTo(FT->getReturnType()) &&
-        !Caller->use_empty())
+        !(OldRetTy->isLosslesslyConvertibleTo(FT->getReturnType()) ||
+          (isa<PointerType>(FT->getReturnType()) && 
+           TD->getIntPtrType()->isLosslesslyConvertibleTo(OldRetTy)))
+        && !Caller->use_empty())
        return false;   // Cannot transform this return value...
  
      // If the callsite is an invoke instruction, and the return value is used by
@@ -5985,7 +6374,8 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI) {
    if (const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) {
      const Type *SrcPTy = SrcTy->getElementType();
  
-    if (DestPTy->isInteger() || isa<PointerType>(DestPTy)) {
+    if (DestPTy->isInteger() || isa<PointerType>(DestPTy) || 
+        isa<PackedType>(DestPTy)) {
        // If the source is an array, the code below will not succeed.  Check to
        // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
        // constants.
@@ -5998,7 +6388,8 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI) {
              SrcPTy = SrcTy->getElementType();
            }
  
-      if ((SrcPTy->isInteger() || isa<PointerType>(SrcPTy)) &&
+      if ((SrcPTy->isInteger() || isa<PointerType>(SrcPTy) || 
+           isa<PackedType>(SrcPTy)) &&
            // Do not allow turning this into a load of an integer, which is then
            // casted to a pointer, this pessimizes pointer analysis a lot.
            (isa<PointerType>(SrcPTy) == isa<PointerType>(LI.getType())) &&
@@ -6434,42 +6825,134 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
    return 0;
  }
  
-Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
-  if (ConstantAggregateZero *C = 
-      dyn_cast<ConstantAggregateZero>(EI.getOperand(0))) {
-    // If packed val is constant 0, replace extract with scalar 0
-    const Type *Ty = cast<PackedType>(C->getType())->getElementType();
-    EI.replaceAllUsesWith(Constant::getNullValue(Ty));
-    return ReplaceInstUsesWith(EI, Constant::getNullValue(Ty));
+/// CheapToScalarize - Return true if the value is cheaper to scalarize than it
+/// is to leave as a vector operation.
+static bool CheapToScalarize(Value *V, bool isConstant) {
+  if (isa<ConstantAggregateZero>(V)) 
+    return true;
+  if (ConstantPacked *C = dyn_cast<ConstantPacked>(V)) {
+    if (isConstant) return true;
+    // If all elts are the same, we can extract.
+    Constant *Op0 = C->getOperand(0);
+    for (unsigned i = 1; i < C->getNumOperands(); ++i)
+      if (C->getOperand(i) != Op0)
+        return false;
+    return true;
    }
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+  
+  // Insert element gets simplified to the inserted element or is deleted if
+  // this is constant idx extract element and its a constant idx insertelt.
+  if (I->getOpcode() == Instruction::InsertElement && isConstant &&
+      isa<ConstantInt>(I->getOperand(2)))
+    return true;
+  if (I->getOpcode() == Instruction::Load && I->hasOneUse())
+    return true;
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I))
+    if (BO->hasOneUse() &&
+        (CheapToScalarize(BO->getOperand(0), isConstant) ||
+         CheapToScalarize(BO->getOperand(1), isConstant)))
+      return true;
+  
+  return false;
+}
+
+/// FindScalarElement - Given a vector and an element number, see if the scalar
+/// value is already around as a register, for example if it were inserted then
+/// extracted from the vector.
+static Value *FindScalarElement(Value *V, unsigned EltNo) {
+  assert(isa<PackedType>(V->getType()) && "Not looking at a vector?");
+  const PackedType *PTy = cast<PackedType>(V->getType());
+  unsigned Width = PTy->getNumElements();
+  if (EltNo >= Width)  // Out of range access.
+    return UndefValue::get(PTy->getElementType());
+  
+  if (isa<UndefValue>(V))
+    return UndefValue::get(PTy->getElementType());
+  else if (isa<ConstantAggregateZero>(V))
+    return Constant::getNullValue(PTy->getElementType());
+  else if (ConstantPacked *CP = dyn_cast<ConstantPacked>(V))
+    return CP->getOperand(EltNo);
+  else if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert to a variable element, we don't know what it is.
+    if (!isa<ConstantUInt>(III->getOperand(2))) return 0;
+    unsigned IIElt = cast<ConstantUInt>(III->getOperand(2))->getValue();
+    
+    // If this is an insert to the element we are looking for, return the
+    // inserted value.
+    if (EltNo == IIElt) return III->getOperand(1);
+    
+    // Otherwise, the insertelement doesn't modify the value, recurse on its
+    // vector input.
+    return FindScalarElement(III->getOperand(0), EltNo);
+  } else if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V)) {
+    if (isa<ConstantAggregateZero>(SVI->getOperand(2))) {
+      return FindScalarElement(SVI->getOperand(0), 0);
+    } else if (ConstantPacked *CP = 
+                   dyn_cast<ConstantPacked>(SVI->getOperand(2))) {
+      if (isa<UndefValue>(CP->getOperand(EltNo)))
+        return UndefValue::get(PTy->getElementType());
+      unsigned InEl = cast<ConstantUInt>(CP->getOperand(EltNo))->getValue();
+      if (InEl < Width)
+        return FindScalarElement(SVI->getOperand(0), InEl);
+      else
+        return FindScalarElement(SVI->getOperand(1), InEl - Width);
+    }
+  }
+  
+  // Otherwise, we don't know.
+  return 0;
+}
+
+Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
+
+  // If packed val is undef, replace extract with scalar undef.
+  if (isa<UndefValue>(EI.getOperand(0)))
+    return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+
+  // If packed val is constant 0, replace extract with scalar 0.
+  if (isa<ConstantAggregateZero>(EI.getOperand(0)))
+    return ReplaceInstUsesWith(EI, Constant::getNullValue(EI.getType()));
+  
    if (ConstantPacked *C = dyn_cast<ConstantPacked>(EI.getOperand(0))) {
      // If packed val is constant with uniform operands, replace EI
      // with that operand
-    Constant *op0 = cast<Constant>(C->getOperand(0));
+    Constant *op0 = C->getOperand(0);
      for (unsigned i = 1; i < C->getNumOperands(); ++i)
-      if (C->getOperand(i) != op0) return 0;
-    return ReplaceInstUsesWith(EI, op0);
+      if (C->getOperand(i) != op0) {
+        op0 = 0; 
+        break;
+      }
+    if (op0)
+      return ReplaceInstUsesWith(EI, op0);
+  }
+  
+  // If extracting a specified index from the vector, see if we can recursively
+  // find a previously computed scalar that was inserted into the vector.
+  if (ConstantUInt *IdxC = dyn_cast<ConstantUInt>(EI.getOperand(1))) {
+    if (Value *Elt = FindScalarElement(EI.getOperand(0), IdxC->getValue()))
+      return ReplaceInstUsesWith(EI, Elt);
    }
+  
    if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0)))
      if (I->hasOneUse()) {
        // Push extractelement into predecessor operation if legal and
        // profitable to do so
        if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
-        if (!isa<Constant>(BO->getOperand(0)) &&
-            !isa<Constant>(BO->getOperand(1)))
-          return 0;
-        ExtractElementInst *newEI0 = 
-          new ExtractElementInst(BO->getOperand(0), EI.getOperand(1),
-                                 EI.getName());
-        ExtractElementInst *newEI1 =
-          new ExtractElementInst(BO->getOperand(1), EI.getOperand(1),
-                                 EI.getName());
-        InsertNewInstBefore(newEI0, EI);
-        InsertNewInstBefore(newEI1, EI);
-        return BinaryOperator::create(BO->getOpcode(), newEI0, newEI1);
-      }
-      switch(I->getOpcode()) {
-      case Instruction::Load: {
+        bool isConstantElt = isa<ConstantInt>(EI.getOperand(1));
+        if (CheapToScalarize(BO, isConstantElt)) {
+          ExtractElementInst *newEI0 = 
+            new ExtractElementInst(BO->getOperand(0), EI.getOperand(1),
+                                   EI.getName()+".lhs");
+          ExtractElementInst *newEI1 =
+            new ExtractElementInst(BO->getOperand(1), EI.getOperand(1),
+                                   EI.getName()+".rhs");
+          InsertNewInstBefore(newEI0, EI);
+          InsertNewInstBefore(newEI1, EI);
+          return BinaryOperator::create(BO->getOpcode(), newEI0, newEI1);
+        }
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
          Value *Ptr = InsertCastBefore(I->getOperand(0),
                                        PointerType::get(EI.getType()), EI);
          GetElementPtrInst *GEP = 
@@ -6477,15 +6960,308 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
                                  I->getName() + ".gep");
          InsertNewInstBefore(GEP, EI);
          return new LoadInst(GEP);
+      } else if (InsertElementInst *IE = dyn_cast<InsertElementInst>(I)) {
+        // Extracting the inserted element?
+        if (IE->getOperand(2) == EI.getOperand(1))
+          return ReplaceInstUsesWith(EI, IE->getOperand(1));
+        // If the inserted and extracted elements are constants, they must not
+        // be the same value, extract from the pre-inserted value instead.
+        if (isa<Constant>(IE->getOperand(2)) &&
+            isa<Constant>(EI.getOperand(1))) {
+          AddUsesToWorkList(EI);
+          EI.setOperand(0, IE->getOperand(0));
+          return &EI;
+        }
        }
-      default:
-        return 0;
+    }
+  return 0;
+}
+
+/// CollectSingleShuffleElements - If V is a shuffle of values that ONLY returns
+/// elements from either LHS or RHS, return the shuffle mask and true. 
+/// Otherwise, return false.
+static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
+                                         std::vector<Constant*> &Mask) {
+  assert(V->getType() == LHS->getType() && V->getType() == RHS->getType() &&
+         "Invalid CollectSingleShuffleElements");
+  unsigned NumElts = cast<PackedType>(V->getType())->getNumElements();
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, UndefValue::get(Type::UIntTy));
+    return true;
+  } else if (V == LHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(ConstantUInt::get(Type::UIntTy, i));
+    return true;
+  } else if (V == RHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(ConstantUInt::get(Type::UIntTy, i+NumElts));
+    return true;
+  } else if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+    
+    if (!isa<ConstantInt>(IdxOp))
+      return false;
+    unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getRawValue();
+    
+    if (isa<UndefValue>(ScalarOp)) {  // inserting undef into vector.
+      // Okay, we can handle this if the vector we are insertinting into is
+      // transitively ok.
+      if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+        // If so, update the mask to reflect the inserted undef.
+        Mask[InsertedIdx] = UndefValue::get(Type::UIntTy);
+        return true;
+      }      
+    } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){
+      if (isa<ConstantInt>(EI->getOperand(1)) &&
+          EI->getOperand(0)->getType() == V->getType()) {
+        unsigned ExtractedIdx =
+          cast<ConstantInt>(EI->getOperand(1))->getRawValue();
+        
+        // This must be extracting from either LHS or RHS.
+        if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
+          // Okay, we can handle this if the vector we are insertinting into is
+          // transitively ok.
+          if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+            // If so, update the mask to reflect the inserted value.
+            if (EI->getOperand(0) == LHS) {
+              Mask[InsertedIdx & (NumElts-1)] = 
+                 ConstantUInt::get(Type::UIntTy, ExtractedIdx);
+            } else {
+              assert(EI->getOperand(0) == RHS);
+              Mask[InsertedIdx & (NumElts-1)] = 
+                ConstantUInt::get(Type::UIntTy, ExtractedIdx+NumElts);
+              
+            }
+            return true;
+          }
+        }
+      }
+    }
+  }
+  // TODO: Handle shufflevector here!
+  
+  return false;
+}
+
+/// CollectShuffleElements - We are building a shuffle of V, using RHS as the
+/// RHS of the shuffle instruction, if it is not null.  Return a shuffle mask
+/// that computes V and the LHS value of the shuffle.
+static Value *CollectShuffleElements(Value *V, std::vector<Constant*> &Mask,
+                                     Value *&RHS) {
+  assert(isa<PackedType>(V->getType()) && 
+         (RHS == 0 || V->getType() == RHS->getType()) &&
+         "Invalid shuffle!");
+  unsigned NumElts = cast<PackedType>(V->getType())->getNumElements();
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, UndefValue::get(Type::UIntTy));
+    return V;
+  } else if (isa<ConstantAggregateZero>(V)) {
+    Mask.assign(NumElts, ConstantUInt::get(Type::UIntTy, 0));
+    return V;
+  } else if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+    
+    if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+      if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) &&
+          EI->getOperand(0)->getType() == V->getType()) {
+        unsigned ExtractedIdx =
+          cast<ConstantInt>(EI->getOperand(1))->getRawValue();
+        unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getRawValue();
+        
+        // Either the extracted from or inserted into vector must be RHSVec,
+        // otherwise we'd end up with a shuffle of three inputs.
+        if (EI->getOperand(0) == RHS || RHS == 0) {
+          RHS = EI->getOperand(0);
+          Value *V = CollectShuffleElements(VecOp, Mask, RHS);
+          Mask[InsertedIdx & (NumElts-1)] = 
+            ConstantUInt::get(Type::UIntTy, NumElts+ExtractedIdx);
+          return V;
+        }
+        
+        if (VecOp == RHS) {
+          Value *V = CollectShuffleElements(EI->getOperand(0), Mask, RHS);
+          // Everything but the extracted element is replaced with the RHS.
+          for (unsigned i = 0; i != NumElts; ++i) {
+            if (i != InsertedIdx)
+              Mask[i] = ConstantUInt::get(Type::UIntTy, NumElts+i);
+          }
+          return V;
+        }
+        
+        // If this insertelement is a chain that comes from exactly these two
+        // vectors, return the vector and the effective shuffle.
+        if (CollectSingleShuffleElements(IEI, EI->getOperand(0), RHS, Mask))
+          return EI->getOperand(0);
+        
        }
      }
+  }
+  // TODO: Handle shufflevector here!
+  
+  // Otherwise, can't do anything fancy.  Return an identity vector.
+  for (unsigned i = 0; i != NumElts; ++i)
+    Mask.push_back(ConstantUInt::get(Type::UIntTy, i));
+  return V;
+}
+
+Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
+  Value *VecOp    = IE.getOperand(0);
+  Value *ScalarOp = IE.getOperand(1);
+  Value *IdxOp    = IE.getOperand(2);
+  
+  // If the inserted element was extracted from some other vector, and if the 
+  // indexes are constant, try to turn this into a shufflevector operation.
+  if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+    if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) &&
+        EI->getOperand(0)->getType() == IE.getType()) {
+      unsigned NumVectorElts = IE.getType()->getNumElements();
+      unsigned ExtractedIdx=cast<ConstantInt>(EI->getOperand(1))->getRawValue();
+      unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getRawValue();
+      
+      if (ExtractedIdx >= NumVectorElts) // Out of range extract.
+        return ReplaceInstUsesWith(IE, VecOp);
+      
+      if (InsertedIdx >= NumVectorElts)  // Out of range insert.
+        return ReplaceInstUsesWith(IE, UndefValue::get(IE.getType()));
+      
+      // If we are extracting a value from a vector, then inserting it right
+      // back into the same place, just use the input vector.
+      if (EI->getOperand(0) == VecOp && ExtractedIdx == InsertedIdx)
+        return ReplaceInstUsesWith(IE, VecOp);      
+      
+      // We could theoretically do this for ANY input.  However, doing so could
+      // turn chains of insertelement instructions into a chain of shufflevector
+      // instructions, and right now we do not merge shufflevectors.  As such,
+      // only do this in a situation where it is clear that there is benefit.
+      if (isa<UndefValue>(VecOp) || isa<ConstantAggregateZero>(VecOp)) {
+        // Turn this into shuffle(EIOp0, VecOp, Mask).  The result has all of
+        // the values of VecOp, except then one read from EIOp0.
+        // Build a new shuffle mask.
+        std::vector<Constant*> Mask;
+        if (isa<UndefValue>(VecOp))
+          Mask.assign(NumVectorElts, UndefValue::get(Type::UIntTy));
+        else {
+          assert(isa<ConstantAggregateZero>(VecOp) && "Unknown thing");
+          Mask.assign(NumVectorElts, ConstantUInt::get(Type::UIntTy,
+                                                       NumVectorElts));
+        } 
+        Mask[InsertedIdx] = ConstantUInt::get(Type::UIntTy, ExtractedIdx);
+        return new ShuffleVectorInst(EI->getOperand(0), VecOp,
+                                     ConstantPacked::get(Mask));
+      }
+      
+      // If this insertelement isn't used by some other insertelement, turn it
+      // (and any insertelements it points to), into one big shuffle.
+      if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.use_back())) {
+        std::vector<Constant*> Mask;
+        Value *RHS = 0;
+        Value *LHS = CollectShuffleElements(&IE, Mask, RHS);
+        if (RHS == 0) RHS = UndefValue::get(LHS->getType());
+        // We now have a shuffle of LHS, RHS, Mask.
+        return new ShuffleVectorInst(LHS, RHS, ConstantPacked::get(Mask));
+      }
+    }
+  }
+
    return 0;
  }
  
  
+Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+  Value *LHS = SVI.getOperand(0);
+  Value *RHS = SVI.getOperand(1);
+  Constant *Mask = cast<Constant>(SVI.getOperand(2));
+
+  bool MadeChange = false;
+  
+  if (isa<UndefValue>(Mask))
+    return ReplaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));
+  
+  // TODO: If we have shuffle(x, undef, mask) and any elements of mask refer to
+  // the undef, change them to undefs.
+  
+  // Canonicalize shuffle(x,x) -> shuffle(x,undef)
+  if (LHS == RHS) {
+    if (isa<UndefValue>(LHS)) {
+      // shuffle(undef,undef,mask) -> undef.
+      return ReplaceInstUsesWith(SVI, LHS);
+    }
+    
+    if (!isa<ConstantAggregateZero>(Mask)) {
+      // Remap any references to RHS to use LHS.
+      ConstantPacked *CP = cast<ConstantPacked>(Mask);
+      std::vector<Constant*> Elts;
+      for (unsigned i = 0, e = CP->getNumOperands(); i != e; ++i) {
+        Elts.push_back(CP->getOperand(i));
+        if (isa<UndefValue>(CP->getOperand(i)))
+          continue;
+        unsigned MV = cast<ConstantInt>(CP->getOperand(i))->getRawValue();
+        if (MV >= e)
+          Elts.back() = ConstantUInt::get(Type::UIntTy, MV & (e-1));
+      }
+      Mask = ConstantPacked::get(Elts);
+    }
+    SVI.setOperand(1, UndefValue::get(RHS->getType()));
+    SVI.setOperand(2, Mask);
+    MadeChange = true;
+  }
+  
+  // Canonicalize shuffle(undef,x,mask) -> shuffle(x, undef,mask').
+  if (isa<UndefValue>(LHS)) {
+    // shuffle(undef,x,<0,0,0,0>) -> undef.
+    if (isa<ConstantAggregateZero>(Mask))
+      return ReplaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));
+    
+    ConstantPacked *CPM = cast<ConstantPacked>(Mask);
+    std::vector<Constant*> Elts;
+    for (unsigned i = 0, e = CPM->getNumOperands(); i != e; ++i) {
+      if (isa<UndefValue>(CPM->getOperand(i)))
+        Elts.push_back(CPM->getOperand(i));
+      else {
+        unsigned EltNo = cast<ConstantUInt>(CPM->getOperand(i))->getRawValue();
+        if (EltNo >= e)
+          Elts.push_back(ConstantUInt::get(Type::UIntTy, EltNo-e));
+        else               // Referring to the undef.
+          Elts.push_back(UndefValue::get(Type::UIntTy));
+      }
+    }
+    return new ShuffleVectorInst(RHS, LHS, ConstantPacked::get(Elts));
+  }
+  
+  if (ConstantPacked *CP = dyn_cast<ConstantPacked>(Mask)) {
+    bool isLHSID = true, isRHSID = true;
+    
+    // Analyze the shuffle.
+    for (unsigned i = 0, e = CP->getNumOperands(); i != e; ++i) {
+      if (isa<UndefValue>(CP->getOperand(i)))
+        continue;
+      unsigned MV = cast<ConstantInt>(CP->getOperand(i))->getRawValue();
+      
+      // Is this an identity shuffle of the LHS value?
+      isLHSID &= (MV == i);
+      
+      // Is this an identity shuffle of the RHS value?
+      isRHSID &= (MV-e == i);
+    }
+
+    // Eliminate identity shuffles.
+    if (isLHSID) return ReplaceInstUsesWith(SVI, LHS);
+    if (isRHSID) return ReplaceInstUsesWith(SVI, RHS);
+  }
+  
+  return MadeChange ? &SVI : 0;
+}
+
+
+
  void InstCombiner::removeFromWorkList(Instruction *I) {
    WorkList.erase(std::remove(WorkList.begin(), WorkList.end(), I),
                   WorkList.end());