[InstCombine] canonicalize (bitcast (extractelement X)) --> (extractelement(bitcast X))

[oota-llvm.git] / lib / Transforms / InstCombine / InstCombineVectorOps.cpp
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp

index c0b9b2fc3e5dae2bef03a3e8736c11350612d512..8704fcc0fd178e97fb177b7d4250b985c23e77cc 100644 (file)
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -12,24 +12,31 @@
  //
  //===----------------------------------------------------------------------===//
  
-#include "InstCombine.h"
-#include "llvm/Support/PatternMatch.h"
+#include "InstCombineInternal.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/PatternMatch.h"
  using namespace llvm;
  using namespace PatternMatch;
  
-/// CheapToScalarize - Return true if the value is cheaper to scalarize than it
-/// is to leave as a vector operation.  isConstant indicates whether we're
-/// extracting one known element.  If false we're extracting a variable index.
-static bool CheapToScalarize(Value *V, bool isConstant) {
+#define DEBUG_TYPE "instcombine"
+
+/// Return true if the value is cheaper to scalarize than it is to leave as a
+/// vector operation. isConstant indicates whether we're extracting one known
+/// element. If false we're extracting a variable index.
+static bool cheapToScalarize(Value *V, bool isConstant) {
    if (Constant *C = dyn_cast<Constant>(V)) {
      if (isConstant) return true;
  
      // If all elts are the same, we can extract it and use any of the values.
-    Constant *Op0 = C->getAggregateElement(0U);
-    for (unsigned i = 1, e = V->getType()->getVectorNumElements(); i != e; ++i)
-      if (C->getAggregateElement(i) != Op0)
-        return false;
-    return true;
+    if (Constant *Op0 = C->getAggregateElement(0U)) {
+      for (unsigned i = 1, e = V->getType()->getVectorNumElements(); i != e;
+           ++i)
+        if (C->getAggregateElement(i) != Op0)
+          return false;
+      return true;
+    }
    }
    Instruction *I = dyn_cast<Instruction>(V);
    if (!I) return false;
@@ -43,80 +50,30 @@ static bool CheapToScalarize(Value *V, bool isConstant) {
      return true;
    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I))
      if (BO->hasOneUse() &&
-        (CheapToScalarize(BO->getOperand(0), isConstant) ||
-         CheapToScalarize(BO->getOperand(1), isConstant)))
+        (cheapToScalarize(BO->getOperand(0), isConstant) ||
+         cheapToScalarize(BO->getOperand(1), isConstant)))
        return true;
    if (CmpInst *CI = dyn_cast<CmpInst>(I))
      if (CI->hasOneUse() &&
-        (CheapToScalarize(CI->getOperand(0), isConstant) ||
-         CheapToScalarize(CI->getOperand(1), isConstant)))
+        (cheapToScalarize(CI->getOperand(0), isConstant) ||
+         cheapToScalarize(CI->getOperand(1), isConstant)))
        return true;
  
    return false;
  }
  
-/// FindScalarElement - Given a vector and an element number, see if the scalar
-/// value is already around as a register, for example if it were inserted then
-/// extracted from the vector.
-static Value *FindScalarElement(Value *V, unsigned EltNo) {
-  assert(V->getType()->isVectorTy() && "Not looking at a vector?");
-  VectorType *VTy = cast<VectorType>(V->getType());
-  unsigned Width = VTy->getNumElements();
-  if (EltNo >= Width)  // Out of range access.
-    return UndefValue::get(VTy->getElementType());
-
-  if (Constant *C = dyn_cast<Constant>(V))
-    return C->getAggregateElement(EltNo);
-
-  if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
-    // If this is an insert to a variable element, we don't know what it is.
-    if (!isa<ConstantInt>(III->getOperand(2)))
-      return 0;
-    unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue();
-
-    // If this is an insert to the element we are looking for, return the
-    // inserted value.
-    if (EltNo == IIElt)
-      return III->getOperand(1);
-
-    // Otherwise, the insertelement doesn't modify the value, recurse on its
-    // vector input.
-    return FindScalarElement(III->getOperand(0), EltNo);
-  }
-
-  if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V)) {
-    unsigned LHSWidth = SVI->getOperand(0)->getType()->getVectorNumElements();
-    int InEl = SVI->getMaskValue(EltNo);
-    if (InEl < 0)
-      return UndefValue::get(VTy->getElementType());
-    if (InEl < (int)LHSWidth)
-      return FindScalarElement(SVI->getOperand(0), InEl);
-    return FindScalarElement(SVI->getOperand(1), InEl - LHSWidth);
-  }
-
-  // Extract a value from a vector add operation with a constant zero.
-  Value *Val = 0; Constant *Con = 0;
-  if (match(V, m_Add(m_Value(Val), m_Constant(Con)))) {
-    if (Con->getAggregateElement(EltNo)->isNullValue())
-      return FindScalarElement(Val, EltNo);
-  }
-
-  // Otherwise, we don't know.
-  return 0;
-}
-
  // If we have a PHI node with a vector type that has only 2 uses: feed
  // itself and be an operand of extractelement at a constant location,
  // try to replace the PHI of the vector type with a PHI of a scalar type.
  Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
    // Verify that the PHI node has exactly 2 uses. Otherwise return NULL.
    if (!PN->hasNUses(2))
-    return NULL;
+    return nullptr;
  
    // If so, it's known at this point that one operand is PHI and the other is
    // an extractelement node. Find the PHI user that is not the extractelement
    // node.
-  Value::use_iterator iu = PN->use_begin();
+  auto iu = PN->user_begin();
    Instruction *PHIUser = dyn_cast<Instruction>(*iu);
    if (PHIUser == cast<Instruction>(&EI))
      PHIUser = cast<Instruction>(*(++iu));
@@ -124,9 +81,9 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
    // Verify that this PHI user has one use, which is the PHI itself,
    // and that it is a binary operation which is cheap to scalarize.
    // otherwise return NULL.
-  if (!PHIUser->hasOneUse() || !(PHIUser->use_back() == PN) ||
-      !(isa<BinaryOperator>(PHIUser)) || !CheapToScalarize(PHIUser, true))
-    return NULL;
+  if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) ||
+      !(isa<BinaryOperator>(PHIUser)) || !cheapToScalarize(PHIUser, true))
+    return nullptr;
  
    // Create a scalar PHI node that will replace the vector PHI node
    // just before the current PHI node.
@@ -140,7 +97,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
      // If the operand is the PHI induction variable:
      if (PHIInVal == PHIUser) {
        // Scalarize the binary operation. Its first operand is the
-      // scalar PHI and the second operand is extracted from the other
+      // scalar PHI, and the second operand is extracted from the other
        // vector operand.
        BinaryOperator *B0 = cast<BinaryOperator>(PHIUser);
        unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0;
@@ -158,8 +115,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
        Instruction *pos = dyn_cast<Instruction>(PHIInVal);
        BasicBlock::iterator InsertPos;
        if (pos && !isa<PHINode>(pos)) {
-        InsertPos = pos;
-        ++InsertPos;
+        InsertPos = ++pos->getIterator();
        } else {
          InsertPos = inBB->getFirstInsertionPt();
        }
@@ -173,10 +129,14 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
  }
  
  Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
+  if (Value *V = SimplifyExtractElementInst(
+          EI.getVectorOperand(), EI.getIndexOperand(), DL, TLI, DT, AC))
+    return ReplaceInstUsesWith(EI, V);
+
    // If vector val is constant with all elements the same, replace EI with
    // that element.  We handle a known element # below.
    if (Constant *C = dyn_cast<Constant>(EI.getOperand(0)))
-    if (CheapToScalarize(C, false))
+    if (cheapToScalarize(C, false))
        return ReplaceInstUsesWith(EI, C->getAggregateElement(0U));
  
    // If extracting a specified index from the vector, see if we can recursively
@@ -185,10 +145,8 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
      unsigned IndexVal = IdxC->getZExtValue();
      unsigned VectorWidth = EI.getVectorOperandType()->getNumElements();
  
-    // If this is extracting an invalid index, turn this into undef, to avoid
-    // crashing the code below.
-    if (IndexVal >= VectorWidth)
-      return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+    // InstSimplify handles cases where the index is invalid.
+    assert(IndexVal < VectorWidth);
  
      // This instruction only demands the single element from the input vector.
      // If the input vector has a single use, simplify it based on this use
@@ -197,23 +155,20 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
        APInt UndefElts(VectorWidth, 0);
        APInt DemandedMask(VectorWidth, 0);
        DemandedMask.setBit(IndexVal);
-      if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0),
-                                                DemandedMask, UndefElts)) {
+      if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0), DemandedMask,
+                                                UndefElts)) {
          EI.setOperand(0, V);
          return &EI;
        }
      }
  
-    if (Value *Elt = FindScalarElement(EI.getOperand(0), IndexVal))
-      return ReplaceInstUsesWith(EI, Elt);
-
-    // If the this extractelement is directly using a bitcast from a vector of
+    // If this extractelement is directly using a bitcast from a vector of
      // the same number of elements, see if we can find the source element from
      // it.  In this case, we will end up needing to bitcast the scalars.
      if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) {
        if (VectorType *VT = dyn_cast<VectorType>(BCI->getOperand(0)->getType()))
          if (VT->getNumElements() == VectorWidth)
-          if (Value *Elt = FindScalarElement(BCI->getOperand(0), IndexVal))
+          if (Value *Elt = findScalarElement(BCI->getOperand(0), IndexVal))
              return new BitCastInst(Elt, EI.getType());
      }
  
@@ -228,10 +183,10 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
  
    if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0))) {
      // Push extractelement into predecessor operation if legal and
-    // profitable to do so
+    // profitable to do so.
      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
        if (I->hasOneUse() &&
-          CheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) {
+          cheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) {
          Value *newEI0 =
            Builder->CreateExtractElement(BO->getOperand(0), EI.getOperand(1),
                                          EI.getName()+".lhs");
@@ -274,8 +229,9 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
                                                             SrcIdx, false));
        }
      } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
-      // Canonicalize extractelement(cast) -> cast(extractelement)
-      // bitcasts can change the number of vector elements and they cost nothing
+      // Canonicalize extractelement(cast) -> cast(extractelement).
+      // Bitcasts can change the number of vector elements, and they cost
+      // nothing.
        if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
          Value *EE = Builder->CreateExtractElement(CI->getOperand(0),
                                                    EI.getIndexOperand());
@@ -289,7 +245,8 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
          // fight the vectorizer.
  
          // If we are extracting an element from a vector select or a select on
-        // vectors, a select on the scalars extracted from the vector arguments.
+        // vectors, create a select on the scalars extracted from the vector
+        // arguments.
          Value *TrueVal = SI->getTrueValue();
          Value *FalseVal = SI->getFalseValue();
  
@@ -316,15 +273,14 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
        }
      }
    }
-  return 0;
+  return nullptr;
  }
  
-/// CollectSingleShuffleElements - If V is a shuffle of values that ONLY returns
-/// elements from either LHS or RHS, return the shuffle mask and true.
-/// Otherwise, return false.
-static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
+/// If V is a shuffle of values that ONLY returns elements from either LHS or
+/// RHS, return the shuffle mask and true. Otherwise, return false.
+static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
                                           SmallVectorImpl<Constant*> &Mask) {
-  assert(V->getType() == LHS->getType() && V->getType() == RHS->getType() &&
+  assert(LHS->getType() == RHS->getType() &&
           "Invalid CollectSingleShuffleElements");
    unsigned NumElts = V->getType()->getVectorNumElements();
  
@@ -357,24 +313,24 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
      unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
  
      if (isa<UndefValue>(ScalarOp)) {  // inserting undef into vector.
-      // Okay, we can handle this if the vector we are insertinting into is
+      // We can handle this if the vector we are inserting into is
        // transitively ok.
-      if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+      if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
          // If so, update the mask to reflect the inserted undef.
          Mask[InsertedIdx] = UndefValue::get(Type::getInt32Ty(V->getContext()));
          return true;
        }
      } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){
-      if (isa<ConstantInt>(EI->getOperand(1)) &&
-          EI->getOperand(0)->getType() == V->getType()) {
+      if (isa<ConstantInt>(EI->getOperand(1))) {
          unsigned ExtractedIdx =
          cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+        unsigned NumLHSElts = LHS->getType()->getVectorNumElements();
  
          // This must be extracting from either LHS or RHS.
          if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
-          // Okay, we can handle this if the vector we are insertinting into is
+          // We can handle this if the vector we are inserting into is
            // transitively ok.
-          if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+          if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
              // If so, update the mask to reflect the inserted value.
              if (EI->getOperand(0) == LHS) {
                Mask[InsertedIdx % NumElts] =
@@ -384,7 +340,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
                assert(EI->getOperand(0) == RHS);
                Mask[InsertedIdx % NumElts] =
                ConstantInt::get(Type::getInt32Ty(V->getContext()),
-                               ExtractedIdx+NumElts);
+                               ExtractedIdx + NumLHSElts);
              }
              return true;
            }
@@ -392,29 +348,36 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
        }
      }
    }
-  // TODO: Handle shufflevector here!
  
    return false;
  }
  
-/// CollectShuffleElements - We are building a shuffle of V, using RHS as the
-/// RHS of the shuffle instruction, if it is not null.  Return a shuffle mask
-/// that computes V and the LHS value of the shuffle.
-static Value *CollectShuffleElements(Value *V, SmallVectorImpl<Constant*> &Mask,
-                                     Value *&RHS) {
-  assert(V->getType()->isVectorTy() &&
-         (RHS == 0 || V->getType() == RHS->getType()) &&
-         "Invalid shuffle!");
+
+/// We are building a shuffle to create V, which is a sequence of insertelement,
+/// extractelement pairs. If PermittedRHS is set, then we must either use it or
+/// not rely on the second vector source. Return a std::pair containing the
+/// left and right vectors of the proposed shuffle (or 0), and set the Mask
+/// parameter as required.
+///
+/// Note: we intentionally don't try to fold earlier shuffles since they have
+/// often been chosen carefully to be efficiently implementable on the target.
+typedef std::pair<Value *, Value *> ShuffleOps;
+
+static ShuffleOps collectShuffleElements(Value *V,
+                                         SmallVectorImpl<Constant *> &Mask,
+                                         Value *PermittedRHS) {
+  assert(V->getType()->isVectorTy() && "Invalid shuffle!");
    unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
  
    if (isa<UndefValue>(V)) {
      Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext())));
-    return V;
+    return std::make_pair(
+        PermittedRHS ? UndefValue::get(PermittedRHS->getType()) : V, nullptr);
    }
  
    if (isa<ConstantAggregateZero>(V)) {
      Mask.assign(NumElts, ConstantInt::get(Type::getInt32Ty(V->getContext()),0));
-    return V;
+    return std::make_pair(V, nullptr);
    }
  
    if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
@@ -424,51 +387,94 @@ static Value *CollectShuffleElements(Value *V, SmallVectorImpl<Constant*> &Mask,
      Value *IdxOp    = IEI->getOperand(2);
  
      if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
-      if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) &&
-          EI->getOperand(0)->getType() == V->getType()) {
+      if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) {
          unsigned ExtractedIdx =
            cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
          unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
  
          // Either the extracted from or inserted into vector must be RHSVec,
          // otherwise we'd end up with a shuffle of three inputs.
-        if (EI->getOperand(0) == RHS || RHS == 0) {
-          RHS = EI->getOperand(0);
-          Value *V = CollectShuffleElements(VecOp, Mask, RHS);
+        if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) {
+          Value *RHS = EI->getOperand(0);
+          ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS);
+          assert(LR.second == nullptr || LR.second == RHS);
+
+          if (LR.first->getType() != RHS->getType()) {
+            // We tried our best, but we can't find anything compatible with RHS
+            // further up the chain. Return a trivial shuffle.
+            for (unsigned i = 0; i < NumElts; ++i)
+              Mask[i] = ConstantInt::get(Type::getInt32Ty(V->getContext()), i);
+            return std::make_pair(V, nullptr);
+          }
+
+          unsigned NumLHSElts = RHS->getType()->getVectorNumElements();
            Mask[InsertedIdx % NumElts] =
              ConstantInt::get(Type::getInt32Ty(V->getContext()),
-                             NumElts+ExtractedIdx);
-          return V;
+                             NumLHSElts+ExtractedIdx);
+          return std::make_pair(LR.first, RHS);
          }
  
-        if (VecOp == RHS) {
-          Value *V = CollectShuffleElements(EI->getOperand(0), Mask, RHS);
-          // Update Mask to reflect that `ScalarOp' has been inserted at
-          // position `InsertedIdx' within the vector returned by IEI.
-          Mask[InsertedIdx % NumElts] = Mask[ExtractedIdx];
-
-          // Everything but the extracted element is replaced with the RHS.
-          for (unsigned i = 0; i != NumElts; ++i) {
-            if (i != InsertedIdx)
-              Mask[i] = ConstantInt::get(Type::getInt32Ty(V->getContext()),
-                                         NumElts+i);
-          }
-          return V;
+        if (VecOp == PermittedRHS) {
+          // We've gone as far as we can: anything on the other side of the
+          // extractelement will already have been converted into a shuffle.
+          unsigned NumLHSElts =
+              EI->getOperand(0)->getType()->getVectorNumElements();
+          for (unsigned i = 0; i != NumElts; ++i)
+            Mask.push_back(ConstantInt::get(
+                Type::getInt32Ty(V->getContext()),
+                i == InsertedIdx ? ExtractedIdx : NumLHSElts + i));
+          return std::make_pair(EI->getOperand(0), PermittedRHS);
          }
  
          // If this insertelement is a chain that comes from exactly these two
          // vectors, return the vector and the effective shuffle.
-        if (CollectSingleShuffleElements(IEI, EI->getOperand(0), RHS, Mask))
-          return EI->getOperand(0);
+        if (EI->getOperand(0)->getType() == PermittedRHS->getType() &&
+            collectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS,
+                                         Mask))
+          return std::make_pair(EI->getOperand(0), PermittedRHS);
        }
      }
    }
-  // TODO: Handle shufflevector here!
  
-  // Otherwise, can't do anything fancy.  Return an identity vector.
+  // Otherwise, we can't do anything fancy. Return an identity vector.
    for (unsigned i = 0; i != NumElts; ++i)
      Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i));
-  return V;
+  return std::make_pair(V, nullptr);
+}
+
+/// Try to find redundant insertvalue instructions, like the following ones:
+///  %0 = insertvalue { i8, i32 } undef, i8 %x, 0
+///  %1 = insertvalue { i8, i32 } %0,    i8 %y, 0
+/// Here the second instruction inserts values at the same indices, as the
+/// first one, making the first one redundant.
+/// It should be transformed to:
+///  %0 = insertvalue { i8, i32 } undef, i8 %y, 0
+Instruction *InstCombiner::visitInsertValueInst(InsertValueInst &I) {
+  bool IsRedundant = false;
+  ArrayRef<unsigned int> FirstIndices = I.getIndices();
+
+  // If there is a chain of insertvalue instructions (each of them except the
+  // last one has only one use and it's another insertvalue insn from this
+  // chain), check if any of the 'children' uses the same indices as the first
+  // instruction. In this case, the first one is redundant.
+  Value *V = &I;
+  unsigned Depth = 0;
+  while (V->hasOneUse() && Depth < 10) {
+    User *U = V->user_back();
+    auto UserInsInst = dyn_cast<InsertValueInst>(U);
+    if (!UserInsInst || U->getOperand(0) != V)
+      break;
+    if (UserInsInst->getIndices() == FirstIndices) {
+      IsRedundant = true;
+      break;
+    }
+    V = UserInsInst;
+    Depth++;
+  }
+
+  if (IsRedundant)
+    return ReplaceInstUsesWith(I, I.getOperand(0));
+  return nullptr;
  }
  
  Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
@@ -483,17 +489,18 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
    // If the inserted element was extracted from some other vector, and if the
    // indexes are constant, try to turn this into a shufflevector operation.
    if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
-    if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) &&
-        EI->getOperand(0)->getType() == IE.getType()) {
-      unsigned NumVectorElts = IE.getType()->getNumElements();
+    if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) {
+      unsigned NumInsertVectorElts = IE.getType()->getNumElements();
+      unsigned NumExtractVectorElts =
+          EI->getOperand(0)->getType()->getVectorNumElements();
        unsigned ExtractedIdx =
          cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
        unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
  
-      if (ExtractedIdx >= NumVectorElts) // Out of range extract.
+      if (ExtractedIdx >= NumExtractVectorElts) // Out of range extract.
          return ReplaceInstUsesWith(IE, VecOp);
  
-      if (InsertedIdx >= NumVectorElts)  // Out of range insert.
+      if (InsertedIdx >= NumInsertVectorElts)  // Out of range insert.
          return ReplaceInstUsesWith(IE, UndefValue::get(IE.getType()));
  
        // If we are extracting a value from a vector, then inserting it right
@@ -503,13 +510,19 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
  
        // If this insertelement isn't used by some other insertelement, turn it
        // (and any insertelements it points to), into one big shuffle.
-      if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.use_back())) {
+      if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.user_back())) {
          SmallVector<Constant*, 16> Mask;
-        Value *RHS = 0;
-        Value *LHS = CollectShuffleElements(&IE, Mask, RHS);
-        if (RHS == 0) RHS = UndefValue::get(LHS->getType());
-        // We now have a shuffle of LHS, RHS, Mask.
-        return new ShuffleVectorInst(LHS, RHS, ConstantVector::get(Mask));
+        ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr);
+
+        // The proposed shuffle may be trivial, in which case we shouldn't
+        // perform the combine.
+        if (LR.first != &IE && LR.second != &IE) {
+          // We now have a shuffle of LHS, RHS, Mask.
+          if (LR.second == nullptr)
+            LR.second = UndefValue::get(LR.first->getType());
+          return new ShuffleVectorInst(LR.first, LR.second,
+                                       ConstantVector::get(Mask));
+        }
        }
      }
    }
@@ -523,7 +536,7 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
      return &IE;
    }
  
-  return 0;
+  return nullptr;
  }
  
  /// Return true if we can evaluate the specified expression tree if the vector
@@ -575,8 +588,8 @@ static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask,
      case Instruction::FPTrunc:
      case Instruction::FPExt:
      case Instruction::GetElementPtr: {
-      for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
-        if (!CanEvaluateShuffled(I->getOperand(i), Mask, Depth-1))
+      for (Value *Operand : I->operands()) {
+        if (!CanEvaluateShuffled(Operand, Mask, Depth-1))
            return false;
        }
        return true;
@@ -604,7 +617,7 @@ static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask,
  
  /// Rebuild a new instruction just like 'I' but with the new operands given.
  /// In the event of type mismatch, the type of the operands is correct.
-static Value *BuildNew(Instruction *I, ArrayRef<Value*> NewOps) {
+static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps) {
    // We don't want to use the IRBuilder here because we want the replacement
    // instructions to appear next to 'I', not the builder's insertion point.
    switch (I->getOpcode()) {
@@ -638,6 +651,8 @@ static Value *BuildNew(Instruction *I, ArrayRef<Value*> NewOps) {
        if (isa<PossiblyExactOperator>(BO)) {
          New->setIsExact(BO->isExact());
        }
+      if (isa<FPMathOperator>(BO))
+        New->copyFastMathFlags(I);
        return New;
      }
      case Instruction::ICmp:
@@ -669,7 +684,8 @@ static Value *BuildNew(Instruction *I, ArrayRef<Value*> NewOps) {
      case Instruction::GetElementPtr: {
        Value *Ptr = NewOps[0];
        ArrayRef<Value*> Idx = NewOps.slice(1);
-      GetElementPtrInst *GEP = GetElementPtrInst::Create(Ptr, Idx, "", I);
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          cast<GetElementPtrInst>(I)->getSourceElementType(), Ptr, Idx, "", I);
        GEP->setIsInBounds(cast<GetElementPtrInst>(I)->isInBounds());
        return GEP;
      }
@@ -744,7 +760,7 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
          NeedsRebuild |= (V != I->getOperand(i));
        }
        if (NeedsRebuild) {
-        return BuildNew(I, NewOps);
+        return buildNew(I, NewOps);
        }
        return I;
      }
@@ -776,10 +792,46 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
    llvm_unreachable("failed to reorder elements of vector instruction!");
  }
  
+static void recognizeIdentityMask(const SmallVectorImpl<int> &Mask,
+                                  bool &isLHSID, bool &isRHSID) {
+  isLHSID = isRHSID = true;
+
+  for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+    if (Mask[i] < 0) continue;  // Ignore undef values.
+    // Is this an identity shuffle of the LHS value?
+    isLHSID &= (Mask[i] == (int)i);
+
+    // Is this an identity shuffle of the RHS value?
+    isRHSID &= (Mask[i]-e == i);
+  }
+}
+
+// Returns true if the shuffle is extracting a contiguous range of values from
+// LHS, for example:
+//                 +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+//   Input:        |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP|
+//   Shuffles to:  |EE|FF|GG|HH|
+//                 +--+--+--+--+
+static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI,
+                                       SmallVector<int, 16> &Mask) {
+  unsigned LHSElems =
+      cast<VectorType>(SVI.getOperand(0)->getType())->getNumElements();
+  unsigned MaskElems = Mask.size();
+  unsigned BegIdx = Mask.front();
+  unsigned EndIdx = Mask.back();
+  if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1)
+    return false;
+  for (unsigned I = 0; I != MaskElems; ++I)
+    if (static_cast<unsigned>(Mask[I]) != BegIdx + I)
+      return false;
+  return true;
+}
+
  Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
    Value *LHS = SVI.getOperand(0);
    Value *RHS = SVI.getOperand(1);
    SmallVector<int, 16> Mask = SVI.getShuffleMask();
+  Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
  
    bool MadeChange = false;
  
@@ -815,18 +867,17 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
      SmallVector<Constant*, 16> Elts;
      for (unsigned i = 0, e = LHSWidth; i != VWidth; ++i) {
        if (Mask[i] < 0) {
-        Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext())));
+        Elts.push_back(UndefValue::get(Int32Ty));
          continue;
        }
  
        if ((Mask[i] >= (int)e && isa<UndefValue>(RHS)) ||
            (Mask[i] <  (int)e && isa<UndefValue>(LHS))) {
          Mask[i] = -1;     // Turn into undef.
-        Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext())));
+        Elts.push_back(UndefValue::get(Int32Ty));
        } else {
          Mask[i] = Mask[i] % e;  // Force to LHS.
-        Elts.push_back(ConstantInt::get(Type::getInt32Ty(SVI.getContext()),
-                                        Mask[i]));
+        Elts.push_back(ConstantInt::get(Int32Ty, Mask[i]));
        }
      }
      SVI.setOperand(0, SVI.getOperand(1));
@@ -839,16 +890,8 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
  
    if (VWidth == LHSWidth) {
      // Analyze the shuffle, are the LHS or RHS and identity shuffles?
-    bool isLHSID = true, isRHSID = true;
-
-    for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
-      if (Mask[i] < 0) continue;  // Ignore undef values.
-      // Is this an identity shuffle of the LHS value?
-      isLHSID &= (Mask[i] == (int)i);
-
-      // Is this an identity shuffle of the RHS value?
-      isRHSID &= (Mask[i]-e == i);
-    }
+    bool isLHSID, isRHSID;
+    recognizeIdentityMask(Mask, isLHSID, isRHSID);
  
      // Eliminate identity shuffles.
      if (isLHSID) return ReplaceInstUsesWith(SVI, LHS);
@@ -860,6 +903,95 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
      return ReplaceInstUsesWith(SVI, V);
    }
  
+  // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to
+  // a non-vector type. We can instead bitcast the original vector followed by
+  // an extract of the desired element:
+  //
+  //   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef,
+  //                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  //   %1 = bitcast <4 x i8> %sroa to i32
+  // Becomes:
+  //   %bc = bitcast <16 x i8> %in to <4 x i32>
+  //   %ext = extractelement <4 x i32> %bc, i32 0
+  //
+  // If the shuffle is extracting a contiguous range of values from the input
+  // vector then each use which is a bitcast of the extracted size can be
+  // replaced. This will work if the vector types are compatible, and the begin
+  // index is aligned to a value in the casted vector type. If the begin index
+  // isn't aligned then we can shuffle the original vector (keeping the same
+  // vector type) before extracting.
+  //
+  // This code will bail out if the target type is fundamentally incompatible
+  // with vectors of the source type.
+  //
+  // Example of <16 x i8>, target type i32:
+  // Index range [4,8):         v-----------v Will work.
+  //                +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+  //     <16 x i8>: |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
+  //     <4 x i32>: |           |           |           |           |
+  //                +-----------+-----------+-----------+-----------+
+  // Index range [6,10):              ^-----------^ Needs an extra shuffle.
+  // Target type i40:           ^--------------^ Won't work, bail.
+  if (isShuffleExtractingFromLHS(SVI, Mask)) {
+    Value *V = LHS;
+    unsigned MaskElems = Mask.size();
+    unsigned BegIdx = Mask.front();
+    VectorType *SrcTy = cast<VectorType>(V->getType());
+    unsigned VecBitWidth = SrcTy->getBitWidth();
+    unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
+    assert(SrcElemBitWidth && "vector elements must have a bitwidth");
+    unsigned SrcNumElems = SrcTy->getNumElements();
+    SmallVector<BitCastInst *, 8> BCs;
+    DenseMap<Type *, Value *> NewBCs;
+    for (User *U : SVI.users())
+      if (BitCastInst *BC = dyn_cast<BitCastInst>(U))
+        if (!BC->use_empty())
+          // Only visit bitcasts that weren't previously handled.
+          BCs.push_back(BC);
+    for (BitCastInst *BC : BCs) {
+      Type *TgtTy = BC->getDestTy();
+      unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy);
+      if (!TgtElemBitWidth)
+        continue;
+      unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth;
+      bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth;
+      bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth);
+      if (!VecBitWidthsEqual)
+        continue;
+      if (!VectorType::isValidElementType(TgtTy))
+        continue;
+      VectorType *CastSrcTy = VectorType::get(TgtTy, TgtNumElems);
+      if (!BegIsAligned) {
+        // Shuffle the input so [0,NumElements) contains the output, and
+        // [NumElems,SrcNumElems) is undef.
+        SmallVector<Constant *, 16> ShuffleMask(SrcNumElems,
+                                                UndefValue::get(Int32Ty));
+        for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I)
+          ShuffleMask[I] = ConstantInt::get(Int32Ty, Idx);
+        V = Builder->CreateShuffleVector(V, UndefValue::get(V->getType()),
+                                         ConstantVector::get(ShuffleMask),
+                                         SVI.getName() + ".extract");
+        BegIdx = 0;
+      }
+      unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth;
+      assert(SrcElemsPerTgtElem);
+      BegIdx /= SrcElemsPerTgtElem;
+      bool BCAlreadyExists = NewBCs.find(CastSrcTy) != NewBCs.end();
+      auto *NewBC =
+          BCAlreadyExists
+              ? NewBCs[CastSrcTy]
+              : Builder->CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc");
+      if (!BCAlreadyExists)
+        NewBCs[CastSrcTy] = NewBC;
+      auto *Ext = Builder->CreateExtractElement(
+          NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract");
+      // The shufflevector isn't being replaced: the bitcast that used it
+      // is. InstCombine will visit the newly-created instructions.
+      ReplaceInstUsesWith(*BC, Ext);
+      MadeChange = true;
+    }
+  }
+
    // If the LHS is a shufflevector itself, see if we can combine it with this
    // one without producing an unusual shuffle.
    // Cases that might be simplified:
@@ -907,16 +1039,16 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
    ShuffleVectorInst* RHSShuffle = dyn_cast<ShuffleVectorInst>(RHS);
    if (LHSShuffle)
      if (!isa<UndefValue>(LHSShuffle->getOperand(1)) && !isa<UndefValue>(RHS))
-      LHSShuffle = NULL;
+      LHSShuffle = nullptr;
    if (RHSShuffle)
      if (!isa<UndefValue>(RHSShuffle->getOperand(1)))
-      RHSShuffle = NULL;
+      RHSShuffle = nullptr;
    if (!LHSShuffle && !RHSShuffle)
-    return MadeChange ? &SVI : 0;
+    return MadeChange ? &SVI : nullptr;
  
-  Value* LHSOp0 = NULL;
-  Value* LHSOp1 = NULL;
-  Value* RHSOp0 = NULL;
+  Value* LHSOp0 = nullptr;
+  Value* LHSOp1 = nullptr;
+  Value* RHSOp0 = nullptr;
    unsigned LHSOp0Width = 0;
    unsigned RHSOp0Width = 0;
    if (LHSShuffle) {
@@ -948,11 +1080,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
    // case 4
    if (LHSOp0 == RHSOp0) {
      newLHS = LHSOp0;
-    newRHS = NULL;
+    newRHS = nullptr;
    }
  
    if (newLHS == LHS && newRHS == RHS)
-    return MadeChange ? &SVI : 0;
+    return MadeChange ? &SVI : nullptr;
  
    SmallVector<int, 16> LHSMask;
    SmallVector<int, 16> RHSMask;
@@ -1011,8 +1143,8 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
        // references from RHSOp0 to LHSOp0, so we don't need to shift the mask.
        // If newRHS == newLHS, we want to remap any references from newRHS to
        // newLHS so that we can properly identify splats that may occur due to
-      // obfuscation accross the two vectors.
-      if (eltMask >= 0 && newRHS != NULL && newLHS != newRHS)
+      // obfuscation across the two vectors.
+      if (eltMask >= 0 && newRHS != nullptr && newLHS != newRHS)
          eltMask += newLHSWidth;
      }
  
@@ -1030,7 +1162,6 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
    // or is a splat, do the replacement.
    if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) {
      SmallVector<Constant*, 16> Elts;
-    Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
      for (unsigned i = 0, e = newMask.size(); i != e; ++i) {
        if (newMask[i] < 0) {
          Elts.push_back(UndefValue::get(Int32Ty));
@@ -1038,10 +1169,17 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
          Elts.push_back(ConstantInt::get(Int32Ty, newMask[i]));
        }
      }
-    if (newRHS == NULL)
+    if (!newRHS)
        newRHS = UndefValue::get(newLHS->getType());
      return new ShuffleVectorInst(newLHS, newRHS, ConstantVector::get(Elts));
    }
  
-  return MadeChange ? &SVI : 0;
+  // If the result mask is an identity, replace uses of this instruction with
+  // corresponding argument.
+  bool isLHSID, isRHSID;
+  recognizeIdentityMask(newMask, isLHSID, isRHSID);
+  if (isLHSID && VWidth == LHSOp0Width) return ReplaceInstUsesWith(SVI, newLHS);
+  if (isRHSID && VWidth == RHSOp0Width) return ReplaceInstUsesWith(SVI, newRHS);
+
+  return MadeChange ? &SVI : nullptr;
  }