[InstCombine] canonicalize (bitcast (extractelement X)) --> (extractelement(bitcast X))

[oota-llvm.git] / lib / Transforms / InstCombine / InstCombineVectorOps.cpp
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp

index bd647d9e1cd9f31c8234098ad29a588652786504..8704fcc0fd178e97fb177b7d4250b985c23e77cc 100644 (file)
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -12,17 +12,20 @@
  //
  //===----------------------------------------------------------------------===//
  
-#include "InstCombine.h"
+#include "InstCombineInternal.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/VectorUtils.h"
  #include "llvm/IR/PatternMatch.h"
  using namespace llvm;
  using namespace PatternMatch;
  
  #define DEBUG_TYPE "instcombine"
  
-/// CheapToScalarize - Return true if the value is cheaper to scalarize than it
-/// is to leave as a vector operation.  isConstant indicates whether we're
-/// extracting one known element.  If false we're extracting a variable index.
-static bool CheapToScalarize(Value *V, bool isConstant) {
+/// Return true if the value is cheaper to scalarize than it is to leave as a
+/// vector operation. isConstant indicates whether we're extracting one known
+/// element. If false we're extracting a variable index.
+static bool cheapToScalarize(Value *V, bool isConstant) {
    if (Constant *C = dyn_cast<Constant>(V)) {
      if (isConstant) return true;
  
@@ -47,68 +50,18 @@ static bool CheapToScalarize(Value *V, bool isConstant) {
      return true;
    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I))
      if (BO->hasOneUse() &&
-        (CheapToScalarize(BO->getOperand(0), isConstant) ||
-         CheapToScalarize(BO->getOperand(1), isConstant)))
+        (cheapToScalarize(BO->getOperand(0), isConstant) ||
+         cheapToScalarize(BO->getOperand(1), isConstant)))
        return true;
    if (CmpInst *CI = dyn_cast<CmpInst>(I))
      if (CI->hasOneUse() &&
-        (CheapToScalarize(CI->getOperand(0), isConstant) ||
-         CheapToScalarize(CI->getOperand(1), isConstant)))
+        (cheapToScalarize(CI->getOperand(0), isConstant) ||
+         cheapToScalarize(CI->getOperand(1), isConstant)))
        return true;
  
    return false;
  }
  
-/// FindScalarElement - Given a vector and an element number, see if the scalar
-/// value is already around as a register, for example if it were inserted then
-/// extracted from the vector.
-static Value *FindScalarElement(Value *V, unsigned EltNo) {
-  assert(V->getType()->isVectorTy() && "Not looking at a vector?");
-  VectorType *VTy = cast<VectorType>(V->getType());
-  unsigned Width = VTy->getNumElements();
-  if (EltNo >= Width)  // Out of range access.
-    return UndefValue::get(VTy->getElementType());
-
-  if (Constant *C = dyn_cast<Constant>(V))
-    return C->getAggregateElement(EltNo);
-
-  if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
-    // If this is an insert to a variable element, we don't know what it is.
-    if (!isa<ConstantInt>(III->getOperand(2)))
-      return nullptr;
-    unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue();
-
-    // If this is an insert to the element we are looking for, return the
-    // inserted value.
-    if (EltNo == IIElt)
-      return III->getOperand(1);
-
-    // Otherwise, the insertelement doesn't modify the value, recurse on its
-    // vector input.
-    return FindScalarElement(III->getOperand(0), EltNo);
-  }
-
-  if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V)) {
-    unsigned LHSWidth = SVI->getOperand(0)->getType()->getVectorNumElements();
-    int InEl = SVI->getMaskValue(EltNo);
-    if (InEl < 0)
-      return UndefValue::get(VTy->getElementType());
-    if (InEl < (int)LHSWidth)
-      return FindScalarElement(SVI->getOperand(0), InEl);
-    return FindScalarElement(SVI->getOperand(1), InEl - LHSWidth);
-  }
-
-  // Extract a value from a vector add operation with a constant zero.
-  Value *Val = nullptr; Constant *Con = nullptr;
-  if (match(V, m_Add(m_Value(Val), m_Constant(Con)))) {
-    if (Con->getAggregateElement(EltNo)->isNullValue())
-      return FindScalarElement(Val, EltNo);
-  }
-
-  // Otherwise, we don't know.
-  return nullptr;
-}
-
  // If we have a PHI node with a vector type that has only 2 uses: feed
  // itself and be an operand of extractelement at a constant location,
  // try to replace the PHI of the vector type with a PHI of a scalar type.
@@ -129,7 +82,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
    // and that it is a binary operation which is cheap to scalarize.
    // otherwise return NULL.
    if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) ||
-      !(isa<BinaryOperator>(PHIUser)) || !CheapToScalarize(PHIUser, true))
+      !(isa<BinaryOperator>(PHIUser)) || !cheapToScalarize(PHIUser, true))
      return nullptr;
  
    // Create a scalar PHI node that will replace the vector PHI node
@@ -144,7 +97,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
      // If the operand is the PHI induction variable:
      if (PHIInVal == PHIUser) {
        // Scalarize the binary operation. Its first operand is the
-      // scalar PHI and the second operand is extracted from the other
+      // scalar PHI, and the second operand is extracted from the other
        // vector operand.
        BinaryOperator *B0 = cast<BinaryOperator>(PHIUser);
        unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0;
@@ -162,8 +115,7 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
        Instruction *pos = dyn_cast<Instruction>(PHIInVal);
        BasicBlock::iterator InsertPos;
        if (pos && !isa<PHINode>(pos)) {
-        InsertPos = pos;
-        ++InsertPos;
+        InsertPos = ++pos->getIterator();
        } else {
          InsertPos = inBB->getFirstInsertionPt();
        }
@@ -177,10 +129,14 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
  }
  
  Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
+  if (Value *V = SimplifyExtractElementInst(
+          EI.getVectorOperand(), EI.getIndexOperand(), DL, TLI, DT, AC))
+    return ReplaceInstUsesWith(EI, V);
+
    // If vector val is constant with all elements the same, replace EI with
    // that element.  We handle a known element # below.
    if (Constant *C = dyn_cast<Constant>(EI.getOperand(0)))
-    if (CheapToScalarize(C, false))
+    if (cheapToScalarize(C, false))
        return ReplaceInstUsesWith(EI, C->getAggregateElement(0U));
  
    // If extracting a specified index from the vector, see if we can recursively
@@ -189,10 +145,8 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
      unsigned IndexVal = IdxC->getZExtValue();
      unsigned VectorWidth = EI.getVectorOperandType()->getNumElements();
  
-    // If this is extracting an invalid index, turn this into undef, to avoid
-    // crashing the code below.
-    if (IndexVal >= VectorWidth)
-      return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+    // InstSimplify handles cases where the index is invalid.
+    assert(IndexVal < VectorWidth);
  
      // This instruction only demands the single element from the input vector.
      // If the input vector has a single use, simplify it based on this use
@@ -201,23 +155,20 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
        APInt UndefElts(VectorWidth, 0);
        APInt DemandedMask(VectorWidth, 0);
        DemandedMask.setBit(IndexVal);
-      if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0),
-                                                DemandedMask, UndefElts)) {
+      if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0), DemandedMask,
+                                                UndefElts)) {
          EI.setOperand(0, V);
          return &EI;
        }
      }
  
-    if (Value *Elt = FindScalarElement(EI.getOperand(0), IndexVal))
-      return ReplaceInstUsesWith(EI, Elt);
-
-    // If the this extractelement is directly using a bitcast from a vector of
+    // If this extractelement is directly using a bitcast from a vector of
      // the same number of elements, see if we can find the source element from
      // it.  In this case, we will end up needing to bitcast the scalars.
      if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) {
        if (VectorType *VT = dyn_cast<VectorType>(BCI->getOperand(0)->getType()))
          if (VT->getNumElements() == VectorWidth)
-          if (Value *Elt = FindScalarElement(BCI->getOperand(0), IndexVal))
+          if (Value *Elt = findScalarElement(BCI->getOperand(0), IndexVal))
              return new BitCastInst(Elt, EI.getType());
      }
  
@@ -232,10 +183,10 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
  
    if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0))) {
      // Push extractelement into predecessor operation if legal and
-    // profitable to do so
+    // profitable to do so.
      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
        if (I->hasOneUse() &&
-          CheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) {
+          cheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) {
          Value *newEI0 =
            Builder->CreateExtractElement(BO->getOperand(0), EI.getOperand(1),
                                          EI.getName()+".lhs");
@@ -278,8 +229,9 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
                                                             SrcIdx, false));
        }
      } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
-      // Canonicalize extractelement(cast) -> cast(extractelement)
-      // bitcasts can change the number of vector elements and they cost nothing
+      // Canonicalize extractelement(cast) -> cast(extractelement).
+      // Bitcasts can change the number of vector elements, and they cost
+      // nothing.
        if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
          Value *EE = Builder->CreateExtractElement(CI->getOperand(0),
                                                    EI.getIndexOperand());
@@ -293,7 +245,8 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
          // fight the vectorizer.
  
          // If we are extracting an element from a vector select or a select on
-        // vectors, a select on the scalars extracted from the vector arguments.
+        // vectors, create a select on the scalars extracted from the vector
+        // arguments.
          Value *TrueVal = SI->getTrueValue();
          Value *FalseVal = SI->getFalseValue();
  
@@ -323,10 +276,9 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
    return nullptr;
  }
  
-/// CollectSingleShuffleElements - If V is a shuffle of values that ONLY returns
-/// elements from either LHS or RHS, return the shuffle mask and true.
-/// Otherwise, return false.
-static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
+/// If V is a shuffle of values that ONLY returns elements from either LHS or
+/// RHS, return the shuffle mask and true. Otherwise, return false.
+static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
                                           SmallVectorImpl<Constant*> &Mask) {
    assert(LHS->getType() == RHS->getType() &&
           "Invalid CollectSingleShuffleElements");
@@ -361,9 +313,9 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
      unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
  
      if (isa<UndefValue>(ScalarOp)) {  // inserting undef into vector.
-      // Okay, we can handle this if the vector we are insertinting into is
+      // We can handle this if the vector we are inserting into is
        // transitively ok.
-      if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+      if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
          // If so, update the mask to reflect the inserted undef.
          Mask[InsertedIdx] = UndefValue::get(Type::getInt32Ty(V->getContext()));
          return true;
@@ -376,9 +328,9 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
  
          // This must be extracting from either LHS or RHS.
          if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
-          // Okay, we can handle this if the vector we are insertinting into is
+          // We can handle this if the vector we are inserting into is
            // transitively ok.
-          if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+          if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
              // If so, update the mask to reflect the inserted value.
              if (EI->getOperand(0) == LHS) {
                Mask[InsertedIdx % NumElts] =
@@ -403,7 +355,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
  
  /// We are building a shuffle to create V, which is a sequence of insertelement,
  /// extractelement pairs. If PermittedRHS is set, then we must either use it or
-/// not rely on the second vector source. Return an std::pair containing the
+/// not rely on the second vector source. Return a std::pair containing the
  /// left and right vectors of the proposed shuffle (or 0), and set the Mask
  /// parameter as required.
  ///
@@ -411,7 +363,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
  /// often been chosen carefully to be efficiently implementable on the target.
  typedef std::pair<Value *, Value *> ShuffleOps;
  
-static ShuffleOps CollectShuffleElements(Value *V,
+static ShuffleOps collectShuffleElements(Value *V,
                                           SmallVectorImpl<Constant *> &Mask,
                                           Value *PermittedRHS) {
    assert(V->getType()->isVectorTy() && "Invalid shuffle!");
@@ -444,7 +396,7 @@ static ShuffleOps CollectShuffleElements(Value *V,
          // otherwise we'd end up with a shuffle of three inputs.
          if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) {
            Value *RHS = EI->getOperand(0);
-          ShuffleOps LR = CollectShuffleElements(VecOp, Mask, RHS);
+          ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS);
            assert(LR.second == nullptr || LR.second == RHS);
  
            if (LR.first->getType() != RHS->getType()) {
@@ -477,14 +429,14 @@ static ShuffleOps CollectShuffleElements(Value *V,
          // If this insertelement is a chain that comes from exactly these two
          // vectors, return the vector and the effective shuffle.
          if (EI->getOperand(0)->getType() == PermittedRHS->getType() &&
-            CollectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS,
+            collectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS,
                                           Mask))
            return std::make_pair(EI->getOperand(0), PermittedRHS);
        }
      }
    }
  
-  // Otherwise, can't do anything fancy.  Return an identity vector.
+  // Otherwise, we can't do anything fancy. Return an identity vector.
    for (unsigned i = 0; i != NumElts; ++i)
      Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i));
    return std::make_pair(V, nullptr);
@@ -560,7 +512,7 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
        // (and any insertelements it points to), into one big shuffle.
        if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.user_back())) {
          SmallVector<Constant*, 16> Mask;
-        ShuffleOps LR = CollectShuffleElements(&IE, Mask, nullptr);
+        ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr);
  
          // The proposed shuffle may be trivial, in which case we shouldn't
          // perform the combine.
@@ -636,8 +588,8 @@ static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask,
      case Instruction::FPTrunc:
      case Instruction::FPExt:
      case Instruction::GetElementPtr: {
-      for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
-        if (!CanEvaluateShuffled(I->getOperand(i), Mask, Depth-1))
+      for (Value *Operand : I->operands()) {
+        if (!CanEvaluateShuffled(Operand, Mask, Depth-1))
            return false;
        }
        return true;
@@ -665,7 +617,7 @@ static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask,
  
  /// Rebuild a new instruction just like 'I' but with the new operands given.
  /// In the event of type mismatch, the type of the operands is correct.
-static Value *BuildNew(Instruction *I, ArrayRef<Value*> NewOps) {
+static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps) {
    // We don't want to use the IRBuilder here because we want the replacement
    // instructions to appear next to 'I', not the builder's insertion point.
    switch (I->getOpcode()) {
@@ -732,7 +684,8 @@ static Value *BuildNew(Instruction *I, ArrayRef<Value*> NewOps) {
      case Instruction::GetElementPtr: {
        Value *Ptr = NewOps[0];
        ArrayRef<Value*> Idx = NewOps.slice(1);
-      GetElementPtrInst *GEP = GetElementPtrInst::Create(Ptr, Idx, "", I);
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          cast<GetElementPtrInst>(I)->getSourceElementType(), Ptr, Idx, "", I);
        GEP->setIsInBounds(cast<GetElementPtrInst>(I)->isInBounds());
        return GEP;
      }
@@ -807,7 +760,7 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
          NeedsRebuild |= (V != I->getOperand(i));
        }
        if (NeedsRebuild) {
-        return BuildNew(I, NewOps);
+        return buildNew(I, NewOps);
        }
        return I;
      }
@@ -839,7 +792,7 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
    llvm_unreachable("failed to reorder elements of vector instruction!");
  }
  
-static void RecognizeIdentityMask(const SmallVectorImpl<int> &Mask,
+static void recognizeIdentityMask(const SmallVectorImpl<int> &Mask,
                                    bool &isLHSID, bool &isRHSID) {
    isLHSID = isRHSID = true;
  
@@ -853,10 +806,32 @@ static void RecognizeIdentityMask(const SmallVectorImpl<int> &Mask,
    }
  }
  
+// Returns true if the shuffle is extracting a contiguous range of values from
+// LHS, for example:
+//                 +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+//   Input:        |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP|
+//   Shuffles to:  |EE|FF|GG|HH|
+//                 +--+--+--+--+
+static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI,
+                                       SmallVector<int, 16> &Mask) {
+  unsigned LHSElems =
+      cast<VectorType>(SVI.getOperand(0)->getType())->getNumElements();
+  unsigned MaskElems = Mask.size();
+  unsigned BegIdx = Mask.front();
+  unsigned EndIdx = Mask.back();
+  if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1)
+    return false;
+  for (unsigned I = 0; I != MaskElems; ++I)
+    if (static_cast<unsigned>(Mask[I]) != BegIdx + I)
+      return false;
+  return true;
+}
+
  Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
    Value *LHS = SVI.getOperand(0);
    Value *RHS = SVI.getOperand(1);
    SmallVector<int, 16> Mask = SVI.getShuffleMask();
+  Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
  
    bool MadeChange = false;
  
@@ -892,18 +867,17 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
      SmallVector<Constant*, 16> Elts;
      for (unsigned i = 0, e = LHSWidth; i != VWidth; ++i) {
        if (Mask[i] < 0) {
-        Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext())));
+        Elts.push_back(UndefValue::get(Int32Ty));
          continue;
        }
  
        if ((Mask[i] >= (int)e && isa<UndefValue>(RHS)) ||
            (Mask[i] <  (int)e && isa<UndefValue>(LHS))) {
          Mask[i] = -1;     // Turn into undef.
-        Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext())));
+        Elts.push_back(UndefValue::get(Int32Ty));
        } else {
          Mask[i] = Mask[i] % e;  // Force to LHS.
-        Elts.push_back(ConstantInt::get(Type::getInt32Ty(SVI.getContext()),
-                                        Mask[i]));
+        Elts.push_back(ConstantInt::get(Int32Ty, Mask[i]));
        }
      }
      SVI.setOperand(0, SVI.getOperand(1));
@@ -917,7 +891,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
    if (VWidth == LHSWidth) {
      // Analyze the shuffle, are the LHS or RHS and identity shuffles?
      bool isLHSID, isRHSID;
-    RecognizeIdentityMask(Mask, isLHSID, isRHSID);
+    recognizeIdentityMask(Mask, isLHSID, isRHSID);
  
      // Eliminate identity shuffles.
      if (isLHSID) return ReplaceInstUsesWith(SVI, LHS);
@@ -929,6 +903,95 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
      return ReplaceInstUsesWith(SVI, V);
    }
  
+  // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to
+  // a non-vector type. We can instead bitcast the original vector followed by
+  // an extract of the desired element:
+  //
+  //   %sroa = shufflevector <16 x i8> %in, <16 x i8> undef,
+  //                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  //   %1 = bitcast <4 x i8> %sroa to i32
+  // Becomes:
+  //   %bc = bitcast <16 x i8> %in to <4 x i32>
+  //   %ext = extractelement <4 x i32> %bc, i32 0
+  //
+  // If the shuffle is extracting a contiguous range of values from the input
+  // vector then each use which is a bitcast of the extracted size can be
+  // replaced. This will work if the vector types are compatible, and the begin
+  // index is aligned to a value in the casted vector type. If the begin index
+  // isn't aligned then we can shuffle the original vector (keeping the same
+  // vector type) before extracting.
+  //
+  // This code will bail out if the target type is fundamentally incompatible
+  // with vectors of the source type.
+  //
+  // Example of <16 x i8>, target type i32:
+  // Index range [4,8):         v-----------v Will work.
+  //                +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+  //     <16 x i8>: |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
+  //     <4 x i32>: |           |           |           |           |
+  //                +-----------+-----------+-----------+-----------+
+  // Index range [6,10):              ^-----------^ Needs an extra shuffle.
+  // Target type i40:           ^--------------^ Won't work, bail.
+  if (isShuffleExtractingFromLHS(SVI, Mask)) {
+    Value *V = LHS;
+    unsigned MaskElems = Mask.size();
+    unsigned BegIdx = Mask.front();
+    VectorType *SrcTy = cast<VectorType>(V->getType());
+    unsigned VecBitWidth = SrcTy->getBitWidth();
+    unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
+    assert(SrcElemBitWidth && "vector elements must have a bitwidth");
+    unsigned SrcNumElems = SrcTy->getNumElements();
+    SmallVector<BitCastInst *, 8> BCs;
+    DenseMap<Type *, Value *> NewBCs;
+    for (User *U : SVI.users())
+      if (BitCastInst *BC = dyn_cast<BitCastInst>(U))
+        if (!BC->use_empty())
+          // Only visit bitcasts that weren't previously handled.
+          BCs.push_back(BC);
+    for (BitCastInst *BC : BCs) {
+      Type *TgtTy = BC->getDestTy();
+      unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy);
+      if (!TgtElemBitWidth)
+        continue;
+      unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth;
+      bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth;
+      bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth);
+      if (!VecBitWidthsEqual)
+        continue;
+      if (!VectorType::isValidElementType(TgtTy))
+        continue;
+      VectorType *CastSrcTy = VectorType::get(TgtTy, TgtNumElems);
+      if (!BegIsAligned) {
+        // Shuffle the input so [0,NumElements) contains the output, and
+        // [NumElems,SrcNumElems) is undef.
+        SmallVector<Constant *, 16> ShuffleMask(SrcNumElems,
+                                                UndefValue::get(Int32Ty));
+        for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I)
+          ShuffleMask[I] = ConstantInt::get(Int32Ty, Idx);
+        V = Builder->CreateShuffleVector(V, UndefValue::get(V->getType()),
+                                         ConstantVector::get(ShuffleMask),
+                                         SVI.getName() + ".extract");
+        BegIdx = 0;
+      }
+      unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth;
+      assert(SrcElemsPerTgtElem);
+      BegIdx /= SrcElemsPerTgtElem;
+      bool BCAlreadyExists = NewBCs.find(CastSrcTy) != NewBCs.end();
+      auto *NewBC =
+          BCAlreadyExists
+              ? NewBCs[CastSrcTy]
+              : Builder->CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc");
+      if (!BCAlreadyExists)
+        NewBCs[CastSrcTy] = NewBC;
+      auto *Ext = Builder->CreateExtractElement(
+          NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract");
+      // The shufflevector isn't being replaced: the bitcast that used it
+      // is. InstCombine will visit the newly-created instructions.
+      ReplaceInstUsesWith(*BC, Ext);
+      MadeChange = true;
+    }
+  }
+
    // If the LHS is a shufflevector itself, see if we can combine it with this
    // one without producing an unusual shuffle.
    // Cases that might be simplified:
@@ -1099,7 +1162,6 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
    // or is a splat, do the replacement.
    if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) {
      SmallVector<Constant*, 16> Elts;
-    Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
      for (unsigned i = 0, e = newMask.size(); i != e; ++i) {
        if (newMask[i] < 0) {
          Elts.push_back(UndefValue::get(Int32Ty));
@@ -1114,12 +1176,10 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
  
    // If the result mask is an identity, replace uses of this instruction with
    // corresponding argument.
-  if (VWidth == LHSWidth) {
-    bool isLHSID, isRHSID;
-    RecognizeIdentityMask(newMask, isLHSID, isRHSID);
-    if (isLHSID) return ReplaceInstUsesWith(SVI, newLHS);
-    if (isRHSID) return ReplaceInstUsesWith(SVI, newRHS);
-  }
+  bool isLHSID, isRHSID;
+  recognizeIdentityMask(newMask, isLHSID, isRHSID);
+  if (isLHSID && VWidth == LHSOp0Width) return ReplaceInstUsesWith(SVI, newLHS);
+  if (isRHSID && VWidth == RHSOp0Width) return ReplaceInstUsesWith(SVI, newRHS);
  
    return MadeChange ? &SVI : nullptr;
  }