From 9b6ca9304cd7deb0598b0c3d7488023bfffd29d9 Mon Sep 17 00:00:00 2001
From: Quentin Colombet <qcolombet@apple.com>
Date: Fri, 31 Oct 2014 17:52:53 +0000
Subject: [PATCH] [CodeGenPrepare] Move extractelement close to store if they
 can be combined.

This patch adds an optimization in CodeGenPrepare to move an extractelement
right before a store when the target can combine them.
The optimization may promote any scalar operations to vector operations in the
way to make that possible.


** Context **

Some targets use different register files for both vector and scalar operations.
This means that transitioning from one domain to another may incur copy from one
register file to another. These copies are not coalescable and may be expensive.
For example, according to the scheduling model, on cortex-A8 a vector to GPR
move is 20 cycles.


** Motivating Example **

Let us consider an example:
define void @foo(<2 x i32>* %addr1, i32* %dest) {
 %in1 = load <2 x i32>* %addr1, align 8
 %extract = extractelement <2 x i32> %in1, i32 1
 %out = or i32 %extract, 1
 store i32 %out, i32* %dest, align 4
 ret void
}

As it is, this IR generates the following assembly on armv7:
  vldr  d16, [r0]            @vector load
  vmov.32 r0, d16[1]  @ cross-register-file copy: 20 cycles
  orr r0, r0, #1           @ scalar bitwise or
  str r0, [r1]               @ scalar store
  bx  lr

Whereas we could generate much faster code:
  vldr  d16, [r0]               @ vector load
  vorr.i32  d16, #0x1     @ vector bitwise or
  vst1.32 {d16[1]}, [r1:32] @ vector extract + store
  bx  lr

Half of the computation made in the vector is useless, but this allows to get
rid of the expensive cross-register-file copy.


** Proposed Solution **

To avoid this cross-register-copy penalty, we promote the scalar operations to
vector operations. The penalty will be removed if we manage to promote the whole
chain of computation in the vector domain.
Currently, we do that only when the chain of computation ends by a store and the
target is able to combine an extract with a store.

Stores are the most likely candidates, because other instructions produce values
that would need to be promoted and so, extracted as some point[1]. Moreover,
this is customary that targets feature stores that perform a vector extract (see
AArch64 and X86 for instance).

The proposed implementation relies on the TargetTransformInfo to decide whether
or not it is beneficial to promote a chain of computation in the vector domain.
Unfortunately, this interface is rather inaccurate for this level of details and
although this optimization may be beneficial for X86 and AArch64, the inaccuracy
will lead to the optimization being too aggressive.
Basically in TargetTransformInfo, everything that is legal has a cost of 1,
whereas, even if a vector type is legal, usually a vector operation is slightly
more expensive than its scalar counterpart. That will lead to too many
promotions that may not be counter balanced by the saving of the
cross-register-file copy. For instance, on AArch64 this penalty is just 4
cycles.

For now, the optimization is just enabled for ARM prior than v8, since those
processors have a larger penalty on cross-register-file copies, and the scope is
limited to basic blocks. Because of these two factors, we limit the effects of
the inaccuracy. Indeed, I did not want to build up a fancy cost model with block
frequency and everything on top of that.

[1] We can imagine targets that can combine an extractelement with  other
instructions than just stores. If we want to go into that direction, the current
interfaces must be augmented and, moreover, I think this becomes a global isel
problem.

Differential Revision: http://reviews.llvm.org/D5921

<rdar://problem/14170854>


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@220978 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetLowering.h |  10 +-
 lib/CodeGen/CodeGenPrepare.cpp       | 380 ++++++++++++++++++++++++-
 lib/Target/ARM/ARMISelLowering.cpp   |  29 ++
 lib/Target/ARM/ARMISelLowering.h     |   3 +
 test/CodeGen/ARM/vector-promotion.ll | 403 +++++++++++++++++++++++++++
 5 files changed, 823 insertions(+), 2 deletions(-)
 create mode 100644 test/CodeGen/ARM/vector-promotion.ll

diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 95b050ac7b8..ffb44b36796 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -262,7 +262,15 @@ public:
   bool isMaskAndBranchFoldingLegal() const {
     return MaskAndBranchFoldingIsLegal;
   }
-  
+
+  /// Return true if the target can combine store(extractelement VectorTy,
+  /// Idx).
+  /// \p Cost[out] gives the cost of that transformation when this is true.
+  virtual bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
+                                         unsigned &Cost) const {
+    return false;
+  }
+
   /// Return true if target supports floating point exceptions.
   bool hasFloatingPointExceptions() const {
     return HasFloatingPointExceptions;
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index ffff18c9ae0..a5b2c96ed28 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -63,6 +64,7 @@ STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
 STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
 STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches");
+STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
 
 static cl::opt<bool> DisableBranchOpts(
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
@@ -80,6 +82,14 @@ static cl::opt<bool> EnableAndCmpSinking(
    "enable-andcmp-sinking", cl::Hidden, cl::init(true),
    cl::desc("Enable sinkinig and/cmp into branches."));
 
+static cl::opt<bool> DisableStoreExtract(
+    "disable-cgp-store-extract", cl::Hidden, cl::init(false),
+    cl::desc("Disable store(extract) optimizations in CodeGenPrepare"));
+
+static cl::opt<bool> StressStoreExtract(
+    "stress-cgp-store-extract", cl::Hidden, cl::init(false),
+    cl::desc("Stress test store(extract) optimizations in CodeGenPrepare"));
+
 namespace {
 typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
 typedef DenseMap<Instruction *, Type *> InstrToOrigTy;
@@ -89,6 +99,7 @@ typedef DenseMap<Instruction *, Type *> InstrToOrigTy;
     /// transformation profitability.
     const TargetMachine *TM;
     const TargetLowering *TLI;
+    const TargetTransformInfo *TTI;
     const TargetLibraryInfo *TLInfo;
     DominatorTree *DT;
 
@@ -118,7 +129,7 @@ typedef DenseMap<Instruction *, Type *> InstrToOrigTy;
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit CodeGenPrepare(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), TM(TM), TLI(nullptr) {
+        : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr) {
         initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
       }
     bool runOnFunction(Function &F) override;
@@ -128,6 +139,7 @@ typedef DenseMap<Instruction *, Type *> InstrToOrigTy;
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addPreserved<DominatorTreeWrapperPass>();
       AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<TargetTransformInfo>();
     }
 
   private:
@@ -144,6 +156,7 @@ typedef DenseMap<Instruction *, Type *> InstrToOrigTy;
     bool OptimizeExtUses(Instruction *I);
     bool OptimizeSelectInst(SelectInst *SI);
     bool OptimizeShuffleVectorInst(ShuffleVectorInst *SI);
+    bool OptimizeExtractElementInst(Instruction *Inst);
     bool DupRetToEnableTailCallOpts(BasicBlock *BB);
     bool PlaceDbgValues(Function &F);
     bool sinkAndCmp(Function &F);
@@ -171,6 +184,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   if (TM)
     TLI = TM->getSubtargetImpl()->getTargetLowering();
   TLInfo = &getAnalysis<TargetLibraryInfo>();
+  TTI = &getAnalysis<TargetTransformInfo>();
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
@@ -3168,6 +3182,367 @@ bool CodeGenPrepare::OptimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
   return MadeChange;
 }
 
+namespace {
+/// \brief Helper class to promote a scalar operation to a vector one.
+/// This class is used to move downward extractelement transition.
+/// E.g.,
+/// a = vector_op <2 x i32>
+/// b = extractelement <2 x i32> a, i32 0
+/// c = scalar_op b
+/// store c
+///
+/// =>
+/// a = vector_op <2 x i32>
+/// c = vector_op a (equivalent to scalar_op on the related lane)
+/// * d = extractelement <2 x i32> c, i32 0
+/// * store d
+/// Assuming both extractelement and store can be combine, we get rid of the
+/// transition.
+class VectorPromoteHelper {
+  /// Used to perform some checks on the legality of vector operations.
+  const TargetLowering &TLI;
+
+  /// Used to estimated the cost of the promoted chain.
+  const TargetTransformInfo &TTI;
+
+  /// The transition being moved downwards.
+  Instruction *Transition;
+  /// The sequence of instructions to be promoted.
+  SmallVector<Instruction *, 4> InstsToBePromoted;
+  /// Cost of combining a store and an extract.
+  unsigned StoreExtractCombineCost;
+  /// Instruction that will be combined with the transition.
+  Instruction *CombineInst;
+
+  /// \brief The instruction that represents the current end of the transition.
+  /// Since we are faking the promotion until we reach the end of the chain
+  /// of computation, we need a way to get the current end of the transition.
+  Instruction *getEndOfTransition() const {
+    if (InstsToBePromoted.empty())
+      return Transition;
+    return InstsToBePromoted.back();
+  }
+
+  /// \brief Return the index of the original value in the transition.
+  /// E.g., for "extractelement <2 x i32> c, i32 1" the original value,
+  /// c, is at index 0.
+  unsigned getTransitionOriginalValueIdx() const {
+    assert(isa<ExtractElementInst>(Transition) &&
+           "Other kind of transitions are not supported yet");
+    return 0;
+  }
+
+  /// \brief Return the index of the index in the transition.
+  /// E.g., for "extractelement <2 x i32> c, i32 0" the index
+  /// is at index 1.
+  unsigned getTransitionIdx() const {
+    assert(isa<ExtractElementInst>(Transition) &&
+           "Other kind of transitions are not supported yet");
+    return 1;
+  }
+
+  /// \brief Get the type of the transition.
+  /// This is the type of the original value.
+  /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the
+  /// transition is <2 x i32>.
+  Type *getTransitionType() const {
+    return Transition->getOperand(getTransitionOriginalValueIdx())->getType();
+  }
+
+  /// \brief Promote \p ToBePromoted by moving \p Def downward through.
+  /// I.e., we have the following sequence:
+  /// Def = Transition <ty1> a to <ty2>
+  /// b = ToBePromoted <ty2> Def, ...
+  /// =>
+  /// b = ToBePromoted <ty1> a, ...
+  /// Def = Transition <ty1> ToBePromoted to <ty2>
+  void promoteImpl(Instruction *ToBePromoted);
+
+  /// \brief Check whether or not it is profitable to promote all the
+  /// instructions enqueued to be promoted.
+  bool isProfitableToPromote() {
+    Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx());
+    unsigned Index = isa<ConstantInt>(ValIdx)
+                         ? cast<ConstantInt>(ValIdx)->getZExtValue()
+                         : -1;
+    Type *PromotedType = getTransitionType();
+
+    StoreInst *ST = cast<StoreInst>(CombineInst);
+    unsigned AS = ST->getPointerAddressSpace();
+    unsigned Align = ST->getAlignment();
+    // Check if this store is supported.
+    if (!TLI.allowsMisalignedMemoryAccesses(
+            EVT::getEVT(ST->getValueOperand()->getType()), AS, Align)) {
+      // If this is not supported, there is no way we can combine
+      // the extract with the store.
+      return false;
+    }
+
+    // The scalar chain of computation has to pay for the transition
+    // scalar to vector.
+    // The vector chain has to account for the combining cost.
+    uint64_t ScalarCost =
+        TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);
+    uint64_t VectorCost = StoreExtractCombineCost;
+    for (const auto &Inst : InstsToBePromoted) {
+      // Compute the cost.
+      // By construction, all instructions being promoted are arithmetic ones.
+      // Moreover, one argument is a constant that can be viewed as a splat
+      // constant.
+      Value *Arg0 = Inst->getOperand(0);
+      bool IsArg0Constant = isa<UndefValue>(Arg0) || isa<ConstantInt>(Arg0) ||
+                            isa<ConstantFP>(Arg0);
+      TargetTransformInfo::OperandValueKind Arg0OVK =
+          IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
+                         : TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Arg1OVK =
+          !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
+                          : TargetTransformInfo::OK_AnyValue;
+      ScalarCost += TTI.getArithmeticInstrCost(
+          Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK);
+      VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,
+                                               Arg0OVK, Arg1OVK);
+    }
+    DEBUG(dbgs() << "Estimated cost of computation to be promoted:\nScalar: "
+                 << ScalarCost << "\nVector: " << VectorCost << '\n');
+    return ScalarCost > VectorCost;
+  }
+
+  /// \brief Generate a constant vector with \p Val with the same
+  /// number of elements as the transition.
+  /// \p UseSplat defines whether or not \p Val should be replicated
+  /// accross the whole vector.
+  /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
+  /// otherwise we generate a vector with as many undef as possible:
+  /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only
+  /// used at the index of the extract.
+  Value *getConstantVector(Constant *Val, bool UseSplat) const {
+    unsigned ExtractIdx = UINT_MAX;
+    if (!UseSplat) {
+      // If we cannot determine where the constant must be, we have to
+      // use a splat constant.
+      Value *ValExtractIdx = Transition->getOperand(getTransitionIdx());
+      if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx))
+        ExtractIdx = CstVal->getSExtValue();
+      else
+        UseSplat = true;
+    }
+
+    unsigned End = getTransitionType()->getVectorNumElements();
+    if (UseSplat)
+      return ConstantVector::getSplat(End, Val);
+
+    SmallVector<Constant *, 4> ConstVec;
+    UndefValue *UndefVal = UndefValue::get(Val->getType());
+    for (unsigned Idx = 0; Idx != End; ++Idx) {
+      if (Idx == ExtractIdx)
+        ConstVec.push_back(Val);
+      else
+        ConstVec.push_back(UndefVal);
+    }
+    return ConstantVector::get(ConstVec);
+  }
+
+  /// \brief Check if promoting to a vector type an operand at \p OperandIdx
+  /// in \p Use can trigger undefined behavior.
+  static bool canCauseUndefinedBehavior(const Instruction *Use,
+                                        unsigned OperandIdx) {
+    // This is not safe to introduce undef when the operand is on
+    // the right hand side of a division-like instruction.
+    if (OperandIdx != 1)
+      return false;
+    switch (Use->getOpcode()) {
+    default:
+      return false;
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+    case Instruction::SRem:
+    case Instruction::URem:
+      return true;
+    case Instruction::FDiv:
+    case Instruction::FRem:
+      return !Use->hasNoNaNs();
+    }
+    llvm_unreachable(nullptr);
+  }
+
+public:
+  VectorPromoteHelper(const TargetLowering &TLI, const TargetTransformInfo &TTI,
+                      Instruction *Transition, unsigned CombineCost)
+      : TLI(TLI), TTI(TTI), Transition(Transition),
+        StoreExtractCombineCost(CombineCost), CombineInst(nullptr) {
+    assert(Transition && "Do not know how to promote null");
+  }
+
+  /// \brief Check if we can promote \p ToBePromoted to \p Type.
+  bool canPromote(const Instruction *ToBePromoted) const {
+    // We could support CastInst too.
+    return isa<BinaryOperator>(ToBePromoted);
+  }
+
+  /// \brief Check if it is profitable to promote \p ToBePromoted
+  /// by moving downward the transition through.
+  bool shouldPromote(const Instruction *ToBePromoted) const {
+    // Promote only if all the operands can be statically expanded.
+    // Indeed, we do not want to introduce any new kind of transitions.
+    for (const Use &U : ToBePromoted->operands()) {
+      const Value *Val = U.get();
+      if (Val == getEndOfTransition()) {
+        // If the use is a division and the transition is on the rhs,
+        // we cannot promote the operation, otherwise we may create a
+        // division by zero.
+        if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()))
+          return false;
+        continue;
+      }
+      if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) &&
+          !isa<ConstantFP>(Val))
+        return false;
+    }
+    // Check that the resulting operation is legal.
+    int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode());
+    if (!ISDOpcode)
+      return false;
+    return StressStoreExtract ||
+           TLI.isOperationLegalOrCustom(ISDOpcode,
+                                        EVT::getEVT(getTransitionType(), true));
+  }
+
+  /// \brief Check whether or not \p Use can be combined
+  /// with the transition.
+  /// I.e., is it possible to do Use(Transition) => AnotherUse?
+  bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); }
+
+  /// \brief Record \p ToBePromoted as part of the chain to be promoted.
+  void enqueueForPromotion(Instruction *ToBePromoted) {
+    InstsToBePromoted.push_back(ToBePromoted);
+  }
+
+  /// \brief Set the instruction that will be combined with the transition.
+  void recordCombineInstruction(Instruction *ToBeCombined) {
+    assert(canCombine(ToBeCombined) && "Unsupported instruction to combine");
+    CombineInst = ToBeCombined;
+  }
+
+  /// \brief Promote all the instructions enqueued for promotion if it is
+  /// is profitable.
+  /// \return True if the promotion happened, false otherwise.
+  bool promote() {
+    // Check if there is something to promote.
+    // Right now, if we do not have anything to combine with,
+    // we assume the promotion is not profitable.
+    if (InstsToBePromoted.empty() || !CombineInst)
+      return false;
+
+    // Check cost.
+    if (!StressStoreExtract && !isProfitableToPromote())
+      return false;
+
+    // Promote.
+    for (auto &ToBePromoted : InstsToBePromoted)
+      promoteImpl(ToBePromoted);
+    InstsToBePromoted.clear();
+    return true;
+  }
+};
+} // End of anonymous namespace.
+
+void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
+  // At this point, we know that all the operands of ToBePromoted but Def
+  // can be statically promoted.
+  // For Def, we need to use its parameter in ToBePromoted:
+  // b = ToBePromoted ty1 a
+  // Def = Transition ty1 b to ty2
+  // Move the transition down.
+  // 1. Replace all uses of the promoted operation by the transition.
+  // = ... b => = ... Def.
+  assert(ToBePromoted->getType() == Transition->getType() &&
+         "The type of the result of the transition does not match "
+         "the final type");
+  ToBePromoted->replaceAllUsesWith(Transition);
+  // 2. Update the type of the uses.
+  // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def.
+  Type *TransitionTy = getTransitionType();
+  ToBePromoted->mutateType(TransitionTy);
+  // 3. Update all the operands of the promoted operation with promoted
+  // operands.
+  // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a.
+  for (Use &U : ToBePromoted->operands()) {
+    Value *Val = U.get();
+    Value *NewVal = nullptr;
+    if (Val == Transition)
+      NewVal = Transition->getOperand(getTransitionOriginalValueIdx());
+    else if (isa<UndefValue>(Val) || isa<ConstantInt>(Val) ||
+             isa<ConstantFP>(Val)) {
+      // Use a splat constant if it is not safe to use undef.
+      NewVal = getConstantVector(
+          cast<Constant>(Val),
+          isa<UndefValue>(Val) ||
+              canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()));
+    } else
+      assert(0 && "Did you modified shouldPromote and forgot to update this?");
+    ToBePromoted->setOperand(U.getOperandNo(), NewVal);
+  }
+  Transition->removeFromParent();
+  Transition->insertAfter(ToBePromoted);
+  Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);
+}
+
+/// Some targets can do store(extractelement) with one instruction.
+/// Try to push the extractelement towards the stores when the target
+/// has this feature and this is profitable.
+bool CodeGenPrepare::OptimizeExtractElementInst(Instruction *Inst) {
+  unsigned CombineCost = UINT_MAX;
+  if (DisableStoreExtract || !TLI ||
+      (!StressStoreExtract &&
+       !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),
+                                       Inst->getOperand(1), CombineCost)))
+    return false;
+
+  // At this point we know that Inst is a vector to scalar transition.
+  // Try to move it down the def-use chain, until:
+  // - We can combine the transition with its single use
+  //   => we got rid of the transition.
+  // - We escape the current basic block
+  //   => we would need to check that we are moving it at a cheaper place and
+  //      we do not do that for now.
+  BasicBlock *Parent = Inst->getParent();
+  DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
+  VectorPromoteHelper VPH(*TLI, *TTI, Inst, CombineCost);
+  // If the transition has more than one use, assume this is not going to be
+  // beneficial.
+  while (Inst->hasOneUse()) {
+    Instruction *ToBePromoted = cast<Instruction>(*Inst->user_begin());
+    DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');
+
+    if (ToBePromoted->getParent() != Parent) {
+      DEBUG(dbgs() << "Instruction to promote is in a different block ("
+                   << ToBePromoted->getParent()->getName()
+                   << ") than the transition (" << Parent->getName() << ").\n");
+      return false;
+    }
+
+    if (VPH.canCombine(ToBePromoted)) {
+      DEBUG(dbgs() << "Assume " << *Inst << '\n'
+                   << "will be combined with: " << *ToBePromoted << '\n');
+      VPH.recordCombineInstruction(ToBePromoted);
+      bool Changed = VPH.promote();
+      NumStoreExtractExposed += Changed;
+      return Changed;
+    }
+
+    DEBUG(dbgs() << "Try promoting.\n");
+    if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted))
+      return false;
+
+    DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");
+
+    VPH.enqueueForPromotion(ToBePromoted);
+    Inst = ToBePromoted;
+  }
+  return false;
+}
+
 bool CodeGenPrepare::OptimizeInst(Instruction *I) {
   if (PHINode *P = dyn_cast<PHINode>(I)) {
     // It is possible for very late stage optimizations (such as SimplifyCFG)
@@ -3262,6 +3637,9 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
   if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
     return OptimizeShuffleVectorInst(SVI);
 
+  if (isa<ExtractElementInst>(I))
+    return OptimizeExtractElementInst(I);
+
   return false;
 }
 
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index df3b016b3ac..fc7a02f5e6b 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -11123,6 +11123,35 @@ bool ARMTargetLowering::useLoadStackGuardNode() const {
   return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO;
 }
 
+bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
+                                                  unsigned &Cost) const {
+  // If we do not have NEON, vector types are not natively supported.
+  if (!Subtarget->hasNEON())
+    return false;
+
+  // Floating point values and vector values map to the same register file.
+  // Therefore, althought we could do a store extract of a vector type, this is
+  // better to leave at float as we have more freedom in the addressing mode for
+  // those.
+  if (VectorTy->isFPOrFPVectorTy())
+    return false;
+
+  // If the index is unknown at compile time, this is very expensive to lower
+  // and it is not possible to combine the store with the extract.
+  if (!isa<ConstantInt>(Idx))
+    return false;
+
+  assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
+  unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
+  // We can do a store + vector extract on any vector that fits perfectly in a D
+  // or Q register.
+  if (BitWidth == 64 || BitWidth == 128) {
+    Cost = 0;
+    return true;
+  }
+  return false;
+}
+
 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                          AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index a3d316e80b6..89b0c31ac52 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -410,6 +410,9 @@ namespace llvm {
 
     bool useLoadStackGuardNode() const override;
 
+    bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
+                                   unsigned &Cost) const override;
+
   protected:
     std::pair<const TargetRegisterClass*, uint8_t>
     findRepresentativeClass(MVT VT) const override;
diff --git a/test/CodeGen/ARM/vector-promotion.ll b/test/CodeGen/ARM/vector-promotion.ll
new file mode 100644
index 00000000000..42ceb60c47f
--- /dev/null
+++ b/test/CodeGen/ARM/vector-promotion.ll
@@ -0,0 +1,403 @@
+; RUN: opt -codegenprepare -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon -S | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-NORMAL %s
+; RUN: opt -codegenprepare -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-STRESS %s
+; RUN: llc -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon | FileCheck --check-prefix=ASM %s
+
+; IR-BOTH-LABEL: @simpleOneInstructionPromotion
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 undef, i32 1>
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR]], i32 1
+; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest
+; IR-BOTH-NEXT: ret
+;
+; Make sure we got rid of any expensive vmov.32 instructions.
+; ASM-LABEL: simpleOneInstructionPromotion:
+; ASM: vldr [[LOAD:d[0-9]+]], [r0]
+; ASM-NEXT: vorr.i32 [[LOAD]], #0x1
+; ASM-NEXT: vst1.32 {[[LOAD]][1]}, [r1:32]
+; ASM-NEXT: bx
+define void @simpleOneInstructionPromotion(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @unsupportedInstructionForPromotion
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0
+; IR-BOTH-NEXT: [[CMP:%[a-zA-Z_0-9-]+]] = icmp eq i32 [[EXTRACT]], %in2
+; IR-BOTH-NEXT: store i1 [[CMP]], i1* %dest
+; IR-BOTH-NEXT: ret
+;
+; ASM-LABEL: unsupportedInstructionForPromotion:
+; ASM: vldr [[LOAD:d[0-9]+]], [r0]
+; ASM: vmov.32 {{r[0-9]+}}, [[LOAD]]
+; ASM: bx
+define void @unsupportedInstructionForPromotion(<2 x i32>* %addr1, i32 %in2, i1* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  %out = icmp eq i32 %extract, %in2
+  store i1 %out, i1* %dest, align 4
+  ret void
+}
+
+
+; IR-BOTH-LABEL: @unsupportedChainInDifferentBBs
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0
+; IR-BOTH-NEXT: br i1 %bool, label %bb2, label %end
+; BB2
+; IR-BOTH: [[OR:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1
+; IR-BOTH-NEXT: store i32 [[OR]], i32* %dest, align 4
+; IR-BOTH: ret
+;
+; ASM-LABEL: unsupportedChainInDifferentBBs:
+; ASM: vldrne [[LOAD:d[0-9]+]], [r0]
+; ASM: vmovne.32 {{r[0-9]+}}, [[LOAD]]
+; ASM: bx
+define void @unsupportedChainInDifferentBBs(<2 x i32>* %addr1, i32* %dest, i1 %bool) {
+bb1:
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  br i1 %bool, label %bb2, label %end
+bb2: 
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  br label %end
+end:
+  ret void
+}
+
+; IR-LABEL: @chainOfInstructionsToPromote
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[VECTOR_OR1:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR2:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR1]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR3:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR2]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR4:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR3]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR5:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR4]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR6:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR5]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[VECTOR_OR7:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR6]], <i32 1, i32 undef>
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR7]], i32 0
+; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest
+; IR-BOTH-NEXT: ret
+;
+; ASM-LABEL: chainOfInstructionsToPromote:
+; ASM: vldr [[LOAD:d[0-9]+]], [r0]
+; ASM-NOT: vmov.32 {{r[0-9]+}}, [[LOAD]]
+; ASM: bx
+define void @chainOfInstructionsToPromote(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 0
+  %out1 = or i32 %extract, 1
+  %out2 = or i32 %out1, 1
+  %out3 = or i32 %out2, 1
+  %out4 = or i32 %out3, 1
+  %out5 = or i32 %out4, 1
+  %out6 = or i32 %out5, 1
+  %out7 = or i32 %out6, 1
+  store i32 %out7, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @unsupportedMultiUses
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-BOTH-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1
+; IR-BOTH-NEXT: store i32 [[OR]], i32* %dest
+; IR-BOTH-NEXT: ret i32 [[OR]]
+;
+; ASM-LABEL: unsupportedMultiUses:
+; ASM: vldr [[LOAD:d[0-9]+]], [r0]
+; ASM: vmov.32 {{r[0-9]+}}, [[LOAD]]
+; ASM: bx
+define i32 @unsupportedMultiUses(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  ret i32 %out
+}
+
+; Check that we promote we a splat constant when this is a division.
+; The NORMAL mode does not promote anything as divisions are not legal.
+; IR-BOTH-LABEL: @udivCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; Scalar version:
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = udiv i32 [[EXTRACT]], 7
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = udiv <2 x i32> [[LOAD]], <i32 7, i32 7>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret
+define void @udivCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = udiv i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @uremCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; Scalar version:
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = urem i32 [[EXTRACT]], 7
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = urem <2 x i32> [[LOAD]], <i32 7, i32 7>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret 
+define void @uremCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = urem i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @sdivCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; Scalar version:
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = sdiv i32 [[EXTRACT]], 7
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = sdiv <2 x i32> [[LOAD]], <i32 7, i32 7>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret 
+define void @sdivCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = sdiv i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @sremCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; Scalar version:
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = srem i32 [[EXTRACT]], 7
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = srem <2 x i32> [[LOAD]], <i32 7, i32 7>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret 
+define void @sremCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = srem i32 %extract, 7
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @fdivCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1
+; Scalar version:  
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fdiv float [[EXTRACT]], 7.0
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fdiv <2 x float> [[LOAD]], <float 7.000000e+00, float 7.000000e+00>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store float [[RES]], float* %dest
+; IR-BOTH-NEXT: ret
+define void @fdivCase(<2 x float>* %addr1, float* %dest) {
+  %in1 = load <2 x float>* %addr1, align 8   
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = fdiv float %extract, 7.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; IR-BOTH-LABEL: @fremCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1
+; Scalar version:  
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem float [[EXTRACT]], 7.0
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem <2 x float> [[LOAD]], <float 7.000000e+00, float 7.000000e+00>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store float [[RES]], float* %dest
+; IR-BOTH-NEXT: ret
+define void @fremCase(<2 x float>* %addr1, float* %dest) {
+  %in1 = load <2 x float>* %addr1, align 8   
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = frem float %extract, 7.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we do not promote when we may introduce undefined behavior
+; like division by zero.
+; IR-BOTH-LABEL: @undefDivCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-BOTH-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = udiv i32 7, [[EXTRACT]]
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret
+define void @undefDivCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = udiv i32 7, %extract
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+
+; Check that we do not promote when we may introduce undefined behavior
+; like division by zero.
+; IR-BOTH-LABEL: @undefRemCase
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
+; IR-BOTH-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = srem i32 7, [[EXTRACT]]
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret
+define void @undefRemCase(<2 x i32>* %addr1, i32* %dest) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 1
+  %out = srem i32 7, %extract
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Check that we use an undef mask for undefined behavior if the fast-math
+; flag is set.
+; IR-BOTH-LABEL: @undefConstantFRemCaseWithFastMath
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1
+; Scalar version:  
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float [[EXTRACT]], 7.0
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem nnan <2 x float> [[LOAD]], <float undef, float 7.000000e+00>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store float [[RES]], float* %dest
+; IR-BOTH-NEXT: ret
+define void @undefConstantFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) {
+  %in1 = load <2 x float>* %addr1, align 8   
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = frem nnan float %extract, 7.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we use an undef mask for undefined behavior if the fast-math
+; flag is set.
+; IR-BOTH-LABEL: @undefVectorFRemCaseWithFastMath
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1
+; Scalar version:  
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float 7.000000e+00, [[EXTRACT]]
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem nnan <2 x float> <float undef, float 7.000000e+00>, [[LOAD]]
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store float [[RES]], float* %dest
+; IR-BOTH-NEXT: ret
+define void @undefVectorFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) {
+  %in1 = load <2 x float>* %addr1, align 8   
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = frem nnan float 7.0, %extract
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we are able to promote floating point value.
+; This requires the STRESS mode, as floating point value are
+; not promote on armv7.
+; IR-BOTH-LABEL: @simpleOneInstructionPromotionFloat
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>* %addr1
+; Scalar version: 
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fadd float [[EXTRACT]], 1.0
+; Vector version:
+; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fadd <2 x float> [[LOAD]], <float undef, float 1.000000e+00>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
+;
+; IR-BOTH-NEXT: store float [[RES]], float* %dest
+; IR-BOTH-NEXT: ret
+define void @simpleOneInstructionPromotionFloat(<2 x float>* %addr1, float* %dest) {
+  %in1 = load <2 x float>* %addr1, align 8
+  %extract = extractelement <2 x float> %in1, i32 1
+  %out = fadd float %extract, 1.0
+  store float %out, float* %dest, align 4
+  ret void
+}
+
+; Check that we correctly use a splat constant when we cannot
+; determine at compile time the index of the extract.
+; This requires the STRESS modes, as variable index are expensive
+; to lower.
+; IR-BOTH-LABEL: @simpleOneInstructionPromotionVariableIdx
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>* %addr1
+; Scalar version:
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 %idx
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1
+; Vector version:
+; IR-STRESS-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 1, i32 1>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[OR]], i32 %idx
+;
+; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
+; IR-BOTH-NEXT: ret
+define void @simpleOneInstructionPromotionVariableIdx(<2 x i32>* %addr1, i32* %dest, i32 %idx) {
+  %in1 = load <2 x i32>* %addr1, align 8
+  %extract = extractelement <2 x i32> %in1, i32 %idx
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 4
+  ret void
+}
+
+; Check a vector with more than 2 elements.
+; This requires the STRESS mode because currently 'or v8i8' is not marked
+; as legal or custom, althought the actual assembly is better if we were
+; promoting it.
+; IR-BOTH-LABEL: @simpleOneInstructionPromotion8x8
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <8 x i8>* %addr1
+; Scalar version:  
+; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <8 x i8> [[LOAD]], i32 1
+; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i8 [[EXTRACT]], 1
+; Vector version:  
+; IR-STRESS-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <8 x i8> [[LOAD]], <i8 undef, i8 1, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
+; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <8 x i8> [[OR]], i32 1
+;
+; IR-BOTH-NEXT: store i8 [[RES]], i8* %dest
+; IR-BOTH-NEXT: ret
+define void @simpleOneInstructionPromotion8x8(<8 x i8>* %addr1, i8* %dest) {
+  %in1 = load <8 x i8>* %addr1, align 8
+  %extract = extractelement <8 x i8> %in1, i32 1
+  %out = or i8 %extract, 1
+  store i8 %out, i8* %dest, align 4
+  ret void
+}
+
+; Check that we optimized the sequence correctly when it can be
+; lowered on a Q register.
+; IR-BOTH-LABEL: @simpleOneInstructionPromotion
+; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>* %addr1
+; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <4 x i32> [[LOAD]], <i32 undef, i32 1, i32 undef, i32 undef>
+; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[VECTOR_OR]], i32 1
+; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest
+; IR-BOTH-NEXT: ret
+;
+; Make sure we got rid of any expensive vmov.32 instructions.
+; ASM-LABEL: simpleOneInstructionPromotion4x32:
+; ASM: vld1.64 {[[LOAD:d[0-9]+]], d{{[0-9]+}}}, [r0]
+; The Q register used here must be [[LOAD]] / 2, but we cannot express that.
+; ASM-NEXT: vorr.i32 q{{[[0-9]+}}, #0x1
+; ASM-NEXT: vst1.32 {[[LOAD]][1]}, [r1]
+; ASM-NEXT: bx
+define void @simpleOneInstructionPromotion4x32(<4 x i32>* %addr1, i32* %dest) {
+  %in1 = load <4 x i32>* %addr1, align 8
+  %extract = extractelement <4 x i32> %in1, i32 1
+  %out = or i32 %extract, 1
+  store i32 %out, i32* %dest, align 1
+  ret void
+}
-- 
2.34.1