From 5ab48a2f6966752615228fe49451f34fffb2a0a8 Mon Sep 17 00:00:00 2001
From: Hao Liu <Hao.Liu@arm.com>
Date: Mon, 15 Jun 2015 01:56:40 +0000
Subject: [PATCH] [AArch64] Revert r239711 again. We need to discuss how to
 share code between AArch64 and ARM backend.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@239713 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64.h                  |   1 -
 .../AArch64/AArch64InterleavedAccess.cpp      | 392 ------------------
 lib/Target/AArch64/AArch64TargetMachine.cpp   |   8 -
 .../AArch64/AArch64TargetTransformInfo.cpp    |  12 -
 .../AArch64/AArch64TargetTransformInfo.h      |   5 -
 lib/Target/AArch64/CMakeLists.txt             |   1 -
 .../AArch64/aarch64-interleaved-accesses.ll   | 197 ---------
 7 files changed, 616 deletions(-)

diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 8c966c4af9a..21106c9ad29 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -38,7 +38,6 @@ FunctionPass *createAArch64LoadStoreOptimizationPass();
 ModulePass *createAArch64PromoteConstantPass();
 FunctionPass *createAArch64ConditionOptimizerPass();
 FunctionPass *createAArch64AddressTypePromotionPass();
-FunctionPass *createAArch64InterleavedAccessPass();
 FunctionPass *createAArch64A57FPLoadBalancing();
 FunctionPass *createAArch64A53Fix835769();
 
diff --git a/lib/Target/AArch64/AArch64InterleavedAccess.cpp b/lib/Target/AArch64/AArch64InterleavedAccess.cpp
index 4219abbf7ce..e69de29bb2d 100644
--- a/lib/Target/AArch64/AArch64InterleavedAccess.cpp
+++ b/lib/Target/AArch64/AArch64InterleavedAccess.cpp
@@ -1,392 +0,0 @@
-//=--------------------- AArch64InterleavedAccess.cpp ----------------------==//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the AArch64InterleavedAccess pass, which identifies
-// interleaved memory accesses and Transforms them into an AArch64 ldN/stN
-// intrinsics (N = 2, 3, 4).
-//
-// An interleaved load reads data from memory into several vectors, with
-// DE-interleaving the data on factor. An interleaved store writes several
-// vectors to memory with RE-interleaving the data on factor. The interleave
-// factor is equal to the number of vectors. AArch64 backend supports interleave
-// factor of 2, 3 and 4.
-//
-// E.g. Transform an interleaved load (Factor = 2):
-//        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
-//        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
-//        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
-//      Into:
-//        %ld2 = { <4 x i32>, <4 x i32> } call aarch64.neon.ld2(%ptr)
-//        %v0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
-//        %v1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
-//
-// E.g. Transform an interleaved store (Factor = 2):
-//        %i.vec = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>  ; Interleaved vec
-//        store <8 x i32> %i.vec, <8 x i32>* %ptr
-//      Into:
-//        %v0 = shuffle %i.vec, undef, <0, 1, 2, 3>
-//        %v1 = shuffle %i.vec, undef, <4, 5, 6, 7>
-//        call void aarch64.neon.st2(%v0, %v1, %ptr)
-//
-//===----------------------------------------------------------------------===//
-
-#include "AArch64.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "aarch64-interleaved-access"
-
-static const unsigned MIN_FACTOR = 2;
-static const unsigned MAX_FACTOR = 4;
-
-namespace llvm {
-static void initializeAArch64InterleavedAccessPass(PassRegistry &);
-}
-
-namespace {
-
-class AArch64InterleavedAccess : public FunctionPass {
-
-public:
-  static char ID;
-  AArch64InterleavedAccess() : FunctionPass(ID) {
-    initializeAArch64InterleavedAccessPass(*PassRegistry::getPassRegistry());
-  }
-
-  const char *getPassName() const override {
-    return "AArch64 Interleaved Access Pass";
-  }
-
-  bool runOnFunction(Function &F) override;
-
-private:
-  const DataLayout *DL;
-  Module *M;
-
-  /// \brief Transform an interleaved load into ldN intrinsic.
-  bool matchInterleavedLoad(ShuffleVectorInst *SVI,
-                            SmallSetVector<Instruction *, 32> &DeadInsts);
-
-  /// \brief Transform an interleaved store into stN intrinsic.
-  bool matchInterleavedStore(ShuffleVectorInst *SVI,
-                             SmallSetVector<Instruction *, 32> &DeadInsts);
-};
-} // end anonymous namespace.
-
-char AArch64InterleavedAccess::ID = 0;
-
-INITIALIZE_PASS_BEGIN(AArch64InterleavedAccess, DEBUG_TYPE,
-                      "AArch64 interleaved access Pass", false, false)
-INITIALIZE_PASS_END(AArch64InterleavedAccess, DEBUG_TYPE,
-                    "AArch64 interleaved access Pass", false, false)
-
-FunctionPass *llvm::createAArch64InterleavedAccessPass() {
-  return new AArch64InterleavedAccess();
-}
-
-/// \brief Get a ldN/stN intrinsic according to the Factor (2, 3, or 4).
-static Intrinsic::ID getLdNStNIntrinsic(unsigned Factor, bool IsLoad) {
-  static const Intrinsic::ID LoadInt[3] = {Intrinsic::aarch64_neon_ld2,
-                                           Intrinsic::aarch64_neon_ld3,
-                                           Intrinsic::aarch64_neon_ld4};
-  static const Intrinsic::ID StoreInt[3] = {Intrinsic::aarch64_neon_st2,
-                                            Intrinsic::aarch64_neon_st3,
-                                            Intrinsic::aarch64_neon_st4};
-
-  assert(Factor >= MIN_FACTOR && Factor <= MAX_FACTOR &&
-         "Invalid interleave factor");
-
-  if (IsLoad)
-    return LoadInt[Factor - 2];
-  else
-    return StoreInt[Factor - 2];
-}
-
-/// \brief Check if the mask is a DE-interleave mask of the given factor
-/// \p Factor like:
-///     <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
-static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor,
-                                       unsigned &Index) {
-  // Check all potential start indices from 0 to (Factor - 1).
-  for (Index = 0; Index < Factor; Index++) {
-    unsigned i = 0;
-
-    // Check that elements are in ascending order by Factor.
-    for (; i < Mask.size(); i++)
-      if (Mask[i] >= 0 && static_cast<unsigned>(Mask[i]) != Index + i * Factor)
-        break;
-
-    if (i == Mask.size())
-      return true;
-  }
-
-  return false;
-}
-
-/// \brief Check if the mask is a DE-interleave mask for an interleaved load.
-///
-/// E.g. DE-interleave masks (Factor = 2) could be:
-///     <0, 2, 4, 6>    (mask of index 0 to extract even elements)
-///     <1, 3, 5, 7>    (mask of index 1 to extract odd elements)
-static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
-                               unsigned &Index) {
-  unsigned NumElts = Mask.size();
-  if (NumElts < 2)
-    return false;
-
-  for (Factor = MIN_FACTOR; Factor <= MAX_FACTOR; Factor++)
-    if (isDeInterleaveMaskOfFactor(Mask, Factor, Index))
-      return true;
-
-  return false;
-}
-
-/// \brief Check if the given mask \p Mask is RE-interleaved mask of the given
-/// factor \p Factor.
-///
-/// I.e. <0, NumSubElts, ... , NumSubElts*(Factor - 1), 1, NumSubElts + 1, ...>
-static bool isReInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor) {
-  unsigned NumElts = Mask.size();
-  if (NumElts % Factor)
-    return false;
-
-  unsigned NumSubElts = NumElts / Factor;
-  if (!isPowerOf2_32(NumSubElts))
-    return false;
-
-  for (unsigned i = 0; i < NumSubElts; i++)
-    for (unsigned j = 0; j < Factor; j++)
-      if (Mask[i * Factor + j] >= 0 &&
-          static_cast<unsigned>(Mask[i * Factor + j]) != j * NumSubElts + i)
-        return false;
-
-  return true;
-}
-
-/// \brief Check if the mask is RE-interleave mask for an interleaved store.
-///
-/// E.g. The RE-interleave mask (Factor = 2) could be:
-///     <0, 4, 1, 5, 2, 6, 3, 7>
-static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor) {
-  if (Mask.size() < 4)
-    return false;
-
-  // Check potential Factors and return true if find a factor for the mask.
-  for (Factor = MIN_FACTOR; Factor <= MAX_FACTOR; Factor++)
-    if (isReInterleaveMaskOfFactor(Mask, Factor))
-      return true;
-
-  return false;
-}
-
-/// \brief Get a mask consisting of sequential integers starting from \p Start.
-///
-/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
-static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
-                                   unsigned NumElts) {
-  SmallVector<Constant *, 16> Mask;
-  for (unsigned i = 0; i < NumElts; i++)
-    Mask.push_back(Builder.getInt32(Start + i));
-
-  return ConstantVector::get(Mask);
-}
-
-bool AArch64InterleavedAccess::matchInterleavedLoad(
-    ShuffleVectorInst *SVI, SmallSetVector<Instruction *, 32> &DeadInsts) {
-  if (DeadInsts.count(SVI))
-    return false;
-
-  LoadInst *LI = dyn_cast<LoadInst>(SVI->getOperand(0));
-  if (!LI || !LI->isSimple() || !isa<UndefValue>(SVI->getOperand(1)))
-    return false;
-
-  SmallVector<ShuffleVectorInst *, 4> Shuffles;
-
-  // Check if all users of this load are shufflevectors.
-  for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
-    ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(*UI);
-    if (!SV)
-      return false;
-
-    Shuffles.push_back(SV);
-  }
-
-  // Check if the type of the first shuffle is legal.
-  VectorType *VecTy = Shuffles[0]->getType();
-  unsigned TypeSize = DL->getTypeAllocSizeInBits(VecTy);
-  if (TypeSize != 64 && TypeSize != 128)
-    return false;
-
-  // Check if the mask of the first shuffle is strided and get the start index.
-  unsigned Factor, Index;
-  if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index))
-    return false;
-
-  // Holds the corresponding index for each strided shuffle.
-  SmallVector<unsigned, 4> Indices;
-  Indices.push_back(Index);
-
-  // Check if other shufflevectors are of the same type and factor
-  for (unsigned i = 1; i < Shuffles.size(); i++) {
-    if (Shuffles[i]->getType() != VecTy)
-      return false;
-
-    unsigned Index;
-    if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor,
-                                    Index))
-      return false;
-
-    Indices.push_back(Index);
-  }
-
-  DEBUG(dbgs() << "Found an interleaved load:" << *LI << "\n");
-
-  // A pointer vector can not be the return type of the ldN intrinsics. Need to
-  // load integer vectors first and then convert to pointer vectors.
-  Type *EltTy = VecTy->getVectorElementType();
-  if (EltTy->isPointerTy())
-    VecTy = VectorType::get(DL->getIntPtrType(EltTy),
-                            VecTy->getVectorNumElements());
-
-  Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
-  Type *Tys[2] = {VecTy, PtrTy};
-  Function *LdNFunc =
-      Intrinsic::getDeclaration(M, getLdNStNIntrinsic(Factor, true), Tys);
-
-  IRBuilder<> Builder(LI);
-  Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy);
-
-  CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN");
-  DEBUG(dbgs() << "   Created:" << *LdN << "\n");
-
-  // Replace each strided shufflevector with the corresponding vector loaded
-  // by ldN.
-  for (unsigned i = 0; i < Shuffles.size(); i++) {
-    ShuffleVectorInst *SV = Shuffles[i];
-    unsigned Index = Indices[i];
-
-    Value *SubVec = Builder.CreateExtractValue(LdN, Index);
-
-    // Convert the integer vector to pointer vector if the element is pointer.
-    if (EltTy->isPointerTy())
-      SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
-
-    SV->replaceAllUsesWith(SubVec);
-
-    DEBUG(dbgs() << "  Replaced:" << *SV << "\n"
-                 << "      With:" << *SubVec << "\n");
-
-    // Avoid analyzing it twice.
-    DeadInsts.insert(SV);
-  }
-
-  // Mark this load as dead.
-  DeadInsts.insert(LI);
-  return true;
-}
-
-bool AArch64InterleavedAccess::matchInterleavedStore(
-    ShuffleVectorInst *SVI, SmallSetVector<Instruction *, 32> &DeadInsts) {
-  if (DeadInsts.count(SVI) || !SVI->hasOneUse())
-    return false;
-
-  StoreInst *SI = dyn_cast<StoreInst>(SVI->user_back());
-  if (!SI || !SI->isSimple())
-    return false;
-
-  // Check if the mask is interleaved and get the interleave factor.
-  unsigned Factor;
-  if (!isReInterleaveMask(SVI->getShuffleMask(), Factor))
-    return false;
-
-  VectorType *VecTy = SVI->getType();
-  unsigned NumSubElts = VecTy->getVectorNumElements() / Factor;
-  Type *EltTy = VecTy->getVectorElementType();
-  VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts);
-
-  // Skip illegal vector types.
-  unsigned TypeSize = DL->getTypeAllocSizeInBits(SubVecTy);
-  if (TypeSize != 64 && TypeSize != 128)
-    return false;
-
-  DEBUG(dbgs() << "Found an interleaved store:" << *SI << "\n");
-
-  Value *Op0 = SVI->getOperand(0);
-  Value *Op1 = SVI->getOperand(1);
-  IRBuilder<> Builder(SI);
-
-  // StN intrinsics don't support pointer vectors as arguments. Convert pointer
-  // vectors to integer vectors.
-  if (EltTy->isPointerTy()) {
-    Type *IntTy = DL->getIntPtrType(EltTy);
-    unsigned NumOpElts =
-        dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();
-
-    // The corresponding integer vector type of the same element size.
-    Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
-
-    Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
-    Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
-    SubVecTy = VectorType::get(IntTy, NumSubElts);
-  }
-
-  Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
-  Type *Tys[2] = {SubVecTy, PtrTy};
-  Function *StNFunc =
-      Intrinsic::getDeclaration(M, getLdNStNIntrinsic(Factor, false), Tys);
-
-  SmallVector<Value *, 5> Ops;
-
-  // Split the shufflevector operands into sub vectors for the new stN call.
-  for (unsigned i = 0; i < Factor; i++)
-    Ops.push_back(Builder.CreateShuffleVector(
-        Op0, Op1, getSequentialMask(Builder, NumSubElts * i, NumSubElts)));
-
-  Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
-  CallInst *StN = Builder.CreateCall(StNFunc, Ops);
-
-  (void)StN; // silence warning.
-  DEBUG(dbgs() << "  Replaced:" << *SI << "'\n");
-  DEBUG(dbgs() << "      with:" << *StN << "\n");
-
-  // Mark this shufflevector and store as dead.
-  DeadInsts.insert(SI);
-  DeadInsts.insert(SVI);
-  return true;
-}
-
-bool AArch64InterleavedAccess::runOnFunction(Function &F) {
-  DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
-
-  M = F.getParent();
-  DL = &M->getDataLayout();
-
-  // Holds dead instructions that will be erased later.
-  SmallSetVector<Instruction *, 32> DeadInsts;
-  bool Changed = false;
-  for (auto &I : inst_range(F)) {
-    if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(&I)) {
-      Changed |= matchInterleavedLoad(SVI, DeadInsts);
-      Changed |= matchInterleavedStore(SVI, DeadInsts);
-    }
-  }
-
-  for (auto I : DeadInsts)
-    I->eraseFromParent();
-
-  return Changed;
-}
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index eb78c8c0452..29995903d11 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -67,11 +67,6 @@ EnableAtomicTidy("aarch64-atomic-cfg-tidy", cl::Hidden,
                           " to make use of cmpxchg flow-based information"),
                  cl::init(true));
 
-static cl::opt<bool> AArch64InterleavedAccessOpt(
-    "aarch64-interleaved-access-opt",
-    cl::desc("Optimize interleaved memory accesses in the AArch64 backend"),
-    cl::init(false), cl::Hidden);
-
 static cl::opt<bool>
 EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden,
                         cl::desc("Run early if-conversion"),
@@ -228,9 +223,6 @@ void AArch64PassConfig::addIRPasses() {
   if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
     addPass(createCFGSimplificationPass());
 
-  if (TM->getOptLevel() != CodeGenOpt::None && AArch64InterleavedAccessOpt)
-    addPass(createAArch64InterleavedAccessPass());
-
   TargetPassConfig::addIRPasses();
 
   if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index a51a0674c8f..ed27cf84bbb 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -407,18 +407,6 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   return LT.first;
 }
 
-unsigned AArch64TTIImpl::getInterleavedMemoryOpCost(
-    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-    unsigned Alignment, unsigned AddressSpace) {
-  assert(isa<VectorType>(VecTy) && "Expect vector types");
-
-  if (Factor > 1 && Factor < 5 && isTypeLegal(VecTy))
-    return Factor;
-
-  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace);
-}
-
 unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
   unsigned Cost = 0;
   for (auto *I : Tys) {
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 4dabdadd8ee..25c22bcd58e 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -139,11 +139,6 @@ public:
 
   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
 
-  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                      unsigned Factor,
-                                      ArrayRef<unsigned> Indices,
-                                      unsigned Alignment,
-                                      unsigned AddressSpace);
   /// @}
 };
 
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index 676a2482ed1..f26327ff84a 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -38,7 +38,6 @@ add_llvm_target(AArch64CodeGen
   AArch64PBQPRegAlloc.cpp
   AArch64RegisterInfo.cpp
   AArch64SelectionDAGInfo.cpp
-  AArch64InterleavedAccess.cpp
   AArch64StorePairSuppress.cpp
   AArch64Subtarget.cpp
   AArch64TargetMachine.cpp
diff --git a/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll b/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
index ad0af1ea220..e69de29bb2d 100644
--- a/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
+++ b/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
@@ -1,197 +0,0 @@
-; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic -aarch64-interleaved-access-opt=true < %s | FileCheck %s
-
-; CHECK-LABEL: load_factor2:
-; CHECK: ld2 { v0.8b, v1.8b }, [x0]
-define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
-  %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
-  %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-  %strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-  %add = add nsw <8 x i8> %strided.v0, %strided.v1
-  ret <8 x i8> %add
-}
-
-; CHECK-LABEL: load_delat3:
-; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
-define <4 x i32> @load_delat3(i32* %ptr) {
-  %base = bitcast i32* %ptr to <12 x i32>*
-  %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
-  %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-  %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-  %add = add nsw <4 x i32> %strided.v2, %strided.v1
-  ret <4 x i32> %add
-}
-
-; CHECK-LABEL: load_factor4:
-; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
-define <4 x i32> @load_factor4(i32* %ptr) {
-  %base = bitcast i32* %ptr to <16 x i32>*
-  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
-  %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-  %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-  %add = add nsw <4 x i32> %strided.v0, %strided.v2
-  ret <4 x i32> %add
-}
-
-; CHECK-LABEL: store_factor2:
-; CHECK: st2 { v0.8b, v1.8b }, [x0]
-define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
-  %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-  store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
-  ret void
-}
-
-; CHECK-LABEL: store_factor3:
-; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
-define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
-  %base = bitcast i32* %ptr to <12 x i32>*
-  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-  store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
-  ret void
-}
-
-; CHECK-LABEL: store_factor4:
-; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
-define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-  %base = bitcast i32* %ptr to <16 x i32>*
-  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-  store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
-  ret void
-}
-
-; The following cases test that interleaved access of pointer vectors can be
-; matched to ldN/stN instruction.
-
-; CHECK-LABEL: load_ptrvec_factor2:
-; CHECK: ld2 { v0.2d, v1.2d }, [x0]
-define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
-  %base = bitcast i32** %ptr to <4 x i32*>*
-  %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4
-  %strided.v0 = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
-  ret <2 x i32*> %strided.v0
-}
-
-; CHECK-LABEL: load_ptrvec_factor3:
-; CHECK: ld3 { v0.2d, v1.2d, v2.2d }, [x0]
-define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
-  %base = bitcast i32** %ptr to <6 x i32*>*
-  %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4
-  %strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
-  store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1
-  %strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
-  store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2
-  ret void
-}
-
-; CHECK-LABEL: load_ptrvec_factor4:
-; CHECK: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
-define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
-  %base = bitcast i32** %ptr to <8 x i32*>*
-  %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4
-  %strided.v1 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
-  %strided.v3 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
-  store <2 x i32*> %strided.v1, <2 x i32*>* %ptr1
-  store <2 x i32*> %strided.v3, <2 x i32*>* %ptr2
-  ret void
-}
-
-; CHECK-LABEL: store_ptrvec_factor2:
-; CHECK: st2 { v0.2d, v1.2d }, [x0]
-define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
-  %base = bitcast i32** %ptr to <4 x i32*>*
-  %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-  store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4
-  ret void
-}
-
-; CHECK-LABEL: store_ptrvec_factor3:
-; CHECK: st3 { v0.2d, v1.2d, v2.2d }, [x0]
-define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
-  %base = bitcast i32** %ptr to <6 x i32*>*
-  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
-  store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4
-  ret void
-}
-
-; CHECK-LABEL: store_ptrvec_factor4:
-; CHECK: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
-define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
-  %base = bitcast i32* %ptr to <8 x i32*>*
-  %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-  store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4
-  ret void
-}
-
-; Following cases check that shuffle maskes with undef indices can be matched
-; into ldN/stN instruction.
-
-; CHECK-LABEL: load_undef_mask_factor2:
-; CHECK: ld2 { v0.4s, v1.4s }, [x0]
-define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
-  %base = bitcast i32* %ptr to <8 x i32>*
-  %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
-  %strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
-  %strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
-  %add = add nsw <4 x i32> %strided.v0, %strided.v1
-  ret <4 x i32> %add
-}
-
-; CHECK-LABEL: load_undef_mask_factor3:
-; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
-define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
-  %base = bitcast i32* %ptr to <12 x i32>*
-  %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
-  %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
-  %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-  %add = add nsw <4 x i32> %strided.v2, %strided.v1
-  ret <4 x i32> %add
-}
-
-; CHECK-LABEL: load_undef_mask_factor4:
-; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
-define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
-  %base = bitcast i32* %ptr to <16 x i32>*
-  %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
-  %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
-  %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
-  %add = add nsw <4 x i32> %strided.v0, %strided.v2
-  ret <4 x i32> %add
-}
-
-; CHECK-LABEL: store_undef_mask_factor2:
-; CHECK: st2 { v0.4s, v1.4s }, [x0]
-define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
-  %base = bitcast i32* %ptr to <8 x i32>*
-  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
-  store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
-  ret void
-}
-
-; CHECK-LABEL: store_undef_mask_factor3:
-; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
-define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
-  %base = bitcast i32* %ptr to <12 x i32>*
-  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-  store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
-  ret void
-}
-
-; CHECK-LABEL: store_undef_mask_factor4:
-; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
-define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-  %base = bitcast i32* %ptr to <16 x i32>*
-  %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-  store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
-  ret void
-}
-- 
2.34.1